diff options
Diffstat (limited to 'lib')
538 files changed, 36684 insertions, 21534 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp index be02ddb..c189a00 100644 --- a/lib/Analysis/AliasAnalysis.cpp +++ b/lib/Analysis/AliasAnalysis.cpp @@ -86,14 +86,20 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS, if (onlyAccessesArgPointees(MRB)) { bool doesAlias = false; - if (doesAccessArgPointees(MRB)) + if (doesAccessArgPointees(MRB)) { + MDNode *CSTag = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa); for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); - AI != AE; ++AI) - if (!isNoAlias(Location(*AI), Loc)) { + AI != AE; ++AI) { + const Value *Arg = *AI; + if (!Arg->getType()->isPointerTy()) + continue; + Location CSLoc(Arg, UnknownSize, CSTag); + if (!isNoAlias(CSLoc, Loc)) { doesAlias = true; break; } - + } + } if (!doesAlias) return NoModRef; } @@ -138,13 +144,19 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) { // CS2's arguments. if (onlyAccessesArgPointees(CS2B)) { AliasAnalysis::ModRefResult R = NoModRef; - if (doesAccessArgPointees(CS2B)) + if (doesAccessArgPointees(CS2B)) { + MDNode *CS2Tag = CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa); for (ImmutableCallSite::arg_iterator I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) { - R = ModRefResult((R | getModRefInfo(CS1, *I, UnknownSize)) & Mask); + const Value *Arg = *I; + if (!Arg->getType()->isPointerTy()) + continue; + Location CS2Loc(Arg, UnknownSize, CS2Tag); + R = ModRefResult((R | getModRefInfo(CS1, CS2Loc)) & Mask); if (R == Mask) break; } + } return R; } @@ -152,13 +164,20 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) { // any of the memory referenced by CS1's arguments. If not, return NoModRef. if (onlyAccessesArgPointees(CS1B)) { AliasAnalysis::ModRefResult R = NoModRef; - if (doesAccessArgPointees(CS1B)) + if (doesAccessArgPointees(CS1B)) { + MDNode *CS1Tag = CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa); for (ImmutableCallSite::arg_iterator - I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) - if (getModRefInfo(CS2, *I, UnknownSize) != NoModRef) { + I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) { + const Value *Arg = *I; + if (!Arg->getType()->isPointerTy()) + continue; + Location CS1Loc(Arg, UnknownSize, CS1Tag); + if (getModRefInfo(CS2, CS1Loc) != NoModRef) { R = Mask; break; } + } + } if (R == NoModRef) return R; } diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp index 3a46976..2ed6949 100644 --- a/lib/Analysis/AliasSetTracker.cpp +++ b/lib/Analysis/AliasSetTracker.cpp @@ -602,6 +602,10 @@ void AliasSetTracker::ASTCallbackVH::deleted() { // this now dangles! } +void AliasSetTracker::ASTCallbackVH::allUsesReplacedWith(Value *V) { + AST->copyValue(getValPtr(), V); +} + AliasSetTracker::ASTCallbackVH::ASTCallbackVH(Value *V, AliasSetTracker *ast) : CallbackVH(V), AST(ast) {} diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp index 6ebe100..e57ba78 100644 --- a/lib/Analysis/Analysis.cpp +++ b/lib/Analysis/Analysis.cpp @@ -23,6 +23,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeAliasSetPrinterPass(Registry); initializeNoAAPass(Registry); initializeBasicAliasAnalysisPass(Registry); + initializeBranchProbabilityInfoPass(Registry); initializeCFGViewerPass(Registry); initializeCFGPrinterPass(Registry); initializeCFGOnlyViewerPass(Registry); diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index f7bcd9e..8330ea7 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -281,17 +281,20 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs, continue; } - if (const Instruction *I = dyn_cast<Instruction>(V)) - // TODO: Get a DominatorTree and use it here. - if (const Value *Simplified = - SimplifyInstruction(const_cast<Instruction *>(I), TD)) { - V = Simplified; - continue; - } - const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op); - if (GEPOp == 0) + if (GEPOp == 0) { + // If it's not a GEP, hand it off to SimplifyInstruction to see if it + // can come up with something. This matches what GetUnderlyingObject does. + if (const Instruction *I = dyn_cast<Instruction>(V)) + // TODO: Get a DominatorTree and use it here. + if (const Value *Simplified = + SimplifyInstruction(const_cast<Instruction *>(I), TD)) { + V = Simplified; + continue; + } + return V; + } // Don't attempt to analyze GEPs over unsized objects. if (!cast<PointerType>(GEPOp->getOperand(0)->getType()) @@ -350,7 +353,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs, Scale *= IndexScale.getSExtValue(); - // If we already had an occurrance of this index variable, merge this + // If we already had an occurrence of this index variable, merge this // scale into it. For example, we want to handle: // A[x][x] -> x*16 + x*4 -> x*20 // This also ensures that 'x' only appears in the index list once. @@ -448,7 +451,13 @@ namespace { /// BasicAliasAnalysis - This is the primary alias analysis implementation. struct BasicAliasAnalysis : public ImmutablePass, public AliasAnalysis { static char ID; // Class identification, replacement for typeinfo - BasicAliasAnalysis() : ImmutablePass(ID) { + BasicAliasAnalysis() : ImmutablePass(ID), + // AliasCache rarely has more than 1 or 2 elements, + // so start it off fairly small so that clear() + // doesn't have to tromp through 64 (the default) + // elements on each alias query. This really wants + // something like a SmallDenseMap. + AliasCache(8) { initializeBasicAliasAnalysisPass(*PassRegistry::getPassRegistry()); } @@ -462,12 +471,12 @@ namespace { virtual AliasResult alias(const Location &LocA, const Location &LocB) { - assert(Visited.empty() && "Visited must be cleared after use!"); + assert(AliasCache.empty() && "AliasCache must be cleared after use!"); assert(notDifferentParent(LocA.Ptr, LocB.Ptr) && "BasicAliasAnalysis doesn't support interprocedural queries."); AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.TBAATag, LocB.Ptr, LocB.Size, LocB.TBAATag); - Visited.clear(); + AliasCache.clear(); return Alias; } @@ -503,7 +512,12 @@ namespace { } private: - // Visited - Track instructions visited by a aliasPHI, aliasSelect(), and aliasGEP(). + // AliasCache - Track alias queries to guard against recursion. + typedef std::pair<Location, Location> LocPair; + typedef DenseMap<LocPair, AliasResult> AliasCacheTy; + AliasCacheTy AliasCache; + + // Visited - Track instructions visited by pointsToConstantMemory. SmallPtrSet<const Value*, 16> Visited; // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP @@ -680,9 +694,12 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS, unsigned ArgNo = 0; for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); CI != CE; ++CI, ++ArgNo) { - // Only look at the no-capture pointer arguments. + // Only look at the no-capture or byval pointer arguments. If this + // pointer were passed to arguments that were neither of these, then it + // couldn't be no-capture. if (!(*CI)->getType()->isPointerTy() || - !CS.paramHasAttr(ArgNo+1, Attribute::NoCapture)) + (!CS.paramHasAttr(ArgNo+1, Attribute::NoCapture) && + !CS.paramHasAttr(ArgNo+1, Attribute::ByVal))) continue; // If this is a no-capture pointer argument, see if we can tell that it @@ -779,6 +796,26 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS, return NoModRef; break; } + case Intrinsic::arm_neon_vld1: { + // LLVM's vld1 and vst1 intrinsics currently only support a single + // vector register. + uint64_t Size = + TD ? TD->getTypeStoreSize(II->getType()) : UnknownSize; + if (isNoAlias(Location(II->getArgOperand(0), Size, + II->getMetadata(LLVMContext::MD_tbaa)), + Loc)) + return NoModRef; + break; + } + case Intrinsic::arm_neon_vst1: { + uint64_t Size = + TD ? TD->getTypeStoreSize(II->getArgOperand(1)->getType()) : UnknownSize; + if (isNoAlias(Location(II->getArgOperand(0), Size, + II->getMetadata(LLVMContext::MD_tbaa)), + Loc)) + return NoModRef; + break; + } } // The AliasAnalysis base class has some smarts, lets use them. @@ -796,13 +833,6 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size, const MDNode *V2TBAAInfo, const Value *UnderlyingV1, const Value *UnderlyingV2) { - // If this GEP has been visited before, we're on a use-def cycle. - // Such cycles are only valid when PHI nodes are involved or in unreachable - // code. The visitPHI function catches cycles containing PHIs, but there - // could still be a cycle without PHIs in unreachable code. - if (!Visited.insert(GEP1)) - return MayAlias; - int64_t GEP1BaseOffset; SmallVector<VariableGEPIndex, 4> GEP1VariableIndices; @@ -883,7 +913,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size, if (GEP1BaseOffset == 0 && GEP1VariableIndices.empty()) return MustAlias; - // If there is a difference betwen the pointers, but the difference is + // If there is a difference between the pointers, but the difference is // less than the size of the associated memory object, then we know // that the objects are partially overlapping. if (GEP1BaseOffset != 0 && GEP1VariableIndices.empty()) { @@ -920,7 +950,30 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size, return NoAlias; } - return MayAlias; + // Statically, we can see that the base objects are the same, but the + // pointers have dynamic offsets which we can't resolve. And none of our + // little tricks above worked. + // + // TODO: Returning PartialAlias instead of MayAlias is a mild hack; the + // practical effect of this is protecting TBAA in the case of dynamic + // indices into arrays of unions. An alternative way to solve this would + // be to have clang emit extra metadata for unions and/or union accesses. + // A union-specific solution wouldn't handle the problem for malloc'd + // memory however. + return PartialAlias; +} + +static AliasAnalysis::AliasResult +MergeAliasResults(AliasAnalysis::AliasResult A, AliasAnalysis::AliasResult B) { + // If the results agree, take it. + if (A == B) + return A; + // A mix of PartialAlias and MustAlias is PartialAlias. + if ((A == AliasAnalysis::PartialAlias && B == AliasAnalysis::MustAlias) || + (B == AliasAnalysis::PartialAlias && A == AliasAnalysis::MustAlias)) + return AliasAnalysis::PartialAlias; + // Otherwise, we don't know anything. + return AliasAnalysis::MayAlias; } /// aliasSelect - Provide a bunch of ad-hoc rules to disambiguate a Select @@ -930,13 +983,6 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize, const MDNode *SITBAAInfo, const Value *V2, uint64_t V2Size, const MDNode *V2TBAAInfo) { - // If this select has been visited before, we're on a use-def cycle. - // Such cycles are only valid when PHI nodes are involved or in unreachable - // code. The visitPHI function catches cycles containing PHIs, but there - // could still be a cycle without PHIs in unreachable code. - if (!Visited.insert(SI)) - return MayAlias; - // If the values are Selects with the same condition, we can do a more precise // check: just check for aliases between the values on corresponding arms. if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2)) @@ -949,9 +995,7 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize, AliasResult ThisAlias = aliasCheck(SI->getFalseValue(), SISize, SITBAAInfo, SI2->getFalseValue(), V2Size, V2TBAAInfo); - if (ThisAlias != Alias) - return MayAlias; - return Alias; + return MergeAliasResults(ThisAlias, Alias); } // If both arms of the Select node NoAlias or MustAlias V2, then returns @@ -961,16 +1005,9 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize, if (Alias == MayAlias) return MayAlias; - // If V2 is visited, the recursive case will have been caught in the - // above aliasCheck call, so these subsequent calls to aliasCheck - // don't need to assume that V2 is being visited recursively. - Visited.erase(V2); - AliasResult ThisAlias = aliasCheck(V2, V2Size, V2TBAAInfo, SI->getFalseValue(), SISize, SITBAAInfo); - if (ThisAlias != Alias) - return MayAlias; - return Alias; + return MergeAliasResults(ThisAlias, Alias); } // aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI instruction @@ -980,10 +1017,6 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, const MDNode *PNTBAAInfo, const Value *V2, uint64_t V2Size, const MDNode *V2TBAAInfo) { - // The PHI node has already been visited, avoid recursion any further. - if (!Visited.insert(PN)) - return MayAlias; - // If the values are PHIs in the same block, we can do a more precise // as well as efficient check: just check for aliases between the values // on corresponding edges. @@ -1000,8 +1033,9 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, aliasCheck(PN->getIncomingValue(i), PNSize, PNTBAAInfo, PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)), V2Size, V2TBAAInfo); - if (ThisAlias != Alias) - return MayAlias; + Alias = MergeAliasResults(ThisAlias, Alias); + if (Alias == MayAlias) + break; } return Alias; } @@ -1032,15 +1066,11 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) { Value *V = V1Srcs[i]; - // If V2 is visited, the recursive case will have been caught in the - // above aliasCheck call, so these subsequent calls to aliasCheck - // don't need to assume that V2 is being visited recursively. - Visited.erase(V2); - AliasResult ThisAlias = aliasCheck(V2, V2Size, V2TBAAInfo, V, PNSize, PNTBAAInfo); - if (ThisAlias != Alias || ThisAlias == MayAlias) - return MayAlias; + Alias = MergeAliasResults(ThisAlias, Alias); + if (Alias == MayAlias) + break; } return Alias; @@ -1125,6 +1155,17 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD))) return NoAlias; + // Check the cache before climbing up use-def chains. This also terminates + // otherwise infinitely recursive queries. + LocPair Locs(Location(V1, V1Size, V1TBAAInfo), + Location(V2, V2Size, V2TBAAInfo)); + if (V1 > V2) + std::swap(Locs.first, Locs.second); + std::pair<AliasCacheTy::iterator, bool> Pair = + AliasCache.insert(std::make_pair(Locs, MayAlias)); + if (!Pair.second) + return Pair.first->second; + // FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the // GEP can't simplify, we don't even look at the PHI cases. if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) { @@ -1134,7 +1175,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, } if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) { AliasResult Result = aliasGEP(GV1, V1Size, V2, V2Size, V2TBAAInfo, O1, O2); - if (Result != MayAlias) return Result; + if (Result != MayAlias) return AliasCache[Locs] = Result; } if (isa<PHINode>(V2) && !isa<PHINode>(V1)) { @@ -1144,7 +1185,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, if (const PHINode *PN = dyn_cast<PHINode>(V1)) { AliasResult Result = aliasPHI(PN, V1Size, V1TBAAInfo, V2, V2Size, V2TBAAInfo); - if (Result != MayAlias) return Result; + if (Result != MayAlias) return AliasCache[Locs] = Result; } if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) { @@ -1154,7 +1195,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) { AliasResult Result = aliasSelect(S1, V1Size, V1TBAAInfo, V2, V2Size, V2TBAAInfo); - if (Result != MayAlias) return Result; + if (Result != MayAlias) return AliasCache[Locs] = Result; } // If both pointers are pointing into the same object and one of them @@ -1163,8 +1204,10 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, if (TD && O1 == O2) if ((V1Size != UnknownSize && isObjectSize(O1, V1Size, *TD)) || (V2Size != UnknownSize && isObjectSize(O2, V2Size, *TD))) - return PartialAlias; + return AliasCache[Locs] = PartialAlias; - return AliasAnalysis::alias(Location(V1, V1Size, V1TBAAInfo), - Location(V2, V2Size, V2TBAAInfo)); + AliasResult Result = + AliasAnalysis::alias(Location(V1, V1Size, V1TBAAInfo), + Location(V2, V2Size, V2TBAAInfo)); + return AliasCache[Locs] = Result; } diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp new file mode 100644 index 0000000..15059c7 --- /dev/null +++ b/lib/Analysis/BranchProbabilityInfo.cpp @@ -0,0 +1,358 @@ +//===-- BranchProbabilityInfo.cpp - Branch Probability Analysis -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Loops should be simplified before this analysis. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Instructions.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +INITIALIZE_PASS_BEGIN(BranchProbabilityInfo, "branch-prob", + "Branch Probability Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_END(BranchProbabilityInfo, "branch-prob", + "Branch Probability Analysis", false, true) + +char BranchProbabilityInfo::ID = 0; + +namespace { +// Please note that BranchProbabilityAnalysis is not a FunctionPass. +// It is created by BranchProbabilityInfo (which is a FunctionPass), which +// provides a clear interface. Thanks to that, all heuristics and other +// private methods are hidden in the .cpp file. +class BranchProbabilityAnalysis { + + typedef std::pair<BasicBlock *, BasicBlock *> Edge; + + DenseMap<Edge, uint32_t> *Weights; + + BranchProbabilityInfo *BP; + + LoopInfo *LI; + + + // Weights are for internal use only. They are used by heuristics to help to + // estimate edges' probability. Example: + // + // Using "Loop Branch Heuristics" we predict weights of edges for the + // block BB2. + // ... + // | + // V + // BB1<-+ + // | | + // | | (Weight = 128) + // V | + // BB2--+ + // | + // | (Weight = 4) + // V + // BB3 + // + // Probability of the edge BB2->BB1 = 128 / (128 + 4) = 0.9696.. + // Probability of the edge BB2->BB3 = 4 / (128 + 4) = 0.0303.. + + static const uint32_t LBH_TAKEN_WEIGHT = 128; + static const uint32_t LBH_NONTAKEN_WEIGHT = 4; + + // Standard weight value. Used when none of the heuristics set weight for + // the edge. + static const uint32_t NORMAL_WEIGHT = 16; + + // Minimum weight of an edge. Please note, that weight is NEVER 0. + static const uint32_t MIN_WEIGHT = 1; + + // Return TRUE if BB leads directly to a Return Instruction. + static bool isReturningBlock(BasicBlock *BB) { + SmallPtrSet<BasicBlock *, 8> Visited; + + while (true) { + TerminatorInst *TI = BB->getTerminator(); + if (isa<ReturnInst>(TI)) + return true; + + if (TI->getNumSuccessors() > 1) + break; + + // It is unreachable block which we can consider as a return instruction. + if (TI->getNumSuccessors() == 0) + return true; + + Visited.insert(BB); + BB = TI->getSuccessor(0); + + // Stop if cycle is detected. + if (Visited.count(BB)) + return false; + } + + return false; + } + + // Multiply Edge Weight by two. + void incEdgeWeight(BasicBlock *Src, BasicBlock *Dst) { + uint32_t Weight = BP->getEdgeWeight(Src, Dst); + uint32_t MaxWeight = getMaxWeightFor(Src); + + if (Weight * 2 > MaxWeight) + BP->setEdgeWeight(Src, Dst, MaxWeight); + else + BP->setEdgeWeight(Src, Dst, Weight * 2); + } + + // Divide Edge Weight by two. + void decEdgeWeight(BasicBlock *Src, BasicBlock *Dst) { + uint32_t Weight = BP->getEdgeWeight(Src, Dst); + + assert(Weight > 0); + if (Weight / 2 < MIN_WEIGHT) + BP->setEdgeWeight(Src, Dst, MIN_WEIGHT); + else + BP->setEdgeWeight(Src, Dst, Weight / 2); + } + + + uint32_t getMaxWeightFor(BasicBlock *BB) const { + return UINT32_MAX / BB->getTerminator()->getNumSuccessors(); + } + +public: + BranchProbabilityAnalysis(DenseMap<Edge, uint32_t> *W, + BranchProbabilityInfo *BP, LoopInfo *LI) + : Weights(W), BP(BP), LI(LI) { + } + + // Return Heuristics + void calcReturnHeuristics(BasicBlock *BB); + + // Pointer Heuristics + void calcPointerHeuristics(BasicBlock *BB); + + // Loop Branch Heuristics + void calcLoopBranchHeuristics(BasicBlock *BB); + + bool runOnFunction(Function &F); +}; +} // end anonymous namespace + +// Calculate Edge Weights using "Return Heuristics". Predict a successor which +// leads directly to Return Instruction will not be taken. +void BranchProbabilityAnalysis::calcReturnHeuristics(BasicBlock *BB){ + if (BB->getTerminator()->getNumSuccessors() == 1) + return; + + for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { + BasicBlock *Succ = *I; + if (isReturningBlock(Succ)) { + decEdgeWeight(BB, Succ); + } + } +} + +// Calculate Edge Weights using "Pointer Heuristics". Predict a comparsion +// between two pointer or pointer and NULL will fail. +void BranchProbabilityAnalysis::calcPointerHeuristics(BasicBlock *BB) { + BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isConditional()) + return; + + Value *Cond = BI->getCondition(); + ICmpInst *CI = dyn_cast<ICmpInst>(Cond); + if (!CI) + return; + + Value *LHS = CI->getOperand(0); + + if (!LHS->getType()->isPointerTy()) + return; + + assert(CI->getOperand(1)->getType()->isPointerTy()); + + BasicBlock *Taken = BI->getSuccessor(0); + BasicBlock *NonTaken = BI->getSuccessor(1); + + // p != 0 -> isProb = true + // p == 0 -> isProb = false + // p != q -> isProb = true + // p == q -> isProb = false; + bool isProb = !CI->isEquality(); + if (!isProb) + std::swap(Taken, NonTaken); + + incEdgeWeight(BB, Taken); + decEdgeWeight(BB, NonTaken); +} + +// Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges +// as taken, exiting edges as not-taken. +void BranchProbabilityAnalysis::calcLoopBranchHeuristics(BasicBlock *BB) { + uint32_t numSuccs = BB->getTerminator()->getNumSuccessors(); + + Loop *L = LI->getLoopFor(BB); + if (!L) + return; + + SmallVector<BasicBlock *, 8> BackEdges; + SmallVector<BasicBlock *, 8> ExitingEdges; + + for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { + BasicBlock *Succ = *I; + Loop *SuccL = LI->getLoopFor(Succ); + if (SuccL != L) + ExitingEdges.push_back(Succ); + else if (Succ == L->getHeader()) + BackEdges.push_back(Succ); + } + + if (uint32_t numBackEdges = BackEdges.size()) { + uint32_t backWeight = LBH_TAKEN_WEIGHT / numBackEdges; + if (backWeight < NORMAL_WEIGHT) + backWeight = NORMAL_WEIGHT; + + for (SmallVector<BasicBlock *, 8>::iterator EI = BackEdges.begin(), + EE = BackEdges.end(); EI != EE; ++EI) { + BasicBlock *Back = *EI; + BP->setEdgeWeight(BB, Back, backWeight); + } + } + + uint32_t numExitingEdges = ExitingEdges.size(); + if (uint32_t numNonExitingEdges = numSuccs - numExitingEdges) { + uint32_t exitWeight = LBH_NONTAKEN_WEIGHT / numNonExitingEdges; + if (exitWeight < MIN_WEIGHT) + exitWeight = MIN_WEIGHT; + + for (SmallVector<BasicBlock *, 8>::iterator EI = ExitingEdges.begin(), + EE = ExitingEdges.end(); EI != EE; ++EI) { + BasicBlock *Exiting = *EI; + BP->setEdgeWeight(BB, Exiting, exitWeight); + } + } +} + +bool BranchProbabilityAnalysis::runOnFunction(Function &F) { + + for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { + BasicBlock *BB = I++; + + // Only LBH uses setEdgeWeight method. + calcLoopBranchHeuristics(BB); + + // PH and RH use only incEdgeWeight and decEwdgeWeight methods to + // not efface LBH results. + calcPointerHeuristics(BB); + calcReturnHeuristics(BB); + } + + return false; +} + + +bool BranchProbabilityInfo::runOnFunction(Function &F) { + LoopInfo &LI = getAnalysis<LoopInfo>(); + BranchProbabilityAnalysis BPA(&Weights, this, &LI); + return BPA.runOnFunction(F); +} + +uint32_t BranchProbabilityInfo::getSumForBlock(BasicBlock *BB) const { + uint32_t Sum = 0; + + for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { + BasicBlock *Succ = *I; + uint32_t Weight = getEdgeWeight(BB, Succ); + uint32_t PrevSum = Sum; + + Sum += Weight; + assert(Sum > PrevSum); (void) PrevSum; + } + + return Sum; +} + +bool BranchProbabilityInfo::isEdgeHot(BasicBlock *Src, BasicBlock *Dst) const { + // Hot probability is at least 4/5 = 80% + uint32_t Weight = getEdgeWeight(Src, Dst); + uint32_t Sum = getSumForBlock(Src); + + // FIXME: Implement BranchProbability::compare then change this code to + // compare this BranchProbability against a static "hot" BranchProbability. + return (uint64_t)Weight * 5 > (uint64_t)Sum * 4; +} + +BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const { + uint32_t Sum = 0; + uint32_t MaxWeight = 0; + BasicBlock *MaxSucc = 0; + + for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { + BasicBlock *Succ = *I; + uint32_t Weight = getEdgeWeight(BB, Succ); + uint32_t PrevSum = Sum; + + Sum += Weight; + assert(Sum > PrevSum); (void) PrevSum; + + if (Weight > MaxWeight) { + MaxWeight = Weight; + MaxSucc = Succ; + } + } + + // FIXME: Use BranchProbability::compare. + if ((uint64_t)MaxWeight * 5 > (uint64_t)Sum * 4) + return MaxSucc; + + return 0; +} + +// Return edge's weight. If can't find it, return DEFAULT_WEIGHT value. +uint32_t +BranchProbabilityInfo::getEdgeWeight(BasicBlock *Src, BasicBlock *Dst) const { + Edge E(Src, Dst); + DenseMap<Edge, uint32_t>::const_iterator I = Weights.find(E); + + if (I != Weights.end()) + return I->second; + + return DEFAULT_WEIGHT; +} + +void BranchProbabilityInfo::setEdgeWeight(BasicBlock *Src, BasicBlock *Dst, + uint32_t Weight) { + Weights[std::make_pair(Src, Dst)] = Weight; + DEBUG(dbgs() << "set edge " << Src->getNameStr() << " -> " + << Dst->getNameStr() << " weight to " << Weight + << (isEdgeHot(Src, Dst) ? " [is HOT now]\n" : "\n")); +} + + +BranchProbability BranchProbabilityInfo:: +getEdgeProbability(BasicBlock *Src, BasicBlock *Dst) const { + + uint32_t N = getEdgeWeight(Src, Dst); + uint32_t D = getSumForBlock(Src); + + return BranchProbability(N, D); +} + +raw_ostream & +BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS, BasicBlock *Src, + BasicBlock *Dst) const { + + const BranchProbability Prob = getEdgeProbability(Src, Dst); + OS << "edge " << Src->getNameStr() << " -> " << Dst->getNameStr() + << " probability is " << Prob + << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n"); + + return OS; +} diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt index 6be5617..1a975bf 100644 --- a/lib/Analysis/CMakeLists.txt +++ b/lib/Analysis/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_library(LLVMAnalysis AliasSetTracker.cpp Analysis.cpp BasicAliasAnalysis.cpp + BranchProbabilityInfo.cpp CFGPrinter.cpp CaptureTracking.cpp ConstantFolding.cpp diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp index 42a54d9..b2c27d1 100644 --- a/lib/Analysis/CaptureTracking.cpp +++ b/lib/Analysis/CaptureTracking.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Constants.h" #include "llvm/Instructions.h" #include "llvm/Value.h" #include "llvm/Analysis/AliasAnalysis.h" diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index c6aad0c..08a6065 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -23,6 +23,7 @@ #include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" #include "llvm/Intrinsics.h" +#include "llvm/Operator.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Target/TargetData.h" #include "llvm/ADT/SmallVector.h" @@ -1084,7 +1085,7 @@ llvm::canConstantFoldCallTo(const Function *F) { case 'c': return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh"; case 'e': - return Name == "exp"; + return Name == "exp" || Name == "exp2"; case 'f': return Name == "fabs" || Name == "fmod" || Name == "floor"; case 'l': @@ -1220,6 +1221,12 @@ llvm::ConstantFoldCall(Function *F, case 'e': if (Name == "exp") return ConstantFoldFP(exp, V, Ty); + + if (Name == "exp2") { + // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a + // C99 library. + return ConstantFoldBinaryFP(pow, 2.0, V, Ty); + } break; case 'f': if (Name == "fabs") diff --git a/lib/Analysis/DIBuilder.cpp b/lib/Analysis/DIBuilder.cpp index 108d2d2..ef5d03a 100644 --- a/lib/Analysis/DIBuilder.cpp +++ b/lib/Analysis/DIBuilder.cpp @@ -50,7 +50,11 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename, MDString::get(VMContext, Flags), ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeVer) }; - TheCU = DICompileUnit(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + TheCU = DICompileUnit(MDNode::get(VMContext, Elts)); + + // Create a named metadata so that it is easier to find cu in a module. + NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu"); + NMD->addOperand(TheCU); } /// createFile - Create a file descriptor to hold debugging information @@ -63,7 +67,7 @@ DIFile DIBuilder::createFile(StringRef Filename, StringRef Directory) { MDString::get(VMContext, Directory), TheCU }; - return DIFile(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIFile(MDNode::get(VMContext, Elts)); } /// createEnumerator - Create a single enumerator value. @@ -73,7 +77,7 @@ DIEnumerator DIBuilder::createEnumerator(StringRef Name, uint64_t Val) { MDString::get(VMContext, Name), ConstantInt::get(Type::getInt64Ty(VMContext), Val) }; - return DIEnumerator(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIEnumerator(MDNode::get(VMContext, Elts)); } /// createBasicType - Create debugging information entry for a basic @@ -95,7 +99,7 @@ DIType DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits, ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags; ConstantInt::get(Type::getInt32Ty(VMContext), Encoding) }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createQaulifiedType - Create debugging information entry for a qualified @@ -114,7 +118,7 @@ DIType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) { ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags FromTy }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createPointerType - Create debugging information entry for a pointer. @@ -133,7 +137,7 @@ DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits, ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags PointeeTy }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createReferenceType - Create debugging information entry for a reference. @@ -151,17 +155,17 @@ DIType DIBuilder::createReferenceType(DIType RTy) { ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags RTy }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createTypedef - Create debugging information entry for a typedef. DIType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File, - unsigned LineNo) { + unsigned LineNo, DIDescriptor Context) { // typedefs are encoded in DIDerivedType format. assert(Ty.Verify() && "Invalid typedef type!"); Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_typedef), - Ty.getContext(), + Context, MDString::get(VMContext, Name), File, ConstantInt::get(Type::getInt32Ty(VMContext), LineNo), @@ -171,7 +175,7 @@ DIType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File, ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags Ty }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createFriend - Create debugging information entry for a 'friend'. @@ -191,7 +195,7 @@ DIType DIBuilder::createFriend(DIType Ty, DIType FriendTy) { ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags FriendTy }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createInheritance - Create debugging information entry to establish @@ -211,7 +215,7 @@ DIType DIBuilder::createInheritance(DIType Ty, DIType BaseTy, ConstantInt::get(Type::getInt32Ty(VMContext), Flags), BaseTy }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createMemberType - Create debugging information entry for a member. @@ -233,7 +237,36 @@ DIType DIBuilder::createMemberType(StringRef Name, ConstantInt::get(Type::getInt32Ty(VMContext), Flags), Ty }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); +} + +/// createObjCIVar - Create debugging information entry for Objective-C +/// instance variable. +DIType DIBuilder::createObjCIVar(StringRef Name, + DIFile File, unsigned LineNumber, + uint64_t SizeInBits, uint64_t AlignInBits, + uint64_t OffsetInBits, unsigned Flags, + DIType Ty, StringRef PropertyName, + StringRef GetterName, StringRef SetterName, + unsigned PropertyAttributes) { + // TAG_member is encoded in DIDerivedType format. + Value *Elts[] = { + GetTagConstant(VMContext, dwarf::DW_TAG_member), + File, // Or TheCU ? Ty ? + MDString::get(VMContext, Name), + File, + ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber), + ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits), + ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits), + ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits), + ConstantInt::get(Type::getInt32Ty(VMContext), Flags), + Ty, + MDString::get(VMContext, PropertyName), + MDString::get(VMContext, GetterName), + MDString::get(VMContext, SetterName), + ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes) + }; + return DIType(MDNode::get(VMContext, Elts)); } /// createClassType - Create debugging information entry for a class. @@ -260,7 +293,7 @@ DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name, VTableHoder, TemplateParams }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createTemplateTypeParameter - Create debugging information for template @@ -278,8 +311,7 @@ DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name, ConstantInt::get(Type::getInt32Ty(VMContext), LineNo), ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo) }; - return DITemplateTypeParameter(MDNode::get(VMContext, &Elts[0], - array_lengthof(Elts))); + return DITemplateTypeParameter(MDNode::get(VMContext, Elts)); } /// createTemplateValueParameter - Create debugging information for template @@ -299,8 +331,7 @@ DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name, ConstantInt::get(Type::getInt32Ty(VMContext), LineNo), ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo) }; - return DITemplateValueParameter(MDNode::get(VMContext, &Elts[0], - array_lengthof(Elts))); + return DITemplateValueParameter(MDNode::get(VMContext, Elts)); } /// createStructType - Create debugging information entry for a struct. @@ -325,7 +356,7 @@ DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name, ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang), llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createUnionType - Create debugging information entry for an union. @@ -350,7 +381,7 @@ DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name, ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang), llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createSubroutineType - Create subroutine type. @@ -371,7 +402,7 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) { ConstantInt::get(Type::getInt32Ty(VMContext), 0), llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createEnumerationType - Create debugging information entry for an @@ -396,7 +427,7 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name, ConstantInt::get(Type::getInt32Ty(VMContext), 0), llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), }; - MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)); + MDNode *Node = MDNode::get(VMContext, Elts); NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.enum"); NMD->addOperand(Node); return DIType(Node); @@ -421,7 +452,7 @@ DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits, ConstantInt::get(Type::getInt32Ty(VMContext), 0), llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createVectorType - Create debugging information entry for a vector. @@ -443,7 +474,7 @@ DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits, ConstantInt::get(Type::getInt32Ty(VMContext), 0), llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), }; - return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DIType(MDNode::get(VMContext, Elts)); } /// createArtificialType - Create a new DIType with "artificial" flag set. @@ -467,7 +498,7 @@ DIType DIBuilder::createArtificialType(DIType Ty) { // Flags are stored at this slot. Elts[8] = ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags); - return DIType(MDNode::get(VMContext, Elts.data(), Elts.size())); + return DIType(MDNode::get(VMContext, Elts)); } /// retainType - Retain DIType in a module even if it is not referenced @@ -483,7 +514,7 @@ DIDescriptor DIBuilder::createUnspecifiedParameter() { Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_parameters) }; - return DIDescriptor(MDNode::get(VMContext, &Elts[0], 1)); + return DIDescriptor(MDNode::get(VMContext, Elts)); } /// createTemporaryType - Create a temporary forward-declared type. @@ -491,7 +522,7 @@ DIType DIBuilder::createTemporaryType() { // Give the temporary MDNode a tag. It doesn't matter what tag we // use here as long as DIType accepts it. Value *Elts[] = { GetTagConstant(VMContext, DW_TAG_base_type) }; - MDNode *Node = MDNode::getTemporary(VMContext, Elts, array_lengthof(Elts)); + MDNode *Node = MDNode::getTemporary(VMContext, Elts); return DIType(Node); } @@ -505,17 +536,17 @@ DIType DIBuilder::createTemporaryType(DIFile F) { NULL, F }; - MDNode *Node = MDNode::getTemporary(VMContext, Elts, array_lengthof(Elts)); + MDNode *Node = MDNode::getTemporary(VMContext, Elts); return DIType(Node); } /// getOrCreateArray - Get a DIArray, create one if required. -DIArray DIBuilder::getOrCreateArray(Value *const *Elements, unsigned NumElements) { - if (NumElements == 0) { +DIArray DIBuilder::getOrCreateArray(ArrayRef<Value *> Elements) { + if (Elements.empty()) { Value *Null = llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)); - return DIArray(MDNode::get(VMContext, &Null, 1)); + return DIArray(MDNode::get(VMContext, Null)); } - return DIArray(MDNode::get(VMContext, Elements, NumElements)); + return DIArray(MDNode::get(VMContext, Elements)); } /// getOrCreateSubrange - Create a descriptor for a value range. This @@ -527,7 +558,7 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Hi) { ConstantInt::get(Type::getInt64Ty(VMContext), Hi) }; - return DISubrange(MDNode::get(VMContext, &Elts[0], 3)); + return DISubrange(MDNode::get(VMContext, Elts)); } /// createGlobalVariable - Create a new descriptor for the specified global. @@ -548,7 +579,7 @@ createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber, ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/ Val }; - MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)); + MDNode *Node = MDNode::get(VMContext, Elts); // Create a named metadata so that we do not lose this mdnode. NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv"); NMD->addOperand(Node); @@ -575,7 +606,7 @@ createStaticVariable(DIDescriptor Context, StringRef Name, ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/ Val }; - MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)); + MDNode *Node = MDNode::get(VMContext, Elts); // Create a named metadata so that we do not lose this mdnode. NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv"); NMD->addOperand(Node); @@ -597,7 +628,7 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope, Ty, ConstantInt::get(Type::getInt32Ty(VMContext), Flags) }; - MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)); + MDNode *Node = MDNode::get(VMContext, Elts); if (AlwaysPreserve) { // The optimizer may remove local variable. If there is an interest // to preserve variable info in such situation then stash it in a @@ -620,8 +651,8 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope, DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope, StringRef Name, DIFile F, unsigned LineNo, - DIType Ty, Value *const *Addr, - unsigned NumAddr, unsigned ArgNo) { + DIType Ty, ArrayRef<Value *> Addr, + unsigned ArgNo) { SmallVector<Value *, 15> Elts; Elts.push_back(GetTagConstant(VMContext, Tag)); Elts.push_back(Scope); @@ -629,9 +660,10 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope, Elts.push_back(F); Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24)))); Elts.push_back(Ty); - Elts.append(Addr, Addr+NumAddr); + Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext))); + Elts.append(Addr.begin(), Addr.end()); - return DIVariable(MDNode::get(VMContext, Elts.data(), Elts.size())); + return DIVariable(MDNode::get(VMContext, Elts)); } /// createFunction - Create a new descriptor for the specified function. @@ -643,7 +675,8 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context, bool isLocalToUnit, bool isDefinition, unsigned Flags, bool isOptimized, Function *Fn, - MDNode *TParams) { + MDNode *TParams, + MDNode *Decl) { Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_subprogram), llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), @@ -662,9 +695,10 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context, ConstantInt::get(Type::getInt32Ty(VMContext), Flags), ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized), Fn, - TParams + TParams, + Decl }; - MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)); + MDNode *Node = MDNode::get(VMContext, Elts); // Create a named metadata so that we do not lose this mdnode. NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp"); @@ -706,7 +740,7 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context, Fn, TParam, }; - MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)); + MDNode *Node = MDNode::get(VMContext, Elts); // Create a named metadata so that we do not lose this mdnode. NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp"); @@ -725,7 +759,7 @@ DINameSpace DIBuilder::createNameSpace(DIDescriptor Scope, StringRef Name, File, ConstantInt::get(Type::getInt32Ty(VMContext), LineNo) }; - return DINameSpace(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DINameSpace(MDNode::get(VMContext, Elts)); } DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File, @@ -740,7 +774,7 @@ DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File, File, ConstantInt::get(Type::getInt32Ty(VMContext), unique_id++) }; - return DILexicalBlock(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts))); + return DILexicalBlock(MDNode::get(VMContext, Elts)); } /// insertDeclare - Insert a new llvm.dbg.declare intrinsic call. @@ -751,7 +785,7 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo, if (!DeclareFn) DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare); - Value *Args[] = { MDNode::get(Storage->getContext(), &Storage, 1), VarInfo }; + Value *Args[] = { MDNode::get(Storage->getContext(), Storage), VarInfo }; return CallInst::Create(DeclareFn, Args, Args+2, "", InsertBefore); } @@ -763,7 +797,7 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo, if (!DeclareFn) DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare); - Value *Args[] = { MDNode::get(Storage->getContext(), &Storage, 1), VarInfo }; + Value *Args[] = { MDNode::get(Storage->getContext(), Storage), VarInfo }; // If this block already has a terminator then insert this intrinsic // before the terminator. @@ -782,7 +816,7 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset, if (!ValueFn) ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value); - Value *Args[] = { MDNode::get(V->getContext(), &V, 1), + Value *Args[] = { MDNode::get(V->getContext(), V), ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset), VarInfo }; return CallInst::Create(ValueFn, Args, Args+3, "", InsertBefore); @@ -797,7 +831,7 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset, if (!ValueFn) ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value); - Value *Args[] = { MDNode::get(V->getContext(), &V, 1), + Value *Args[] = { MDNode::get(V->getContext(), V), ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset), VarInfo }; return CallInst::Create(ValueFn, Args, Args+3, "", InsertAtEnd); diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp index 690c4b4..2e79eab 100644 --- a/lib/Analysis/IPA/CallGraph.cpp +++ b/lib/Analysis/IPA/CallGraph.cpp @@ -148,7 +148,7 @@ private: for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { CallSite CS(cast<Value>(II)); - if (CS && !isa<DbgInfoIntrinsic>(II)) { + if (CS && !isa<IntrinsicInst>(II)) { const Function *Callee = CS.getCalledFunction(); if (Callee) Node->addCalledFunction(CS, getOrInsertFunction(Callee)); diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp index 725ab72..659ffab 100644 --- a/lib/Analysis/IPA/CallGraphSCCPass.cpp +++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp @@ -245,8 +245,8 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC, for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - CallSite CS(cast<Value>(I)); - if (!CS || isa<DbgInfoIntrinsic>(I)) continue; + CallSite CS(cast<Value>(I)); + if (!CS || isa<IntrinsicInst>(I)) continue; // If this call site already existed in the callgraph, just verify it // matches up to expectations and remove it from CallSites. diff --git a/lib/Analysis/IPA/FindUsedTypes.cpp b/lib/Analysis/IPA/FindUsedTypes.cpp index 06ae34c..dde2556 100644 --- a/lib/Analysis/IPA/FindUsedTypes.cpp +++ b/lib/Analysis/IPA/FindUsedTypes.cpp @@ -32,7 +32,7 @@ INITIALIZE_PASS(FindUsedTypes, "print-used-types", void FindUsedTypes::IncorporateType(const Type *Ty) { // If ty doesn't already exist in the used types map, add it now, otherwise // return. - if (!UsedTypes.insert(Ty).second) return; // Already contain Ty. + if (!UsedTypes.insert(Ty)) return; // Already contain Ty. // Make sure to add any types this type references now. // @@ -94,7 +94,7 @@ bool FindUsedTypes::runOnModule(Module &m) { // void FindUsedTypes::print(raw_ostream &OS, const Module *M) const { OS << "Types in use by this module:\n"; - for (std::set<const Type *>::const_iterator I = UsedTypes.begin(), + for (SetVector<const Type *>::const_iterator I = UsedTypes.begin(), E = UsedTypes.end(); I != E; ++I) { OS << " "; WriteTypeSymbolic(OS, *I, M); diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp index 116aaf4..b226d66 100644 --- a/lib/Analysis/IPA/GlobalsModRef.cpp +++ b/lib/Analysis/IPA/GlobalsModRef.cpp @@ -602,7 +602,7 @@ void GlobalsModRef::addEscapingUse(Use &U) { // For the purposes of this analysis, it is conservatively correct to treat // a newly escaping value equivalently to a deleted one. We could perhaps // be more precise by processing the new use and attempting to update our - // saved analysis results to accomodate it. + // saved analysis results to accommodate it. deleteValue(U); AliasAnalysis::addEscapingUse(U); diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp index 2cda791..a0c42f0 100644 --- a/lib/Analysis/IVUsers.cpp +++ b/lib/Analysis/IVUsers.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetData.h" #include "llvm/Assembly/Writer.h" #include "llvm/ADT/STLExtras.h" @@ -38,6 +39,15 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(IVUsers, "iv-users", "Induction Variable Users", false, true) +// IVUsers behavior currently depends on this temporary indvars mode. The +// option must be defined upstream from its uses. +namespace llvm { + bool DisableIVRewrite = false; +} +cl::opt<bool, true> DisableIVRewriteOpt( + "disable-iv-rewrite", cl::Hidden, cl::location(llvm::DisableIVRewrite), + cl::desc("Disable canonical induction variable rewriting")); + Pass *llvm::createIVUsersPass() { return new IVUsers(); } @@ -79,7 +89,7 @@ static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L, /// AddUsersIfInteresting - Inspect the specified instruction. If it is a /// reducible SCEV, recursively add its users to the IVUsesByStride set and /// return true. Otherwise, return false. -bool IVUsers::AddUsersIfInteresting(Instruction *I) { +bool IVUsers::AddUsersIfInteresting(Instruction *I, PHINode *Phi) { if (!SE->isSCEVable(I->getType())) return false; // Void and FP expressions cannot be reduced. @@ -90,6 +100,11 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) { if (Width > 64 || (TD && !TD->isLegalInteger(Width))) return false; + // We expect Sign/Zero extension to be eliminated from the IR before analyzing + // any downstream uses. + if (DisableIVRewrite && (isa<SExtInst>(I) || isa<ZExtInst>(I))) + return false; + if (!Processed.insert(I)) return true; // Instruction already handled. @@ -121,13 +136,13 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) { bool AddUserToIVUsers = false; if (LI->getLoopFor(User->getParent()) != L) { if (isa<PHINode>(User) || Processed.count(User) || - !AddUsersIfInteresting(User)) { + !AddUsersIfInteresting(User, Phi)) { DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n' << " OF SCEV: " << *ISE << '\n'); AddUserToIVUsers = true; } } else if (Processed.count(User) || - !AddUsersIfInteresting(User)) { + !AddUsersIfInteresting(User, Phi)) { DEBUG(dbgs() << "FOUND USER: " << *User << '\n' << " OF SCEV: " << *ISE << '\n'); AddUserToIVUsers = true; @@ -135,9 +150,11 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) { if (AddUserToIVUsers) { // Okay, we found a user that we cannot reduce. - IVUses.push_back(new IVStrideUse(this, User, I)); + IVUses.push_back(new IVStrideUse(this, User, I, Phi)); IVStrideUse &NewUse = IVUses.back(); - // Transform the expression into a normalized form. + // Autodetect the post-inc loop set, populating NewUse.PostIncLoops. + // The regular return value here is discarded; instead of recording + // it, we just recompute it when we need it. ISE = TransformForPostIncUse(NormalizeAutodetect, ISE, User, I, NewUse.PostIncLoops, @@ -148,8 +165,8 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) { return true; } -IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) { - IVUses.push_back(new IVStrideUse(this, User, Operand)); +IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand, PHINode *Phi) { + IVUses.push_back(new IVStrideUse(this, User, Operand, Phi)); return IVUses.back(); } @@ -177,7 +194,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) { // them by stride. Start by finding all of the PHI nodes in the header for // this loop. If they are induction variables, inspect their uses. for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) - (void)AddUsersIfInteresting(I); + (void)AddUsersIfInteresting(I, cast<PHINode>(I)); return false; } diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index 47f91cf..efde598 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -66,21 +66,13 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) { ImmutableCallSite CS(cast<Instruction>(II)); - // If this function contains a call to setjmp or _setjmp, never inline - // it. This is a hack because we depend on the user marking their local - // variables as volatile if they are live across a setjmp call, and they - // probably won't do this in callers. if (const Function *F = CS.getCalledFunction()) { // If a function is both internal and has a single use, then it is // extremely likely to get inlined in the future (it was probably // exposed by an interleaved devirtualization pass). if (F->hasInternalLinkage() && F->hasOneUse()) ++NumInlineCandidates; - - if (F->isDeclaration() && - (F->getName() == "setjmp" || F->getName() == "_setjmp")) - callsSetJmp = true; - + // If this call is to function itself, then the function is recursive. // Inlining it into other functions is a bad idea, because this is // basically just a form of loop peeling, and our metrics aren't useful @@ -226,6 +218,13 @@ unsigned CodeMetrics::CountCodeReductionForAlloca(Value *V) { /// analyzeFunction - Fill in the current structure with information gleaned /// from the specified function. void CodeMetrics::analyzeFunction(Function *F) { + // If this function contains a call to setjmp or _setjmp, never inline + // it. This is a hack because we depend on the user marking their local + // variables as volatile if they are live across a setjmp call, and they + // probably won't do this in callers. + if (F->callsFunctionThatReturnsTwice()) + callsSetJmp = true; + // Look at the size of the callee. for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB) analyzeBasicBlock(&*BB); @@ -501,7 +500,7 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, return InlineCost::getAlways(); if (CalleeFI->Metrics.usesDynamicAlloca) { - // Get infomation about the caller. + // Get information about the caller. FunctionInfo &CallerFI = CachedFunctionInfo[Caller]; // If we haven't calculated this information yet, do so now. @@ -549,7 +548,7 @@ InlineCost InlineCostAnalyzer::getSpecializationCost(Function *Callee, int Cost = 0; - // Look at the orginal size of the callee. Each instruction counts as 5. + // Look at the original size of the callee. Each instruction counts as 5. Cost += CalleeFI->Metrics.NumInsts * InlineConstants::InstrCost; // Offset that with the amount of code that can be constant-folded @@ -594,7 +593,7 @@ InlineCostAnalyzer::growCachedCostInfo(Function *Caller, Function *Callee) { CodeMetrics &CallerMetrics = CachedFunctionInfo[Caller].Metrics; // For small functions we prefer to recalculate the cost for better accuracy. - if (CallerMetrics.NumBlocks < 10 || CallerMetrics.NumInsts < 1000) { + if (CallerMetrics.NumBlocks < 10 && CallerMetrics.NumInsts < 1000) { resetCachedCostInfo(Caller); return; } diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 9dd5f05..9d78f8b 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -18,6 +18,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "instsimplify" +#include "llvm/Operator.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ConstantFolding.h" @@ -900,6 +901,109 @@ Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const TargetData *TD, return ::SimplifyFDivInst(Op0, Op1, TD, DT, RecursionLimit); } +/// SimplifyRem - Given operands for an SRem or URem, see if we can +/// fold the result. If not, this returns null. +static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, + const TargetData *TD, const DominatorTree *DT, + unsigned MaxRecurse) { + if (Constant *C0 = dyn_cast<Constant>(Op0)) { + if (Constant *C1 = dyn_cast<Constant>(Op1)) { + Constant *Ops[] = { C0, C1 }; + return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, 2, TD); + } + } + + // X % undef -> undef + if (match(Op1, m_Undef())) + return Op1; + + // undef % X -> 0 + if (match(Op0, m_Undef())) + return Constant::getNullValue(Op0->getType()); + + // 0 % X -> 0, we don't need to preserve faults! + if (match(Op0, m_Zero())) + return Op0; + + // X % 0 -> undef, we don't need to preserve faults! + if (match(Op1, m_Zero())) + return UndefValue::get(Op0->getType()); + + // X % 1 -> 0 + if (match(Op1, m_One())) + return Constant::getNullValue(Op0->getType()); + + if (Op0->getType()->isIntegerTy(1)) + // It can't be remainder by zero, hence it must be remainder by one. + return Constant::getNullValue(Op0->getType()); + + // X % X -> 0 + if (Op0 == Op1) + return Constant::getNullValue(Op0->getType()); + + // If the operation is with the result of a select instruction, check whether + // operating on either branch of the select always yields the same value. + if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1)) + if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, TD, DT, MaxRecurse)) + return V; + + // If the operation is with the result of a phi instruction, check whether + // operating on all incoming values of the phi always yields the same value. + if (isa<PHINode>(Op0) || isa<PHINode>(Op1)) + if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, TD, DT, MaxRecurse)) + return V; + + return 0; +} + +/// SimplifySRemInst - Given operands for an SRem, see if we can +/// fold the result. If not, this returns null. +static Value *SimplifySRemInst(Value *Op0, Value *Op1, const TargetData *TD, + const DominatorTree *DT, unsigned MaxRecurse) { + if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, TD, DT, MaxRecurse)) + return V; + + return 0; +} + +Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const TargetData *TD, + const DominatorTree *DT) { + return ::SimplifySRemInst(Op0, Op1, TD, DT, RecursionLimit); +} + +/// SimplifyURemInst - Given operands for a URem, see if we can +/// fold the result. If not, this returns null. +static Value *SimplifyURemInst(Value *Op0, Value *Op1, const TargetData *TD, + const DominatorTree *DT, unsigned MaxRecurse) { + if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, TD, DT, MaxRecurse)) + return V; + + return 0; +} + +Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const TargetData *TD, + const DominatorTree *DT) { + return ::SimplifyURemInst(Op0, Op1, TD, DT, RecursionLimit); +} + +static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const TargetData *, + const DominatorTree *, unsigned) { + // undef % X -> undef (the undef could be a snan). + if (match(Op0, m_Undef())) + return Op0; + + // X % undef -> undef + if (match(Op1, m_Undef())) + return Op1; + + return 0; +} + +Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const TargetData *TD, + const DominatorTree *DT) { + return ::SimplifyFRemInst(Op0, Op1, TD, DT, RecursionLimit); +} + /// SimplifyShift - Given operands for an Shl, LShr or AShr, see if we can /// fold the result. If not, this returns null. static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1, @@ -1272,6 +1376,26 @@ static const Type *GetCompareTy(Value *Op) { return CmpInst::makeCmpResultType(Op->getType()); } +/// ExtractEquivalentCondition - Rummage around inside V looking for something +/// equivalent to the comparison "LHS Pred RHS". Return such a value if found, +/// otherwise return null. Helper function for analyzing max/min idioms. +static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred, + Value *LHS, Value *RHS) { + SelectInst *SI = dyn_cast<SelectInst>(V); + if (!SI) + return 0; + CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); + if (!Cmp) + return 0; + Value *CmpLHS = Cmp->getOperand(0), *CmpRHS = Cmp->getOperand(1); + if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS) + return Cmp; + if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) && + LHS == CmpRHS && RHS == CmpLHS) + return Cmp; + return 0; +} + /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can /// fold the result. If not, this returns null. static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, @@ -1354,46 +1478,48 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, default: assert(false && "Unknown ICmp predicate!"); case ICmpInst::ICMP_ULT: - return ConstantInt::getFalse(LHS->getContext()); + // getNullValue also works for vectors, unlike getFalse. + return Constant::getNullValue(ITy); case ICmpInst::ICMP_UGE: - return ConstantInt::getTrue(LHS->getContext()); + // getAllOnesValue also works for vectors, unlike getTrue. + return ConstantInt::getAllOnesValue(ITy); case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULE: if (isKnownNonZero(LHS, TD)) - return ConstantInt::getFalse(LHS->getContext()); + return Constant::getNullValue(ITy); break; case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGT: if (isKnownNonZero(LHS, TD)) - return ConstantInt::getTrue(LHS->getContext()); + return ConstantInt::getAllOnesValue(ITy); break; case ICmpInst::ICMP_SLT: ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD); if (LHSKnownNegative) - return ConstantInt::getTrue(LHS->getContext()); + return ConstantInt::getAllOnesValue(ITy); if (LHSKnownNonNegative) - return ConstantInt::getFalse(LHS->getContext()); + return Constant::getNullValue(ITy); break; case ICmpInst::ICMP_SLE: ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD); if (LHSKnownNegative) - return ConstantInt::getTrue(LHS->getContext()); + return ConstantInt::getAllOnesValue(ITy); if (LHSKnownNonNegative && isKnownNonZero(LHS, TD)) - return ConstantInt::getFalse(LHS->getContext()); + return Constant::getNullValue(ITy); break; case ICmpInst::ICMP_SGE: ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD); if (LHSKnownNegative) - return ConstantInt::getFalse(LHS->getContext()); + return Constant::getNullValue(ITy); if (LHSKnownNonNegative) - return ConstantInt::getTrue(LHS->getContext()); + return ConstantInt::getAllOnesValue(ITy); break; case ICmpInst::ICMP_SGT: ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD); if (LHSKnownNegative) - return ConstantInt::getFalse(LHS->getContext()); + return Constant::getNullValue(ITy); if (LHSKnownNonNegative && isKnownNonZero(LHS, TD)) - return ConstantInt::getTrue(LHS->getContext()); + return ConstantInt::getAllOnesValue(ITy); break; } } @@ -1685,7 +1811,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: - return ConstantInt::getFalse(RHS->getContext()); + // getNullValue also works for vectors, unlike getFalse. + return Constant::getNullValue(ITy); case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: ComputeSignBit(LHS, KnownNonNegative, KnownNegative, TD); @@ -1695,7 +1822,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, case ICmpInst::ICMP_NE: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: - return ConstantInt::getTrue(RHS->getContext()); + // getAllOnesValue also works for vectors, unlike getTrue. + return Constant::getAllOnesValue(ITy); } } if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) { @@ -1712,7 +1840,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: - return ConstantInt::getTrue(RHS->getContext()); + // getAllOnesValue also works for vectors, unlike getTrue. + return Constant::getAllOnesValue(ITy); case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: ComputeSignBit(RHS, KnownNonNegative, KnownNegative, TD); @@ -1722,7 +1851,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: - return ConstantInt::getFalse(RHS->getContext()); + // getNullValue also works for vectors, unlike getFalse. + return Constant::getNullValue(ITy); } } @@ -1737,7 +1867,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, // fall-through case Instruction::SDiv: case Instruction::AShr: - if (!LBO->isExact() && !RBO->isExact()) + if (!LBO->isExact() || !RBO->isExact()) break; if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0), RBO->getOperand(0), TD, DT, MaxRecurse-1)) @@ -1758,6 +1888,194 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, } } + // Simplify comparisons involving max/min. + Value *A, *B; + CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE; + CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B". + + // Signed variants on "max(a,b)>=a -> true". + if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) { + if (A != RHS) std::swap(A, B); // smax(A, B) pred A. + EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B". + // We analyze this as smax(A, B) pred A. + P = Pred; + } else if (match(RHS, m_SMax(m_Value(A), m_Value(B))) && + (A == LHS || B == LHS)) { + if (A != LHS) std::swap(A, B); // A pred smax(A, B). + EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B". + // We analyze this as smax(A, B) swapped-pred A. + P = CmpInst::getSwappedPredicate(Pred); + } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) && + (A == RHS || B == RHS)) { + if (A != RHS) std::swap(A, B); // smin(A, B) pred A. + EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B". + // We analyze this as smax(-A, -B) swapped-pred -A. + // Note that we do not need to actually form -A or -B thanks to EqP. + P = CmpInst::getSwappedPredicate(Pred); + } else if (match(RHS, m_SMin(m_Value(A), m_Value(B))) && + (A == LHS || B == LHS)) { + if (A != LHS) std::swap(A, B); // A pred smin(A, B). + EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B". + // We analyze this as smax(-A, -B) pred -A. + // Note that we do not need to actually form -A or -B thanks to EqP. + P = Pred; + } + if (P != CmpInst::BAD_ICMP_PREDICATE) { + // Cases correspond to "max(A, B) p A". + switch (P) { + default: + break; + case CmpInst::ICMP_EQ: + case CmpInst::ICMP_SLE: + // Equivalent to "A EqP B". This may be the same as the condition tested + // in the max/min; if so, we can just return that. + if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B)) + return V; + if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B)) + return V; + // Otherwise, see if "A EqP B" simplifies. + if (MaxRecurse) + if (Value *V = SimplifyICmpInst(EqP, A, B, TD, DT, MaxRecurse-1)) + return V; + break; + case CmpInst::ICMP_NE: + case CmpInst::ICMP_SGT: { + CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP); + // Equivalent to "A InvEqP B". This may be the same as the condition + // tested in the max/min; if so, we can just return that. + if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B)) + return V; + if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B)) + return V; + // Otherwise, see if "A InvEqP B" simplifies. + if (MaxRecurse) + if (Value *V = SimplifyICmpInst(InvEqP, A, B, TD, DT, MaxRecurse-1)) + return V; + break; + } + case CmpInst::ICMP_SGE: + // Always true. + return Constant::getAllOnesValue(ITy); + case CmpInst::ICMP_SLT: + // Always false. + return Constant::getNullValue(ITy); + } + } + + // Unsigned variants on "max(a,b)>=a -> true". + P = CmpInst::BAD_ICMP_PREDICATE; + if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) { + if (A != RHS) std::swap(A, B); // umax(A, B) pred A. + EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B". + // We analyze this as umax(A, B) pred A. + P = Pred; + } else if (match(RHS, m_UMax(m_Value(A), m_Value(B))) && + (A == LHS || B == LHS)) { + if (A != LHS) std::swap(A, B); // A pred umax(A, B). + EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B". + // We analyze this as umax(A, B) swapped-pred A. + P = CmpInst::getSwappedPredicate(Pred); + } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) && + (A == RHS || B == RHS)) { + if (A != RHS) std::swap(A, B); // umin(A, B) pred A. + EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B". + // We analyze this as umax(-A, -B) swapped-pred -A. + // Note that we do not need to actually form -A or -B thanks to EqP. + P = CmpInst::getSwappedPredicate(Pred); + } else if (match(RHS, m_UMin(m_Value(A), m_Value(B))) && + (A == LHS || B == LHS)) { + if (A != LHS) std::swap(A, B); // A pred umin(A, B). + EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B". + // We analyze this as umax(-A, -B) pred -A. + // Note that we do not need to actually form -A or -B thanks to EqP. + P = Pred; + } + if (P != CmpInst::BAD_ICMP_PREDICATE) { + // Cases correspond to "max(A, B) p A". + switch (P) { + default: + break; + case CmpInst::ICMP_EQ: + case CmpInst::ICMP_ULE: + // Equivalent to "A EqP B". This may be the same as the condition tested + // in the max/min; if so, we can just return that. + if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B)) + return V; + if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B)) + return V; + // Otherwise, see if "A EqP B" simplifies. + if (MaxRecurse) + if (Value *V = SimplifyICmpInst(EqP, A, B, TD, DT, MaxRecurse-1)) + return V; + break; + case CmpInst::ICMP_NE: + case CmpInst::ICMP_UGT: { + CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP); + // Equivalent to "A InvEqP B". This may be the same as the condition + // tested in the max/min; if so, we can just return that. + if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B)) + return V; + if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B)) + return V; + // Otherwise, see if "A InvEqP B" simplifies. + if (MaxRecurse) + if (Value *V = SimplifyICmpInst(InvEqP, A, B, TD, DT, MaxRecurse-1)) + return V; + break; + } + case CmpInst::ICMP_UGE: + // Always true. + return Constant::getAllOnesValue(ITy); + case CmpInst::ICMP_ULT: + // Always false. + return Constant::getNullValue(ITy); + } + } + + // Variants on "max(x,y) >= min(x,z)". + Value *C, *D; + if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && + match(RHS, m_SMin(m_Value(C), m_Value(D))) && + (A == C || A == D || B == C || B == D)) { + // max(x, ?) pred min(x, ?). + if (Pred == CmpInst::ICMP_SGE) + // Always true. + return Constant::getAllOnesValue(ITy); + if (Pred == CmpInst::ICMP_SLT) + // Always false. + return Constant::getNullValue(ITy); + } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) && + match(RHS, m_SMax(m_Value(C), m_Value(D))) && + (A == C || A == D || B == C || B == D)) { + // min(x, ?) pred max(x, ?). + if (Pred == CmpInst::ICMP_SLE) + // Always true. + return Constant::getAllOnesValue(ITy); + if (Pred == CmpInst::ICMP_SGT) + // Always false. + return Constant::getNullValue(ITy); + } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && + match(RHS, m_UMin(m_Value(C), m_Value(D))) && + (A == C || A == D || B == C || B == D)) { + // max(x, ?) pred min(x, ?). + if (Pred == CmpInst::ICMP_UGE) + // Always true. + return Constant::getAllOnesValue(ITy); + if (Pred == CmpInst::ICMP_ULT) + // Always false. + return Constant::getNullValue(ITy); + } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) && + match(RHS, m_UMax(m_Value(C), m_Value(D))) && + (A == C || A == D || B == C || B == D)) { + // min(x, ?) pred max(x, ?). + if (Pred == CmpInst::ICMP_ULE) + // Always true. + return Constant::getAllOnesValue(ITy); + if (Pred == CmpInst::ICMP_UGT) + // Always false. + return Constant::getNullValue(ITy); + } + // If the comparison is with the result of a select instruction, check whether // comparing with either branch of the select always yields the same value. if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS)) @@ -1993,6 +2311,9 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, TD, DT, MaxRecurse); case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, TD, DT, MaxRecurse); case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, TD, DT, MaxRecurse); + case Instruction::SRem: return SimplifySRemInst(LHS, RHS, TD, DT, MaxRecurse); + case Instruction::URem: return SimplifyURemInst(LHS, RHS, TD, DT, MaxRecurse); + case Instruction::FRem: return SimplifyFRemInst(LHS, RHS, TD, DT, MaxRecurse); case Instruction::Shl: return SimplifyShlInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false, TD, DT, MaxRecurse); @@ -2087,6 +2408,15 @@ Value *llvm::SimplifyInstruction(Instruction *I, const TargetData *TD, case Instruction::FDiv: Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1), TD, DT); break; + case Instruction::SRem: + Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), TD, DT); + break; + case Instruction::URem: + Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), TD, DT); + break; + case Instruction::FRem: + Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1), TD, DT); + break; case Instruction::Shl: Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1), cast<BinaryOperator>(I)->hasNoSignedWrap(), diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp index 9e7da6c..6e27597 100644 --- a/lib/Analysis/LazyValueInfo.cpp +++ b/lib/Analysis/LazyValueInfo.cpp @@ -29,7 +29,6 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include <map> -#include <set> #include <stack> using namespace llvm; @@ -268,6 +267,8 @@ public: } // end anonymous namespace. namespace llvm { +raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) + LLVM_ATTRIBUTE_USED; raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) { if (Val.isUndefined()) return OS << "undefined"; @@ -588,16 +589,18 @@ static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) { } if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) { if (MI->isVolatile()) return false; - if (MI->getAddressSpace() != 0) return false; // FIXME: check whether it has a valuerange that excludes zero? ConstantInt *Len = dyn_cast<ConstantInt>(MI->getLength()); if (!Len || Len->isZero()) return false; - if (MI->getRawDest() == Ptr || MI->getDest() == Ptr) - return true; + if (MI->getDestAddressSpace() == 0) + if (MI->getRawDest() == Ptr || MI->getDest() == Ptr) + return true; if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) - return MTI->getRawSource() == Ptr || MTI->getSource() == Ptr; + if (MTI->getSourceAddressSpace() == 0) + if (MTI->getRawSource() == Ptr || MTI->getSource() == Ptr) + return true; } return false; } diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp index fc7edc0..f130f30 100644 --- a/lib/Analysis/Lint.cpp +++ b/lib/Analysis/Lint.cpp @@ -606,7 +606,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk, Type::getInt64Ty(V->getContext()))) return findValueImpl(CE->getOperand(0), OffsetOk, Visited); } else if (CE->getOpcode() == Instruction::ExtractValue) { - const SmallVector<unsigned, 4> &Indices = CE->getIndices(); + ArrayRef<unsigned> Indices = CE->getIndices(); if (Value *W = FindInsertedValue(CE->getOperand(0), Indices.begin(), Indices.end())) diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp index 2ea27fb..c5c676b 100644 --- a/lib/Analysis/Loads.cpp +++ b/lib/Analysis/Loads.cpp @@ -17,6 +17,7 @@ #include "llvm/GlobalAlias.h" #include "llvm/GlobalVariable.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Operator.h" using namespace llvm; /// AreEquivalentAddressValues - Test if A and B will obviously have the same @@ -30,7 +31,7 @@ using namespace llvm; static bool AreEquivalentAddressValues(const Value *A, const Value *B) { // Test if the values are trivially equivalent. if (A == B) return true; - + // Test if the values come from identical arithmetic instructions. // Use isIdenticalToWhenDefined instead of isIdenticalTo because // this function is only used when one address use dominates the @@ -41,7 +42,7 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) { if (const Instruction *BI = dyn_cast<Instruction>(B)) if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI)) return true; - + // Otherwise they may not be equivalent. return false; } diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp index 64d215c..2283db0 100644 --- a/lib/Analysis/MemDepPrinter.cpp +++ b/lib/Analysis/MemDepPrinter.cpp @@ -79,8 +79,8 @@ bool MemDepPrinter::runOnFunction(Function &F) { MemDepResult Res = MDA.getDependency(Inst); if (!Res.isNonLocal()) { - assert(Res.isClobber() != Res.isDef() && - "Local dep should be def or clobber!"); + assert((Res.isUnknown() || Res.isClobber() || Res.isDef()) && + "Local dep should be unknown, def or clobber!"); Deps[Inst].insert(std::make_pair(InstAndClobberFlag(Res.getInst(), Res.isClobber()), static_cast<BasicBlock *>(0))); @@ -92,8 +92,9 @@ bool MemDepPrinter::runOnFunction(Function &F) { for (MemoryDependenceAnalysis::NonLocalDepInfo::const_iterator I = NLDI.begin(), E = NLDI.end(); I != E; ++I) { const MemDepResult &Res = I->getResult(); - assert(Res.isClobber() != Res.isDef() && - "Resolved non-local call dep should be def or clobber!"); + assert((Res.isUnknown() || Res.isClobber() || Res.isDef()) && + "Resolved non-local call dep should be unknown, def or " + "clobber!"); InstDeps.insert(std::make_pair(InstAndClobberFlag(Res.getInst(), Res.isClobber()), I->getBB())); @@ -148,16 +149,24 @@ void MemDepPrinter::print(raw_ostream &OS, const Module *M) const { bool isClobber = I->first.getInt(); const BasicBlock *DepBB = I->second; - OS << " " << (isClobber ? "Clobber" : " Def"); + OS << " "; + if (!DepInst) + OS << "Unknown"; + else if (isClobber) + OS << "Clobber"; + else + OS << " Def"; if (DepBB) { OS << " in block "; WriteAsOperand(OS, DepBB, /*PrintType=*/false, M); } - OS << " from: "; - if (DepInst == Inst) - OS << "<unspecified>"; - else - DepInst->print(OS); + if (DepInst) { + OS << " from: "; + if (DepInst == Inst) + OS << "<unspecified>"; + else + DepInst->print(OS); + } OS << "\n"; } diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index 35043bdd..bba4482 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -16,6 +16,7 @@ #define DEBUG_TYPE "memdep" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Function.h" @@ -46,6 +47,11 @@ STATISTIC(NumUncacheNonLocalPtr, STATISTIC(NumCacheCompleteNonLocalPtr, "Number of block queries that were completely cached"); +// Limit for the number of instructions to scan in a block. +// FIXME: Figure out what a sane value is for this. +// (500 is relatively insane.) +static const int BlockScanLimit = 500; + char MemoryDependenceAnalysis::ID = 0; // Register this pass... @@ -179,8 +185,16 @@ AliasAnalysis::ModRefResult GetLocation(const Instruction *Inst, MemDepResult MemoryDependenceAnalysis:: getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall, BasicBlock::iterator ScanIt, BasicBlock *BB) { + unsigned Limit = BlockScanLimit; + // Walk backwards through the block, looking for dependencies while (ScanIt != BB->begin()) { + // Limit the amount of scanning we do so we don't end up with quadratic + // running time on extreme testcases. + --Limit; + if (!Limit) + return MemDepResult::getUnknown(); + Instruction *Inst = --ScanIt; // If this inst is a memory op, get the pointer it accessed @@ -214,11 +228,101 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall, } } - // No dependence found. If this is the entry block of the function, it is a - // clobber, otherwise it is non-local. + // No dependence found. If this is the entry block of the function, it is + // unknown, otherwise it is non-local. if (BB != &BB->getParent()->getEntryBlock()) return MemDepResult::getNonLocal(); - return MemDepResult::getClobber(ScanIt); + return MemDepResult::getUnknown(); +} + +/// isLoadLoadClobberIfExtendedToFullWidth - Return true if LI is a load that +/// would fully overlap MemLoc if done as a wider legal integer load. +/// +/// MemLocBase, MemLocOffset are lazily computed here the first time the +/// base/offs of memloc is needed. +static bool +isLoadLoadClobberIfExtendedToFullWidth(const AliasAnalysis::Location &MemLoc, + const Value *&MemLocBase, + int64_t &MemLocOffs, + const LoadInst *LI, + const TargetData *TD) { + // If we have no target data, we can't do this. + if (TD == 0) return false; + + // If we haven't already computed the base/offset of MemLoc, do so now. + if (MemLocBase == 0) + MemLocBase = GetPointerBaseWithConstantOffset(MemLoc.Ptr, MemLocOffs, *TD); + + unsigned Size = MemoryDependenceAnalysis:: + getLoadLoadClobberFullWidthSize(MemLocBase, MemLocOffs, MemLoc.Size, + LI, *TD); + return Size != 0; +} + +/// getLoadLoadClobberFullWidthSize - This is a little bit of analysis that +/// looks at a memory location for a load (specified by MemLocBase, Offs, +/// and Size) and compares it against a load. If the specified load could +/// be safely widened to a larger integer load that is 1) still efficient, +/// 2) safe for the target, and 3) would provide the specified memory +/// location value, then this function returns the size in bytes of the +/// load width to use. If not, this returns zero. +unsigned MemoryDependenceAnalysis:: +getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs, + unsigned MemLocSize, const LoadInst *LI, + const TargetData &TD) { + // We can only extend non-volatile integer loads. + if (!isa<IntegerType>(LI->getType()) || LI->isVolatile()) return 0; + + // Get the base of this load. + int64_t LIOffs = 0; + const Value *LIBase = + GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, TD); + + // If the two pointers are not based on the same pointer, we can't tell that + // they are related. + if (LIBase != MemLocBase) return 0; + + // Okay, the two values are based on the same pointer, but returned as + // no-alias. This happens when we have things like two byte loads at "P+1" + // and "P+3". Check to see if increasing the size of the "LI" load up to its + // alignment (or the largest native integer type) will allow us to load all + // the bits required by MemLoc. + + // If MemLoc is before LI, then no widening of LI will help us out. + if (MemLocOffs < LIOffs) return 0; + + // Get the alignment of the load in bytes. We assume that it is safe to load + // any legal integer up to this size without a problem. For example, if we're + // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can + // widen it up to an i32 load. If it is known 2-byte aligned, we can widen it + // to i16. + unsigned LoadAlign = LI->getAlignment(); + + int64_t MemLocEnd = MemLocOffs+MemLocSize; + + // If no amount of rounding up will let MemLoc fit into LI, then bail out. + if (LIOffs+LoadAlign < MemLocEnd) return 0; + + // This is the size of the load to try. Start with the next larger power of + // two. + unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits()/8U; + NewLoadByteSize = NextPowerOf2(NewLoadByteSize); + + while (1) { + // If this load size is bigger than our known alignment or would not fit + // into a native integer register, then we fail. + if (NewLoadByteSize > LoadAlign || + !TD.fitsInLegalInteger(NewLoadByteSize*8)) + return 0; + + // If a load of this width would include all of MemLoc, then we succeed. + if (LIOffs+NewLoadByteSize >= MemLocEnd) + return NewLoadByteSize; + + NewLoadByteSize <<= 1; + } + + return 0; } /// getPointerDependencyFrom - Return the instruction on which a memory @@ -229,58 +333,39 @@ MemDepResult MemoryDependenceAnalysis:: getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad, BasicBlock::iterator ScanIt, BasicBlock *BB) { - Value *InvariantTag = 0; + const Value *MemLocBase = 0; + int64_t MemLocOffset = 0; + + unsigned Limit = BlockScanLimit; // Walk backwards through the basic block, looking for dependencies. while (ScanIt != BB->begin()) { + // Limit the amount of scanning we do so we don't end up with quadratic + // running time on extreme testcases. + --Limit; + if (!Limit) + return MemDepResult::getUnknown(); + Instruction *Inst = --ScanIt; - // If we're in an invariant region, no dependencies can be found before - // we pass an invariant-begin marker. - if (InvariantTag == Inst) { - InvariantTag = 0; - continue; - } - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { // Debug intrinsics don't (and can't) cause dependences. if (isa<DbgInfoIntrinsic>(II)) continue; - // If we pass an invariant-end marker, then we've just entered an - // invariant region and can start ignoring dependencies. - if (II->getIntrinsicID() == Intrinsic::invariant_end) { - // FIXME: This only considers queries directly on the invariant-tagged - // pointer, not on query pointers that are indexed off of them. It'd - // be nice to handle that at some point. - AliasAnalysis::AliasResult R = - AA->alias(AliasAnalysis::Location(II->getArgOperand(2)), MemLoc); - if (R == AliasAnalysis::MustAlias) - InvariantTag = II->getArgOperand(0); - - continue; - } - // If we reach a lifetime begin or end marker, then the query ends here // because the value is undefined. if (II->getIntrinsicID() == Intrinsic::lifetime_start) { // FIXME: This only considers queries directly on the invariant-tagged // pointer, not on query pointers that are indexed off of them. It'd - // be nice to handle that at some point. - AliasAnalysis::AliasResult R = - AA->alias(AliasAnalysis::Location(II->getArgOperand(1)), MemLoc); - if (R == AliasAnalysis::MustAlias) + // be nice to handle that at some point (the right approach is to use + // GetPointerBaseWithConstantOffset). + if (AA->isMustAlias(AliasAnalysis::Location(II->getArgOperand(1)), + MemLoc)) return MemDepResult::getDef(II); continue; } } - // If we're querying on a load and we're in an invariant region, we're done - // at this point. Nothing a load depends on can live in an invariant region. - // - // FIXME: this will prevent us from returning load/load must-aliases, so GVN - // won't remove redundant loads. - if (isLoad && InvariantTag) continue; - // Values depend on loads if the pointers are must aliased. This means that // a load depends on another must aliased load from the same value. if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { @@ -288,27 +373,57 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad, // If we found a pointer, check if it could be the same as our pointer. AliasAnalysis::AliasResult R = AA->alias(LoadLoc, MemLoc); - if (R == AliasAnalysis::NoAlias) - continue; - // May-alias loads don't depend on each other without a dependence. - if (isLoad && R != AliasAnalysis::MustAlias) + if (isLoad) { + if (R == AliasAnalysis::NoAlias) { + // If this is an over-aligned integer load (for example, + // "load i8* %P, align 4") see if it would obviously overlap with the + // queried location if widened to a larger load (e.g. if the queried + // location is 1 byte at P+1). If so, return it as a load/load + // clobber result, allowing the client to decide to widen the load if + // it wants to. + if (const IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) + if (LI->getAlignment()*8 > ITy->getPrimitiveSizeInBits() && + isLoadLoadClobberIfExtendedToFullWidth(MemLoc, MemLocBase, + MemLocOffset, LI, TD)) + return MemDepResult::getClobber(Inst); + + continue; + } + + // Must aliased loads are defs of each other. + if (R == AliasAnalysis::MustAlias) + return MemDepResult::getDef(Inst); + +#if 0 // FIXME: Temporarily disabled. GVN is cleverly rewriting loads + // in terms of clobbering loads, but since it does this by looking + // at the clobbering load directly, it doesn't know about any + // phi translation that may have happened along the way. + + // If we have a partial alias, then return this as a clobber for the + // client to handle. + if (R == AliasAnalysis::PartialAlias) + return MemDepResult::getClobber(Inst); +#endif + + // Random may-alias loads don't depend on each other without a + // dependence. + continue; + } + + // Stores don't depend on other no-aliased accesses. + if (R == AliasAnalysis::NoAlias) continue; // Stores don't alias loads from read-only memory. - if (!isLoad && AA->pointsToConstantMemory(LoadLoc)) + if (AA->pointsToConstantMemory(LoadLoc)) continue; - // Stores depend on may and must aliased loads, loads depend on must-alias - // loads. + // Stores depend on may/must aliased loads. return MemDepResult::getDef(Inst); } if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - // There can't be stores to the value we care about inside an - // invariant region. - if (InvariantTag) continue; - // If alias analysis can tell that this store is guaranteed to not modify // the query pointer, ignore it. Use getModRefInfo to handle cases where // the query pointer points to constant memory etc. @@ -341,8 +456,7 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad, (isa<CallInst>(Inst) && extractMallocCall(Inst))) { const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, TD); - if (AccessPtr == Inst || - AA->alias(Inst, 1, AccessPtr, 1) == AliasAnalysis::MustAlias) + if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr)) return MemDepResult::getDef(Inst); continue; } @@ -353,9 +467,6 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad, // If the call has no effect on the queried pointer, just ignore it. continue; case AliasAnalysis::Mod: - // If we're in an invariant region, we can ignore calls that ONLY - // modify the pointer. - if (InvariantTag) continue; return MemDepResult::getClobber(Inst); case AliasAnalysis::Ref: // If the call is known to never store to the pointer, and if this is a @@ -368,11 +479,11 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad, } } - // No dependence found. If this is the entry block of the function, it is a - // clobber, otherwise it is non-local. + // No dependence found. If this is the entry block of the function, it is + // unknown, otherwise it is non-local. if (BB != &BB->getParent()->getEntryBlock()) return MemDepResult::getNonLocal(); - return MemDepResult::getClobber(ScanIt); + return MemDepResult::getUnknown(); } /// getDependency - Return the instruction on which a memory operation @@ -400,12 +511,12 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) { // Do the scan. if (BasicBlock::iterator(QueryInst) == QueryParent->begin()) { - // No dependence found. If this is the entry block of the function, it is a - // clobber, otherwise it is non-local. + // No dependence found. If this is the entry block of the function, it is + // unknown, otherwise it is non-local. if (QueryParent != &QueryParent->getParent()->getEntryBlock()) LocalCache = MemDepResult::getNonLocal(); else - LocalCache = MemDepResult::getClobber(QueryInst); + LocalCache = MemDepResult::getUnknown(); } else { AliasAnalysis::Location MemLoc; AliasAnalysis::ModRefResult MR = GetLocation(QueryInst, MemLoc, AA); @@ -413,7 +524,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) { // If we can do a pointer scan, make it happen. bool isLoad = !(MR & AliasAnalysis::Mod); if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst)) - isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_end; + isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_start; LocalCache = getPointerDependencyFrom(MemLoc, isLoad, ScanPos, QueryParent); @@ -424,7 +535,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) { QueryParent); } else // Non-memory instruction. - LocalCache = MemDepResult::getClobber(--BasicBlock::iterator(ScanPos)); + LocalCache = MemDepResult::getUnknown(); } // Remember the result! @@ -558,10 +669,10 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) { Dep = getCallSiteDependencyFrom(QueryCS, isReadonlyCall,ScanPos, DirtyBB); } else if (DirtyBB != &DirtyBB->getParent()->getEntryBlock()) { // No dependence found. If this is the entry block of the function, it is - // a clobber, otherwise it is non-local. + // a clobber, otherwise it is unknown. Dep = MemDepResult::getNonLocal(); } else { - Dep = MemDepResult::getClobber(ScanPos); + Dep = MemDepResult::getUnknown(); } // If we had a dirty entry for the block, update it. Otherwise, just add @@ -617,7 +728,7 @@ getNonLocalPointerDependency(const AliasAnalysis::Location &Loc, bool isLoad, return; Result.clear(); Result.push_back(NonLocalDepResult(FromBB, - MemDepResult::getClobber(FromBB->begin()), + MemDepResult::getUnknown(), const_cast<Value *>(Loc.Ptr))); } @@ -679,7 +790,7 @@ GetNonLocalInfoForBlock(const AliasAnalysis::Location &Loc, // If the block has a dependency (i.e. it isn't completely transparent to // the value), remember the reverse association because we just added it // to Cache! - if (Dep.isNonLocal()) + if (Dep.isNonLocal() || Dep.isUnknown()) return Dep; // Keep the ReverseNonLocalPtrDeps map up to date so we can efficiently @@ -853,6 +964,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, SmallVector<BasicBlock*, 32> Worklist; Worklist.push_back(StartBB); + // PredList used inside loop. + SmallVector<std::pair<BasicBlock*, PHITransAddr>, 16> PredList; + // Keep track of the entries that we know are sorted. Previously cached // entries will all be sorted. The entries we add we only sort on demand (we // don't insert every element into its sorted position). We know that we @@ -889,22 +1003,29 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, // the same Pointer. if (!Pointer.NeedsPHITranslationFromBlock(BB)) { SkipFirstBlock = false; + SmallVector<BasicBlock*, 16> NewBlocks; for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) { // Verify that we haven't looked at this block yet. std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool> InsertRes = Visited.insert(std::make_pair(*PI, Pointer.getAddr())); if (InsertRes.second) { // First time we've looked at *PI. - Worklist.push_back(*PI); + NewBlocks.push_back(*PI); continue; } // If we have seen this block before, but it was with a different // pointer then we have a phi translation failure and we have to treat // this as a clobber. - if (InsertRes.first->second != Pointer.getAddr()) + if (InsertRes.first->second != Pointer.getAddr()) { + // Make sure to clean up the Visited map before continuing on to + // PredTranslationFailure. + for (unsigned i = 0; i < NewBlocks.size(); i++) + Visited.erase(NewBlocks[i]); goto PredTranslationFailure; + } } + Worklist.append(NewBlocks.begin(), NewBlocks.end()); continue; } @@ -923,13 +1044,15 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, NumSortedEntries = Cache->size(); } Cache = 0; - + + PredList.clear(); for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) { BasicBlock *Pred = *PI; - + PredList.push_back(std::make_pair(Pred, Pointer)); + // Get the PHI translated pointer in this predecessor. This can fail if // not translatable, in which case the getAddr() returns null. - PHITransAddr PredPointer(Pointer); + PHITransAddr &PredPointer = PredList.back().second; PredPointer.PHITranslateValue(BB, Pred, 0); Value *PredPtrVal = PredPointer.getAddr(); @@ -943,6 +1066,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, InsertRes = Visited.insert(std::make_pair(Pred, PredPtrVal)); if (!InsertRes.second) { + // We found the pred; take it off the list of preds to visit. + PredList.pop_back(); + // If the predecessor was visited with PredPtr, then we already did // the analysis and can ignore it. if (InsertRes.first->second == PredPtrVal) @@ -951,18 +1077,49 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, // Otherwise, the block was previously analyzed with a different // pointer. We can't represent the result of this case, so we just // treat this as a phi translation failure. + + // Make sure to clean up the Visited map before continuing on to + // PredTranslationFailure. + for (unsigned i = 0; i < PredList.size(); i++) + Visited.erase(PredList[i].first); + goto PredTranslationFailure; } - + } + + // Actually process results here; this need to be a separate loop to avoid + // calling getNonLocalPointerDepFromBB for blocks we don't want to return + // any results for. (getNonLocalPointerDepFromBB will modify our + // datastructures in ways the code after the PredTranslationFailure label + // doesn't expect.) + for (unsigned i = 0; i < PredList.size(); i++) { + BasicBlock *Pred = PredList[i].first; + PHITransAddr &PredPointer = PredList[i].second; + Value *PredPtrVal = PredPointer.getAddr(); + + bool CanTranslate = true; // If PHI translation was unable to find an available pointer in this // predecessor, then we have to assume that the pointer is clobbered in // that predecessor. We can still do PRE of the load, which would insert // a computation of the pointer in this predecessor. - if (PredPtrVal == 0) { + if (PredPtrVal == 0) + CanTranslate = false; + + // FIXME: it is entirely possible that PHI translating will end up with + // the same value. Consider PHI translating something like: + // X = phi [x, bb1], [y, bb2]. PHI translating for bb1 doesn't *need* + // to recurse here, pedantically speaking. + + // If getNonLocalPointerDepFromBB fails here, that means the cached + // result conflicted with the Visited list; we have to conservatively + // assume it is unknown, but this also does not block PRE of the load. + if (!CanTranslate || + getNonLocalPointerDepFromBB(PredPointer, + Loc.getWithNewPtr(PredPtrVal), + isLoad, Pred, + Result, Visited)) { // Add the entry to the Result list. - NonLocalDepResult Entry(Pred, - MemDepResult::getClobber(Pred->getTerminator()), - PredPtrVal); + NonLocalDepResult Entry(Pred, MemDepResult::getUnknown(), PredPtrVal); Result.push_back(Entry); // Since we had a phi translation failure, the cache for CacheKey won't @@ -974,19 +1131,6 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, NLPI.Pair = BBSkipFirstBlockPair(); continue; } - - // FIXME: it is entirely possible that PHI translating will end up with - // the same value. Consider PHI translating something like: - // X = phi [x, bb1], [y, bb2]. PHI translating for bb1 doesn't *need* - // to recurse here, pedantically speaking. - - // If we have a problem phi translating, fall through to the code below - // to handle the failure condition. - if (getNonLocalPointerDepFromBB(PredPointer, - Loc.getWithNewPtr(PredPointer.getAddr()), - isLoad, Pred, - Result, Visited)) - goto PredTranslationFailure; } // Refresh the CacheInfo/Cache pointer so that it isn't invalidated. @@ -1003,6 +1147,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, continue; PredTranslationFailure: + // The following code is "failure"; we can't produce a sane translation + // for the given block. It assumes that we haven't modified any of + // our datastructures while processing the current block. if (Cache == 0) { // Refresh the CacheInfo/Cache pointer if it got invalidated. @@ -1017,8 +1164,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, // results from the set". Clear out the indicator for this. CacheInfo->Pair = BBSkipFirstBlockPair(); - // If *nothing* works, mark the pointer as being clobbered by the first - // instruction in this block. + // If *nothing* works, mark the pointer as unknown. // // If this is the magic first block, return this as a clobber of the whole // incoming value. Since we can't phi translate to one of the predecessors, @@ -1033,8 +1179,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, assert(I->getResult().isNonLocal() && "Should only be here with transparent block"); - I->setResult(MemDepResult::getClobber(BB->begin())); - ReverseNonLocalPtrDeps[BB->begin()].insert(CacheKey); + I->setResult(MemDepResult::getUnknown()); Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(), Pointer.getAddr())); break; diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp index 93da5a4..70dcd0d 100644 --- a/lib/Analysis/PHITransAddr.cpp +++ b/lib/Analysis/PHITransAddr.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Constants.h" #include "llvm/Instructions.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp index 5d3f6bb..7c584da 100644 --- a/lib/Analysis/PathNumbering.cpp +++ b/lib/Analysis/PathNumbering.cpp @@ -38,13 +38,10 @@ #include "llvm/Support/TypeBuilder.h" #include "llvm/Support/raw_ostream.h" -#include <map> #include <queue> -#include <set> #include <stack> #include <string> #include <utility> -#include <vector> #include <sstream> using namespace llvm; @@ -286,7 +283,7 @@ void BallLarusDag::calculatePathNumbers() { BallLarusEdge* exitEdge = addEdge(node, getExit(), 0); exitEdge->setType(BallLarusEdge::SPLITEDGE_PHONY); - // Counters to handle the possibilty of a multi-graph + // Counters to handle the possibility of a multi-graph BasicBlock* oldTarget = 0; unsigned duplicateNumber = 0; diff --git a/lib/Analysis/PathProfileVerifier.cpp b/lib/Analysis/PathProfileVerifier.cpp index c549773..0ae734e 100644 --- a/lib/Analysis/PathProfileVerifier.cpp +++ b/lib/Analysis/PathProfileVerifier.cpp @@ -124,7 +124,7 @@ bool PathProfileVerifier::runOnModule (Module &M) { ProfilePathEdgeVector* pev = currentPath->getPathEdges(); DEBUG(dbgs () << "path #" << currentPath->getNumber() << ": " << currentPath->getCount() << "\n"); - // setup the entry edge (normally path profiling doens't care about this) + // setup the entry edge (normally path profiling doesn't care about this) if (currentPath->getFirstBlockInPath() == &F->getEntryBlock()) edgeArray[arrayMap[0][currentPath->getFirstBlockInPath()][0]] += currentPath->getCount(); diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp index 667ee1c..b594e2b 100644 --- a/lib/Analysis/ProfileEstimatorPass.cpp +++ b/lib/Analysis/ProfileEstimatorPass.cpp @@ -140,7 +140,7 @@ void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) { // loop, thus the edge is a backedge, continue and do not check if the // value is valid. if (BBisHeader && BBLoop->contains(*bbi)) { - printEdgeError(edge, "but is backedge, continueing"); + printEdgeError(edge, "but is backedge, continuing"); continue; } // If the edges value is missing (and this is no loop header, and this is diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp index 36f211e..173de2c 100644 --- a/lib/Analysis/ProfileInfo.cpp +++ b/lib/Analysis/ProfileInfo.cpp @@ -309,9 +309,9 @@ void ProfileInfoT<Function,BasicBlock>:: removeEdge(oldedge); } -/// Replaces all occurences of RmBB in the ProfilingInfo with DestBB. +/// Replaces all occurrences of RmBB in the ProfilingInfo with DestBB. /// This checks all edges of the function the blocks reside in and replaces the -/// occurences of RmBB with DestBB. +/// occurrences of RmBB with DestBB. template<> void ProfileInfoT<Function,BasicBlock>:: replaceAllUses(const BasicBlock *RmBB, const BasicBlock *DestBB) { @@ -812,7 +812,7 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) { } if (iw < 0) continue; - // Check the recieving end of the path if it can handle the flow. + // Check the receiving end of the path if it can handle the flow. double ow = getExecutionCount(Dest); Processed.clear(); for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB); diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp index 25481b2..eaa38da 100644 --- a/lib/Analysis/ProfileInfoLoader.cpp +++ b/lib/Analysis/ProfileInfoLoader.cpp @@ -19,7 +19,6 @@ #include "llvm/Support/raw_ostream.h" #include <cstdio> #include <cstdlib> -#include <map> using namespace llvm; // ByteSwap - Byteswap 'Var' if 'Really' is true. diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp index 3269dcc..80eda79 100644 --- a/lib/Analysis/RegionPass.cpp +++ b/lib/Analysis/RegionPass.cpp @@ -249,7 +249,7 @@ void RegionPass::assignPassManager(PMStack &PMS, assert (!PMS.empty() && "Unable to create Region Pass Manager"); PMDataManager *PMD = PMS.top(); - // [1] Create new Call Graph Pass Manager + // [1] Create new Region Pass Manager RGPM = new RGPassManager(PMD->getDepth() + 1); RGPM->populateInheritedAnalysis(PMS); diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 228974d..025718e 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -1035,6 +1035,93 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, return S; } +// Get the limit of a recurrence such that incrementing by Step cannot cause +// signed overflow as long as the value of the recurrence within the loop does +// not exceed this limit before incrementing. +static const SCEV *getOverflowLimitForStep(const SCEV *Step, + ICmpInst::Predicate *Pred, + ScalarEvolution *SE) { + unsigned BitWidth = SE->getTypeSizeInBits(Step->getType()); + if (SE->isKnownPositive(Step)) { + *Pred = ICmpInst::ICMP_SLT; + return SE->getConstant(APInt::getSignedMinValue(BitWidth) - + SE->getSignedRange(Step).getSignedMax()); + } + if (SE->isKnownNegative(Step)) { + *Pred = ICmpInst::ICMP_SGT; + return SE->getConstant(APInt::getSignedMaxValue(BitWidth) - + SE->getSignedRange(Step).getSignedMin()); + } + return 0; +} + +// The recurrence AR has been shown to have no signed wrap. Typically, if we can +// prove NSW for AR, then we can just as easily prove NSW for its preincrement +// or postincrement sibling. This allows normalizing a sign extended AddRec as +// such: {sext(Step + Start),+,Step} => {(Step + sext(Start),+,Step} As a +// result, the expression "Step + sext(PreIncAR)" is congruent with +// "sext(PostIncAR)" +static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR, + const Type *Ty, + ScalarEvolution *SE) { + const Loop *L = AR->getLoop(); + const SCEV *Start = AR->getStart(); + const SCEV *Step = AR->getStepRecurrence(*SE); + + // Check for a simple looking step prior to loop entry. + const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start); + if (!SA || SA->getNumOperands() != 2 || SA->getOperand(0) != Step) + return 0; + + // This is a postinc AR. Check for overflow on the preinc recurrence using the + // same three conditions that getSignExtendedExpr checks. + + // 1. NSW flags on the step increment. + const SCEV *PreStart = SA->getOperand(1); + const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>( + SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap)); + + if (PreAR && PreAR->getNoWrapFlags(SCEV::FlagNSW)) + return PreStart; + + // 2. Direct overflow check on the step operation's expression. + unsigned BitWidth = SE->getTypeSizeInBits(AR->getType()); + const Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2); + const SCEV *OperandExtendedStart = + SE->getAddExpr(SE->getSignExtendExpr(PreStart, WideTy), + SE->getSignExtendExpr(Step, WideTy)); + if (SE->getSignExtendExpr(Start, WideTy) == OperandExtendedStart) { + // Cache knowledge of PreAR NSW. + if (PreAR) + const_cast<SCEVAddRecExpr *>(PreAR)->setNoWrapFlags(SCEV::FlagNSW); + // FIXME: this optimization needs a unit test + DEBUG(dbgs() << "SCEV: untested prestart overflow check\n"); + return PreStart; + } + + // 3. Loop precondition. + ICmpInst::Predicate Pred; + const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, SE); + + if (OverflowLimit && + SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) { + return PreStart; + } + return 0; +} + +// Get the normalized sign-extended expression for this AddRec's Start. +static const SCEV *getSignExtendAddRecStart(const SCEVAddRecExpr *AR, + const Type *Ty, + ScalarEvolution *SE) { + const SCEV *PreStart = getPreStartForSignExtend(AR, Ty, SE); + if (!PreStart) + return SE->getSignExtendExpr(AR->getStart(), Ty); + + return SE->getAddExpr(SE->getSignExtendExpr(AR->getStepRecurrence(*SE), Ty), + SE->getSignExtendExpr(PreStart, Ty)); +} + const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, const Type *Ty) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && @@ -1097,7 +1184,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, // If we have special knowledge that this addrec won't overflow, // we don't need to do any further analysis. if (AR->getNoWrapFlags(SCEV::FlagNSW)) - return getAddRecExpr(getSignExtendExpr(Start, Ty), + return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this), getSignExtendExpr(Step, Ty), L, SCEV::FlagNSW); @@ -1133,7 +1220,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, // Cache knowledge of AR NSW, which is propagated to this AddRec. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); // Return the expression with the addrec on the outside. - return getAddRecExpr(getSignExtendExpr(Start, Ty), + return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this), getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } @@ -1149,7 +1236,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, // Cache knowledge of AR NSW, which is propagated to this AddRec. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); // Return the expression with the addrec on the outside. - return getAddRecExpr(getSignExtendExpr(Start, Ty), + return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this), getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags()); } @@ -1159,34 +1246,18 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, // the addrec is safe. Also, if the entry is guarded by a comparison // with the start value and the backedge is guarded by a comparison // with the post-inc value, the addrec is safe. - if (isKnownPositive(Step)) { - const SCEV *N = getConstant(APInt::getSignedMinValue(BitWidth) - - getSignedRange(Step).getSignedMax()); - if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SLT, AR, N) || - (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, Start, N) && - isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SLT, - AR->getPostIncExpr(*this), N))) { - // Cache knowledge of AR NSW, which is propagated to this AddRec. - const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); - // Return the expression with the addrec on the outside. - return getAddRecExpr(getSignExtendExpr(Start, Ty), - getSignExtendExpr(Step, Ty), - L, AR->getNoWrapFlags()); - } - } else if (isKnownNegative(Step)) { - const SCEV *N = getConstant(APInt::getSignedMaxValue(BitWidth) - - getSignedRange(Step).getSignedMin()); - if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SGT, AR, N) || - (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGT, Start, N) && - isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SGT, - AR->getPostIncExpr(*this), N))) { - // Cache knowledge of AR NSW, which is propagated to this AddRec. - const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); - // Return the expression with the addrec on the outside. - return getAddRecExpr(getSignExtendExpr(Start, Ty), - getSignExtendExpr(Step, Ty), - L, AR->getNoWrapFlags()); - } + ICmpInst::Predicate Pred; + const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, this); + if (OverflowLimit && + (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) || + (isLoopEntryGuardedByCond(L, Pred, Start, OverflowLimit) && + isLoopBackedgeGuardedByCond(L, Pred, AR->getPostIncExpr(*this), + OverflowLimit)))) { + // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec. + const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); + return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this), + getSignExtendExpr(Step, Ty), + L, AR->getNoWrapFlags()); } } } @@ -1882,7 +1953,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, // outer mul and the inner addrec are guaranteed to have no overflow. // // No self-wrap cannot be guaranteed after changing the step size, but - // will be infered if either NUW or NSW is true. + // will be inferred if either NUW or NSW is true. Flags = AddRec->getNoWrapFlags(clearFlags(Flags, SCEV::FlagNW)); const SCEV *NewRec = getAddRecExpr(NewOps, AddRecLoop, Flags); @@ -2015,7 +2086,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, } } // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded. - if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(LHS)) { + if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(LHS)) { SmallVector<const SCEV *, 4> Operands; for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) Operands.push_back(getZeroExtendExpr(A->getOperand(i), ExtTy)); @@ -3783,24 +3854,25 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { // update the value. The temporary CouldNotCompute value tells SCEV // code elsewhere that it shouldn't attempt to request a new // backedge-taken count, which could result in infinite recursion. - std::pair<std::map<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair = + std::pair<DenseMap<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair = BackedgeTakenCounts.insert(std::make_pair(L, getCouldNotCompute())); if (!Pair.second) return Pair.first->second; - BackedgeTakenInfo BECount = ComputeBackedgeTakenCount(L); - if (BECount.Exact != getCouldNotCompute()) { - assert(isLoopInvariant(BECount.Exact, L) && - isLoopInvariant(BECount.Max, L) && + BackedgeTakenInfo Result = getCouldNotCompute(); + BackedgeTakenInfo Computed = ComputeBackedgeTakenCount(L); + if (Computed.Exact != getCouldNotCompute()) { + assert(isLoopInvariant(Computed.Exact, L) && + isLoopInvariant(Computed.Max, L) && "Computed backedge-taken count isn't loop invariant for loop!"); ++NumTripCountsComputed; // Update the value in the map. - Pair.first->second = BECount; + Result = Computed; } else { - if (BECount.Max != getCouldNotCompute()) + if (Computed.Max != getCouldNotCompute()) // Update the value in the map. - Pair.first->second = BECount; + Result = Computed; if (isa<PHINode>(L->getHeader()->begin())) // Only count loops that have phi nodes as not being computable. ++NumTripCountsNotComputed; @@ -3811,7 +3883,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { // conservative estimates made without the benefit of trip count // information. This is similar to the code in forgetLoop, except that // it handles SCEVUnknown PHI nodes specially. - if (BECount.hasAnyInfo()) { + if (Computed.hasAnyInfo()) { SmallVector<Instruction *, 16> Worklist; PushLoopPHIs(L, Worklist); @@ -3842,7 +3914,13 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { PushDefUseChildren(I, Worklist); } } - return Pair.first->second; + + // Re-lookup the insert position, since the call to + // ComputeBackedgeTakenCount above could result in a + // recusive call to getBackedgeTakenInfo (on a different + // loop), which would invalidate the iterator computed + // earlier. + return BackedgeTakenCounts.find(L)->second = Result; } /// forgetLoop - This method should be called by the client when it has @@ -4426,7 +4504,7 @@ Constant * ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, const APInt &BEs, const Loop *L) { - std::map<PHINode*, Constant*>::const_iterator I = + DenseMap<PHINode*, Constant*>::const_iterator I = ConstantEvolutionLoopExitValue.find(PN); if (I != ConstantEvolutionLoopExitValue.end()) return I->second; @@ -4694,9 +4772,15 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { for (++i; i != e; ++i) NewOps.push_back(getSCEVAtScope(AddRec->getOperand(i), L)); - AddRec = cast<SCEVAddRecExpr>( + const SCEV *FoldedRec = getAddRecExpr(NewOps, AddRec->getLoop(), - AddRec->getNoWrapFlags(SCEV::FlagNW))); + AddRec->getNoWrapFlags(SCEV::FlagNW)); + AddRec = dyn_cast<SCEVAddRecExpr>(FoldedRec); + // The addrec may be folded to a nonrecurrence, for example, if the + // induction variable is multiplied by zero after constant folding. Go + // ahead and return the folded value. + if (!AddRec) + return FoldedRec; break; } diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp index 40e18ab..0faf139 100644 --- a/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -31,7 +31,7 @@ // // The second field identifies the type's parent node in the tree, or // is null or omitted for a root node. A type is considered to alias -// all of its decendents and all of its ancestors in the tree. Also, +// all of its descendants and all of its ancestors in the tree. Also, // a type is considered to alias all types in other trees, so that // bitcode produced from multiple front-ends is handled conservatively. // @@ -59,6 +59,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Constants.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Metadata.h" diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index a8117e6..dab5aeb 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -131,8 +131,18 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask, } return; } + + if (Argument *A = dyn_cast<Argument>(V)) { + // Get alignment information off byval arguments if specified in the IR. + if (A->hasByValAttr()) + if (unsigned Align = A->getParamAlignment()) + KnownZero = Mask & APInt::getLowBitsSet(BitWidth, + CountTrailingZeros_32(Align)); + return; + } - KnownZero.clearAllBits(); KnownOne.clearAllBits(); // Start out not knowing anything. + // Start out not knowing anything. + KnownZero.clearAllBits(); KnownOne.clearAllBits(); if (Depth == MaxDepth || Mask == 0) return; // Limit search depth. @@ -670,6 +680,10 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask, KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); break; } + case Intrinsic::x86_sse42_crc32_64_8: + case Intrinsic::x86_sse42_crc32_64_64: + KnownZero = APInt::getHighBitsSet(64, 32); + break; } } break; @@ -1328,7 +1342,7 @@ static Value *BuildSubAggregate(Value *From, Value* To, const Type *IndexedType, break; } } - // If we succesfully found a value for each of our subaggregates + // If we successfully found a value for each of our subaggregates if (To) return To; } @@ -1757,7 +1771,7 @@ llvm::GetUnderlyingObject(Value *V, const TargetData *TD, unsigned MaxLookup) { } else { // See if InstructionSimplify knows any relevant tricks. if (Instruction *I = dyn_cast<Instruction>(V)) - // TODO: Aquire a DominatorTree and use it. + // TODO: Acquire a DominatorTree and use it. if (Value *Simplified = SimplifyInstruction(I, TD, 0)) { V = Simplified; continue; diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp index 857fa1e..85defca 100644 --- a/lib/AsmParser/LLLexer.cpp +++ b/lib/AsmParser/LLLexer.cpp @@ -308,16 +308,8 @@ lltok::Kind LLLexer::LexAt() { } // Handle GlobalVarName: @[-a-zA-Z$._][-a-zA-Z$._0-9]* - if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_') { - ++CurPtr; - while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_') - ++CurPtr; - - StrVal.assign(TokStart+1, CurPtr); // Skip @ + if (ReadVarName()) return lltok::GlobalVar; - } // Handle GlobalVarID: @[0-9]+ if (isdigit(CurPtr[0])) { @@ -334,6 +326,39 @@ lltok::Kind LLLexer::LexAt() { return lltok::Error; } +/// ReadString - Read a string until the closing quote. +lltok::Kind LLLexer::ReadString(lltok::Kind kind) { + const char *Start = CurPtr; + while (1) { + int CurChar = getNextChar(); + + if (CurChar == EOF) { + Error("end of file in string constant"); + return lltok::Error; + } + if (CurChar == '"') { + StrVal.assign(Start, CurPtr-1); + UnEscapeLexed(StrVal); + return kind; + } + } +} + +/// ReadVarName - Read the rest of a token containing a variable name. +bool LLLexer::ReadVarName() { + const char *NameStart = CurPtr; + if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' || + CurPtr[0] == '.' || CurPtr[0] == '_') { + ++CurPtr; + while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' || + CurPtr[0] == '.' || CurPtr[0] == '_') + ++CurPtr; + + StrVal.assign(NameStart, CurPtr); + return true; + } + return false; +} /// LexPercent - Lex all tokens that start with a % character: /// LocalVar ::= %\"[^\"]*\" @@ -343,33 +368,12 @@ lltok::Kind LLLexer::LexPercent() { // Handle LocalVarName: %\"[^\"]*\" if (CurPtr[0] == '"') { ++CurPtr; - - while (1) { - int CurChar = getNextChar(); - - if (CurChar == EOF) { - Error("end of file in string constant"); - return lltok::Error; - } - if (CurChar == '"') { - StrVal.assign(TokStart+2, CurPtr-1); - UnEscapeLexed(StrVal); - return lltok::LocalVar; - } - } + return ReadString(lltok::LocalVar); } // Handle LocalVarName: %[-a-zA-Z$._][-a-zA-Z$._0-9]* - if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_') { - ++CurPtr; - while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_') - ++CurPtr; - - StrVal.assign(TokStart+1, CurPtr); // Skip % + if (ReadVarName()) return lltok::LocalVar; - } // Handle LocalVarID: %[0-9]+ if (isdigit(CurPtr[0])) { @@ -390,38 +394,16 @@ lltok::Kind LLLexer::LexPercent() { /// QuoteLabel "[^"]+": /// StringConstant "[^"]*" lltok::Kind LLLexer::LexQuote() { - while (1) { - int CurChar = getNextChar(); - - if (CurChar == EOF) { - Error("end of file in quoted string"); - return lltok::Error; - } - - if (CurChar != '"') continue; - - if (CurPtr[0] != ':') { - StrVal.assign(TokStart+1, CurPtr-1); - UnEscapeLexed(StrVal); - return lltok::StringConstant; - } + lltok::Kind kind = ReadString(lltok::StringConstant); + if (kind == lltok::Error || kind == lltok::Eof) + return kind; + if (CurPtr[0] == ':') { ++CurPtr; - StrVal.assign(TokStart+1, CurPtr-2); - UnEscapeLexed(StrVal); - return lltok::LabelStr; + kind = lltok::LabelStr; } -} -static bool JustWhitespaceNewLine(const char *&Ptr) { - const char *ThisPtr = Ptr; - while (*ThisPtr == ' ' || *ThisPtr == '\t') - ++ThisPtr; - if (*ThisPtr == '\n' || *ThisPtr == '\r') { - Ptr = ThisPtr; - return true; - } - return false; + return kind; } /// LexExclaim: @@ -429,13 +411,15 @@ static bool JustWhitespaceNewLine(const char *&Ptr) { /// ! lltok::Kind LLLexer::LexExclaim() { // Lex a metadata name as a MetadataVar. - if (isalpha(CurPtr[0])) { + if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' || + CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') { ++CurPtr; while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' || - CurPtr[0] == '.' || CurPtr[0] == '_') + CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') ++CurPtr; StrVal.assign(TokStart+1, CurPtr); // Skip ! + UnEscapeLexed(StrVal); return lltok::MetadataVar; } return lltok::exclaim; @@ -565,6 +549,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(nest); KEYWORD(readnone); KEYWORD(readonly); + KEYWORD(uwtable); KEYWORD(inlinehint); KEYWORD(noinline); @@ -576,6 +561,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(noimplicitfloat); KEYWORD(naked); KEYWORD(hotpatch); + KEYWORD(nonlazybind); KEYWORD(type); KEYWORD(opaque); @@ -604,26 +590,6 @@ lltok::Kind LLLexer::LexIdentifier() { TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context)); #undef TYPEKEYWORD - // Handle special forms for autoupgrading. Drop these in LLVM 3.0. This is - // to avoid conflicting with the sext/zext instructions, below. - if (Len == 4 && !memcmp(StartChar, "sext", 4)) { - // Scan CurPtr ahead, seeing if there is just whitespace before the newline. - if (JustWhitespaceNewLine(CurPtr)) - return lltok::kw_signext; - } else if (Len == 4 && !memcmp(StartChar, "zext", 4)) { - // Scan CurPtr ahead, seeing if there is just whitespace before the newline. - if (JustWhitespaceNewLine(CurPtr)) - return lltok::kw_zeroext; - } else if (Len == 6 && !memcmp(StartChar, "malloc", 6)) { - // FIXME: Remove in LLVM 3.0. - // Autoupgrade malloc instruction. - return lltok::kw_malloc; - } else if (Len == 4 && !memcmp(StartChar, "free", 4)) { - // FIXME: Remove in LLVM 3.0. - // Autoupgrade malloc instruction. - return lltok::kw_free; - } - // Keywords for instructions. #define INSTKEYWORD(STR, Enum) \ if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR))) { \ @@ -695,14 +661,6 @@ lltok::Kind LLLexer::LexIdentifier() { return lltok::kw_cc; } - // If this starts with "call", return it as CALL. This is to support old - // broken .ll files. FIXME: remove this with LLVM 3.0. - if (CurPtr-TokStart > 4 && !memcmp(TokStart, "call", 4)) { - CurPtr = TokStart+4; - UIntVal = Instruction::Call; - return lltok::kw_call; - } - // Finally, if this isn't known, return an error. CurPtr = TokStart+1; return lltok::Error; diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h index 09ae801..4fe705e 100644 --- a/lib/AsmParser/LLLexer.h +++ b/lib/AsmParser/LLLexer.h @@ -71,6 +71,9 @@ namespace llvm { int getNextChar(); void SkipLineComment(); + lltok::Kind ReadString(lltok::Kind kind); + bool ReadVarName(); + lltok::Kind LexIdentifier(); lltok::Kind LexDigitOrNegative(); lltok::Kind LexPositive(); diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp index 0c3237a..fa1d97d 100644 --- a/lib/AsmParser/LLParser.cpp +++ b/lib/AsmParser/LLParser.cpp @@ -59,24 +59,6 @@ bool LLParser::ValidateEndOfModule() { } - // Update auto-upgraded malloc calls to "malloc". - // FIXME: Remove in LLVM 3.0. - if (MallocF) { - MallocF->setName("malloc"); - // If setName() does not set the name to "malloc", then there is already a - // declaration of "malloc". In that case, iterate over all calls to MallocF - // and get them to call the declared "malloc" instead. - if (MallocF->getName() != "malloc") { - Constant *RealMallocF = M->getFunction("malloc"); - if (RealMallocF->getType() != MallocF->getType()) - RealMallocF = ConstantExpr::getBitCast(RealMallocF, MallocF->getType()); - MallocF->replaceAllUsesWith(RealMallocF); - MallocF->eraseFromParent(); - MallocF = NULL; - } - } - - // If there are entries in ForwardRefBlockAddresses at this point, they are // references after the function was defined. Resolve those now. while (!ForwardRefBlockAddresses.empty()) { @@ -176,7 +158,6 @@ bool LLParser::ParseTopLevelEntities() { switch (Lex.getKind()) { default: return TokError("expected top-level entity"); case lltok::Eof: return false; - //case lltok::kw_define: case lltok::kw_declare: if (ParseDeclare()) return true; break; case lltok::kw_define: if (ParseDefine()) return true; break; case lltok::kw_module: if (ParseModuleAsm()) return true; break; @@ -514,7 +495,7 @@ bool LLParser::ParseMDNodeID(MDNode *&Result) { if (Result) return false; // Otherwise, create MDNode forward reference. - MDNode *FwdNode = MDNode::getTemporary(Context, 0, 0); + MDNode *FwdNode = MDNode::getTemporary(Context, ArrayRef<Value*>()); ForwardRefMDNodes[MID] = std::make_pair(FwdNode, Lex.getLoc()); if (NumberedMetadata.size() <= MID) @@ -572,7 +553,7 @@ bool LLParser::ParseStandaloneMetadata() { ParseToken(lltok::rbrace, "expected end of metadata node")) return true; - MDNode *Init = MDNode::get(Context, Elts.data(), Elts.size()); + MDNode *Init = MDNode::get(Context, Elts); // See if this was forward referenced, if so, handle it. std::map<unsigned, std::pair<TrackingVH<MDNode>, LocTy> >::iterator @@ -931,33 +912,23 @@ bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace) { /// ParseOptionalAttrs - Parse a potentially empty attribute list. AttrKind /// indicates what kind of attribute list this is: 0: function arg, 1: result, /// 2: function attr. -/// 3: function arg after value: FIXME: REMOVE IN LLVM 3.0 bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) { Attrs = Attribute::None; LocTy AttrLoc = Lex.getLoc(); while (1) { switch (Lex.getKind()) { - case lltok::kw_sext: - case lltok::kw_zext: - // Treat these as signext/zeroext if they occur in the argument list after - // the value, as in "call i8 @foo(i8 10 sext)". If they occur before the - // value, as in "call i8 @foo(i8 sext (" then it is part of a constant - // expr. - // FIXME: REMOVE THIS IN LLVM 3.0 - if (AttrKind == 3) { - if (Lex.getKind() == lltok::kw_sext) - Attrs |= Attribute::SExt; - else - Attrs |= Attribute::ZExt; - break; - } - // FALL THROUGH. default: // End of attributes. if (AttrKind != 2 && (Attrs & Attribute::FunctionOnly)) return Error(AttrLoc, "invalid use of function-only attribute"); - if (AttrKind != 0 && AttrKind != 3 && (Attrs & Attribute::ParameterOnly)) + // As a hack, we allow "align 2" on functions as a synonym for + // "alignstack 2". + if (AttrKind == 2 && + (Attrs & ~(Attribute::FunctionOnly | Attribute::Alignment))) + return Error(AttrLoc, "invalid use of attribute on a function"); + + if (AttrKind != 0 && (Attrs & Attribute::ParameterOnly)) return Error(AttrLoc, "invalid use of parameter-only attribute"); return false; @@ -972,6 +943,7 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) { case lltok::kw_noreturn: Attrs |= Attribute::NoReturn; break; case lltok::kw_nounwind: Attrs |= Attribute::NoUnwind; break; + case lltok::kw_uwtable: Attrs |= Attribute::UWTable; break; case lltok::kw_noinline: Attrs |= Attribute::NoInline; break; case lltok::kw_readnone: Attrs |= Attribute::ReadNone; break; case lltok::kw_readonly: Attrs |= Attribute::ReadOnly; break; @@ -984,6 +956,7 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) { case lltok::kw_noimplicitfloat: Attrs |= Attribute::NoImplicitFloat; break; case lltok::kw_naked: Attrs |= Attribute::Naked; break; case lltok::kw_hotpatch: Attrs |= Attribute::Hotpatch; break; + case lltok::kw_nonlazybind: Attrs |= Attribute::NonLazyBind; break; case lltok::kw_alignstack: { unsigned Alignment; @@ -1494,11 +1467,7 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList, return true; // Otherwise, handle normal operands. - if (ParseOptionalAttrs(ArgAttrs1, 0) || - ParseValue(ArgTy, V, PFS) || - // FIXME: Should not allow attributes after the argument, remove this - // in LLVM 3.0. - ParseOptionalAttrs(ArgAttrs2, 3)) + if (ParseOptionalAttrs(ArgAttrs1, 0) || ParseValue(ArgTy, V, PFS)) return true; ArgList.push_back(ParamInfo(ArgLoc, V, ArgAttrs1|ArgAttrs2)); } @@ -2498,7 +2467,7 @@ bool LLParser::ParseMetadataListValue(ValID &ID, PerFunctionState *PFS) { ParseToken(lltok::rbrace, "expected end of metadata node")) return true; - ID.MDNodeVal = MDNode::get(Context, Elts.data(), Elts.size()); + ID.MDNodeVal = MDNode::get(Context, Elts); ID.Kind = ValID::t_MDNode; return false; } @@ -2761,13 +2730,6 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { // and do semantic checks. std::vector<const Type*> ParamTypeList; SmallVector<AttributeWithIndex, 8> Attrs; - // FIXME : In 3.0, stop accepting zext, sext and inreg as optional function - // attributes. - unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg; - if (FuncAttrs & ObsoleteFuncAttrs) { - RetAttrs |= FuncAttrs & ObsoleteFuncAttrs; - FuncAttrs &= ~ObsoleteFuncAttrs; - } if (RetAttrs != Attribute::None) Attrs.push_back(AttributeWithIndex::get(0, RetAttrs)); @@ -3003,7 +2965,6 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_sub: case lltok::kw_mul: case lltok::kw_shl: { - LocTy ModifierLoc = Lex.getLoc(); bool NUW = EatIfPresent(lltok::kw_nuw); bool NSW = EatIfPresent(lltok::kw_nsw); if (!NUW) NUW = EatIfPresent(lltok::kw_nuw); @@ -3062,8 +3023,6 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_tail: return ParseCall(Inst, PFS, true); // Memory. case lltok::kw_alloca: return ParseAlloc(Inst, PFS); - case lltok::kw_malloc: return ParseAlloc(Inst, PFS, BB, false); - case lltok::kw_free: return ParseFree(Inst, PFS, BB); case lltok::kw_load: return ParseLoad(Inst, PFS, false); case lltok::kw_store: return ParseStore(Inst, PFS, false); case lltok::kw_volatile: @@ -3341,14 +3300,6 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) { Value *Callee; if (ConvertValIDToValue(PFTy, CalleeID, Callee, &PFS)) return true; - // FIXME: In LLVM 3.0, stop accepting zext, sext and inreg as optional - // function attributes. - unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg; - if (FnAttrs & ObsoleteFuncAttrs) { - RetAttrs |= FnAttrs & ObsoleteFuncAttrs; - FnAttrs &= ~ObsoleteFuncAttrs; - } - // Set up the Attributes for the function. SmallVector<AttributeWithIndex, 8> Attrs; if (RetAttrs != Attribute::None) @@ -3686,14 +3637,6 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, Value *Callee; if (ConvertValIDToValue(PFTy, CalleeID, Callee, &PFS)) return true; - // FIXME: In LLVM 3.0, stop accepting zext, sext and inreg as optional - // function attributes. - unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg; - if (FnAttrs & ObsoleteFuncAttrs) { - RetAttrs |= FnAttrs & ObsoleteFuncAttrs; - FnAttrs &= ~ObsoleteFuncAttrs; - } - // Set up the Attributes for the function. SmallVector<AttributeWithIndex, 8> Attrs; if (RetAttrs != Attribute::None) @@ -3743,10 +3686,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, //===----------------------------------------------------------------------===// /// ParseAlloc -/// ::= 'malloc' Type (',' TypeAndValue)? (',' OptionalInfo)? /// ::= 'alloca' Type (',' TypeAndValue)? (',' OptionalInfo)? -int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS, - BasicBlock* BB, bool isAlloca) { +int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) { PATypeHolder Ty(Type::getVoidTy(Context)); Value *Size = 0; LocTy SizeLoc; @@ -3769,37 +3710,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS, if (Size && !Size->getType()->isIntegerTy()) return Error(SizeLoc, "element count must have integer type"); - if (isAlloca) { - Inst = new AllocaInst(Ty, Size, Alignment); - return AteExtraComma ? InstExtraComma : InstNormal; - } - - // Autoupgrade old malloc instruction to malloc call. - // FIXME: Remove in LLVM 3.0. - if (Size && !Size->getType()->isIntegerTy(32)) - return Error(SizeLoc, "element count must be i32"); - const Type *IntPtrTy = Type::getInt32Ty(Context); - Constant *AllocSize = ConstantExpr::getSizeOf(Ty); - AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, IntPtrTy); - if (!MallocF) - // Prototype malloc as "void *(int32)". - // This function is renamed as "malloc" in ValidateEndOfModule(). - MallocF = cast<Function>( - M->getOrInsertFunction("", Type::getInt8PtrTy(Context), IntPtrTy, NULL)); - Inst = CallInst::CreateMalloc(BB, IntPtrTy, Ty, AllocSize, Size, MallocF); -return AteExtraComma ? InstExtraComma : InstNormal; -} - -/// ParseFree -/// ::= 'free' TypeAndValue -bool LLParser::ParseFree(Instruction *&Inst, PerFunctionState &PFS, - BasicBlock* BB) { - Value *Val; LocTy Loc; - if (ParseTypeAndValue(Val, Loc, PFS)) return true; - if (!Val->getType()->isPointerTy()) - return Error(Loc, "operand to free must be a pointer"); - Inst = CallInst::CreateFree(Val, BB); - return false; + Inst = new AllocaInst(Ty, Size, Alignment); + return AteExtraComma ? InstExtraComma : InstNormal; } /// ParseLoad diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h index 93e7f77..bbc641c 100644 --- a/lib/AsmParser/LLParser.h +++ b/lib/AsmParser/LLParser.h @@ -131,11 +131,10 @@ namespace llvm { std::map<ValID, std::vector<std::pair<ValID, GlobalValue*> > > ForwardRefBlockAddresses; - Function *MallocF; public: LLParser(MemoryBuffer *F, SourceMgr &SM, SMDiagnostic &Err, Module *m) : Context(m->getContext()), Lex(F, SM, Err, m->getContext()), - M(m), MallocF(NULL) {} + M(m) {} bool Run(); LLVMContext& getContext() { return Context; } @@ -359,9 +358,7 @@ namespace llvm { bool ParseShuffleVector(Instruction *&I, PerFunctionState &PFS); int ParsePHI(Instruction *&I, PerFunctionState &PFS); bool ParseCall(Instruction *&I, PerFunctionState &PFS, bool isTail); - int ParseAlloc(Instruction *&I, PerFunctionState &PFS, - BasicBlock *BB = 0, bool isAlloca = true); - bool ParseFree(Instruction *&I, PerFunctionState &PFS, BasicBlock *BB); + int ParseAlloc(Instruction *&I, PerFunctionState &PFS); int ParseLoad(Instruction *&I, PerFunctionState &PFS, bool isVolatile); int ParseStore(Instruction *&I, PerFunctionState &PFS, bool isVolatile); bool ParseGetResult(Instruction *&I, PerFunctionState &PFS); diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h index 576da19..ea01264 100644 --- a/lib/AsmParser/LLToken.h +++ b/lib/AsmParser/LLToken.h @@ -87,6 +87,7 @@ namespace lltok { kw_nest, kw_readnone, kw_readonly, + kw_uwtable, kw_inlinehint, kw_noinline, @@ -98,6 +99,7 @@ namespace lltok { kw_noimplicitfloat, kw_naked, kw_hotpatch, + kw_nonlazybind, kw_type, kw_opaque, @@ -120,7 +122,7 @@ namespace lltok { kw_ret, kw_br, kw_switch, kw_indirectbr, kw_invoke, kw_unwind, kw_unreachable, - kw_malloc, kw_alloca, kw_free, kw_load, kw_store, kw_getelementptr, + kw_alloca, kw_load, kw_store, kw_getelementptr, kw_extractelement, kw_insertelement, kw_shufflevector, kw_getresult, kw_extractvalue, kw_insertvalue, kw_blockaddress, diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 8223f76..bc995ae 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -301,8 +301,7 @@ void BitcodeReaderValueList::ResolveConstantForwardRefs() { NewC = ConstantVector::get(NewOps); } else { assert(isa<ConstantExpr>(UserC) && "Must be a ConstantExpr."); - NewC = cast<ConstantExpr>(UserC)->getWithOperands(&NewOps[0], - NewOps.size()); + NewC = cast<ConstantExpr>(UserC)->getWithOperands(NewOps); } UserC->replaceAllUsesWith(NewC); @@ -350,7 +349,7 @@ Value *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) { } // Create and return a placeholder, which will later be RAUW'd. - Value *V = MDNode::getTemporary(Context, 0, 0); + Value *V = MDNode::getTemporary(Context, ArrayRef<Value*>()); MDValuePtrs[Idx] = V; return V; } @@ -844,9 +843,7 @@ bool BitcodeReader::ParseMetadata() { else Elts.push_back(NULL); } - Value *V = MDNode::getWhenValsUnresolved(Context, - Elts.data(), Elts.size(), - IsFunctionLocal); + Value *V = MDNode::getWhenValsUnresolved(Context, Elts, IsFunctionLocal); IsFunctionLocal = false; MDValueList.AssignValue(V, NextMDValueNo++); break; @@ -1591,8 +1588,18 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) { while (!Stream.AtEndOfStream()) { unsigned Code = Stream.ReadCode(); - if (Code != bitc::ENTER_SUBBLOCK) + if (Code != bitc::ENTER_SUBBLOCK) { + + // The ranlib in xcode 4 will align archive members by appending newlines to the + // end of them. If this file size is a multiple of 4 but not 8, we have to read and + // ignore these final 4 bytes :-( + if (Stream.GetAbbrevIDWidth() == 2 && Code == 2 && + Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a && + Stream.AtEndOfStream()) + return false; + return Error("Invalid record at top-level"); + } unsigned BlockID = Stream.ReadSubBlockID(); @@ -1845,7 +1852,6 @@ bool BitcodeReader::ParseFunctionBody(Function *F) { FunctionBBs[i] = BasicBlock::Create(Context, "", F); CurBB = FunctionBBs[0]; continue; - case bitc::FUNC_CODE_DEBUG_LOC_AGAIN: // DEBUG_LOC_AGAIN // This record indicates that the last instruction is at the same diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index e34137f..bc218b3 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -23,6 +23,7 @@ #include "llvm/Operator.h" #include "llvm/TypeSymbolTable.h" #include "llvm/ValueSymbolTable.h" +#include "llvm/ADT/Triple.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -100,8 +101,6 @@ static unsigned GetEncodedBinaryOpcode(unsigned Opcode) { } } - - static void WriteStringRecord(unsigned Code, const std::string &Str, unsigned AbbrevToUse, BitstreamWriter &Stream) { SmallVector<unsigned, 64> Vals; @@ -447,7 +446,6 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, Vals.clear(); } - // Emit the alias information. for (Module::const_alias_iterator AI = M->alias_begin(), E = M->alias_end(); AI != E; ++AI) { @@ -871,8 +869,6 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal, break; } } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C)) { - assert(BA->getFunction() == BA->getBasicBlock()->getParent() && - "Malformed blockaddress"); Code = bitc::CST_CODE_BLOCKADDRESS; Record.push_back(VE.getTypeID(BA->getFunction()->getType())); Record.push_back(VE.getValueID(BA->getFunction())); @@ -1514,9 +1510,9 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream) { WriteModuleMetadata(M, VE, Stream); // Emit function bodies. - for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) - if (!I->isDeclaration()) - WriteFunction(*I, VE, Stream); + for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) + if (!F->isDeclaration()) + WriteFunction(*F, VE, Stream); // Emit metadata. WriteModuleMetadataStore(M, Stream); @@ -1548,40 +1544,7 @@ enum { DarwinBCHeaderSize = 5*4 }; -/// isARMTriplet - Return true if the triplet looks like: -/// arm-*, thumb-*, armv[0-9]-*, thumbv[0-9]-*, armv5te-*, or armv6t2-*. -static bool isARMTriplet(const std::string &TT) { - size_t Pos = 0; - size_t Size = TT.size(); - if (Size >= 6 && - TT[0] == 't' && TT[1] == 'h' && TT[2] == 'u' && - TT[3] == 'm' && TT[4] == 'b') - Pos = 5; - else if (Size >= 4 && TT[0] == 'a' && TT[1] == 'r' && TT[2] == 'm') - Pos = 3; - else - return false; - - if (TT[Pos] == '-') - return true; - else if (TT[Pos] == 'v') { - if (Size >= Pos+4 && - TT[Pos+1] == '6' && TT[Pos+2] == 't' && TT[Pos+3] == '2') - return true; - else if (Size >= Pos+4 && - TT[Pos+1] == '5' && TT[Pos+2] == 't' && TT[Pos+3] == 'e') - return true; - } else - return false; - while (++Pos < Size && TT[Pos] != '-') { - if (!isdigit(TT[Pos])) - return false; - } - return true; -} - -static void EmitDarwinBCHeader(BitstreamWriter &Stream, - const std::string &TT) { +static void EmitDarwinBCHeader(BitstreamWriter &Stream, const Triple &TT) { unsigned CPUType = ~0U; // Match x86_64-*, i[3-9]86-*, powerpc-*, powerpc64-*, arm-*, thumb-*, @@ -1595,16 +1558,16 @@ static void EmitDarwinBCHeader(BitstreamWriter &Stream, DARWIN_CPU_TYPE_POWERPC = 18 }; - if (TT.find("x86_64-") == 0) + Triple::ArchType Arch = TT.getArch(); + if (Arch == Triple::x86_64) CPUType = DARWIN_CPU_TYPE_X86 | DARWIN_CPU_ARCH_ABI64; - else if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' && - TT[4] == '-' && TT[1] - '3' < 6) + else if (Arch == Triple::x86) CPUType = DARWIN_CPU_TYPE_X86; - else if (TT.find("powerpc-") == 0) + else if (Arch == Triple::ppc) CPUType = DARWIN_CPU_TYPE_POWERPC; - else if (TT.find("powerpc64-") == 0) + else if (Arch == Triple::ppc64) CPUType = DARWIN_CPU_TYPE_POWERPC | DARWIN_CPU_ARCH_ABI64; - else if (isARMTriplet(TT)) + else if (Arch == Triple::arm || Arch == Triple::thumb) CPUType = DARWIN_CPU_TYPE_ARM; // Traditional Bitcode starts after header. @@ -1650,11 +1613,9 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out) { void llvm::WriteBitcodeToStream(const Module *M, BitstreamWriter &Stream) { // If this is darwin or another generic macho target, emit a file header and // trailer if needed. - bool isMacho = - M->getTargetTriple().find("-darwin") != std::string::npos || - M->getTargetTriple().find("-macho") != std::string::npos; - if (isMacho) - EmitDarwinBCHeader(Stream, M->getTargetTriple()); + Triple TT(M->getTargetTriple()); + if (TT.isOSDarwin()) + EmitDarwinBCHeader(Stream, TT); // Emit the file header. Stream.Emit((unsigned)'B', 8); @@ -1667,6 +1628,6 @@ void llvm::WriteBitcodeToStream(const Module *M, BitstreamWriter &Stream) { // Emit the module. WriteModule(M, Stream); - if (isMacho) + if (TT.isOSDarwin()) EmitDarwinBCTrailer(Stream, Stream.getBuffer().size()); } diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp index 05078ca..5138c3c 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -363,7 +363,7 @@ void ValueEnumerator::EnumerateValue(const Value *V) { // Initializers for globals are handled explicitly elsewhere. } else if (isa<ConstantArray>(C) && cast<ConstantArray>(C)->isString()) { // Do not enumerate the initializers for an array of simple characters. - // The initializers just polute the value table, and we emit the strings + // The initializers just pollute the value table, and we emit the strings // specially. } else if (C->getNumOperands()) { // If a constant has operands, enumerate them. This makes sure that if a @@ -423,7 +423,7 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) { // This constant may have operands, make sure to enumerate the types in // them. for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) { - const User *Op = C->getOperand(i); + const Value *Op = C->getOperand(i); // Don't enumerate basic blocks here, this happens as operands to // blockaddress. @@ -452,7 +452,6 @@ void ValueEnumerator::EnumerateAttributes(const AttrListPtr &PAL) { } } - void ValueEnumerator::incorporateFunction(const Function &F) { InstructionCount = 0; NumModuleValues = Values.size(); diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index b520d8f..c23351b 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -16,6 +16,7 @@ #define DEBUG_TYPE "post-RA-sched" #include "AggressiveAntiDepBreaker.h" +#include "RegisterClassInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" @@ -114,12 +115,13 @@ bool AggressiveAntiDepState::IsLive(unsigned Reg) AggressiveAntiDepBreaker:: AggressiveAntiDepBreaker(MachineFunction& MFi, + const RegisterClassInfo &RCI, TargetSubtarget::RegClassVector& CriticalPathRCs) : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()), TII(MF.getTarget().getInstrInfo()), TRI(MF.getTarget().getRegisterInfo()), - AllocatableSet(TRI->getAllocatableSet(MF)), + RegClassInfo(RCI), State(NULL) { /* Collect a bitset of all registers that are only broken if they are on the critical path. */ @@ -357,7 +359,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI, RegRefs = State->GetRegRefs(); // Handle dead defs by simulating a last-use of the register just - // after the def. A dead def can occur because the def is truely + // after the def. A dead def can occur because the def is truly // dead, or because only a subregister is live at the def. If we // don't do this the dead def will be incorrectly merged into the // previous def. @@ -618,9 +620,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( const TargetRegisterClass *SuperRC = TRI->getMinimalPhysRegClass(SuperReg, MVT::Other); - const TargetRegisterClass::iterator RB = SuperRC->allocation_order_begin(MF); - const TargetRegisterClass::iterator RE = SuperRC->allocation_order_end(MF); - if (RB == RE) { + ArrayRef<unsigned> Order = RegClassInfo.getOrder(SuperRC); + if (Order.empty()) { DEBUG(dbgs() << "\tEmpty Super Regclass!!\n"); return false; } @@ -628,17 +629,17 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( DEBUG(dbgs() << "\tFind Registers:"); if (RenameOrder.count(SuperRC) == 0) - RenameOrder.insert(RenameOrderType::value_type(SuperRC, RE)); + RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size())); - const TargetRegisterClass::iterator OrigR = RenameOrder[SuperRC]; - const TargetRegisterClass::iterator EndR = ((OrigR == RE) ? RB : OrigR); - TargetRegisterClass::iterator R = OrigR; + unsigned OrigR = RenameOrder[SuperRC]; + unsigned EndR = ((OrigR == Order.size()) ? 0 : OrigR); + unsigned R = OrigR; do { - if (R == RB) R = RE; + if (R == 0) R = Order.size(); --R; - const unsigned NewSuperReg = *R; + const unsigned NewSuperReg = Order[R]; // Don't consider non-allocatable registers - if (!AllocatableSet.test(NewSuperReg)) continue; + if (!RegClassInfo.isAllocatable(NewSuperReg)) continue; // Don't replace a register with itself. if (NewSuperReg == SuperReg) continue; @@ -719,7 +720,9 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( const std::vector<SUnit>& SUnits, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, - unsigned InsertPosIndex) { + unsigned InsertPosIndex, + DbgValueVector &DbgValues) { + std::vector<unsigned> &KillIndices = State->GetKillIndices(); std::vector<unsigned> &DefIndices = State->GetDefIndices(); std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>& @@ -817,7 +820,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( DEBUG(dbgs() << "\tAntidep reg: " << TRI->getName(AntiDepReg)); assert(AntiDepReg != 0 && "Anti-dependence on reg0?"); - if (!AllocatableSet.test(AntiDepReg)) { + if (!RegClassInfo.isAllocatable(AntiDepReg)) { // Don't break anti-dependencies on non-allocatable registers. DEBUG(dbgs() << " (non-allocatable)\n"); continue; @@ -923,14 +926,10 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( // sure to update that as well. const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()]; if (!SU) continue; - for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) { - MachineInstr *DI = SU->DbgInstrList[i]; - assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() && - DI->getOperand(0).getReg() - && "Non register dbg_value attached to SUnit!"); - if (DI->getOperand(0).getReg() == AntiDepReg) - DI->getOperand(0).setReg(NewReg); - } + for (DbgValueVector::iterator DVI = DbgValues.begin(), + DVE = DbgValues.end(); DVI != DVE; ++DVI) + if (DVI->second == Q->second.Operand->getParent()) + UpdateDbgValue(DVI->first, AntiDepReg, NewReg); } // We just went back in time and modified history; the diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h index 9d715cc..e43fe65 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.h +++ b/lib/CodeGen/AggressiveAntiDepBreaker.h @@ -30,6 +30,8 @@ #include <map> namespace llvm { +class RegisterClassInfo; + /// Class AggressiveAntiDepState /// Contains all the state necessary for anti-dep breaking. class AggressiveAntiDepState { @@ -117,11 +119,7 @@ namespace llvm { MachineRegisterInfo &MRI; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; - - /// AllocatableSet - The set of allocatable registers. - /// We'll be ignoring anti-dependencies on non-allocatable registers, - /// because they may not be safe to break. - const BitVector AllocatableSet; + const RegisterClassInfo &RegClassInfo; /// CriticalPathSet - The set of registers that should only be /// renamed if they are on the critical path. @@ -133,6 +131,7 @@ namespace llvm { public: AggressiveAntiDepBreaker(MachineFunction& MFi, + const RegisterClassInfo &RCI, TargetSubtarget::RegClassVector& CriticalPathRCs); ~AggressiveAntiDepBreaker(); @@ -146,7 +145,8 @@ namespace llvm { unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, - unsigned InsertPosIndex); + unsigned InsertPosIndex, + DbgValueVector &DbgValues); /// Observe - Update liveness information to account for the current /// instruction, which will not be scheduled. @@ -157,8 +157,8 @@ namespace llvm { void FinishBlock(); private: - typedef std::map<const TargetRegisterClass *, - TargetRegisterClass::const_iterator> RenameOrderType; + /// Keep track of a position in the allocation order for each regclass. + typedef std::map<const TargetRegisterClass *, unsigned> RenameOrderType; /// IsImplicitDefUse - Return true if MO represents a register /// that is both implicitly used and defined in MI diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp index 20c7625..1005f10 100644 --- a/lib/CodeGen/AllocationOrder.cpp +++ b/lib/CodeGen/AllocationOrder.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "AllocationOrder.h" +#include "RegisterClassInfo.h" #include "VirtRegMap.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -23,8 +24,8 @@ using namespace llvm; // Compare VirtRegMap::getRegAllocPref(). AllocationOrder::AllocationOrder(unsigned VirtReg, const VirtRegMap &VRM, - const BitVector &ReservedRegs) - : Pos(0), Reserved(ReservedRegs) { + const RegisterClassInfo &RegClassInfo) + : Begin(0), End(0), Pos(0), RCI(RegClassInfo), OwnedBegin(false) { const TargetRegisterClass *RC = VRM.getRegInfo().getRegClass(VirtReg); std::pair<unsigned, unsigned> HintPair = VRM.getRegInfo().getRegAllocationHint(VirtReg); @@ -36,33 +37,43 @@ AllocationOrder::AllocationOrder(unsigned VirtReg, if (TargetRegisterInfo::isVirtualRegister(Hint)) Hint = VRM.getPhys(Hint); - // The remaining allocation order may depend on the hint. - tie(Begin, End) = VRM.getTargetRegInfo() - .getAllocationOrder(RC, HintPair.first, Hint, VRM.getMachineFunction()); + // The first hint pair component indicates a target-specific hint. + if (HintPair.first) { + const TargetRegisterInfo &TRI = VRM.getTargetRegInfo(); + // The remaining allocation order may depend on the hint. + ArrayRef<unsigned> Order = + TRI.getRawAllocationOrder(RC, HintPair.first, Hint, + VRM.getMachineFunction()); + if (Order.empty()) + return; - // Target-dependent hints require resolution. - if (HintPair.first) - Hint = VRM.getTargetRegInfo().ResolveRegAllocHint(HintPair.first, Hint, - VRM.getMachineFunction()); + // Copy the allocation order with reserved registers removed. + OwnedBegin = true; + unsigned *P = new unsigned[Order.size()]; + Begin = P; + for (unsigned i = 0; i != Order.size(); ++i) + if (!RCI.isReserved(Order[i])) + *P++ = Order[i]; + End = P; + + // Target-dependent hints require resolution. + Hint = TRI.ResolveRegAllocHint(HintPair.first, Hint, + VRM.getMachineFunction()); + } else { + // If there is no hint or just a normal hint, use the cached allocation + // order from RegisterClassInfo. + ArrayRef<unsigned> O = RCI.getOrder(RC); + Begin = O.begin(); + End = O.end(); + } // The hint must be a valid physreg for allocation. if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) || - !RC->contains(Hint) || ReservedRegs.test(Hint))) + !RC->contains(Hint) || RCI.isReserved(Hint))) Hint = 0; } -unsigned AllocationOrder::next() { - // First take the hint. - if (!Pos) { - Pos = Begin; - if (Hint) - return Hint; - } - // Then look at the order from TRI. - while(Pos != End) { - unsigned Reg = *Pos++; - if (Reg != Hint && !Reserved.test(Reg)) - return Reg; - } - return 0; +AllocationOrder::~AllocationOrder() { + if (OwnedBegin) + delete [] Begin; } diff --git a/lib/CodeGen/AllocationOrder.h b/lib/CodeGen/AllocationOrder.h index 61fd8f8..d1e48a1 100644 --- a/lib/CodeGen/AllocationOrder.h +++ b/lib/CodeGen/AllocationOrder.h @@ -19,15 +19,16 @@ namespace llvm { -class BitVector; +class RegisterClassInfo; class VirtRegMap; class AllocationOrder { const unsigned *Begin; const unsigned *End; const unsigned *Pos; - const BitVector &Reserved; + const RegisterClassInfo &RCI; unsigned Hint; + bool OwnedBegin; public: /// AllocationOrder - Create a new AllocationOrder for VirtReg. @@ -37,12 +38,28 @@ public: /// TargetRegisterInfo::getReservedRegs(). AllocationOrder(unsigned VirtReg, const VirtRegMap &VRM, - const BitVector &ReservedRegs); + const RegisterClassInfo &RegClassInfo); + + ~AllocationOrder(); /// next - Return the next physical register in the allocation order, or 0. /// It is safe to call next again after it returned 0. /// It will keep returning 0 until rewind() is called. - unsigned next(); + unsigned next() { + // First take the hint. + if (!Pos) { + Pos = Begin; + if (Hint) + return Hint; + } + // Then look at the order from TRI. + while (Pos != End) { + unsigned Reg = *Pos++; + if (Reg != Hint) + return Reg; + } + return 0; + } /// rewind - Start over from the beginning. void rewind() { Pos = 0; } diff --git a/lib/CodeGen/AntiDepBreaker.h b/lib/CodeGen/AntiDepBreaker.h index 086b757..df47f98 100644 --- a/lib/CodeGen/AntiDepBreaker.h +++ b/lib/CodeGen/AntiDepBreaker.h @@ -30,6 +30,9 @@ namespace llvm { /// anti-dependencies. class AntiDepBreaker { public: + typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > + DbgValueVector; + virtual ~AntiDepBreaker(); /// Start - Initialize anti-dep breaking for a new basic block. @@ -40,9 +43,10 @@ public: /// the number of anti-dependencies broken. /// virtual unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits, - MachineBasicBlock::iterator Begin, - MachineBasicBlock::iterator End, - unsigned InsertPosIndex) =0; + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned InsertPosIndex, + DbgValueVector &DbgValues) = 0; /// Observe - Update liveness information to account for the current /// instruction, which will not be scheduled. @@ -52,6 +56,14 @@ public: /// Finish - Finish anti-dep breaking for a basic block. virtual void FinishBlock() =0; + + /// UpdateDbgValue - Update DBG_VALUE if dependency breaker is updating + /// other machine instruction to use NewReg. + void UpdateDbgValue(MachineInstr *MI, unsigned OldReg, unsigned NewReg) { + assert (MI->isDebugValue() && "MI is not DBG_VALUE!"); + if (MI && MI->getOperand(0).isReg() && MI->getOperand(0).getReg() == OldReg) + MI->getOperand(0).setReg(NewReg); + } }; } diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp index 0db28a6..5861fa4 100644 --- a/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -52,7 +52,7 @@ void ARMException::EndModule() { /// being emitted immediately after the function entry point. void ARMException::BeginFunction(const MachineFunction *MF) { Asm->OutStreamer.EmitFnStart(); - if (!Asm->MF->getFunction()->doesNotThrow() || UnwindTablesMandatory) + if (Asm->MF->getFunction()->needsUnwindTableEntry()) Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber())); } @@ -60,7 +60,7 @@ void ARMException::BeginFunction(const MachineFunction *MF) { /// EndFunction - Gather and emit post-function exception information. /// void ARMException::EndFunction() { - if (Asm->MF->getFunction()->doesNotThrow() && !UnwindTablesMandatory) + if (!Asm->MF->getFunction()->needsUnwindTableEntry()) Asm->OutStreamer.EmitCantUnwind(); else { Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end", diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 61f5672..b544ff1 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -35,10 +35,12 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Target/Mangler.h" +#include "llvm/Target/TargetAsmInfo.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Assembly/Writer.h" #include "llvm/ADT/SmallString.h" @@ -191,22 +193,25 @@ bool AsmPrinter::doInitialization(Module &M) { if (MAI->doesSupportDebugInformation()) DD = new DwarfDebug(this, &M); - if (MAI->doesSupportExceptionHandling()) - switch (MAI->getExceptionHandlingType()) { - default: - case ExceptionHandling::DwarfTable: - DE = new DwarfTableException(this); - break; - case ExceptionHandling::DwarfCFI: - DE = new DwarfCFIException(this); - break; - case ExceptionHandling::ARM: - DE = new ARMException(this); - break; - } + switch (MAI->getExceptionHandlingType()) { + case ExceptionHandling::None: + return false; + case ExceptionHandling::SjLj: + case ExceptionHandling::DwarfCFI: + DE = new DwarfCFIException(this); + return false; + case ExceptionHandling::ARM: + DE = new ARMException(this); + return false; + case ExceptionHandling::Win64: + DE = new Win64Exception(this); + return false; + } +#else + return false; #endif // ANDROID_TARGET_BUILD - return false; + llvm_unreachable("Unknown exception type."); } void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const { @@ -271,7 +276,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { } MCSymbol *GVSym = Mang->getSymbol(GV); - EmitVisibility(GVSym, GV->getVisibility()); + EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); if (!GV->hasInitializer()) // External globals require no extra code. return; @@ -293,12 +298,6 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { if (GVKind.isCommon() || GVKind.isBSSLocal()) { if (Size == 0) Size = 1; // .comm Foo, 0 is undefined, avoid it. - if (isVerbose()) { - WriteAsOperand(OutStreamer.GetCommentOS(), GV, - /*PrintType=*/false, GV->getParent()); - OutStreamer.GetCommentOS() << '\n'; - } - // Handle common symbols. if (GVKind.isCommon()) { unsigned Align = 1 << AlignLog; @@ -492,39 +491,11 @@ void AsmPrinter::EmitFunctionEntryLabel() { } -static void EmitDebugLoc(DebugLoc DL, const MachineFunction *MF, - raw_ostream &CommentOS) { - const LLVMContext &Ctx = MF->getFunction()->getContext(); - if (!DL.isUnknown()) { // Print source line info. - DIScope Scope(DL.getScope(Ctx)); - // Omit the directory, because it's likely to be long and uninteresting. - if (Scope.Verify()) - CommentOS << Scope.getFilename(); - else - CommentOS << "<unknown>"; - CommentOS << ':' << DL.getLine(); - if (DL.getCol() != 0) - CommentOS << ':' << DL.getCol(); - DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx)); - if (!InlinedAtDL.isUnknown()) { - CommentOS << "[ "; - EmitDebugLoc(InlinedAtDL, MF, CommentOS); - CommentOS << " ]"; - } - } -} - /// EmitComments - Pretty-print comments for instructions. static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) { const MachineFunction *MF = MI.getParent()->getParent(); const TargetMachine &TM = MF->getTarget(); - DebugLoc DL = MI.getDebugLoc(); - if (!DL.isUnknown()) { // Print source line info. - EmitDebugLoc(DL, MF, CommentOS); - CommentOS << '\n'; - } - // Check for spills and reloads int FI; @@ -631,6 +602,45 @@ static bool EmitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { return true; } +AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() { + if (MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI && + MF->getFunction()->needsUnwindTableEntry()) + return CFI_M_EH; + + if (MMI->hasDebugInfo()) + return CFI_M_Debug; + + return CFI_M_None; +} + +bool AsmPrinter::needsSEHMoves() { + return MAI->getExceptionHandlingType() == ExceptionHandling::Win64 && + MF->getFunction()->needsUnwindTableEntry(); +} + +void AsmPrinter::emitPrologLabel(const MachineInstr &MI) { + MCSymbol *Label = MI.getOperand(0).getMCSymbol(); + + if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI) + return; + + if (needsCFIMoves() == CFI_M_None) + return; + + MachineModuleInfo &MMI = MF->getMMI(); + std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + bool FoundOne = false; + (void)FoundOne; + for (std::vector<MachineMove>::iterator I = Moves.begin(), + E = Moves.end(); I != E; ++I) { + if (I->getLabel() == Label) { + EmitCFIFrameMove(*I); + FoundOne = true; + } + } + assert(FoundOne); +} + /// EmitFunctionBody - This method emits the body and trailer for a /// function. void AsmPrinter::EmitFunctionBody() { @@ -669,6 +679,9 @@ void AsmPrinter::EmitFunctionBody() { switch (II->getOpcode()) { case TargetOpcode::PROLOG_LABEL: + emitPrologLabel(*II); + break; + case TargetOpcode::EH_LABEL: case TargetOpcode::GC_LABEL: OutStreamer.EmitLabel(II->getOperand(0).getMCSymbol()); @@ -689,6 +702,9 @@ void AsmPrinter::EmitFunctionBody() { if (isVerbose()) EmitKill(II, *this); break; default: + if (!TM.hasMCUseLoc()) + MCLineEntry::Make(&OutStreamer, getCurrentSection()); + EmitInstruction(II); break; } @@ -767,6 +783,53 @@ getDebugValueLocation(const MachineInstr *MI) const { return MachineLocation(); } +/// EmitDwarfRegOp - Emit dwarf register operation. +void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const { + const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false); + + for (const unsigned *SR = TRI->getSuperRegisters(MLoc.getReg()); + *SR && Reg < 0; ++SR) { + Reg = TRI->getDwarfRegNum(*SR, false); + // FIXME: Get the bit range this register uses of the superregister + // so that we can produce a DW_OP_bit_piece + } + + // FIXME: Handle cases like a super register being encoded as + // DW_OP_reg 32 DW_OP_piece 4 DW_OP_reg 33 + + // FIXME: We have no reasonable way of handling errors in here. The + // caller might be in the middle of an dwarf expression. We should + // probably assert that Reg >= 0 once debug info generation is more mature. + + if (int Offset = MLoc.getOffset()) { + if (Reg < 32) { + OutStreamer.AddComment( + dwarf::OperationEncodingString(dwarf::DW_OP_breg0 + Reg)); + EmitInt8(dwarf::DW_OP_breg0 + Reg); + } else { + OutStreamer.AddComment("DW_OP_bregx"); + EmitInt8(dwarf::DW_OP_bregx); + OutStreamer.AddComment(Twine(Reg)); + EmitULEB128(Reg); + } + EmitSLEB128(Offset); + } else { + if (Reg < 32) { + OutStreamer.AddComment( + dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg)); + EmitInt8(dwarf::DW_OP_reg0 + Reg); + } else { + OutStreamer.AddComment("DW_OP_regx"); + EmitInt8(dwarf::DW_OP_regx); + OutStreamer.AddComment(Twine(Reg)); + EmitULEB128(Reg); + } + } + + // FIXME: Produce a DW_OP_bit_piece if we used a superregister +} + bool AsmPrinter::doFinalization(Module &M) { // Emit global variables. for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); @@ -1879,7 +1942,7 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { return false; // The predecessor has to be immediately before this block. - const MachineBasicBlock *Pred = *PI; + MachineBasicBlock *Pred = *PI; if (!Pred->isLayoutSuccessor(MBB)) return false; @@ -1888,9 +1951,28 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { if (Pred->empty()) return true; - // Otherwise, check the last instruction. - const MachineInstr &LastInst = Pred->back(); - return !LastInst.getDesc().isBarrier(); + // Check the terminators in the previous blocks + for (MachineBasicBlock::iterator II = Pred->getFirstTerminator(), + IE = Pred->end(); II != IE; ++II) { + MachineInstr &MI = *II; + + // If it is not a simple branch, we are in a table somewhere. + if (!MI.getDesc().isBranch() || MI.getDesc().isIndirectBranch()) + return false; + + // If we are the operands of one of the branches, this is not + // a fall through. + for (MachineInstr::mop_iterator OI = MI.operands_begin(), + OE = MI.operands_end(); OI != OE; ++OI) { + const MachineOperand& OP = *OI; + if (OP.isJTI()) + return false; + if (OP.isMBB() && OP.getMBB() == MBB) + return false; + } + } + + return true; } diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp index 9c8184a..dd5b0e2 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp @@ -155,7 +155,7 @@ void AsmPrinter::EmitReference(const MCSymbol *Sym, unsigned Encoding) const { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); const MCExpr *Exp = - TLOF.getExprForDwarfReference(Sym, Mang, MMI, Encoding, OutStreamer); + TLOF.getExprForDwarfReference(Sym, Encoding, OutStreamer); OutStreamer.EmitAbsValue(Exp, GetSizeOfEncodedValue(Encoding)); } @@ -206,108 +206,28 @@ void AsmPrinter::EmitSectionOffset(const MCSymbol *Label, // Dwarf Lowering Routines //===----------------------------------------------------------------------===// - -/// EmitFrameMoves - Emit frame instructions to describe the layout of the -/// frame. -void AsmPrinter::EmitFrameMoves(const std::vector<MachineMove> &Moves, - MCSymbol *BaseLabel, bool isEH) const { - const TargetRegisterInfo *RI = TM.getRegisterInfo(); - - int stackGrowth = TM.getTargetData()->getPointerSize(); - if (TM.getFrameLowering()->getStackGrowthDirection() != - TargetFrameLowering::StackGrowsUp) - stackGrowth *= -1; - - for (unsigned i = 0, N = Moves.size(); i < N; ++i) { - const MachineMove &Move = Moves[i]; - MCSymbol *Label = Move.getLabel(); - // Throw out move if the label is invalid. - if (Label && !Label->isDefined()) continue; // Not emitted, in dead code. - - const MachineLocation &Dst = Move.getDestination(); - const MachineLocation &Src = Move.getSource(); - - // Advance row if new location. - if (BaseLabel && Label) { - MCSymbol *ThisSym = Label; - if (ThisSym != BaseLabel) { - EmitCFAByte(dwarf::DW_CFA_advance_loc4); - EmitLabelDifference(ThisSym, BaseLabel, 4); - BaseLabel = ThisSym; - } - } - - // If advancing cfa. - if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) { - assert(!Src.isReg() && "Machine move not supported yet."); - - if (Src.getReg() == MachineLocation::VirtualFP) { - EmitCFAByte(dwarf::DW_CFA_def_cfa_offset); - } else { - EmitCFAByte(dwarf::DW_CFA_def_cfa); - EmitULEB128(RI->getDwarfRegNum(Src.getReg(), isEH), "Register"); - } - - EmitULEB128(-Src.getOffset(), "Offset"); - continue; - } - - if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) { - assert(Dst.isReg() && "Machine move not supported yet."); - EmitCFAByte(dwarf::DW_CFA_def_cfa_register); - EmitULEB128(RI->getDwarfRegNum(Dst.getReg(), isEH), "Register"); - continue; - } - - unsigned Reg = RI->getDwarfRegNum(Src.getReg(), isEH); - int Offset = Dst.getOffset() / stackGrowth; - - if (Offset < 0) { - EmitCFAByte(dwarf::DW_CFA_offset_extended_sf); - EmitULEB128(Reg, "Reg"); - EmitSLEB128(Offset, "Offset"); - } else if (Reg < 64) { - EmitCFAByte(dwarf::DW_CFA_offset + Reg); - EmitULEB128(Offset, "Offset"); - } else { - EmitCFAByte(dwarf::DW_CFA_offset_extended); - EmitULEB128(Reg, "Reg"); - EmitULEB128(Offset, "Offset"); - } - } -} - -/// EmitFrameMoves - Emit frame instructions to describe the layout of the -/// frame. -void AsmPrinter::EmitCFIFrameMoves(const std::vector<MachineMove> &Moves) const { +/// EmitCFIFrameMove - Emit a frame instruction. +void AsmPrinter::EmitCFIFrameMove(const MachineMove &Move) const { const TargetRegisterInfo *RI = TM.getRegisterInfo(); - for (unsigned i = 0, N = Moves.size(); i < N; ++i) { - const MachineMove &Move = Moves[i]; - MCSymbol *Label = Move.getLabel(); - // Throw out move if the label is invalid. - if (Label && !Label->isDefined()) continue; // Not emitted, in dead code. - - const MachineLocation &Dst = Move.getDestination(); - const MachineLocation &Src = Move.getSource(); - - // If advancing cfa. - if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) { - assert(!Src.isReg() && "Machine move not supported yet."); + const MachineLocation &Dst = Move.getDestination(); + const MachineLocation &Src = Move.getSource(); - if (Src.getReg() == MachineLocation::VirtualFP) { - OutStreamer.EmitCFIDefCfaOffset(-Src.getOffset()); - } else { - assert("Machine move not supported yet"); - // Reg + Offset - } - } else if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) { - assert(Dst.isReg() && "Machine move not supported yet."); - OutStreamer.EmitCFIDefCfaRegister(RI->getDwarfRegNum(Dst.getReg(), true)); + // If advancing cfa. + if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) { + if (Src.getReg() == MachineLocation::VirtualFP) { + OutStreamer.EmitCFIDefCfaOffset(-Src.getOffset()); } else { - assert(!Dst.isReg() && "Machine move not supported yet."); - OutStreamer.EmitCFIOffset(RI->getDwarfRegNum(Src.getReg(), true), - Dst.getOffset()); + // Reg + Offset + OutStreamer.EmitCFIDefCfa(RI->getDwarfRegNum(Src.getReg(), true), + Src.getOffset()); } + } else if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) { + assert(Dst.isReg() && "Machine move not supported yet."); + OutStreamer.EmitCFIDefCfaRegister(RI->getDwarfRegNum(Dst.getReg(), true)); + } else { + assert(!Dst.isReg() && "Machine move not supported yet."); + OutStreamer.EmitCFIOffset(RI->getDwarfRegNum(Src.getReg(), true), + Dst.getOffset()); } } diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt index 1377e4d..4da7876 100644 --- a/lib/CodeGen/AsmPrinter/CMakeLists.txt +++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt @@ -5,9 +5,10 @@ add_llvm_library(LLVMAsmPrinter AsmPrinterInlineAsm.cpp DIE.cpp DwarfCFIException.cpp + DwarfCompileUnit.cpp DwarfDebug.cpp DwarfException.cpp - DwarfTableException.cpp OcamlGCPrinter.cpp + Win64Exception.cpp ) diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index 68be2ee..91b7d08 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -32,6 +32,7 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" @@ -40,92 +41,105 @@ using namespace llvm; DwarfCFIException::DwarfCFIException(AsmPrinter *A) : DwarfException(A), - shouldEmitTable(false), shouldEmitMoves(false), shouldEmitTableModule(false) - {} + shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false), + moveTypeModule(AsmPrinter::CFI_M_None) {} DwarfCFIException::~DwarfCFIException() {} /// EndModule - Emit all exception information that should come after the /// content. void DwarfCFIException::EndModule() { - if (!Asm->MAI->isExceptionHandlingDwarf()) - return; + if (moveTypeModule == AsmPrinter::CFI_M_Debug) + Asm->OutStreamer.EmitCFISections(false, true); - if (!shouldEmitTableModule) + if (!Asm->MAI->isExceptionHandlingDwarf()) return; const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); + unsigned PerEncoding = TLOF.getPersonalityEncoding(); - // Begin eh frame section. - Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection()); + if ((PerEncoding & 0x70) != dwarf::DW_EH_PE_pcrel) + return; // Emit references to all used personality functions + bool AtLeastOne = false; const std::vector<const Function*> &Personalities = MMI->getPersonalities(); for (size_t i = 0, e = Personalities.size(); i != e; ++i) { - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("personality", i)); - Asm->EmitReference(Personalities[i], PerEncoding); + if (!Personalities[i]) + continue; + MCSymbol *Sym = Asm->Mang->getSymbol(Personalities[i]); + TLOF.emitPersonalityValue(Asm->OutStreamer, Asm->TM, Sym); + AtLeastOne = true; + } + + if (AtLeastOne && !TLOF.isFunctionEHFrameSymbolPrivate()) { + // This is a temporary hack to keep sections in the same order they + // were before. This lets us produce bit identical outputs while + // transitioning to CFI. + Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection()); } } /// BeginFunction - Gather pre-function exception information. Assumes it's /// being emitted immediately after the function entry point. void DwarfCFIException::BeginFunction(const MachineFunction *MF) { - shouldEmitTable = shouldEmitMoves = false; + shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false; // If any landing pads survive, we need an EH table. - shouldEmitTable = !MMI->getLandingPads().empty(); + bool hasLandingPads = !MMI->getLandingPads().empty(); // See if we need frame move info. - shouldEmitMoves = - !Asm->MF->getFunction()->doesNotThrow() || UnwindTablesMandatory; + AsmPrinter::CFIMoveType MoveType = Asm->needsCFIMoves(); + if (MoveType == AsmPrinter::CFI_M_EH || + (MoveType == AsmPrinter::CFI_M_Debug && + moveTypeModule == AsmPrinter::CFI_M_None)) + moveTypeModule = MoveType; - if (shouldEmitMoves || shouldEmitTable) - // Assumes in correct section after the entry point. - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin", - Asm->getFunctionNumber())); + shouldEmitMoves = MoveType != AsmPrinter::CFI_M_None; - shouldEmitTableModule |= shouldEmitTable; + const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); + unsigned PerEncoding = TLOF.getPersonalityEncoding(); + const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()]; - if (shouldEmitMoves) { - const TargetFrameLowering *TFL = Asm->TM.getFrameLowering(); - Asm->OutStreamer.EmitCFIStartProc(); + shouldEmitPersonality = hasLandingPads && + PerEncoding != dwarf::DW_EH_PE_omit && Per; - // Indicate locations of general callee saved registers in frame. - std::vector<MachineMove> Moves; - TFL->getInitialFrameState(Moves); - Asm->EmitCFIFrameMoves(Moves); - Asm->EmitCFIFrameMoves(MMI->getFrameMoves()); - } + unsigned LSDAEncoding = TLOF.getLSDAEncoding(); + shouldEmitLSDA = shouldEmitPersonality && + LSDAEncoding != dwarf::DW_EH_PE_omit; - if (!shouldEmitTable) + if (!shouldEmitPersonality && !shouldEmitMoves) return; - const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); + Asm->OutStreamer.EmitCFIStartProc(); + + // Indicate personality routine, if any. + if (!shouldEmitPersonality) + return; + + const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(Per, Asm->Mang, MMI); + Asm->OutStreamer.EmitCFIPersonality(Sym, PerEncoding); + + Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin", + Asm->getFunctionNumber())); // Provide LSDA information. - unsigned LSDAEncoding = TLOF.getLSDAEncoding(); - if (LSDAEncoding != dwarf::DW_EH_PE_omit) - Asm->OutStreamer.EmitCFILsda(Asm->GetTempSymbol("exception", - Asm->getFunctionNumber()), - LSDAEncoding); + if (!shouldEmitLSDA) + return; - // Indicate personality routine, if any. - unsigned PerEncoding = TLOF.getPersonalityEncoding(); - if (PerEncoding != dwarf::DW_EH_PE_omit && - MMI->getPersonalities()[MMI->getPersonalityIndex()]) - Asm->OutStreamer.EmitCFIPersonality(Asm->GetTempSymbol("personality", - MMI->getPersonalityIndex()), - PerEncoding); + Asm->OutStreamer.EmitCFILsda(Asm->GetTempSymbol("exception", + Asm->getFunctionNumber()), + LSDAEncoding); } /// EndFunction - Gather and emit post-function exception information. /// void DwarfCFIException::EndFunction() { - if (!shouldEmitMoves && !shouldEmitTable) return; + if (!shouldEmitPersonality && !shouldEmitMoves) + return; - if (shouldEmitMoves) - Asm->OutStreamer.EmitCFIEndProc(); + Asm->OutStreamer.EmitCFIEndProc(); Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end", Asm->getFunctionNumber())); @@ -133,6 +147,6 @@ void DwarfCFIException::EndFunction() { // Map all labels and get rid of any dead landing pads. MMI->TidyLandingPads(); - if (shouldEmitTable) + if (shouldEmitPersonality) EmitExceptionTable(); } diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp new file mode 100644 index 0000000..bff1a35 --- /dev/null +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -0,0 +1,1054 @@ +//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Unit ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for writing dwarf compile unit. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "dwarfdebug" + +#include "DwarfCompileUnit.h" +#include "DwarfDebug.h" +#include "llvm/Constants.h" +#include "llvm/Analysis/DIBuilder.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +/// CompileUnit - Compile unit constructor. +CompileUnit::CompileUnit(unsigned I, DIE *D, AsmPrinter *A, DwarfDebug *DW) + : ID(I), CUDie(D), Asm(A), DD(DW), IndexTyDie(0) { + DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1); +} + +/// ~CompileUnit - Destructor for compile unit. +CompileUnit::~CompileUnit() { + for (unsigned j = 0, M = DIEBlocks.size(); j < M; ++j) + DIEBlocks[j]->~DIEBlock(); +} + +/// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug +/// information entry. +DIEEntry *CompileUnit::createDIEEntry(DIE *Entry) { + DIEEntry *Value = new (DIEValueAllocator) DIEEntry(Entry); + return Value; +} + +/// addUInt - Add an unsigned integer attribute data and value. +/// +void CompileUnit::addUInt(DIE *Die, unsigned Attribute, + unsigned Form, uint64_t Integer) { + if (!Form) Form = DIEInteger::BestForm(false, Integer); + DIEValue *Value = Integer == 1 ? + DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer); + Die->addValue(Attribute, Form, Value); +} + +/// addSInt - Add an signed integer attribute data and value. +/// +void CompileUnit::addSInt(DIE *Die, unsigned Attribute, + unsigned Form, int64_t Integer) { + if (!Form) Form = DIEInteger::BestForm(true, Integer); + DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer); + Die->addValue(Attribute, Form, Value); +} + +/// addString - Add a string attribute data and value. DIEString only +/// keeps string reference. +void CompileUnit::addString(DIE *Die, unsigned Attribute, unsigned Form, + StringRef String) { + DIEValue *Value = new (DIEValueAllocator) DIEString(String); + Die->addValue(Attribute, Form, Value); +} + +/// addLabel - Add a Dwarf label attribute data and value. +/// +void CompileUnit::addLabel(DIE *Die, unsigned Attribute, unsigned Form, + const MCSymbol *Label) { + DIEValue *Value = new (DIEValueAllocator) DIELabel(Label); + Die->addValue(Attribute, Form, Value); +} + +/// addDelta - Add a label delta attribute data and value. +/// +void CompileUnit::addDelta(DIE *Die, unsigned Attribute, unsigned Form, + const MCSymbol *Hi, const MCSymbol *Lo) { + DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo); + Die->addValue(Attribute, Form, Value); +} + +/// addDIEEntry - Add a DIE attribute data and value. +/// +void CompileUnit::addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, + DIE *Entry) { + Die->addValue(Attribute, Form, createDIEEntry(Entry)); +} + + +/// addBlock - Add block data. +/// +void CompileUnit::addBlock(DIE *Die, unsigned Attribute, unsigned Form, + DIEBlock *Block) { + Block->ComputeSize(Asm); + DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on. + Die->addValue(Attribute, Block->BestForm(), Block); +} + +/// addSourceLine - Add location information to specified debug information +/// entry. +void CompileUnit::addSourceLine(DIE *Die, DIVariable V) { + // Verify variable. + if (!V.Verify()) + return; + + unsigned Line = V.getLineNumber(); + if (Line == 0) + return; + unsigned FileID = DD->GetOrCreateSourceID(V.getContext().getFilename(), + V.getContext().getDirectory()); + assert(FileID && "Invalid file id"); + addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); + addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); +} + +/// addSourceLine - Add location information to specified debug information +/// entry. +void CompileUnit::addSourceLine(DIE *Die, DIGlobalVariable G) { + // Verify global variable. + if (!G.Verify()) + return; + + unsigned Line = G.getLineNumber(); + if (Line == 0) + return; + unsigned FileID = DD->GetOrCreateSourceID(G.getContext().getFilename(), + G.getContext().getDirectory()); + assert(FileID && "Invalid file id"); + addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); + addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); +} + +/// addSourceLine - Add location information to specified debug information +/// entry. +void CompileUnit::addSourceLine(DIE *Die, DISubprogram SP) { + // Verify subprogram. + if (!SP.Verify()) + return; + // If the line number is 0, don't add it. + if (SP.getLineNumber() == 0) + return; + + unsigned Line = SP.getLineNumber(); + if (!SP.getContext().Verify()) + return; + unsigned FileID = DD->GetOrCreateSourceID(SP.getFilename(), SP.getDirectory()); + assert(FileID && "Invalid file id"); + addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); + addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); +} + +/// addSourceLine - Add location information to specified debug information +/// entry. +void CompileUnit::addSourceLine(DIE *Die, DIType Ty) { + // Verify type. + if (!Ty.Verify()) + return; + + unsigned Line = Ty.getLineNumber(); + if (Line == 0 || !Ty.getContext().Verify()) + return; + unsigned FileID = DD->GetOrCreateSourceID(Ty.getFilename(), Ty.getDirectory()); + assert(FileID && "Invalid file id"); + addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); + addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); +} + +/// addSourceLine - Add location information to specified debug information +/// entry. +void CompileUnit::addSourceLine(DIE *Die, DINameSpace NS) { + // Verify namespace. + if (!NS.Verify()) + return; + + unsigned Line = NS.getLineNumber(); + if (Line == 0) + return; + StringRef FN = NS.getFilename(); + + unsigned FileID = DD->GetOrCreateSourceID(FN, NS.getDirectory()); + assert(FileID && "Invalid file id"); + addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); + addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); +} + +/// addVariableAddress - Add DW_AT_location attribute for a +/// DbgVariable based on provided MachineLocation. +void CompileUnit::addVariableAddress(DbgVariable *&DV, DIE *Die, + MachineLocation Location) { + if (DV->variableHasComplexAddress()) + addComplexAddress(DV, Die, dwarf::DW_AT_location, Location); + else if (DV->isBlockByrefVariable()) + addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location); + else + addAddress(Die, dwarf::DW_AT_location, Location); +} + +/// addRegisterOp - Add register operand. +void CompileUnit::addRegisterOp(DIE *TheDie, unsigned Reg) { + const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); + unsigned DWReg = RI->getDwarfRegNum(Reg, false); + if (DWReg < 32) + addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg); + else { + addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx); + addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg); + } +} + +/// addRegisterOffset - Add register offset. +void CompileUnit::addRegisterOffset(DIE *TheDie, unsigned Reg, + int64_t Offset) { + const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); + unsigned DWReg = RI->getDwarfRegNum(Reg, false); + const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo(); + if (Reg == TRI->getFrameRegister(*Asm->MF)) + // If variable offset is based in frame register then use fbreg. + addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg); + else if (DWReg < 32) + addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg); + else { + addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx); + addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg); + } + addSInt(TheDie, 0, dwarf::DW_FORM_sdata, Offset); +} + +/// addAddress - Add an address attribute to a die based on the location +/// provided. +void CompileUnit::addAddress(DIE *Die, unsigned Attribute, + const MachineLocation &Location) { + DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); + + if (Location.isReg()) + addRegisterOp(Block, Location.getReg()); + else + addRegisterOffset(Block, Location.getReg(), Location.getOffset()); + + // Now attach the location information to the DIE. + addBlock(Die, Attribute, 0, Block); +} + +/// addComplexAddress - Start with the address based on the location provided, +/// and generate the DWARF information necessary to find the actual variable +/// given the extra address information encoded in the DIVariable, starting from +/// the starting location. Add the DWARF information to the die. +/// +void CompileUnit::addComplexAddress(DbgVariable *&DV, DIE *Die, + unsigned Attribute, + const MachineLocation &Location) { + DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); + unsigned N = DV->getNumAddrElements(); + unsigned i = 0; + if (Location.isReg()) { + if (N >= 2 && DV->getAddrElement(0) == DIBuilder::OpPlus) { + // If first address element is OpPlus then emit + // DW_OP_breg + Offset instead of DW_OP_reg + Offset. + addRegisterOffset(Block, Location.getReg(), DV->getAddrElement(1)); + i = 2; + } else + addRegisterOp(Block, Location.getReg()); + } + else + addRegisterOffset(Block, Location.getReg(), Location.getOffset()); + + for (;i < N; ++i) { + uint64_t Element = DV->getAddrElement(i); + if (Element == DIBuilder::OpPlus) { + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); + addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i)); + } else if (Element == DIBuilder::OpDeref) { + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); + } else llvm_unreachable("unknown DIBuilder Opcode"); + } + + // Now attach the location information to the DIE. + addBlock(Die, Attribute, 0, Block); +} + +/* Byref variables, in Blocks, are declared by the programmer as "SomeType + VarName;", but the compiler creates a __Block_byref_x_VarName struct, and + gives the variable VarName either the struct, or a pointer to the struct, as + its type. This is necessary for various behind-the-scenes things the + compiler needs to do with by-reference variables in Blocks. + + However, as far as the original *programmer* is concerned, the variable + should still have type 'SomeType', as originally declared. + + The function getBlockByrefType dives into the __Block_byref_x_VarName + struct to find the original type of the variable, which is then assigned to + the variable's Debug Information Entry as its real type. So far, so good. + However now the debugger will expect the variable VarName to have the type + SomeType. So we need the location attribute for the variable to be an + expression that explains to the debugger how to navigate through the + pointers and struct to find the actual variable of type SomeType. + + The following function does just that. We start by getting + the "normal" location for the variable. This will be the location + of either the struct __Block_byref_x_VarName or the pointer to the + struct __Block_byref_x_VarName. + + The struct will look something like: + + struct __Block_byref_x_VarName { + ... <various fields> + struct __Block_byref_x_VarName *forwarding; + ... <various other fields> + SomeType VarName; + ... <maybe more fields> + }; + + If we are given the struct directly (as our starting point) we + need to tell the debugger to: + + 1). Add the offset of the forwarding field. + + 2). Follow that pointer to get the real __Block_byref_x_VarName + struct to use (the real one may have been copied onto the heap). + + 3). Add the offset for the field VarName, to find the actual variable. + + If we started with a pointer to the struct, then we need to + dereference that pointer first, before the other steps. + Translating this into DWARF ops, we will need to append the following + to the current location description for the variable: + + DW_OP_deref -- optional, if we start with a pointer + DW_OP_plus_uconst <forward_fld_offset> + DW_OP_deref + DW_OP_plus_uconst <varName_fld_offset> + + That is what this function does. */ + +/// addBlockByrefAddress - Start with the address based on the location +/// provided, and generate the DWARF information necessary to find the +/// actual Block variable (navigating the Block struct) based on the +/// starting location. Add the DWARF information to the die. For +/// more information, read large comment just above here. +/// +void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die, + unsigned Attribute, + const MachineLocation &Location) { + DIType Ty = DV->getType(); + DIType TmpTy = Ty; + unsigned Tag = Ty.getTag(); + bool isPointer = false; + + StringRef varName = DV->getName(); + + if (Tag == dwarf::DW_TAG_pointer_type) { + DIDerivedType DTy = DIDerivedType(Ty); + TmpTy = DTy.getTypeDerivedFrom(); + isPointer = true; + } + + DICompositeType blockStruct = DICompositeType(TmpTy); + + // Find the __forwarding field and the variable field in the __Block_byref + // struct. + DIArray Fields = blockStruct.getTypeArray(); + DIDescriptor varField = DIDescriptor(); + DIDescriptor forwardingField = DIDescriptor(); + + for (unsigned i = 0, N = Fields.getNumElements(); i < N; ++i) { + DIDescriptor Element = Fields.getElement(i); + DIDerivedType DT = DIDerivedType(Element); + StringRef fieldName = DT.getName(); + if (fieldName == "__forwarding") + forwardingField = Element; + else if (fieldName == varName) + varField = Element; + } + + // Get the offsets for the forwarding field and the variable field. + unsigned forwardingFieldOffset = + DIDerivedType(forwardingField).getOffsetInBits() >> 3; + unsigned varFieldOffset = + DIDerivedType(varField).getOffsetInBits() >> 3; + + // Decode the original location, and use that as the start of the byref + // variable's location. + const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); + unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false); + DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); + + if (Location.isReg()) { + if (Reg < 32) + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg); + else { + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx); + addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); + } + } else { + if (Reg < 32) + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg); + else { + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx); + addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); + } + + addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset()); + } + + // If we started with a pointer to the __Block_byref... struct, then + // the first thing we need to do is dereference the pointer (DW_OP_deref). + if (isPointer) + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); + + // Next add the offset for the '__forwarding' field: + // DW_OP_plus_uconst ForwardingFieldOffset. Note there's no point in + // adding the offset if it's 0. + if (forwardingFieldOffset > 0) { + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); + addUInt(Block, 0, dwarf::DW_FORM_udata, forwardingFieldOffset); + } + + // Now dereference the __forwarding field to get to the real __Block_byref + // struct: DW_OP_deref. + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); + + // Now that we've got the real __Block_byref... struct, add the offset + // for the variable's field to get to the location of the actual variable: + // DW_OP_plus_uconst varFieldOffset. Again, don't add if it's 0. + if (varFieldOffset > 0) { + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); + addUInt(Block, 0, dwarf::DW_FORM_udata, varFieldOffset); + } + + // Now attach the location information to the DIE. + addBlock(Die, Attribute, 0, Block); +} + +/// addConstantValue - Add constant value entry in variable DIE. +bool CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO, + DIType Ty) { + assert (MO.isImm() && "Invalid machine operand!"); + DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); + unsigned form = dwarf::DW_FORM_udata; + switch (Ty.getSizeInBits()) { + case 8: form = dwarf::DW_FORM_data1; break; + case 16: form = dwarf::DW_FORM_data2; break; + case 32: form = dwarf::DW_FORM_data4; break; + case 64: form = dwarf::DW_FORM_data8; break; + default: break; + } + + DIBasicType BTy(Ty); + if (BTy.Verify() && + (BTy.getEncoding() == dwarf::DW_ATE_signed + || BTy.getEncoding() == dwarf::DW_ATE_signed_char)) + addSInt(Block, 0, form, MO.getImm()); + else + addUInt(Block, 0, form, MO.getImm()); + + addBlock(Die, dwarf::DW_AT_const_value, 0, Block); + return true; +} + +/// addConstantFPValue - Add constant value entry in variable DIE. +bool CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) { + assert (MO.isFPImm() && "Invalid machine operand!"); + DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); + APFloat FPImm = MO.getFPImm()->getValueAPF(); + + // Get the raw data form of the floating point. + const APInt FltVal = FPImm.bitcastToAPInt(); + const char *FltPtr = (const char*)FltVal.getRawData(); + + int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte. + bool LittleEndian = Asm->getTargetData().isLittleEndian(); + int Incr = (LittleEndian ? 1 : -1); + int Start = (LittleEndian ? 0 : NumBytes - 1); + int Stop = (LittleEndian ? NumBytes : -1); + + // Output the constant to DWARF one byte at a time. + for (; Start != Stop; Start += Incr) + addUInt(Block, 0, dwarf::DW_FORM_data1, + (unsigned char)0xFF & FltPtr[Start]); + + addBlock(Die, dwarf::DW_AT_const_value, 0, Block); + return true; +} + +/// addConstantValue - Add constant value entry in variable DIE. +bool CompileUnit::addConstantValue(DIE *Die, ConstantInt *CI, + bool Unsigned) { + unsigned CIBitWidth = CI->getBitWidth(); + if (CIBitWidth <= 64) { + unsigned form = 0; + switch (CIBitWidth) { + case 8: form = dwarf::DW_FORM_data1; break; + case 16: form = dwarf::DW_FORM_data2; break; + case 32: form = dwarf::DW_FORM_data4; break; + case 64: form = dwarf::DW_FORM_data8; break; + default: + form = Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata; + } + if (Unsigned) + addUInt(Die, dwarf::DW_AT_const_value, form, CI->getZExtValue()); + else + addSInt(Die, dwarf::DW_AT_const_value, form, CI->getSExtValue()); + return true; + } + + DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); + + // Get the raw data form of the large APInt. + const APInt Val = CI->getValue(); + const char *Ptr = (const char*)Val.getRawData(); + + int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte. + bool LittleEndian = Asm->getTargetData().isLittleEndian(); + int Incr = (LittleEndian ? 1 : -1); + int Start = (LittleEndian ? 0 : NumBytes - 1); + int Stop = (LittleEndian ? NumBytes : -1); + + // Output the constant to DWARF one byte at a time. + for (; Start != Stop; Start += Incr) + addUInt(Block, 0, dwarf::DW_FORM_data1, + (unsigned char)0xFF & Ptr[Start]); + + addBlock(Die, dwarf::DW_AT_const_value, 0, Block); + return true; +} + +/// addTemplateParams - Add template parameters in buffer. +void CompileUnit::addTemplateParams(DIE &Buffer, DIArray TParams) { + // Add template parameters. + for (unsigned i = 0, e = TParams.getNumElements(); i != e; ++i) { + DIDescriptor Element = TParams.getElement(i); + if (Element.isTemplateTypeParameter()) + Buffer.addChild(getOrCreateTemplateTypeParameterDIE( + DITemplateTypeParameter(Element))); + else if (Element.isTemplateValueParameter()) + Buffer.addChild(getOrCreateTemplateValueParameterDIE( + DITemplateValueParameter(Element))); + } + +} +/// addToContextOwner - Add Die into the list of its context owner's children. +void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) { + if (Context.isType()) { + DIE *ContextDIE = getOrCreateTypeDIE(DIType(Context)); + ContextDIE->addChild(Die); + } else if (Context.isNameSpace()) { + DIE *ContextDIE = getOrCreateNameSpace(DINameSpace(Context)); + ContextDIE->addChild(Die); + } else if (Context.isSubprogram()) { + DIE *ContextDIE = DD->createSubprogramDIE(DISubprogram(Context)); + ContextDIE->addChild(Die); + } else if (DIE *ContextDIE = getDIE(Context)) + ContextDIE->addChild(Die); + else + addDie(Die); +} + +/// getOrCreateTypeDIE - Find existing DIE or create new DIE for the +/// given DIType. +DIE *CompileUnit::getOrCreateTypeDIE(DIType Ty) { + DIE *TyDIE = getDIE(Ty); + if (TyDIE) + return TyDIE; + + // Create new type. + TyDIE = new DIE(dwarf::DW_TAG_base_type); + insertDIE(Ty, TyDIE); + if (Ty.isBasicType()) + constructTypeDIE(*TyDIE, DIBasicType(Ty)); + else if (Ty.isCompositeType()) + constructTypeDIE(*TyDIE, DICompositeType(Ty)); + else { + assert(Ty.isDerivedType() && "Unknown kind of DIType"); + constructTypeDIE(*TyDIE, DIDerivedType(Ty)); + } + + addToContextOwner(TyDIE, Ty.getContext()); + return TyDIE; +} + +/// addType - Add a new type attribute to the specified entity. +void CompileUnit::addType(DIE *Entity, DIType Ty) { + if (!Ty.Verify()) + return; + + // Check for pre-existence. + DIEEntry *Entry = getDIEEntry(Ty); + // If it exists then use the existing value. + if (Entry) { + Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry); + return; + } + + // Construct type. + DIE *Buffer = getOrCreateTypeDIE(Ty); + + // Set up proxy. + Entry = createDIEEntry(Buffer); + insertDIEEntry(Ty, Entry); + Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry); + + // If this is a complete composite type then include it in the + // list of global types. + addGlobalType(Ty); +} + +/// addGlobalType - Add a new global type to the compile unit. +/// +void CompileUnit::addGlobalType(DIType Ty) { + DIDescriptor Context = Ty.getContext(); + if (Ty.isCompositeType() && !Ty.getName().empty() && !Ty.isForwardDecl() + && (Context.isCompileUnit() || Context.isFile() || Context.isNameSpace())) + if (DIEEntry *Entry = getDIEEntry(Ty)) + GlobalTypes[Ty.getName()] = Entry->getEntry(); +} + +/// addPubTypes - Add type for pubtypes section. +void CompileUnit::addPubTypes(DISubprogram SP) { + DICompositeType SPTy = SP.getType(); + unsigned SPTag = SPTy.getTag(); + if (SPTag != dwarf::DW_TAG_subroutine_type) + return; + + DIArray Args = SPTy.getTypeArray(); + for (unsigned i = 0, e = Args.getNumElements(); i != e; ++i) { + DIType ATy(Args.getElement(i)); + if (!ATy.Verify()) + continue; + addGlobalType(ATy); + } +} + +/// constructTypeDIE - Construct basic type die from DIBasicType. +void CompileUnit::constructTypeDIE(DIE &Buffer, DIBasicType BTy) { + // Get core information. + StringRef Name = BTy.getName(); + Buffer.setTag(dwarf::DW_TAG_base_type); + addUInt(&Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, + BTy.getEncoding()); + + // Add name if not anonymous or intermediate type. + if (!Name.empty()) + addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); + uint64_t Size = BTy.getSizeInBits() >> 3; + addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size); +} + +/// constructTypeDIE - Construct derived type die from DIDerivedType. +void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) { + // Get core information. + StringRef Name = DTy.getName(); + uint64_t Size = DTy.getSizeInBits() >> 3; + unsigned Tag = DTy.getTag(); + + // FIXME - Workaround for templates. + if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type; + + Buffer.setTag(Tag); + + // Map to main type, void will not have a type. + DIType FromTy = DTy.getTypeDerivedFrom(); + addType(&Buffer, FromTy); + + // Add name if not anonymous or intermediate type. + if (!Name.empty()) + addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); + + // Add size if non-zero (derived types might be zero-sized.) + if (Size) + addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size); + + // Add source line info if available and TyDesc is not a forward declaration. + if (!DTy.isForwardDecl()) + addSourceLine(&Buffer, DTy); +} + +/// constructTypeDIE - Construct type DIE from DICompositeType. +void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) { + // Get core information. + StringRef Name = CTy.getName(); + + uint64_t Size = CTy.getSizeInBits() >> 3; + unsigned Tag = CTy.getTag(); + Buffer.setTag(Tag); + + switch (Tag) { + case dwarf::DW_TAG_vector_type: + case dwarf::DW_TAG_array_type: + constructArrayTypeDIE(Buffer, &CTy); + break; + case dwarf::DW_TAG_enumeration_type: { + DIArray Elements = CTy.getTypeArray(); + + // Add enumerators to enumeration type. + for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) { + DIE *ElemDie = NULL; + DIDescriptor Enum(Elements.getElement(i)); + if (Enum.isEnumerator()) { + ElemDie = constructEnumTypeDIE(DIEnumerator(Enum)); + Buffer.addChild(ElemDie); + } + } + } + break; + case dwarf::DW_TAG_subroutine_type: { + // Add return type. + DIArray Elements = CTy.getTypeArray(); + DIDescriptor RTy = Elements.getElement(0); + addType(&Buffer, DIType(RTy)); + + bool isPrototyped = true; + // Add arguments. + for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) { + DIDescriptor Ty = Elements.getElement(i); + if (Ty.isUnspecifiedParameter()) { + DIE *Arg = new DIE(dwarf::DW_TAG_unspecified_parameters); + Buffer.addChild(Arg); + isPrototyped = false; + } else { + DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter); + addType(Arg, DIType(Ty)); + Buffer.addChild(Arg); + } + } + // Add prototype flag. + if (isPrototyped) + addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1); + } + break; + case dwarf::DW_TAG_structure_type: + case dwarf::DW_TAG_union_type: + case dwarf::DW_TAG_class_type: { + // Add elements to structure type. + DIArray Elements = CTy.getTypeArray(); + + // A forward struct declared type may not have elements available. + unsigned N = Elements.getNumElements(); + if (N == 0) + break; + + // Add elements to structure type. + for (unsigned i = 0; i < N; ++i) { + DIDescriptor Element = Elements.getElement(i); + DIE *ElemDie = NULL; + if (Element.isSubprogram()) { + DISubprogram SP(Element); + ElemDie = DD->createSubprogramDIE(DISubprogram(Element)); + if (SP.isProtected()) + addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, + dwarf::DW_ACCESS_protected); + else if (SP.isPrivate()) + addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, + dwarf::DW_ACCESS_private); + else + addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, + dwarf::DW_ACCESS_public); + if (SP.isExplicit()) + addUInt(ElemDie, dwarf::DW_AT_explicit, dwarf::DW_FORM_flag, 1); + } + else if (Element.isVariable()) { + DIVariable DV(Element); + ElemDie = new DIE(dwarf::DW_TAG_variable); + addString(ElemDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, + DV.getName()); + addType(ElemDie, DV.getType()); + addUInt(ElemDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); + addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1); + addSourceLine(ElemDie, DV); + } else if (Element.isDerivedType()) + ElemDie = createMemberDIE(DIDerivedType(Element)); + else + continue; + Buffer.addChild(ElemDie); + } + + if (CTy.isAppleBlockExtension()) + addUInt(&Buffer, dwarf::DW_AT_APPLE_block, dwarf::DW_FORM_flag, 1); + + unsigned RLang = CTy.getRunTimeLang(); + if (RLang) + addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class, + dwarf::DW_FORM_data1, RLang); + + DICompositeType ContainingType = CTy.getContainingType(); + if (DIDescriptor(ContainingType).isCompositeType()) + addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, + getOrCreateTypeDIE(DIType(ContainingType))); + else { + DIDescriptor Context = CTy.getContext(); + addToContextOwner(&Buffer, Context); + } + + if (CTy.isObjcClassComplete()) + addUInt(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type, + dwarf::DW_FORM_flag, 1); + + if (Tag == dwarf::DW_TAG_class_type) + addTemplateParams(Buffer, CTy.getTemplateParams()); + + break; + } + default: + break; + } + + // Add name if not anonymous or intermediate type. + if (!Name.empty()) + addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); + + if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type + || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) + { + // Add size if non-zero (derived types might be zero-sized.) + if (Size) + addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size); + else { + // Add zero size if it is not a forward declaration. + if (CTy.isForwardDecl()) + addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); + else + addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0); + } + + // Add source line info if available. + if (!CTy.isForwardDecl()) + addSourceLine(&Buffer, CTy); + } +} + +/// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE +/// for the given DITemplateTypeParameter. +DIE * +CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) { + DIE *ParamDIE = getDIE(TP); + if (ParamDIE) + return ParamDIE; + + ParamDIE = new DIE(dwarf::DW_TAG_template_type_parameter); + addType(ParamDIE, TP.getType()); + addString(ParamDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, TP.getName()); + return ParamDIE; +} + +/// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE +/// for the given DITemplateValueParameter. +DIE * +CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV) { + DIE *ParamDIE = getDIE(TPV); + if (ParamDIE) + return ParamDIE; + + ParamDIE = new DIE(dwarf::DW_TAG_template_value_parameter); + addType(ParamDIE, TPV.getType()); + if (!TPV.getName().empty()) + addString(ParamDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, TPV.getName()); + addUInt(ParamDIE, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata, + TPV.getValue()); + return ParamDIE; +} + +/// getOrCreateNameSpace - Create a DIE for DINameSpace. +DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) { + DIE *NDie = getDIE(NS); + if (NDie) + return NDie; + NDie = new DIE(dwarf::DW_TAG_namespace); + insertDIE(NS, NDie); + if (!NS.getName().empty()) + addString(NDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, NS.getName()); + addSourceLine(NDie, NS); + addToContextOwner(NDie, NS.getContext()); + return NDie; +} + +/// constructSubrangeDIE - Construct subrange DIE from DISubrange. +void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){ + DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type); + addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy); + int64_t L = SR.getLo(); + int64_t H = SR.getHi(); + + // The L value defines the lower bounds which is typically zero for C/C++. The + // H value is the upper bounds. Values are 64 bit. H - L + 1 is the size + // of the array. If L > H then do not emit DW_AT_lower_bound and + // DW_AT_upper_bound attributes. If L is zero and H is also zero then the + // array has one element and in such case do not emit lower bound. + + if (L > H) { + Buffer.addChild(DW_Subrange); + return; + } + if (L) + addSInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L); + addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H); + Buffer.addChild(DW_Subrange); +} + +/// constructArrayTypeDIE - Construct array type DIE from DICompositeType. +void CompileUnit::constructArrayTypeDIE(DIE &Buffer, + DICompositeType *CTy) { + Buffer.setTag(dwarf::DW_TAG_array_type); + if (CTy->getTag() == dwarf::DW_TAG_vector_type) + addUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1); + + // Emit derived type. + addType(&Buffer, CTy->getTypeDerivedFrom()); + DIArray Elements = CTy->getTypeArray(); + + // Get an anonymous type for index type. + DIE *IdxTy = getIndexTyDie(); + if (!IdxTy) { + // Construct an anonymous type for index type. + IdxTy = new DIE(dwarf::DW_TAG_base_type); + addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t)); + addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, + dwarf::DW_ATE_signed); + addDie(IdxTy); + setIndexTyDie(IdxTy); + } + + // Add subranges to array type. + for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) { + DIDescriptor Element = Elements.getElement(i); + if (Element.getTag() == dwarf::DW_TAG_subrange_type) + constructSubrangeDIE(Buffer, DISubrange(Element), IdxTy); + } +} + +/// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator. +DIE *CompileUnit::constructEnumTypeDIE(DIEnumerator ETy) { + DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator); + StringRef Name = ETy.getName(); + addString(Enumerator, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); + int64_t Value = ETy.getEnumValue(); + addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value); + return Enumerator; +} + +/// createMemberDIE - Create new member DIE. +DIE *CompileUnit::createMemberDIE(DIDerivedType DT) { + DIE *MemberDie = new DIE(DT.getTag()); + StringRef Name = DT.getName(); + if (!Name.empty()) + addString(MemberDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); + + addType(MemberDie, DT.getTypeDerivedFrom()); + + addSourceLine(MemberDie, DT); + + DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock(); + addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); + + uint64_t Size = DT.getSizeInBits(); + uint64_t FieldSize = DT.getOriginalTypeSize(); + + if (Size != FieldSize) { + // Handle bitfield. + addUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3); + addUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits()); + + uint64_t Offset = DT.getOffsetInBits(); + uint64_t AlignMask = ~(DT.getAlignInBits() - 1); + uint64_t HiMark = (Offset + FieldSize) & AlignMask; + uint64_t FieldOffset = (HiMark - FieldSize); + Offset -= FieldOffset; + + // Maybe we need to work from the other end. + if (Asm->getTargetData().isLittleEndian()) + Offset = FieldSize - (Offset + Size); + addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset); + + // Here WD_AT_data_member_location points to the anonymous + // field that includes this bit field. + addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, FieldOffset >> 3); + + } else + // This is not a bitfield. + addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3); + + if (DT.getTag() == dwarf::DW_TAG_inheritance + && DT.isVirtual()) { + + // For C++, virtual base classes are not at fixed offset. Use following + // expression to extract appropriate offset from vtable. + // BaseAddr = ObAddr + *((*ObAddr) - Offset) + + DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock(); + addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup); + addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); + addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu); + addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits()); + addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_minus); + addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); + addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus); + + addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, + VBaseLocationDie); + } else + addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie); + + if (DT.isProtected()) + addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, + dwarf::DW_ACCESS_protected); + else if (DT.isPrivate()) + addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, + dwarf::DW_ACCESS_private); + // Otherwise C++ member and base classes are considered public. + else if (DT.getCompileUnit().getLanguage() == dwarf::DW_LANG_C_plus_plus) + addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, + dwarf::DW_ACCESS_public); + if (DT.isVirtual()) + addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, + dwarf::DW_VIRTUALITY_virtual); + + // Objective-C properties. + StringRef PropertyName = DT.getObjCPropertyName(); + if (!PropertyName.empty()) { + addString(MemberDie, dwarf::DW_AT_APPLE_property_name, dwarf::DW_FORM_string, + PropertyName); + StringRef GetterName = DT.getObjCPropertyGetterName(); + if (!GetterName.empty()) + addString(MemberDie, dwarf::DW_AT_APPLE_property_getter, + dwarf::DW_FORM_string, GetterName); + StringRef SetterName = DT.getObjCPropertySetterName(); + if (!SetterName.empty()) + addString(MemberDie, dwarf::DW_AT_APPLE_property_setter, + dwarf::DW_FORM_string, SetterName); + unsigned PropertyAttributes = 0; + if (DT.isReadOnlyObjCProperty()) + PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_readonly; + if (DT.isReadWriteObjCProperty()) + PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_readwrite; + if (DT.isAssignObjCProperty()) + PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_assign; + if (DT.isRetainObjCProperty()) + PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_retain; + if (DT.isCopyObjCProperty()) + PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_copy; + if (DT.isNonAtomicObjCProperty()) + PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_nonatomic; + if (PropertyAttributes) + addUInt(MemberDie, dwarf::DW_AT_APPLE_property_attribute, 0, + PropertyAttributes); + } + return MemberDie; +} diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h new file mode 100644 index 0000000..60a9b28 --- /dev/null +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -0,0 +1,280 @@ +//===-- llvm/CodeGen/DwarfCompileUnit.h - Dwarf Compile Unit ---*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for writing dwarf compile unit. +// +//===----------------------------------------------------------------------===// + +#ifndef CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H +#define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H + +#include "DIE.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/OwningPtr.h" + +namespace llvm { + +class DwarfDebug; +class MachineLocation; +class MachineOperand; +class ConstantInt; +class DbgVariable; + +//===----------------------------------------------------------------------===// +/// CompileUnit - This dwarf writer support class manages information associate +/// with a source file. +class CompileUnit { + /// ID - File identifier for source. + /// + unsigned ID; + + /// Die - Compile unit debug information entry. + /// + const OwningPtr<DIE> CUDie; + + /// Asm - Target of Dwarf emission. + AsmPrinter *Asm; + + DwarfDebug *DD; + + /// IndexTyDie - An anonymous type for index type. Owned by CUDie. + DIE *IndexTyDie; + + /// MDNodeToDieMap - Tracks the mapping of unit level debug informaton + /// variables to debug information entries. + DenseMap<const MDNode *, DIE *> MDNodeToDieMap; + + /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug informaton + /// descriptors to debug information entries using a DIEEntry proxy. + DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap; + + /// Globals - A map of globally visible named entities for this unit. + /// + StringMap<DIE*> Globals; + + /// GlobalTypes - A map of globally visible types for this unit. + /// + StringMap<DIE*> GlobalTypes; + + /// DIEBlocks - A list of all the DIEBlocks in use. + std::vector<DIEBlock *> DIEBlocks; + +public: + CompileUnit(unsigned I, DIE *D, AsmPrinter *A, DwarfDebug *DW); + ~CompileUnit(); + + // Accessors. + unsigned getID() const { return ID; } + DIE* getCUDie() const { return CUDie.get(); } + const StringMap<DIE*> &getGlobals() const { return Globals; } + const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; } + + /// hasContent - Return true if this compile unit has something to write out. + /// + bool hasContent() const { return !CUDie->getChildren().empty(); } + + /// addGlobal - Add a new global entity to the compile unit. + /// + void addGlobal(StringRef Name, DIE *Die) { Globals[Name] = Die; } + + /// addGlobalType - Add a new global type to the compile unit. + /// + void addGlobalType(DIType Ty); + + /// getDIE - Returns the debug information entry map slot for the + /// specified debug variable. + DIE *getDIE(const MDNode *N) { return MDNodeToDieMap.lookup(N); } + + DIEBlock *getDIEBlock() { + return new (DIEValueAllocator) DIEBlock(); + } + + /// insertDIE - Insert DIE into the map. + void insertDIE(const MDNode *N, DIE *D) { + MDNodeToDieMap.insert(std::make_pair(N, D)); + } + + /// getDIEEntry - Returns the debug information entry for the specified + /// debug variable. + DIEEntry *getDIEEntry(const MDNode *N) { + DenseMap<const MDNode *, DIEEntry *>::iterator I = + MDNodeToDIEEntryMap.find(N); + if (I == MDNodeToDIEEntryMap.end()) + return NULL; + return I->second; + } + + /// insertDIEEntry - Insert debug information entry into the map. + void insertDIEEntry(const MDNode *N, DIEEntry *E) { + MDNodeToDIEEntryMap.insert(std::make_pair(N, E)); + } + + /// addDie - Adds or interns the DIE to the compile unit. + /// + void addDie(DIE *Buffer) { + this->CUDie->addChild(Buffer); + } + + // getIndexTyDie - Get an anonymous type for index type. + DIE *getIndexTyDie() { + return IndexTyDie; + } + + // setIndexTyDie - Set D as anonymous type for index which can be reused + // later. + void setIndexTyDie(DIE *D) { + IndexTyDie = D; + } +public: + + /// addUInt - Add an unsigned integer attribute data and value. + /// + void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer); + + /// addSInt - Add an signed integer attribute data and value. + /// + void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer); + + /// addString - Add a string attribute data and value. + /// + void addString(DIE *Die, unsigned Attribute, unsigned Form, + const StringRef Str); + + /// addLabel - Add a Dwarf label attribute data and value. + /// + void addLabel(DIE *Die, unsigned Attribute, unsigned Form, + const MCSymbol *Label); + + /// addDelta - Add a label delta attribute data and value. + /// + void addDelta(DIE *Die, unsigned Attribute, unsigned Form, + const MCSymbol *Hi, const MCSymbol *Lo); + + /// addDIEEntry - Add a DIE attribute data and value. + /// + void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry); + + /// addBlock - Add block data. + /// + void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block); + + /// addSourceLine - Add location information to specified debug information + /// entry. + void addSourceLine(DIE *Die, DIVariable V); + void addSourceLine(DIE *Die, DIGlobalVariable G); + void addSourceLine(DIE *Die, DISubprogram SP); + void addSourceLine(DIE *Die, DIType Ty); + void addSourceLine(DIE *Die, DINameSpace NS); + + /// addAddress - Add an address attribute to a die based on the location + /// provided. + void addAddress(DIE *Die, unsigned Attribute, + const MachineLocation &Location); + + /// addConstantValue - Add constant value entry in variable DIE. + bool addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty); + bool addConstantValue(DIE *Die, ConstantInt *CI, bool Unsigned); + + /// addConstantFPValue - Add constant value entry in variable DIE. + bool addConstantFPValue(DIE *Die, const MachineOperand &MO); + + /// addTemplateParams - Add template parameters in buffer. + void addTemplateParams(DIE &Buffer, DIArray TParams); + + /// addRegisterOp - Add register operand. + void addRegisterOp(DIE *TheDie, unsigned Reg); + + /// addRegisterOffset - Add register offset. + void addRegisterOffset(DIE *TheDie, unsigned Reg, int64_t Offset); + + /// addComplexAddress - Start with the address based on the location provided, + /// and generate the DWARF information necessary to find the actual variable + /// (navigating the extra location information encoded in the type) based on + /// the starting location. Add the DWARF information to the die. + /// + void addComplexAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute, + const MachineLocation &Location); + + // FIXME: Should be reformulated in terms of addComplexAddress. + /// addBlockByrefAddress - Start with the address based on the location + /// provided, and generate the DWARF information necessary to find the + /// actual Block variable (navigating the Block struct) based on the + /// starting location. Add the DWARF information to the die. Obsolete, + /// please use addComplexAddress instead. + /// + void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute, + const MachineLocation &Location); + + /// addVariableAddress - Add DW_AT_location attribute for a + /// DbgVariable based on provided MachineLocation. + void addVariableAddress(DbgVariable *&DV, DIE *Die, MachineLocation Location); + + /// addToContextOwner - Add Die into the list of its context owner's children. + void addToContextOwner(DIE *Die, DIDescriptor Context); + + /// addType - Add a new type attribute to the specified entity. + void addType(DIE *Entity, DIType Ty); + + /// getOrCreateNameSpace - Create a DIE for DINameSpace. + DIE *getOrCreateNameSpace(DINameSpace NS); + + /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the + /// given DIType. + DIE *getOrCreateTypeDIE(DIType Ty); + + /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE + /// for the given DITemplateTypeParameter. + DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP); + + /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE + /// for the given DITemplateValueParameter. + DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP); + + /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug + /// information entry. + DIEEntry *createDIEEntry(DIE *Entry); + + void addPubTypes(DISubprogram SP); + + /// constructTypeDIE - Construct basic type die from DIBasicType. + void constructTypeDIE(DIE &Buffer, + DIBasicType BTy); + + /// constructTypeDIE - Construct derived type die from DIDerivedType. + void constructTypeDIE(DIE &Buffer, + DIDerivedType DTy); + + /// constructTypeDIE - Construct type DIE from DICompositeType. + void constructTypeDIE(DIE &Buffer, + DICompositeType CTy); + + /// constructSubrangeDIE - Construct subrange DIE from DISubrange. + void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy); + + /// constructArrayTypeDIE - Construct array type DIE from DICompositeType. + void constructArrayTypeDIE(DIE &Buffer, + DICompositeType *CTy); + + /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator. + DIE *constructEnumTypeDIE(DIEnumerator ETy); + + /// createMemberDIE - Create new member DIE. + DIE *createMemberDIE(DIDerivedType DT); + +private: + + // DIEValueAllocator - All DIEValues are allocated through this allocator. + BumpPtrAllocator DIEValueAllocator; + DIEInteger *DIEIntegerOne; +}; + +} // end llvm namespace +#endif diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index bad87c1..8845bfa 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -14,6 +14,7 @@ #define DEBUG_TYPE "dwarfdebug" #include "DwarfDebug.h" #include "DIE.h" +#include "DwarfCompileUnit.h" #include "llvm/Constants.h" #include "llvm/Module.h" #include "llvm/Instructions.h" @@ -52,13 +53,9 @@ static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print", cl::desc("Disable debug info printing")); static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden, - cl::desc("Make an absense of debug location information explicit."), + cl::desc("Make an absence of debug location information explicit."), cl::init(false)); -#ifndef NDEBUG -STATISTIC(BlocksWithoutLineNo, "Number of blocks without any line number"); -#endif - namespace { const char *DWARFGroupName = "DWARF Emission"; const char *DbgTimerName = "DWARF Debug Writer"; @@ -72,189 +69,56 @@ static const unsigned InitAbbreviationsSetSize = 9; // log2(512) namespace llvm { -//===----------------------------------------------------------------------===// -/// CompileUnit - This dwarf writer support class manages information associate -/// with a source file. -class CompileUnit { - /// ID - File identifier for source. - /// - unsigned ID; - - /// Die - Compile unit debug information entry. - /// - const OwningPtr<DIE> CUDie; - - /// IndexTyDie - An anonymous type for index type. Owned by CUDie. - DIE *IndexTyDie; - - /// MDNodeToDieMap - Tracks the mapping of unit level debug informaton - /// variables to debug information entries. - DenseMap<const MDNode *, DIE *> MDNodeToDieMap; - - /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug informaton - /// descriptors to debug information entries using a DIEEntry proxy. - DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap; - - /// Globals - A map of globally visible named entities for this unit. - /// - StringMap<DIE*> Globals; - - /// GlobalTypes - A map of globally visible types for this unit. - /// - StringMap<DIE*> GlobalTypes; - -public: - CompileUnit(unsigned I, DIE *D) - : ID(I), CUDie(D), IndexTyDie(0) {} - - // Accessors. - unsigned getID() const { return ID; } - DIE* getCUDie() const { return CUDie.get(); } - const StringMap<DIE*> &getGlobals() const { return Globals; } - const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; } - - /// hasContent - Return true if this compile unit has something to write out. - /// - bool hasContent() const { return !CUDie->getChildren().empty(); } - - /// addGlobal - Add a new global entity to the compile unit. - /// - void addGlobal(StringRef Name, DIE *Die) { Globals[Name] = Die; } - - /// addGlobalType - Add a new global type to the compile unit. - /// - void addGlobalType(StringRef Name, DIE *Die) { - GlobalTypes[Name] = Die; - } - - /// getDIE - Returns the debug information entry map slot for the - /// specified debug variable. - DIE *getDIE(const MDNode *N) { return MDNodeToDieMap.lookup(N); } - - /// insertDIE - Insert DIE into the map. - void insertDIE(const MDNode *N, DIE *D) { - MDNodeToDieMap.insert(std::make_pair(N, D)); - } - - /// getDIEEntry - Returns the debug information entry for the speciefied - /// debug variable. - DIEEntry *getDIEEntry(const MDNode *N) { - DenseMap<const MDNode *, DIEEntry *>::iterator I = - MDNodeToDIEEntryMap.find(N); - if (I == MDNodeToDIEEntryMap.end()) - return NULL; - return I->second; - } - - /// insertDIEEntry - Insert debug information entry into the map. - void insertDIEEntry(const MDNode *N, DIEEntry *E) { - MDNodeToDIEEntryMap.insert(std::make_pair(N, E)); - } - - /// addDie - Adds or interns the DIE to the compile unit. - /// - void addDie(DIE *Buffer) { - this->CUDie->addChild(Buffer); - } - - // getIndexTyDie - Get an anonymous type for index type. - DIE *getIndexTyDie() { - return IndexTyDie; - } - - // setIndexTyDie - Set D as anonymous type for index which can be reused - // later. - void setIndexTyDie(DIE *D) { - IndexTyDie = D; - } - -}; - -//===----------------------------------------------------------------------===// -/// DbgVariable - This class is used to track local variable information. -/// -class DbgVariable { - DIVariable Var; // Variable Descriptor. - DIE *TheDIE; // Variable DIE. - unsigned DotDebugLocOffset; // Offset in DotDebugLocEntries. -public: - // AbsVar may be NULL. - DbgVariable(DIVariable V) : Var(V), TheDIE(0), DotDebugLocOffset(~0U) {} - - // Accessors. - DIVariable getVariable() const { return Var; } - void setDIE(DIE *D) { TheDIE = D; } - DIE *getDIE() const { return TheDIE; } - void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; } - unsigned getDotDebugLocOffset() const { return DotDebugLocOffset; } - StringRef getName() const { return Var.getName(); } - unsigned getTag() const { return Var.getTag(); } - bool variableHasComplexAddress() const { - assert(Var.Verify() && "Invalid complex DbgVariable!"); - return Var.hasComplexAddress(); - } - bool isBlockByrefVariable() const { - assert(Var.Verify() && "Invalid complex DbgVariable!"); - return Var.isBlockByrefVariable(); - } - unsigned getNumAddrElements() const { - assert(Var.Verify() && "Invalid complex DbgVariable!"); - return Var.getNumAddrElements(); - } - uint64_t getAddrElement(unsigned i) const { - return Var.getAddrElement(i); - } - DIType getType() const { - DIType Ty = Var.getType(); - // FIXME: isBlockByrefVariable should be reformulated in terms of complex - // addresses instead. - if (Var.isBlockByrefVariable()) { - /* Byref variables, in Blocks, are declared by the programmer as - "SomeType VarName;", but the compiler creates a - __Block_byref_x_VarName struct, and gives the variable VarName - either the struct, or a pointer to the struct, as its type. This - is necessary for various behind-the-scenes things the compiler - needs to do with by-reference variables in blocks. - - However, as far as the original *programmer* is concerned, the - variable should still have type 'SomeType', as originally declared. - - The following function dives into the __Block_byref_x_VarName - struct to find the original type of the variable. This will be - passed back to the code generating the type for the Debug - Information Entry for the variable 'VarName'. 'VarName' will then - have the original type 'SomeType' in its debug information. - - The original type 'SomeType' will be the type of the field named - 'VarName' inside the __Block_byref_x_VarName struct. - - NOTE: In order for this to not completely fail on the debugger - side, the Debug Information Entry for the variable VarName needs to - have a DW_AT_location that tells the debugger how to unwind through - the pointers and __Block_byref_x_VarName struct to find the actual - value of the variable. The function addBlockByrefType does this. */ - DIType subType = Ty; - unsigned tag = Ty.getTag(); - - if (tag == dwarf::DW_TAG_pointer_type) { - DIDerivedType DTy = DIDerivedType(Ty); - subType = DTy.getTypeDerivedFrom(); - } - - DICompositeType blockStruct = DICompositeType(subType); - DIArray Elements = blockStruct.getTypeArray(); - - for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) { - DIDescriptor Element = Elements.getElement(i); - DIDerivedType DT = DIDerivedType(Element); - if (getName() == DT.getName()) - return (DT.getTypeDerivedFrom()); - } - return Ty; +DIType DbgVariable::getType() const { + DIType Ty = Var.getType(); + // FIXME: isBlockByrefVariable should be reformulated in terms of complex + // addresses instead. + if (Var.isBlockByrefVariable()) { + /* Byref variables, in Blocks, are declared by the programmer as + "SomeType VarName;", but the compiler creates a + __Block_byref_x_VarName struct, and gives the variable VarName + either the struct, or a pointer to the struct, as its type. This + is necessary for various behind-the-scenes things the compiler + needs to do with by-reference variables in blocks. + + However, as far as the original *programmer* is concerned, the + variable should still have type 'SomeType', as originally declared. + + The following function dives into the __Block_byref_x_VarName + struct to find the original type of the variable. This will be + passed back to the code generating the type for the Debug + Information Entry for the variable 'VarName'. 'VarName' will then + have the original type 'SomeType' in its debug information. + + The original type 'SomeType' will be the type of the field named + 'VarName' inside the __Block_byref_x_VarName struct. + + NOTE: In order for this to not completely fail on the debugger + side, the Debug Information Entry for the variable VarName needs to + have a DW_AT_location that tells the debugger how to unwind through + the pointers and __Block_byref_x_VarName struct to find the actual + value of the variable. The function addBlockByrefType does this. */ + DIType subType = Ty; + unsigned tag = Ty.getTag(); + + if (tag == dwarf::DW_TAG_pointer_type) { + DIDerivedType DTy = DIDerivedType(Ty); + subType = DTy.getTypeDerivedFrom(); + } + + DICompositeType blockStruct = DICompositeType(subType); + DIArray Elements = blockStruct.getTypeArray(); + + for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) { + DIDescriptor Element = Elements.getElement(i); + DIDerivedType DT = DIDerivedType(Element); + if (getName() == DT.getName()) + return (DT.getTypeDerivedFrom()); } return Ty; } -}; + return Ty; +} //===----------------------------------------------------------------------===// /// DbgRange - This is used to track range of instructions with identical @@ -392,19 +256,16 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) CurrentFnDbgScope(0), PrevLabel(NULL) { NextStringPoolNumber = 0; - DwarfFrameSectionSym = DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0; + DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0; DwarfStrSectionSym = TextSectionSym = 0; DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0; FunctionBeginSym = FunctionEndSym = 0; - DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1); { NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled); beginModule(M); } } DwarfDebug::~DwarfDebug() { - for (unsigned j = 0, M = DIEBlocks.size(); j < M; ++j) - DIEBlocks[j]->~DIEBlock(); } MCSymbol *DwarfDebug::getStringPoolEntry(StringRef Str) { @@ -439,858 +300,6 @@ void DwarfDebug::assignAbbrevNumber(DIEAbbrev &Abbrev) { } } -/// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug -/// information entry. -DIEEntry *DwarfDebug::createDIEEntry(DIE *Entry) { - DIEEntry *Value = new (DIEValueAllocator) DIEEntry(Entry); - return Value; -} - -/// addUInt - Add an unsigned integer attribute data and value. -/// -void DwarfDebug::addUInt(DIE *Die, unsigned Attribute, - unsigned Form, uint64_t Integer) { - if (!Form) Form = DIEInteger::BestForm(false, Integer); - DIEValue *Value = Integer == 1 ? - DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer); - Die->addValue(Attribute, Form, Value); -} - -/// addSInt - Add an signed integer attribute data and value. -/// -void DwarfDebug::addSInt(DIE *Die, unsigned Attribute, - unsigned Form, int64_t Integer) { - if (!Form) Form = DIEInteger::BestForm(true, Integer); - DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer); - Die->addValue(Attribute, Form, Value); -} - -/// addString - Add a string attribute data and value. DIEString only -/// keeps string reference. -void DwarfDebug::addString(DIE *Die, unsigned Attribute, unsigned Form, - StringRef String) { - DIEValue *Value = new (DIEValueAllocator) DIEString(String); - Die->addValue(Attribute, Form, Value); -} - -/// addLabel - Add a Dwarf label attribute data and value. -/// -void DwarfDebug::addLabel(DIE *Die, unsigned Attribute, unsigned Form, - const MCSymbol *Label) { - DIEValue *Value = new (DIEValueAllocator) DIELabel(Label); - Die->addValue(Attribute, Form, Value); -} - -/// addDelta - Add a label delta attribute data and value. -/// -void DwarfDebug::addDelta(DIE *Die, unsigned Attribute, unsigned Form, - const MCSymbol *Hi, const MCSymbol *Lo) { - DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo); - Die->addValue(Attribute, Form, Value); -} - -/// addDIEEntry - Add a DIE attribute data and value. -/// -void DwarfDebug::addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, - DIE *Entry) { - Die->addValue(Attribute, Form, createDIEEntry(Entry)); -} - - -/// addBlock - Add block data. -/// -void DwarfDebug::addBlock(DIE *Die, unsigned Attribute, unsigned Form, - DIEBlock *Block) { - Block->ComputeSize(Asm); - DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on. - Die->addValue(Attribute, Block->BestForm(), Block); -} - -/// addSourceLine - Add location information to specified debug information -/// entry. -void DwarfDebug::addSourceLine(DIE *Die, DIVariable V) { - // Verify variable. - if (!V.Verify()) - return; - - unsigned Line = V.getLineNumber(); - if (Line == 0) - return; - unsigned FileID = GetOrCreateSourceID(V.getContext().getFilename(), - V.getContext().getDirectory()); - assert(FileID && "Invalid file id"); - addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); - addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); -} - -/// addSourceLine - Add location information to specified debug information -/// entry. -void DwarfDebug::addSourceLine(DIE *Die, DIGlobalVariable G) { - // Verify global variable. - if (!G.Verify()) - return; - - unsigned Line = G.getLineNumber(); - if (Line == 0) - return; - unsigned FileID = GetOrCreateSourceID(G.getContext().getFilename(), - G.getContext().getDirectory()); - assert(FileID && "Invalid file id"); - addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); - addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); -} - -/// addSourceLine - Add location information to specified debug information -/// entry. -void DwarfDebug::addSourceLine(DIE *Die, DISubprogram SP) { - // Verify subprogram. - if (!SP.Verify()) - return; - // If the line number is 0, don't add it. - if (SP.getLineNumber() == 0) - return; - - unsigned Line = SP.getLineNumber(); - if (!SP.getContext().Verify()) - return; - unsigned FileID = GetOrCreateSourceID(SP.getFilename(), SP.getDirectory()); - assert(FileID && "Invalid file id"); - addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); - addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); -} - -/// addSourceLine - Add location information to specified debug information -/// entry. -void DwarfDebug::addSourceLine(DIE *Die, DIType Ty) { - // Verify type. - if (!Ty.Verify()) - return; - - unsigned Line = Ty.getLineNumber(); - if (Line == 0 || !Ty.getContext().Verify()) - return; - unsigned FileID = GetOrCreateSourceID(Ty.getFilename(), Ty.getDirectory()); - assert(FileID && "Invalid file id"); - addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); - addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); -} - -/// addSourceLine - Add location information to specified debug information -/// entry. -void DwarfDebug::addSourceLine(DIE *Die, DINameSpace NS) { - // Verify namespace. - if (!NS.Verify()) - return; - - unsigned Line = NS.getLineNumber(); - if (Line == 0) - return; - StringRef FN = NS.getFilename(); - - unsigned FileID = GetOrCreateSourceID(FN, NS.getDirectory()); - assert(FileID && "Invalid file id"); - addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); - addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); -} - -/// addVariableAddress - Add DW_AT_location attribute for a DbgVariable based -/// on provided frame index. -void DwarfDebug::addVariableAddress(DbgVariable *&DV, DIE *Die, int64_t FI) { - MachineLocation Location; - unsigned FrameReg; - const TargetFrameLowering *TFI = Asm->TM.getFrameLowering(); - int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg); - Location.set(FrameReg, Offset); - - if (DV->variableHasComplexAddress()) - addComplexAddress(DV, Die, dwarf::DW_AT_location, Location); - else if (DV->isBlockByrefVariable()) - addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location); - else - addAddress(Die, dwarf::DW_AT_location, Location); -} - -/// addComplexAddress - Start with the address based on the location provided, -/// and generate the DWARF information necessary to find the actual variable -/// given the extra address information encoded in the DIVariable, starting from -/// the starting location. Add the DWARF information to the die. -/// -void DwarfDebug::addComplexAddress(DbgVariable *&DV, DIE *Die, - unsigned Attribute, - const MachineLocation &Location) { - DIType Ty = DV->getType(); - - // Decode the original location, and use that as the start of the byref - // variable's location. - const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); - unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false); - DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - - if (Location.isReg()) { - if (Reg < 32) { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg); - } else { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx); - addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); - } - } else { - if (Reg < 32) - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg); - else { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx); - addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); - } - - addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset()); - } - - for (unsigned i = 0, N = DV->getNumAddrElements(); i < N; ++i) { - uint64_t Element = DV->getAddrElement(i); - - if (Element == DIBuilder::OpPlus) { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); - addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i)); - } else if (Element == DIBuilder::OpDeref) { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); - } else llvm_unreachable("unknown DIBuilder Opcode"); - } - - // Now attach the location information to the DIE. - addBlock(Die, Attribute, 0, Block); -} - -/* Byref variables, in Blocks, are declared by the programmer as "SomeType - VarName;", but the compiler creates a __Block_byref_x_VarName struct, and - gives the variable VarName either the struct, or a pointer to the struct, as - its type. This is necessary for various behind-the-scenes things the - compiler needs to do with by-reference variables in Blocks. - - However, as far as the original *programmer* is concerned, the variable - should still have type 'SomeType', as originally declared. - - The function getBlockByrefType dives into the __Block_byref_x_VarName - struct to find the original type of the variable, which is then assigned to - the variable's Debug Information Entry as its real type. So far, so good. - However now the debugger will expect the variable VarName to have the type - SomeType. So we need the location attribute for the variable to be an - expression that explains to the debugger how to navigate through the - pointers and struct to find the actual variable of type SomeType. - - The following function does just that. We start by getting - the "normal" location for the variable. This will be the location - of either the struct __Block_byref_x_VarName or the pointer to the - struct __Block_byref_x_VarName. - - The struct will look something like: - - struct __Block_byref_x_VarName { - ... <various fields> - struct __Block_byref_x_VarName *forwarding; - ... <various other fields> - SomeType VarName; - ... <maybe more fields> - }; - - If we are given the struct directly (as our starting point) we - need to tell the debugger to: - - 1). Add the offset of the forwarding field. - - 2). Follow that pointer to get the real __Block_byref_x_VarName - struct to use (the real one may have been copied onto the heap). - - 3). Add the offset for the field VarName, to find the actual variable. - - If we started with a pointer to the struct, then we need to - dereference that pointer first, before the other steps. - Translating this into DWARF ops, we will need to append the following - to the current location description for the variable: - - DW_OP_deref -- optional, if we start with a pointer - DW_OP_plus_uconst <forward_fld_offset> - DW_OP_deref - DW_OP_plus_uconst <varName_fld_offset> - - That is what this function does. */ - -/// addBlockByrefAddress - Start with the address based on the location -/// provided, and generate the DWARF information necessary to find the -/// actual Block variable (navigating the Block struct) based on the -/// starting location. Add the DWARF information to the die. For -/// more information, read large comment just above here. -/// -void DwarfDebug::addBlockByrefAddress(DbgVariable *&DV, DIE *Die, - unsigned Attribute, - const MachineLocation &Location) { - DIType Ty = DV->getType(); - DIType TmpTy = Ty; - unsigned Tag = Ty.getTag(); - bool isPointer = false; - - StringRef varName = DV->getName(); - - if (Tag == dwarf::DW_TAG_pointer_type) { - DIDerivedType DTy = DIDerivedType(Ty); - TmpTy = DTy.getTypeDerivedFrom(); - isPointer = true; - } - - DICompositeType blockStruct = DICompositeType(TmpTy); - - // Find the __forwarding field and the variable field in the __Block_byref - // struct. - DIArray Fields = blockStruct.getTypeArray(); - DIDescriptor varField = DIDescriptor(); - DIDescriptor forwardingField = DIDescriptor(); - - for (unsigned i = 0, N = Fields.getNumElements(); i < N; ++i) { - DIDescriptor Element = Fields.getElement(i); - DIDerivedType DT = DIDerivedType(Element); - StringRef fieldName = DT.getName(); - if (fieldName == "__forwarding") - forwardingField = Element; - else if (fieldName == varName) - varField = Element; - } - - // Get the offsets for the forwarding field and the variable field. - unsigned forwardingFieldOffset = - DIDerivedType(forwardingField).getOffsetInBits() >> 3; - unsigned varFieldOffset = - DIDerivedType(varField).getOffsetInBits() >> 3; - - // Decode the original location, and use that as the start of the byref - // variable's location. - const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); - unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false); - DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - - if (Location.isReg()) { - if (Reg < 32) - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg); - else { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx); - addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); - } - } else { - if (Reg < 32) - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg); - else { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx); - addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); - } - - addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset()); - } - - // If we started with a pointer to the __Block_byref... struct, then - // the first thing we need to do is dereference the pointer (DW_OP_deref). - if (isPointer) - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); - - // Next add the offset for the '__forwarding' field: - // DW_OP_plus_uconst ForwardingFieldOffset. Note there's no point in - // adding the offset if it's 0. - if (forwardingFieldOffset > 0) { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); - addUInt(Block, 0, dwarf::DW_FORM_udata, forwardingFieldOffset); - } - - // Now dereference the __forwarding field to get to the real __Block_byref - // struct: DW_OP_deref. - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); - - // Now that we've got the real __Block_byref... struct, add the offset - // for the variable's field to get to the location of the actual variable: - // DW_OP_plus_uconst varFieldOffset. Again, don't add if it's 0. - if (varFieldOffset > 0) { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); - addUInt(Block, 0, dwarf::DW_FORM_udata, varFieldOffset); - } - - // Now attach the location information to the DIE. - addBlock(Die, Attribute, 0, Block); -} - -/// addAddress - Add an address attribute to a die based on the location -/// provided. -void DwarfDebug::addAddress(DIE *Die, unsigned Attribute, - const MachineLocation &Location) { - const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); - unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false); - DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - - if (RI->getFrameRegister(*Asm->MF) == Location.getReg() - && Location.getOffset()) { - // If variable offset is based in frame register then use fbreg. - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg); - addSInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset()); - addBlock(Die, Attribute, 0, Block); - return; - } - - if (Location.isReg()) { - if (Reg < 32) { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg); - } else { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx); - addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); - } - } else { - if (Reg < 32) { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg); - } else { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx); - addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); - } - - addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset()); - } - - addBlock(Die, Attribute, 0, Block); -} - -/// addRegisterAddress - Add register location entry in variable DIE. -bool DwarfDebug::addRegisterAddress(DIE *Die, const MachineOperand &MO) { - assert (MO.isReg() && "Invalid machine operand!"); - if (!MO.getReg()) - return false; - MachineLocation Location; - Location.set(MO.getReg()); - addAddress(Die, dwarf::DW_AT_location, Location); - return true; -} - -/// addConstantValue - Add constant value entry in variable DIE. -bool DwarfDebug::addConstantValue(DIE *Die, const MachineOperand &MO) { - assert (MO.isImm() && "Invalid machine operand!"); - DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - unsigned Imm = MO.getImm(); - addUInt(Block, 0, dwarf::DW_FORM_udata, Imm); - addBlock(Die, dwarf::DW_AT_const_value, 0, Block); - return true; -} - -/// addConstantFPValue - Add constant value entry in variable DIE. -bool DwarfDebug::addConstantFPValue(DIE *Die, const MachineOperand &MO) { - assert (MO.isFPImm() && "Invalid machine operand!"); - DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - APFloat FPImm = MO.getFPImm()->getValueAPF(); - - // Get the raw data form of the floating point. - const APInt FltVal = FPImm.bitcastToAPInt(); - const char *FltPtr = (const char*)FltVal.getRawData(); - - int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte. - bool LittleEndian = Asm->getTargetData().isLittleEndian(); - int Incr = (LittleEndian ? 1 : -1); - int Start = (LittleEndian ? 0 : NumBytes - 1); - int Stop = (LittleEndian ? NumBytes : -1); - - // Output the constant to DWARF one byte at a time. - for (; Start != Stop; Start += Incr) - addUInt(Block, 0, dwarf::DW_FORM_data1, - (unsigned char)0xFF & FltPtr[Start]); - - addBlock(Die, dwarf::DW_AT_const_value, 0, Block); - return true; -} - -/// addConstantValue - Add constant value entry in variable DIE. -bool DwarfDebug::addConstantValue(DIE *Die, ConstantInt *CI, - bool Unsigned) { - if (CI->getBitWidth() <= 64) { - if (Unsigned) - addUInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata, - CI->getZExtValue()); - else - addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, - CI->getSExtValue()); - return true; - } - - DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - - // Get the raw data form of the large APInt. - const APInt Val = CI->getValue(); - const char *Ptr = (const char*)Val.getRawData(); - - int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte. - bool LittleEndian = Asm->getTargetData().isLittleEndian(); - int Incr = (LittleEndian ? 1 : -1); - int Start = (LittleEndian ? 0 : NumBytes - 1); - int Stop = (LittleEndian ? NumBytes : -1); - - // Output the constant to DWARF one byte at a time. - for (; Start != Stop; Start += Incr) - addUInt(Block, 0, dwarf::DW_FORM_data1, - (unsigned char)0xFF & Ptr[Start]); - - addBlock(Die, dwarf::DW_AT_const_value, 0, Block); - return true; -} - -/// addTemplateParams - Add template parameters in buffer. -void DwarfDebug::addTemplateParams(DIE &Buffer, DIArray TParams) { - // Add template parameters. - for (unsigned i = 0, e = TParams.getNumElements(); i != e; ++i) { - DIDescriptor Element = TParams.getElement(i); - if (Element.isTemplateTypeParameter()) - Buffer.addChild(getOrCreateTemplateTypeParameterDIE( - DITemplateTypeParameter(Element))); - else if (Element.isTemplateValueParameter()) - Buffer.addChild(getOrCreateTemplateValueParameterDIE( - DITemplateValueParameter(Element))); - } - -} -/// addToContextOwner - Add Die into the list of its context owner's children. -void DwarfDebug::addToContextOwner(DIE *Die, DIDescriptor Context) { - if (Context.isType()) { - DIE *ContextDIE = getOrCreateTypeDIE(DIType(Context)); - ContextDIE->addChild(Die); - } else if (Context.isNameSpace()) { - DIE *ContextDIE = getOrCreateNameSpace(DINameSpace(Context)); - ContextDIE->addChild(Die); - } else if (Context.isSubprogram()) { - DIE *ContextDIE = createSubprogramDIE(DISubprogram(Context)); - ContextDIE->addChild(Die); - } else if (DIE *ContextDIE = getCompileUnit(Context)->getDIE(Context)) - ContextDIE->addChild(Die); - else - getCompileUnit(Context)->addDie(Die); -} - -/// getOrCreateTypeDIE - Find existing DIE or create new DIE for the -/// given DIType. -DIE *DwarfDebug::getOrCreateTypeDIE(DIType Ty) { - CompileUnit *TypeCU = getCompileUnit(Ty); - DIE *TyDIE = TypeCU->getDIE(Ty); - if (TyDIE) - return TyDIE; - - // Create new type. - TyDIE = new DIE(dwarf::DW_TAG_base_type); - TypeCU->insertDIE(Ty, TyDIE); - if (Ty.isBasicType()) - constructTypeDIE(*TyDIE, DIBasicType(Ty)); - else if (Ty.isCompositeType()) - constructTypeDIE(*TyDIE, DICompositeType(Ty)); - else { - assert(Ty.isDerivedType() && "Unknown kind of DIType"); - constructTypeDIE(*TyDIE, DIDerivedType(Ty)); - } - - addToContextOwner(TyDIE, Ty.getContext()); - return TyDIE; -} - -/// addType - Add a new type attribute to the specified entity. -void DwarfDebug::addType(DIE *Entity, DIType Ty) { - if (!Ty.Verify()) - return; - - // Check for pre-existence. - CompileUnit *TypeCU = getCompileUnit(Ty); - DIEEntry *Entry = TypeCU->getDIEEntry(Ty); - // If it exists then use the existing value. - if (Entry) { - Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry); - return; - } - - // Construct type. - DIE *Buffer = getOrCreateTypeDIE(Ty); - - // Set up proxy. - Entry = createDIEEntry(Buffer); - TypeCU->insertDIEEntry(Ty, Entry); - - Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry); -} - -/// constructTypeDIE - Construct basic type die from DIBasicType. -void DwarfDebug::constructTypeDIE(DIE &Buffer, DIBasicType BTy) { - // Get core information. - StringRef Name = BTy.getName(); - Buffer.setTag(dwarf::DW_TAG_base_type); - addUInt(&Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, - BTy.getEncoding()); - - // Add name if not anonymous or intermediate type. - if (!Name.empty()) - addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); - uint64_t Size = BTy.getSizeInBits() >> 3; - addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size); -} - -/// constructTypeDIE - Construct derived type die from DIDerivedType. -void DwarfDebug::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) { - // Get core information. - StringRef Name = DTy.getName(); - uint64_t Size = DTy.getSizeInBits() >> 3; - unsigned Tag = DTy.getTag(); - - // FIXME - Workaround for templates. - if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type; - - Buffer.setTag(Tag); - - // Map to main type, void will not have a type. - DIType FromTy = DTy.getTypeDerivedFrom(); - addType(&Buffer, FromTy); - - // Add name if not anonymous or intermediate type. - if (!Name.empty()) - addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); - - // Add size if non-zero (derived types might be zero-sized.) - if (Size) - addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size); - - // Add source line info if available and TyDesc is not a forward declaration. - if (!DTy.isForwardDecl()) - addSourceLine(&Buffer, DTy); -} - -/// constructTypeDIE - Construct type DIE from DICompositeType. -void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) { - // Get core information. - StringRef Name = CTy.getName(); - - uint64_t Size = CTy.getSizeInBits() >> 3; - unsigned Tag = CTy.getTag(); - Buffer.setTag(Tag); - - switch (Tag) { - case dwarf::DW_TAG_vector_type: - case dwarf::DW_TAG_array_type: - constructArrayTypeDIE(Buffer, &CTy); - break; - case dwarf::DW_TAG_enumeration_type: { - DIArray Elements = CTy.getTypeArray(); - - // Add enumerators to enumeration type. - for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) { - DIE *ElemDie = NULL; - DIDescriptor Enum(Elements.getElement(i)); - if (Enum.isEnumerator()) { - ElemDie = constructEnumTypeDIE(DIEnumerator(Enum)); - Buffer.addChild(ElemDie); - } - } - } - break; - case dwarf::DW_TAG_subroutine_type: { - // Add return type. - DIArray Elements = CTy.getTypeArray(); - DIDescriptor RTy = Elements.getElement(0); - addType(&Buffer, DIType(RTy)); - - bool isPrototyped = true; - // Add arguments. - for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) { - DIDescriptor Ty = Elements.getElement(i); - if (Ty.isUnspecifiedParameter()) { - DIE *Arg = new DIE(dwarf::DW_TAG_unspecified_parameters); - Buffer.addChild(Arg); - isPrototyped = false; - } else { - DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter); - addType(Arg, DIType(Ty)); - Buffer.addChild(Arg); - } - } - // Add prototype flag. - if (isPrototyped) - addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1); - } - break; - case dwarf::DW_TAG_structure_type: - case dwarf::DW_TAG_union_type: - case dwarf::DW_TAG_class_type: { - // Add elements to structure type. - DIArray Elements = CTy.getTypeArray(); - - // A forward struct declared type may not have elements available. - unsigned N = Elements.getNumElements(); - if (N == 0) - break; - - // Add elements to structure type. - for (unsigned i = 0; i < N; ++i) { - DIDescriptor Element = Elements.getElement(i); - DIE *ElemDie = NULL; - if (Element.isSubprogram()) { - DISubprogram SP(Element); - ElemDie = createSubprogramDIE(DISubprogram(Element)); - if (SP.isProtected()) - addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, - dwarf::DW_ACCESS_protected); - else if (SP.isPrivate()) - addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, - dwarf::DW_ACCESS_private); - else - addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, - dwarf::DW_ACCESS_public); - if (SP.isExplicit()) - addUInt(ElemDie, dwarf::DW_AT_explicit, dwarf::DW_FORM_flag, 1); - } - else if (Element.isVariable()) { - DIVariable DV(Element); - ElemDie = new DIE(dwarf::DW_TAG_variable); - addString(ElemDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, - DV.getName()); - addType(ElemDie, DV.getType()); - addUInt(ElemDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); - addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1); - addSourceLine(ElemDie, DV); - } else if (Element.isDerivedType()) - ElemDie = createMemberDIE(DIDerivedType(Element)); - else - continue; - Buffer.addChild(ElemDie); - } - - if (CTy.isAppleBlockExtension()) - addUInt(&Buffer, dwarf::DW_AT_APPLE_block, dwarf::DW_FORM_flag, 1); - - unsigned RLang = CTy.getRunTimeLang(); - if (RLang) - addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class, - dwarf::DW_FORM_data1, RLang); - - DICompositeType ContainingType = CTy.getContainingType(); - if (DIDescriptor(ContainingType).isCompositeType()) - addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, - getOrCreateTypeDIE(DIType(ContainingType))); - else { - DIDescriptor Context = CTy.getContext(); - addToContextOwner(&Buffer, Context); - } - - if (Tag == dwarf::DW_TAG_class_type) - addTemplateParams(Buffer, CTy.getTemplateParams()); - - break; - } - default: - break; - } - - // Add name if not anonymous or intermediate type. - if (!Name.empty()) - addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); - - if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type - || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) - { - // Add size if non-zero (derived types might be zero-sized.) - if (Size) - addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size); - else { - // Add zero size if it is not a forward declaration. - if (CTy.isForwardDecl()) - addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); - else - addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0); - } - - // Add source line info if available. - if (!CTy.isForwardDecl()) - addSourceLine(&Buffer, CTy); - } -} - -/// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE -/// for the given DITemplateTypeParameter. -DIE * -DwarfDebug::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) { - CompileUnit *TypeCU = getCompileUnit(TP); - DIE *ParamDIE = TypeCU->getDIE(TP); - if (ParamDIE) - return ParamDIE; - - ParamDIE = new DIE(dwarf::DW_TAG_template_type_parameter); - addType(ParamDIE, TP.getType()); - addString(ParamDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, TP.getName()); - return ParamDIE; -} - -/// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE -/// for the given DITemplateValueParameter. -DIE * -DwarfDebug::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV) { - CompileUnit *TVCU = getCompileUnit(TPV); - DIE *ParamDIE = TVCU->getDIE(TPV); - if (ParamDIE) - return ParamDIE; - - ParamDIE = new DIE(dwarf::DW_TAG_template_value_parameter); - addType(ParamDIE, TPV.getType()); - if (!TPV.getName().empty()) - addString(ParamDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, TPV.getName()); - addUInt(ParamDIE, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata, - TPV.getValue()); - return ParamDIE; -} - -/// constructSubrangeDIE - Construct subrange DIE from DISubrange. -void DwarfDebug::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){ - int64_t L = SR.getLo(); - int64_t H = SR.getHi(); - DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type); - - addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy); - if (L) - addSInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L); - addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H); - - Buffer.addChild(DW_Subrange); -} - -/// constructArrayTypeDIE - Construct array type DIE from DICompositeType. -void DwarfDebug::constructArrayTypeDIE(DIE &Buffer, - DICompositeType *CTy) { - Buffer.setTag(dwarf::DW_TAG_array_type); - if (CTy->getTag() == dwarf::DW_TAG_vector_type) - addUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1); - - // Emit derived type. - addType(&Buffer, CTy->getTypeDerivedFrom()); - DIArray Elements = CTy->getTypeArray(); - - // Get an anonymous type for index type. - CompileUnit *TheCU = getCompileUnit(*CTy); - DIE *IdxTy = TheCU->getIndexTyDie(); - if (!IdxTy) { - // Construct an anonymous type for index type. - IdxTy = new DIE(dwarf::DW_TAG_base_type); - addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t)); - addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, - dwarf::DW_ATE_signed); - TheCU->addDie(IdxTy); - TheCU->setIndexTyDie(IdxTy); - } - - // Add subranges to array type. - for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) { - DIDescriptor Element = Elements.getElement(i); - if (Element.getTag() == dwarf::DW_TAG_subrange_type) - constructSubrangeDIE(Buffer, DISubrange(Element), IdxTy); - } -} - -/// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator. -DIE *DwarfDebug::constructEnumTypeDIE(DIEnumerator ETy) { - DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator); - StringRef Name = ETy.getName(); - addString(Enumerator, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); - int64_t Value = ETy.getEnumValue(); - addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value); - return Enumerator; -} - /// getRealLinkageName - If special LLVM prefix that is used to inform the asm /// printer to not emit usual symbol prefix before the symbol name is used then /// return linkage name after skipping this special LLVM prefix. @@ -1301,84 +310,6 @@ static StringRef getRealLinkageName(StringRef LinkageName) { return LinkageName; } -/// createMemberDIE - Create new member DIE. -DIE *DwarfDebug::createMemberDIE(DIDerivedType DT) { - DIE *MemberDie = new DIE(DT.getTag()); - StringRef Name = DT.getName(); - if (!Name.empty()) - addString(MemberDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); - - addType(MemberDie, DT.getTypeDerivedFrom()); - - addSourceLine(MemberDie, DT); - - DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock(); - addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); - - uint64_t Size = DT.getSizeInBits(); - uint64_t FieldSize = DT.getOriginalTypeSize(); - - if (Size != FieldSize) { - // Handle bitfield. - addUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3); - addUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits()); - - uint64_t Offset = DT.getOffsetInBits(); - uint64_t AlignMask = ~(DT.getAlignInBits() - 1); - uint64_t HiMark = (Offset + FieldSize) & AlignMask; - uint64_t FieldOffset = (HiMark - FieldSize); - Offset -= FieldOffset; - - // Maybe we need to work from the other end. - if (Asm->getTargetData().isLittleEndian()) - Offset = FieldSize - (Offset + Size); - addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset); - - // Here WD_AT_data_member_location points to the anonymous - // field that includes this bit field. - addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, FieldOffset >> 3); - - } else - // This is not a bitfield. - addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3); - - if (DT.getTag() == dwarf::DW_TAG_inheritance - && DT.isVirtual()) { - - // For C++, virtual base classes are not at fixed offset. Use following - // expression to extract appropriate offset from vtable. - // BaseAddr = ObAddr + *((*ObAddr) - Offset) - - DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock(); - addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup); - addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); - addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu); - addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits()); - addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_minus); - addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); - addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus); - - addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, - VBaseLocationDie); - } else - addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie); - - if (DT.isProtected()) - addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, - dwarf::DW_ACCESS_protected); - else if (DT.isPrivate()) - addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, - dwarf::DW_ACCESS_private); - // Otherwise C++ member and base classes are considered public. - else if (DT.getCompileUnit().getLanguage() == dwarf::DW_LANG_C_plus_plus) - addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag, - dwarf::DW_ACCESS_public); - if (DT.isVirtual()) - addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, - dwarf::DW_VIRTUALITY_virtual); - return MemberDie; -} - /// createSubprogramDIE - Create new DIE using SP. DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) { CompileUnit *SPCU = getCompileUnit(SP); @@ -1387,19 +318,35 @@ DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) { return SPDie; SPDie = new DIE(dwarf::DW_TAG_subprogram); - // Constructors and operators for anonymous aggregates do not have names. - if (!SP.getName().empty()) - addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, SP.getName()); + + // DW_TAG_inlined_subroutine may refer to this DIE. + SPCU->insertDIE(SP, SPDie); + + // Add to context owner. + SPCU->addToContextOwner(SPDie, SP.getContext()); + + // Add function template parameters. + SPCU->addTemplateParams(*SPDie, SP.getTemplateParams()); StringRef LinkageName = SP.getLinkageName(); if (!LinkageName.empty()) - addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string, - getRealLinkageName(LinkageName)); + SPCU->addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string, + getRealLinkageName(LinkageName)); - addSourceLine(SPDie, SP); + // If this DIE is going to refer declaration info using AT_specification + // then there is no need to add other attributes. + if (SP.getFunctionDeclaration().isSubprogram()) + return SPDie; + + // Constructors and operators for anonymous aggregates do not have names. + if (!SP.getName().empty()) + SPCU->addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, + SP.getName()); + + SPCU->addSourceLine(SPDie, SP); if (SP.isPrototyped()) - addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1); + SPCU->addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1); // Add Return Type. DICompositeType SPTy = SP.getType(); @@ -1407,24 +354,24 @@ DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) { unsigned SPTag = SPTy.getTag(); if (Args.getNumElements() == 0 || SPTag != dwarf::DW_TAG_subroutine_type) - addType(SPDie, SPTy); + SPCU->addType(SPDie, SPTy); else - addType(SPDie, DIType(Args.getElement(0))); + SPCU->addType(SPDie, DIType(Args.getElement(0))); unsigned VK = SP.getVirtuality(); if (VK) { - addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK); - DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu); - addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex()); - addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block); + SPCU->addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK); + DIEBlock *Block = SPCU->getDIEBlock(); + SPCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu); + SPCU->addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex()); + SPCU->addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block); ContainingTypeMap.insert(std::make_pair(SPDie, SP.getContainingType())); } if (!SP.isDefinition()) { - addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); - + SPCU->addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); + // Add arguments. Do not add arguments for subprogram definition. They will // be handled while processing variables. DICompositeType SPTy = SP.getType(); @@ -1435,35 +382,26 @@ DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) { for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) { DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter); DIType ATy = DIType(DIType(Args.getElement(i))); - addType(Arg, ATy); + SPCU->addType(Arg, ATy); if (ATy.isArtificial()) - addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); + SPCU->addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); SPDie->addChild(Arg); } } if (SP.isArtificial()) - addUInt(SPDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); + SPCU->addUInt(SPDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); if (!SP.isLocalToUnit()) - addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1); + SPCU->addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1); if (SP.isOptimized()) - addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1); + SPCU->addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1); if (unsigned isa = Asm->getISAEncoding()) { - addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa); + SPCU->addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa); } - // Add function template parameters. - addTemplateParams(*SPDie, SP.getTemplateParams()); - - // DW_TAG_inlined_subroutine may refer to this DIE. - SPCU->insertDIE(SP, SPDie); - - // Add to context owner. - addToContextOwner(SPDie, SP.getContext()); - return SPDie; } @@ -1518,51 +456,57 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) { assert(SPDie && "Unable to find subprogram DIE!"); DISubprogram SP(SPNode); - // There is not any need to generate specification DIE for a function - // defined at compile unit level. If a function is defined inside another - // function then gdb prefers the definition at top level and but does not - // expect specification DIE in parent function. So avoid creating - // specification DIE for a function defined inside a function. - if (SP.isDefinition() && !SP.getContext().isCompileUnit() && - !SP.getContext().isFile() && - !isSubprogramContext(SP.getContext())) { - addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); - - // Add arguments. - DICompositeType SPTy = SP.getType(); - DIArray Args = SPTy.getTypeArray(); - unsigned SPTag = SPTy.getTag(); - if (SPTag == dwarf::DW_TAG_subroutine_type) - for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) { - DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter); - DIType ATy = DIType(DIType(Args.getElement(i))); - addType(Arg, ATy); - if (ATy.isArtificial()) - addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); - SPDie->addChild(Arg); - } - DIE *SPDeclDie = SPDie; - SPDie = new DIE(dwarf::DW_TAG_subprogram); - addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4, - SPDeclDie); - SPCU->addDie(SPDie); + DISubprogram SPDecl = SP.getFunctionDeclaration(); + if (SPDecl.isSubprogram()) + // Refer function declaration directly. + SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4, + createSubprogramDIE(SPDecl)); + else { + // There is not any need to generate specification DIE for a function + // defined at compile unit level. If a function is defined inside another + // function then gdb prefers the definition at top level and but does not + // expect specification DIE in parent function. So avoid creating + // specification DIE for a function defined inside a function. + if (SP.isDefinition() && !SP.getContext().isCompileUnit() && + !SP.getContext().isFile() && + !isSubprogramContext(SP.getContext())) { + SPCU-> addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); + + // Add arguments. + DICompositeType SPTy = SP.getType(); + DIArray Args = SPTy.getTypeArray(); + unsigned SPTag = SPTy.getTag(); + if (SPTag == dwarf::DW_TAG_subroutine_type) + for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) { + DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter); + DIType ATy = DIType(DIType(Args.getElement(i))); + SPCU->addType(Arg, ATy); + if (ATy.isArtificial()) + SPCU->addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); + SPDie->addChild(Arg); + } + DIE *SPDeclDie = SPDie; + SPDie = new DIE(dwarf::DW_TAG_subprogram); + SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4, + SPDeclDie); + SPCU->addDie(SPDie); + } } - // Pick up abstract subprogram DIE. if (DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode)) { SPDie = new DIE(dwarf::DW_TAG_subprogram); - addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin, - dwarf::DW_FORM_ref4, AbsSPDIE); + SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin, + dwarf::DW_FORM_ref4, AbsSPDIE); SPCU->addDie(SPDie); } - addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, - Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber())); - addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, - Asm->GetTempSymbol("func_end", Asm->getFunctionNumber())); + SPCU->addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, + Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber())); + SPCU->addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, + Asm->GetTempSymbol("func_end", Asm->getFunctionNumber())); const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); MachineLocation Location(RI->getFrameRegister(*Asm->MF)); - addAddress(SPDie, dwarf::DW_AT_frame_base, Location); + SPCU->addAddress(SPDie, dwarf::DW_AT_frame_base, Location); return SPDie; } @@ -1579,13 +523,14 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) { if (Ranges.empty()) return 0; + CompileUnit *TheCU = getCompileUnit(Scope->getScopeNode()); SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin(); if (Ranges.size() > 1) { // .debug_range section has not been laid out yet. Emit offset in // .debug_range as a uint, size 4, for now. emitDIE will handle // DW_AT_ranges appropriately. - addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4, - DebugRangeSymbols.size() * Asm->getTargetData().getPointerSize()); + TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4, + DebugRangeSymbols.size() * Asm->getTargetData().getPointerSize()); for (SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin(), RE = Ranges.end(); RI != RE; ++RI) { DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first)); @@ -1604,8 +549,8 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) { assert(Start->isDefined() && "Invalid starting label for an inlined scope!"); assert(End->isDefined() && "Invalid end label for an inlined scope!"); - addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, Start); - addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, End); + TheCU->addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, Start); + TheCU->addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, End); return ScopeDIE; } @@ -1639,17 +584,19 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) { if (!Scope->getScopeNode()) return NULL; DIScope DS(Scope->getScopeNode()); - DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine); - DISubprogram InlinedSP = getDISubprogram(DS); CompileUnit *TheCU = getCompileUnit(InlinedSP); DIE *OriginDIE = TheCU->getDIE(InlinedSP); - assert(OriginDIE && "Unable to find Origin DIE!"); - addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin, - dwarf::DW_FORM_ref4, OriginDIE); + if (!OriginDIE) { + DEBUG(dbgs() << "Unable to find original DIE for inlined subprogram."); + return NULL; + } + DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine); + TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin, + dwarf::DW_FORM_ref4, OriginDIE); - addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, StartLabel); - addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, EndLabel); + TheCU->addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, StartLabel); + TheCU->addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, EndLabel); InlinedSubprogramDIEs.insert(OriginDIE); @@ -1665,8 +612,8 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) { I->second.push_back(std::make_pair(StartLabel, ScopeDIE)); DILocation DL(Scope->getInlinedAt()); - addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0, TheCU->getID()); - addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber()); + TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0, TheCU->getID()); + TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber()); return ScopeDIE; } @@ -1695,7 +642,7 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) { // Define variable debug information entry. DIE *VariableDie = new DIE(Tag); - + CompileUnit *VariableCU = getCompileUnit(DV->getVariable()); DIE *AbsDIE = NULL; DenseMap<const DbgVariable *, const DbgVariable *>::iterator V2AVI = VarToAbstractVarMap.find(DV); @@ -1703,20 +650,23 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) { AbsDIE = V2AVI->second->getDIE(); if (AbsDIE) - addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin, - dwarf::DW_FORM_ref4, AbsDIE); + VariableCU->addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin, + dwarf::DW_FORM_ref4, AbsDIE); else { - addString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name); - addSourceLine(VariableDie, DV->getVariable()); + VariableCU->addString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, + Name); + VariableCU->addSourceLine(VariableDie, DV->getVariable()); // Add variable type. - addType(VariableDie, DV->getType()); + VariableCU->addType(VariableDie, DV->getType()); } if (Tag == dwarf::DW_TAG_formal_parameter && DV->getType().isArtificial()) - addUInt(VariableDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); + VariableCU->addUInt(VariableDie, dwarf::DW_AT_artificial, + dwarf::DW_FORM_flag, 1); else if (DIVariable(DV->getVariable()).isArtificial()) - addUInt(VariableDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); + VariableCU->addUInt(VariableDie, dwarf::DW_AT_artificial, + dwarf::DW_FORM_flag, 1); if (Scope->isAbstractScope()) { DV->setDIE(VariableDie); @@ -1727,7 +677,7 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) { unsigned Offset = DV->getDotDebugLocOffset(); if (Offset != ~0U) { - addLabel(VariableDie, dwarf::DW_AT_location, dwarf::DW_FORM_data4, + VariableCU->addLabel(VariableDie, dwarf::DW_AT_location, dwarf::DW_FORM_data4, Asm->GetTempSymbol("debug_loc", Offset)); DV->setDIE(VariableDie); UseDotDebugLocEntry.insert(VariableDie); @@ -1747,22 +697,31 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) { const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo(); if (DVInsn->getOperand(1).isImm() && TRI->getFrameRegister(*Asm->MF) == RegOp.getReg()) { - addVariableAddress(DV, VariableDie, DVInsn->getOperand(1).getImm()); - updated = true; - } else - updated = addRegisterAddress(VariableDie, RegOp); + unsigned FrameReg = 0; + const TargetFrameLowering *TFI = Asm->TM.getFrameLowering(); + int Offset = + TFI->getFrameIndexReference(*Asm->MF, + DVInsn->getOperand(1).getImm(), + FrameReg); + MachineLocation Location(FrameReg, Offset); + VariableCU->addVariableAddress(DV, VariableDie, Location); + + } else if (RegOp.getReg()) + VariableCU->addVariableAddress(DV, VariableDie, + MachineLocation(RegOp.getReg())); + updated = true; } else if (DVInsn->getOperand(0).isImm()) - updated = addConstantValue(VariableDie, DVInsn->getOperand(0)); + updated = + VariableCU->addConstantValue(VariableDie, DVInsn->getOperand(0), + DV->getType()); else if (DVInsn->getOperand(0).isFPImm()) updated = - addConstantFPValue(VariableDie, DVInsn->getOperand(0)); + VariableCU->addConstantFPValue(VariableDie, DVInsn->getOperand(0)); } else { - MachineLocation Location = Asm->getDebugValueLocation(DVInsn); - if (Location.getReg()) { - addAddress(VariableDie, dwarf::DW_AT_location, Location); - updated = true; - } + VariableCU->addVariableAddress(DV, VariableDie, + Asm->getDebugValueLocation(DVInsn)); + updated = true; } if (!updated) { // If variableDie is not updated then DBG_VALUE instruction does not @@ -1776,35 +735,20 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) { // .. else use frame index, if available. int FI = 0; - if (findVariableFrameIndex(DV, &FI)) - addVariableAddress(DV, VariableDie, FI); - + if (findVariableFrameIndex(DV, &FI)) { + unsigned FrameReg = 0; + const TargetFrameLowering *TFI = Asm->TM.getFrameLowering(); + int Offset = + TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg); + MachineLocation Location(FrameReg, Offset); + VariableCU->addVariableAddress(DV, VariableDie, Location); + } + DV->setDIE(VariableDie); return VariableDie; } -void DwarfDebug::addPubTypes(DISubprogram SP) { - DICompositeType SPTy = SP.getType(); - unsigned SPTag = SPTy.getTag(); - if (SPTag != dwarf::DW_TAG_subroutine_type) - return; - - DIArray Args = SPTy.getTypeArray(); - for (unsigned i = 0, e = Args.getNumElements(); i != e; ++i) { - DIType ATy(Args.getElement(i)); - if (!ATy.Verify()) - continue; - DICompositeType CATy = getDICompositeType(ATy); - if (DIDescriptor(CATy).Verify() && !CATy.getName().empty() - && !CATy.isForwardDecl()) { - CompileUnit *TheCU = getCompileUnit(CATy); - if (DIEEntry *Entry = TheCU->getDIEEntry(CATy)) - TheCU->addGlobalType(CATy.getName(), Entry->getEntry()); - } - } -} - /// constructScopeDIE - Construct a DIE for this scope. DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) { if (!Scope || !Scope->getScopeNode()) @@ -1858,7 +802,7 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) { ScopeDIE->addChild(*I); if (DS.isSubprogram()) - addPubTypes(DISubprogram(DS)); + getCompileUnit(DS)->addPubTypes(DISubprogram(DS)); return ScopeDIE; } @@ -1875,11 +819,9 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName, return GetOrCreateSourceID("<stdin>", StringRef()); // MCStream expects full path name as filename. - if (!DirName.empty() && !FileName.startswith("/")) { - std::string FullPathName(DirName.data()); - if (!DirName.endswith("/")) - FullPathName += "/"; - FullPathName += FileName.data(); + if (!DirName.empty() && !sys::path::is_absolute(FileName)) { + SmallString<128> FullPathName = DirName; + sys::path::append(FullPathName, FileName); // Here FullPathName will be copied into StringMap by GetOrCreateSourceID. return GetOrCreateSourceID(StringRef(FullPathName), StringRef()); } @@ -1897,21 +839,6 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName, return SrcId; } -/// getOrCreateNameSpace - Create a DIE for DINameSpace. -DIE *DwarfDebug::getOrCreateNameSpace(DINameSpace NS) { - CompileUnit *TheCU = getCompileUnit(NS); - DIE *NDie = TheCU->getDIE(NS); - if (NDie) - return NDie; - NDie = new DIE(dwarf::DW_TAG_namespace); - TheCU->insertDIE(NS, NDie); - if (!NS.getName().empty()) - addString(NDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, NS.getName()); - addSourceLine(NDie, NS); - addToContextOwner(NDie, NS.getContext()); - return NDie; -} - /// constructCompileUnit - Create new CompileUnit for the given /// metadata node with tag DW_TAG_compile_unit. void DwarfDebug::constructCompileUnit(const MDNode *N) { @@ -1921,37 +848,37 @@ void DwarfDebug::constructCompileUnit(const MDNode *N) { unsigned ID = GetOrCreateSourceID(FN, Dir); DIE *Die = new DIE(dwarf::DW_TAG_compile_unit); - addString(Die, dwarf::DW_AT_producer, dwarf::DW_FORM_string, - DIUnit.getProducer()); - addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2, - DIUnit.getLanguage()); - addString(Die, dwarf::DW_AT_name, dwarf::DW_FORM_string, FN); + CompileUnit *NewCU = new CompileUnit(ID, Die, Asm, this); + NewCU->addString(Die, dwarf::DW_AT_producer, dwarf::DW_FORM_string, + DIUnit.getProducer()); + NewCU->addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2, + DIUnit.getLanguage()); + NewCU->addString(Die, dwarf::DW_AT_name, dwarf::DW_FORM_string, FN); // Use DW_AT_entry_pc instead of DW_AT_low_pc/DW_AT_high_pc pair. This // simplifies debug range entries. - addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0); + NewCU->addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0); // DW_AT_stmt_list is a offset of line number information for this // compile unit in debug_line section. - if (Asm->MAI->doesDwarfUsesAbsoluteLabelForStmtList()) - addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_addr, - Asm->GetTempSymbol("section_line")); + if(Asm->MAI->doesDwarfRequireRelocationForSectionOffset()) + NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, + Asm->GetTempSymbol("section_line")); else - addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0); + NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0); if (!Dir.empty()) - addString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir); + NewCU->addString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir); if (DIUnit.isOptimized()) - addUInt(Die, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1); + NewCU->addUInt(Die, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1); StringRef Flags = DIUnit.getFlags(); if (!Flags.empty()) - addString(Die, dwarf::DW_AT_APPLE_flags, dwarf::DW_FORM_string, Flags); - + NewCU->addString(Die, dwarf::DW_AT_APPLE_flags, dwarf::DW_FORM_string, Flags); + unsigned RVer = DIUnit.getRunTimeVersion(); if (RVer) - addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers, + NewCU->addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers, dwarf::DW_FORM_data1, RVer); - CompileUnit *NewCU = new CompileUnit(ID, Die); if (!FirstCU) FirstCU = NewCU; CUMap.insert(std::make_pair(N, NewCU)); @@ -2047,38 +974,34 @@ void DwarfDebug::constructGlobalVariableDIE(const MDNode *N) { bool isGlobalVariable = GV.getGlobal() != NULL; // Add name. - addString(VariableDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, - GV.getDisplayName()); + TheCU->addString(VariableDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, + GV.getDisplayName()); StringRef LinkageName = GV.getLinkageName(); if (!LinkageName.empty() && isGlobalVariable) - addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string, - getRealLinkageName(LinkageName)); + TheCU->addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name, + dwarf::DW_FORM_string, + getRealLinkageName(LinkageName)); // Add type. - addType(VariableDIE, GTy); - if (GTy.isCompositeType() && !GTy.getName().empty() - && !GTy.isForwardDecl()) { - DIEEntry *Entry = TheCU->getDIEEntry(GTy); - assert(Entry && "Missing global type!"); - TheCU->addGlobalType(GTy.getName(), Entry->getEntry()); - } + TheCU->addType(VariableDIE, GTy); + // Add scoping info. if (!GV.isLocalToUnit()) { - addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1); + TheCU->addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1); // Expose as global. TheCU->addGlobal(GV.getName(), VariableDIE); } // Add line number info. - addSourceLine(VariableDIE, GV); + TheCU->addSourceLine(VariableDIE, GV); // Add to map. TheCU->insertDIE(N, VariableDIE); // Add to context owner. DIDescriptor GVContext = GV.getContext(); - addToContextOwner(VariableDIE, GVContext); + TheCU->addToContextOwner(VariableDIE, GVContext); // Add location. if (isGlobalVariable) { DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr); - addLabel(Block, 0, dwarf::DW_FORM_udata, + TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr); + TheCU->addLabel(Block, 0, dwarf::DW_FORM_udata, Asm->Mang->getSymbol(GV.getGlobal())); // Do not create specification DIE if context is either compile unit // or a subprogram. @@ -2086,28 +1009,28 @@ void DwarfDebug::constructGlobalVariableDIE(const MDNode *N) { !GVContext.isFile() && !isSubprogramContext(GVContext)) { // Create specification DIE. DIE *VariableSpecDIE = new DIE(dwarf::DW_TAG_variable); - addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification, + TheCU->addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4, VariableDIE); - addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block); - addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); + TheCU->addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block); + TheCU->addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); TheCU->addDie(VariableSpecDIE); } else { - addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block); + TheCU->addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block); } } else if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(GV.getConstant())) - addConstantValue(VariableDIE, CI, isUnsignedDIType(GTy)); + TheCU->addConstantValue(VariableDIE, CI, isUnsignedDIType(GTy)); else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) { // GV is a merged global. DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr); - addLabel(Block, 0, dwarf::DW_FORM_udata, - Asm->Mang->getSymbol(cast<GlobalValue>(CE->getOperand(0)))); + TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr); + TheCU->addLabel(Block, 0, dwarf::DW_FORM_udata, + Asm->Mang->getSymbol(cast<GlobalValue>(CE->getOperand(0)))); ConstantInt *CII = cast<ConstantInt>(CE->getOperand(2)); - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu); - addUInt(Block, 0, dwarf::DW_FORM_udata, CII->getZExtValue()); - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus); - addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block); + TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu); + TheCU->addUInt(Block, 0, dwarf::DW_FORM_udata, CII->getZExtValue()); + TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus); + TheCU->addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block); } return; @@ -2133,7 +1056,7 @@ void DwarfDebug::constructSubprogramDIE(const MDNode *N) { TheCU->insertDIE(N, SubprogramDie); // Add to context owner. - addToContextOwner(SubprogramDie, SP.getContext()); + TheCU->addToContextOwner(SubprogramDie, SP.getContext()); // Expose as global. TheCU->addGlobal(SP.getName(), SubprogramDie); @@ -2148,52 +1071,80 @@ void DwarfDebug::beginModule(Module *M) { if (DisableDebugInfoPrinting) return; - DebugInfoFinder DbgFinder; - DbgFinder.processModule(*M); - - bool HasDebugInfo = false; + // If module has named metadata anchors then use them, otherwise scan the module + // using debug info finder to collect debug info. + NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); + if (CU_Nodes) { + + NamedMDNode *GV_Nodes = M->getNamedMetadata("llvm.dbg.gv"); + NamedMDNode *SP_Nodes = M->getNamedMetadata("llvm.dbg.sp"); + if (!GV_Nodes && !SP_Nodes) + // If there are not any global variables or any functions then + // there is not any debug info in this module. + return; + + for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) + constructCompileUnit(CU_Nodes->getOperand(i)); + + if (GV_Nodes) + for (unsigned i = 0, e = GV_Nodes->getNumOperands(); i != e; ++i) + constructGlobalVariableDIE(GV_Nodes->getOperand(i)); + + if (SP_Nodes) + for (unsigned i = 0, e = SP_Nodes->getNumOperands(); i != e; ++i) + constructSubprogramDIE(SP_Nodes->getOperand(i)); + + } else { - // Scan all the compile-units to see if there are any marked as the main unit. - // if not, we do not generate debug info. - for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(), - E = DbgFinder.compile_unit_end(); I != E; ++I) { - if (DICompileUnit(*I).isMain()) { - HasDebugInfo = true; - break; + DebugInfoFinder DbgFinder; + DbgFinder.processModule(*M); + + bool HasDebugInfo = false; + // Scan all the compile-units to see if there are any marked as the main unit. + // if not, we do not generate debug info. + for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(), + E = DbgFinder.compile_unit_end(); I != E; ++I) { + if (DICompileUnit(*I).isMain()) { + HasDebugInfo = true; + break; + } } + if (!HasDebugInfo) return; + + // Create all the compile unit DIEs. + for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(), + E = DbgFinder.compile_unit_end(); I != E; ++I) + constructCompileUnit(*I); + + // Create DIEs for each global variable. + for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(), + E = DbgFinder.global_variable_end(); I != E; ++I) + constructGlobalVariableDIE(*I); + + // Create DIEs for each subprogram. + for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(), + E = DbgFinder.subprogram_end(); I != E; ++I) + constructSubprogramDIE(*I); } - - if (!HasDebugInfo) return; - + // Tell MMI that we have debug info. MMI->setDebugInfoAvailability(true); - + // Emit initial sections. EmitSectionLabels(); - // Create all the compile unit DIEs. - for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(), - E = DbgFinder.compile_unit_end(); I != E; ++I) - constructCompileUnit(*I); - - // Create DIEs for each subprogram. - for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(), - E = DbgFinder.subprogram_end(); I != E; ++I) - constructSubprogramDIE(*I); - - // Create DIEs for each global variable. - for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(), - E = DbgFinder.global_variable_end(); I != E; ++I) - constructGlobalVariableDIE(*I); - //getOrCreateTypeDIE if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.enum")) - for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) - getOrCreateTypeDIE(DIType(NMD->getOperand(i))); + for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) { + DIType Ty(NMD->getOperand(i)); + getCompileUnit(Ty)->getOrCreateTypeDIE(Ty); + } if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.ty")) - for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) - getOrCreateTypeDIE(DIType(NMD->getOperand(i))); + for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) { + DIType Ty(NMD->getOperand(i)); + getCompileUnit(Ty)->getOrCreateTypeDIE(Ty); + } // Prime section data. SectionMap.insert(Asm->getObjFileLowering().getTextSection()); @@ -2244,7 +1195,7 @@ void DwarfDebug::endModule() { for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(), AE = InlinedSubprogramDIEs.end(); AI != AE; ++AI) { DIE *ISP = *AI; - addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined); + FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined); } for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(), @@ -2254,7 +1205,8 @@ void DwarfDebug::endModule() { if (!N) continue; DIE *NDie = getCompileUnit(N)->getDIE(N); if (!NDie) continue; - addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie); + getCompileUnit(N)->addDIEEntry(SPDie, dwarf::DW_AT_containing_type, + dwarf::DW_FORM_ref4, NDie); } // Standard sections final addresses. @@ -2269,14 +1221,6 @@ void DwarfDebug::endModule() { Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", i)); } - // Emit common frame information. - emitCommonDebugFrame(); - - // Emit function debug frame information - for (std::vector<FunctionDebugFrameInfo>::iterator I = DebugFrames.begin(), - E = DebugFrames.end(); I != E; ++I) - emitFunctionDebugFrame(*I); - // Compute DIE offsets and sizes. computeSizeAndOffsets(); @@ -2464,15 +1408,10 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF, HI = History.begin(), HE = History.end(); HI != HE; ++HI) { const MachineInstr *Begin = *HI; assert(Begin->isDebugValue() && "Invalid History entry"); - MachineLocation MLoc; - if (Begin->getNumOperands() == 3) { - if (Begin->getOperand(0).isReg() && Begin->getOperand(1).isImm()) - MLoc.set(Begin->getOperand(0).getReg(), Begin->getOperand(1).getImm()); - } else - MLoc = Asm->getDebugValueLocation(Begin); - // FIXME: emitDebugLoc only understands registers. - if (!MLoc.getReg()) + // Check if DBG_VALUE is truncating a range. + if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg() + && !Begin->getOperand(0).getReg()) continue; // Compute the range for a register location. @@ -2481,7 +1420,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF, if (HI + 1 == HE) // If Begin is the last instruction in History then its value is valid - // until the end of the funtion. + // until the end of the function. SLabel = FunctionEndSym; else { const MachineInstr *End = HI[1]; @@ -2496,7 +1435,25 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF, } // The value is valid until the next DBG_VALUE or clobber. - DotDebugLocEntries.push_back(DotDebugLocEntry(FLabel, SLabel, MLoc)); + MachineLocation MLoc; + if (Begin->getNumOperands() == 3) { + if (Begin->getOperand(0).isReg() && Begin->getOperand(1).isImm()) { + MLoc.set(Begin->getOperand(0).getReg(), + Begin->getOperand(1).getImm()); + DotDebugLocEntries. + push_back(DotDebugLocEntry(FLabel, SLabel, MLoc, Var)); + } + // FIXME: Handle isFPImm also. + else if (Begin->getOperand(0).isImm()) { + DotDebugLocEntries. + push_back(DotDebugLocEntry(FLabel, SLabel, + Begin->getOperand(0).getImm())); + } + } else { + MLoc = Asm->getDebugValueLocation(Begin); + DotDebugLocEntries. + push_back(DotDebugLocEntry(FLabel, SLabel, MLoc, Var)); + } } DotDebugLocEntries.push_back(DotDebugLocEntry()); } @@ -2533,12 +1490,17 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { if (!MI->isDebugValue()) { DebugLoc DL = MI->getDebugLoc(); if (DL != PrevInstLoc && (!DL.isUnknown() || UnknownLocations)) { + unsigned Flags = DWARF2_FLAG_IS_STMT; PrevInstLoc = DL; + if (DL == PrologEndLoc) { + Flags |= DWARF2_FLAG_PROLOGUE_END; + PrologEndLoc = DebugLoc(); + } if (!DL.isUnknown()) { const MDNode *Scope = DL.getScope(Asm->MF->getFunction()->getContext()); - recordSourceLine(DL.getLine(), DL.getCol(), Scope); + recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags); } else - recordSourceLine(0, 0, 0); + recordSourceLine(0, 0, 0, 0); } } @@ -2850,41 +1812,22 @@ void DwarfDebug::identifyScopeMarkers() { } } -/// FindFirstDebugLoc - Find the first debug location in the function. This -/// is intended to be an approximation for the source position of the -/// beginning of the function. -static DebugLoc FindFirstDebugLoc(const MachineFunction *MF) { - for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); - I != E; ++I) - for (MachineBasicBlock::const_iterator MBBI = I->begin(), MBBE = I->end(); - MBBI != MBBE; ++MBBI) { - DebugLoc DL = MBBI->getDebugLoc(); - if (!DL.isUnknown()) - return DL; - } - return DebugLoc(); +/// getScopeNode - Get MDNode for DebugLoc's scope. +static MDNode *getScopeNode(DebugLoc DL, const LLVMContext &Ctx) { + if (MDNode *InlinedAt = DL.getInlinedAt(Ctx)) + return getScopeNode(DebugLoc::getFromDILocation(InlinedAt), Ctx); + return DL.getScope(Ctx); } -#ifndef NDEBUG -/// CheckLineNumbers - Count basicblocks whose instructions do not have any -/// line number information. -static void CheckLineNumbers(const MachineFunction *MF) { - for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); - I != E; ++I) { - bool FoundLineNo = false; - for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end(); - II != IE; ++II) { - const MachineInstr *MI = II; - if (!MI->getDebugLoc().isUnknown()) { - FoundLineNo = true; - break; - } - } - if (!FoundLineNo && I->size()) - ++BlocksWithoutLineNo; - } +/// getFnDebugLoc - Walk up the scope chain of given debug loc and find +/// line number info for the function. +static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) { + const MDNode *Scope = getScopeNode(DL, Ctx); + DISubprogram SP = getDISubprogram(Scope); + if (SP.Verify()) + return DebugLoc::get(SP.getLineNumber(), 0, SP); + return DebugLoc(); } -#endif /// beginFunction - Gather pre-function debug information. Assumes being /// emitted immediately after the function entry point. @@ -2892,44 +1835,16 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { if (!MMI->hasDebugInfo()) return; if (!extractScopeInformation()) return; -#ifndef NDEBUG - CheckLineNumbers(MF); -#endif - FunctionBeginSym = Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber()); // Assumes in correct section after the entry point. Asm->OutStreamer.EmitLabel(FunctionBeginSym); - // Emit label for the implicitly defined dbg.stoppoint at the start of the - // function. - DebugLoc FDL = FindFirstDebugLoc(MF); - if (FDL.isUnknown()) return; - - const MDNode *Scope = FDL.getScope(MF->getFunction()->getContext()); - const MDNode *TheScope = 0; - - DISubprogram SP = getDISubprogram(Scope); - unsigned Line, Col; - if (SP.Verify()) { - Line = SP.getLineNumber(); - Col = 0; - TheScope = SP; - } else { - Line = FDL.getLine(); - Col = FDL.getCol(); - TheScope = Scope; - } - - recordSourceLine(Line, Col, TheScope); - assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned"); /// ProcessedArgs - Collection of arguments already processed. SmallPtrSet<const MDNode *, 8> ProcessedArgs; - const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo(); - /// LiveUserVar - Map physreg numbers to the MDNode they contain. std::vector<const MDNode*> LiveUserVar(TRI->getNumRegs()); @@ -2995,6 +1910,11 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { if (!MI->isLabel()) AtBlockEntry = false; + // First known non DBG_VALUE location marks beginning of function + // body. + if (PrologEndLoc.isUnknown() && !MI->getDebugLoc().isUnknown()) + PrologEndLoc = MI->getDebugLoc(); + // Check if the instruction clobbers any registers with debug vars. for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(), MOE = MI->operands_end(); MOI != MOE; ++MOI) { @@ -3063,6 +1983,15 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { PrevInstLoc = DebugLoc(); PrevLabel = FunctionBeginSym; + + // Record beginning of function. + if (!PrologEndLoc.isUnknown()) { + DebugLoc FnStartDL = getFnDebugLoc(PrologEndLoc, + MF->getFunction()->getContext()); + recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(), + FnStartDL.getScope(MF->getFunction()->getContext()), + DWARF2_FLAG_IS_STMT); + } } /// endFunction - Gather and emit post-function debug information. @@ -3109,8 +2038,9 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { DIE *CurFnDIE = constructScopeDIE(CurrentFnDbgScope); if (!DisableFramePointerElim(*MF)) - addUInt(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr, - dwarf::DW_FORM_flag, 1); + getCompileUnit(CurrentFnDbgScope->getScopeNode())->addUInt(CurFnDIE, + dwarf::DW_AT_APPLE_omit_frame_ptr, + dwarf::DW_FORM_flag, 1); DebugFrames.push_back(FunctionDebugFrameInfo(Asm->getFunctionNumber(), @@ -3119,7 +2049,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { // Clear debug info CurrentFnDbgScope = NULL; - CurrentFnArguments.clear(); + DeleteContainerPointers(CurrentFnArguments); DbgVariableToFrameIndexMap.clear(); VarToAbstractVarMap.clear(); DbgVariableToDbgInstMap.clear(); @@ -3176,7 +2106,8 @@ DbgScope *DwarfDebug::findDbgScope(const MachineInstr *MInsn) { /// recordSourceLine - Register a source line with debug info. Returns the /// unique label that was emitted and which provides correspondence to /// the source line list. -void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S){ +void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S, + unsigned Flags) { StringRef Fn; StringRef Dir; unsigned Src = 1; @@ -3204,9 +2135,8 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S){ Src = GetOrCreateSourceID(Fn, Dir); } - - Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, DWARF2_FLAG_IS_STMT, - 0, 0); + Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, Flags, + 0, 0, Fn); } //===----------------------------------------------------------------------===// @@ -3264,17 +2194,15 @@ DwarfDebug::computeSizeAndOffset(DIE *Die, unsigned Offset, bool Last) { /// computeSizeAndOffsets - Compute the size and offset of all the DIEs. /// void DwarfDebug::computeSizeAndOffsets() { - unsigned PrevOffset = 0; for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(), E = CUMap.end(); I != E; ++I) { // Compute size of compile unit header. - static unsigned Offset = PrevOffset + + unsigned Offset = sizeof(int32_t) + // Length of Compilation Unit Info sizeof(int16_t) + // DWARF version number sizeof(int32_t) + // Offset Into Abbrev. Section sizeof(int8_t); // Pointer Size (in bytes) computeSizeAndOffset(I->second->getCUDie(), Offset, true); - PrevOffset = Offset; } } @@ -3296,11 +2224,6 @@ void DwarfDebug::EmitSectionLabels() { const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); // Dwarf sections base addresses. - if (Asm->MAI->doesDwarfRequireFrameSection()) { - DwarfFrameSectionSym = - EmitSectionSym(Asm, TLOF.getDwarfFrameSection(), "section_debug_frame"); - } - DwarfInfoSectionSym = EmitSectionSym(Asm, TLOF.getDwarfInfoSection(), "section_info"); DwarfAbbrevSectionSym = @@ -3435,8 +2358,7 @@ void DwarfDebug::emitDebugInfo() { unsigned ContentSize = Die->getSize() + sizeof(int16_t) + // DWARF version number sizeof(int32_t) + // Offset Into Abbrev. Section - sizeof(int8_t) + // Pointer Size (in bytes) - sizeof(int32_t); // FIXME - extra pad for gdb bug. + sizeof(int8_t); // Pointer Size (in bytes) Asm->OutStreamer.AddComment("Length of Compilation Unit Info"); Asm->EmitInt32(ContentSize); @@ -3449,12 +2371,6 @@ void DwarfDebug::emitDebugInfo() { Asm->EmitInt8(Asm->getTargetData().getPointerSize()); emitDIE(Die); - // FIXME - extra padding for gdb bug. - Asm->OutStreamer.AddComment("4 extra padding bytes for GDB"); - Asm->EmitInt8(0); - Asm->EmitInt8(0); - Asm->EmitInt8(0); - Asm->EmitInt8(0); Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("info_end", TheCU->getID())); } } @@ -3515,91 +2431,6 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) { Asm->EmitInt8(1); } -/// emitCommonDebugFrame - Emit common frame info into a debug frame section. -/// -void DwarfDebug::emitCommonDebugFrame() { - if (!Asm->MAI->doesDwarfRequireFrameSection()) - return; - - int stackGrowth = Asm->getTargetData().getPointerSize(); - if (Asm->TM.getFrameLowering()->getStackGrowthDirection() == - TargetFrameLowering::StackGrowsDown) - stackGrowth *= -1; - - // Start the dwarf frame section. - Asm->OutStreamer.SwitchSection( - Asm->getObjFileLowering().getDwarfFrameSection()); - - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common")); - Asm->OutStreamer.AddComment("Length of Common Information Entry"); - Asm->EmitLabelDifference(Asm->GetTempSymbol("debug_frame_common_end"), - Asm->GetTempSymbol("debug_frame_common_begin"), 4); - - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common_begin")); - Asm->OutStreamer.AddComment("CIE Identifier Tag"); - Asm->EmitInt32((int)dwarf::DW_CIE_ID); - Asm->OutStreamer.AddComment("CIE Version"); - Asm->EmitInt8(dwarf::DW_CIE_VERSION); - Asm->OutStreamer.AddComment("CIE Augmentation"); - Asm->OutStreamer.EmitIntValue(0, 1, /*addrspace*/0); // nul terminator. - Asm->EmitULEB128(1, "CIE Code Alignment Factor"); - Asm->EmitSLEB128(stackGrowth, "CIE Data Alignment Factor"); - Asm->OutStreamer.AddComment("CIE RA Column"); - const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); - const TargetFrameLowering *TFI = Asm->TM.getFrameLowering(); - Asm->EmitInt8(RI->getDwarfRegNum(RI->getRARegister(), false)); - - std::vector<MachineMove> Moves; - TFI->getInitialFrameState(Moves); - - Asm->EmitFrameMoves(Moves, 0, false); - - Asm->EmitAlignment(2); - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common_end")); -} - -/// emitFunctionDebugFrame - Emit per function frame info into a debug frame -/// section. -void DwarfDebug:: -emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo) { - if (!Asm->MAI->doesDwarfRequireFrameSection()) - return; - - // Start the dwarf frame section. - Asm->OutStreamer.SwitchSection( - Asm->getObjFileLowering().getDwarfFrameSection()); - - Asm->OutStreamer.AddComment("Length of Frame Information Entry"); - MCSymbol *DebugFrameBegin = - Asm->GetTempSymbol("debug_frame_begin", DebugFrameInfo.Number); - MCSymbol *DebugFrameEnd = - Asm->GetTempSymbol("debug_frame_end", DebugFrameInfo.Number); - Asm->EmitLabelDifference(DebugFrameEnd, DebugFrameBegin, 4); - - Asm->OutStreamer.EmitLabel(DebugFrameBegin); - - Asm->OutStreamer.AddComment("FDE CIE offset"); - Asm->EmitSectionOffset(Asm->GetTempSymbol("debug_frame_common"), - DwarfFrameSectionSym); - - Asm->OutStreamer.AddComment("FDE initial location"); - MCSymbol *FuncBeginSym = - Asm->GetTempSymbol("func_begin", DebugFrameInfo.Number); - Asm->OutStreamer.EmitSymbolValue(FuncBeginSym, - Asm->getTargetData().getPointerSize(), - 0/*AddrSpace*/); - - - Asm->OutStreamer.AddComment("FDE address range"); - Asm->EmitLabelDifference(Asm->GetTempSymbol("func_end",DebugFrameInfo.Number), - FuncBeginSym, Asm->getTargetData().getPointerSize()); - - Asm->EmitFrameMoves(DebugFrameInfo.Moves, FuncBeginSym, false); - - Asm->EmitAlignment(2); - Asm->OutStreamer.EmitLabel(DebugFrameEnd); -} - /// emitDebugPubNames - Emit visible names into a debug pubnames section. /// void DwarfDebug::emitDebugPubNames() { @@ -3760,33 +2591,63 @@ void DwarfDebug::emitDebugLoc() { } else { Asm->OutStreamer.EmitSymbolValue(Entry.Begin, Size, 0); Asm->OutStreamer.EmitSymbolValue(Entry.End, Size, 0); - const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); - unsigned Reg = RI->getDwarfRegNum(Entry.Loc.getReg(), false); - if (int Offset = Entry.Loc.getOffset()) { - // If the value is at a certain offset from frame register then - // use DW_OP_fbreg. - unsigned OffsetSize = Offset ? MCAsmInfo::getSLEB128Size(Offset) : 1; - Asm->OutStreamer.AddComment("Loc expr size"); - Asm->EmitInt16(1 + OffsetSize); - Asm->OutStreamer.AddComment( - dwarf::OperationEncodingString(dwarf::DW_OP_fbreg)); - Asm->EmitInt8(dwarf::DW_OP_fbreg); - Asm->OutStreamer.AddComment("Offset"); - Asm->EmitSLEB128(Offset); - } else { - if (Reg < 32) { - Asm->OutStreamer.AddComment("Loc expr size"); - Asm->EmitInt16(1); - Asm->OutStreamer.AddComment( - dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg)); - Asm->EmitInt8(dwarf::DW_OP_reg0 + Reg); + DIVariable DV(Entry.Variable); + Asm->OutStreamer.AddComment("Loc expr size"); + MCSymbol *begin = Asm->OutStreamer.getContext().CreateTempSymbol(); + MCSymbol *end = Asm->OutStreamer.getContext().CreateTempSymbol(); + Asm->EmitLabelDifference(end, begin, 2); + Asm->OutStreamer.EmitLabel(begin); + if (Entry.isConstant()) { + DIBasicType BTy(DV.getType()); + if (BTy.Verify() && + (BTy.getEncoding() == dwarf::DW_ATE_signed + || BTy.getEncoding() == dwarf::DW_ATE_signed_char)) { + Asm->OutStreamer.AddComment("DW_OP_consts"); + Asm->EmitInt8(dwarf::DW_OP_consts); + Asm->EmitSLEB128(Entry.getConstant()); } else { - Asm->OutStreamer.AddComment("Loc expr size"); - Asm->EmitInt16(1 + MCAsmInfo::getULEB128Size(Reg)); - Asm->EmitInt8(dwarf::DW_OP_regx); - Asm->EmitULEB128(Reg); + Asm->OutStreamer.AddComment("DW_OP_constu"); + Asm->EmitInt8(dwarf::DW_OP_constu); + Asm->EmitULEB128(Entry.getConstant()); } + } else if (DV.hasComplexAddress()) { + unsigned N = DV.getNumAddrElements(); + unsigned i = 0; + if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) { + if (Entry.Loc.getOffset()) { + i = 2; + Asm->EmitDwarfRegOp(Entry.Loc); + Asm->OutStreamer.AddComment("DW_OP_deref"); + Asm->EmitInt8(dwarf::DW_OP_deref); + Asm->OutStreamer.AddComment("DW_OP_plus_uconst"); + Asm->EmitInt8(dwarf::DW_OP_plus_uconst); + Asm->EmitSLEB128(DV.getAddrElement(1)); + } else { + // If first address element is OpPlus then emit + // DW_OP_breg + Offset instead of DW_OP_reg + Offset. + MachineLocation Loc(Entry.Loc.getReg(), DV.getAddrElement(1)); + Asm->EmitDwarfRegOp(Loc); + i = 2; + } + } else { + Asm->EmitDwarfRegOp(Entry.Loc); + } + + // Emit remaining complex address elements. + for (; i < N; ++i) { + uint64_t Element = DV.getAddrElement(i); + if (Element == DIBuilder::OpPlus) { + Asm->EmitInt8(dwarf::DW_OP_plus_uconst); + Asm->EmitULEB128(DV.getAddrElement(++i)); + } else if (Element == DIBuilder::OpDeref) + Asm->EmitInt8(dwarf::DW_OP_deref); + else llvm_unreachable("unknown Opcode found in complex address"); + } + } else { + // Regular entry. + Asm->EmitDwarfRegOp(Entry.Loc); } + Asm->OutStreamer.EmitLabel(end); } } } diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index 4aeefde..abda2e6 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -16,6 +16,7 @@ #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineLocation.h" +#include "llvm/Analysis/DebugInfo.h" #include "DIE.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/FoldingSet.h" @@ -39,21 +40,6 @@ class DIEAbbrev; class DIE; class DIEBlock; class DIEEntry; -class DIArray; -class DIEnumerator; -class DIDescriptor; -class DIVariable; -class DIGlobal; -class DIGlobalVariable; -class DISubprogram; -class DIBasicType; -class DIDerivedType; -class DIType; -class DINameSpace; -class DISubrange; -class DICompositeType; -class DITemplateTypeParameter; -class DITemplateValueParameter; //===----------------------------------------------------------------------===// /// SrcLineInfo - This class is used to record source line correspondence. @@ -80,10 +66,21 @@ typedef struct DotDebugLocEntry { const MCSymbol *Begin; const MCSymbol *End; MachineLocation Loc; + const MDNode *Variable; bool Merged; - DotDebugLocEntry() : Begin(0), End(0), Merged(false) {} - DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, MachineLocation &L) - : Begin(B), End(E), Loc(L), Merged(false) {} + bool Constant; + int64_t iConstant; + DotDebugLocEntry() + : Begin(0), End(0), Variable(0), Merged(false), + Constant(false), iConstant(0) {} + DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, MachineLocation &L, + const MDNode *V) + : Begin(B), End(E), Loc(L), Variable(V), Merged(false), + Constant(false), iConstant(0) {} + DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, int64_t i) + : Begin(B), End(E), Variable(0), Merged(false), + Constant(true), iConstant(i) {} + /// Empty entries are also used as a trigger to emit temp label. Such /// labels are referenced is used to find debug_loc offset for a given DIE. bool isEmpty() { return Begin == 0 && End == 0; } @@ -94,8 +91,47 @@ typedef struct DotDebugLocEntry { Next->Begin = Begin; Merged = true; } + bool isConstant() { return Constant; } + int64_t getConstant() { return iConstant; } } DotDebugLocEntry; +//===----------------------------------------------------------------------===// +/// DbgVariable - This class is used to track local variable information. +/// +class DbgVariable { + DIVariable Var; // Variable Descriptor. + DIE *TheDIE; // Variable DIE. + unsigned DotDebugLocOffset; // Offset in DotDebugLocEntries. +public: + // AbsVar may be NULL. + DbgVariable(DIVariable V) : Var(V), TheDIE(0), DotDebugLocOffset(~0U) {} + + // Accessors. + DIVariable getVariable() const { return Var; } + void setDIE(DIE *D) { TheDIE = D; } + DIE *getDIE() const { return TheDIE; } + void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; } + unsigned getDotDebugLocOffset() const { return DotDebugLocOffset; } + StringRef getName() const { return Var.getName(); } + unsigned getTag() const { return Var.getTag(); } + bool variableHasComplexAddress() const { + assert(Var.Verify() && "Invalid complex DbgVariable!"); + return Var.hasComplexAddress(); + } + bool isBlockByrefVariable() const { + assert(Var.Verify() && "Invalid complex DbgVariable!"); + return Var.isBlockByrefVariable(); + } + unsigned getNumAddrElements() const { + assert(Var.Verify() && "Invalid complex DbgVariable!"); + return Var.getNumAddrElements(); + } + uint64_t getAddrElement(unsigned i) const { + return Var.getAddrElement(i); + } + DIType getType() const; +}; + class DwarfDebug { /// Asm - Target of Dwarf emission. AsmPrinter *Asm; @@ -122,12 +158,6 @@ class DwarfDebug { /// id mapped to a unique id. StringMap<unsigned> SourceIdMap; - /// DIEBlocks - A list of all the DIEBlocks in use. - std::vector<DIEBlock *> DIEBlocks; - - // DIEValueAllocator - All DIEValues are allocated through this allocator. - BumpPtrAllocator DIEValueAllocator; - /// StringPool - A String->Symbol mapping of strings used by indirect /// references. StringMap<std::pair<MCSymbol*, unsigned> > StringPool; @@ -198,8 +228,6 @@ class DwarfDebug { /// corresponds to the MDNode mapped with the subprogram DIE. DenseMap<DIE *, const MDNode *> ContainingTypeMap; - typedef SmallVector<DbgScope *, 2> ScopeVector; - /// InlineInfo - Keep track of inlined functions and their location. This /// information is used to populate debug_inlined section. typedef std::pair<const MCSymbol *, DIE *> InlineInfoLabels; @@ -236,6 +264,10 @@ class DwarfDebug { DebugLoc PrevInstLoc; MCSymbol *PrevLabel; + /// PrologEndLoc - This location indicates end of function prologue and + /// beginning of function body. + DebugLoc PrologEndLoc; + struct FunctionDebugFrameInfo { unsigned Number; std::vector<MachineMove> Moves; @@ -246,157 +278,23 @@ class DwarfDebug { std::vector<FunctionDebugFrameInfo> DebugFrames; + // DIEValueAllocator - All DIEValues are allocated through this allocator. + BumpPtrAllocator DIEValueAllocator; + // Section Symbols: these are assembler temporary labels that are emitted at // the beginning of each supported dwarf section. These are used to form // section offsets and are created by EmitSectionLabels. - MCSymbol *DwarfFrameSectionSym, *DwarfInfoSectionSym, *DwarfAbbrevSectionSym; + MCSymbol *DwarfInfoSectionSym, *DwarfAbbrevSectionSym; MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym; MCSymbol *DwarfDebugLocSectionSym; MCSymbol *FunctionBeginSym, *FunctionEndSym; - DIEInteger *DIEIntegerOne; - private: /// assignAbbrevNumber - Define a unique number for the abbreviation. /// void assignAbbrevNumber(DIEAbbrev &Abbrev); - /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug - /// information entry. - DIEEntry *createDIEEntry(DIE *Entry); - - /// addUInt - Add an unsigned integer attribute data and value. - /// - void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer); - - /// addSInt - Add an signed integer attribute data and value. - /// - void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer); - - /// addString - Add a string attribute data and value. - /// - void addString(DIE *Die, unsigned Attribute, unsigned Form, - const StringRef Str); - - /// addLabel - Add a Dwarf label attribute data and value. - /// - void addLabel(DIE *Die, unsigned Attribute, unsigned Form, - const MCSymbol *Label); - - /// addDelta - Add a label delta attribute data and value. - /// - void addDelta(DIE *Die, unsigned Attribute, unsigned Form, - const MCSymbol *Hi, const MCSymbol *Lo); - - /// addDIEEntry - Add a DIE attribute data and value. - /// - void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry); - - /// addBlock - Add block data. - /// - void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block); - - /// addSourceLine - Add location information to specified debug information - /// entry. - void addSourceLine(DIE *Die, DIVariable V); - void addSourceLine(DIE *Die, DIGlobalVariable G); - void addSourceLine(DIE *Die, DISubprogram SP); - void addSourceLine(DIE *Die, DIType Ty); - void addSourceLine(DIE *Die, DINameSpace NS); - - /// addAddress - Add an address attribute to a die based on the location - /// provided. - void addAddress(DIE *Die, unsigned Attribute, - const MachineLocation &Location); - - /// addRegisterAddress - Add register location entry in variable DIE. - bool addRegisterAddress(DIE *Die, const MachineOperand &MO); - - /// addConstantValue - Add constant value entry in variable DIE. - bool addConstantValue(DIE *Die, const MachineOperand &MO); - bool addConstantValue(DIE *Die, ConstantInt *CI, bool Unsigned); - - /// addConstantFPValue - Add constant value entry in variable DIE. - bool addConstantFPValue(DIE *Die, const MachineOperand &MO); - - /// addTemplateParams - Add template parameters in buffer. - void addTemplateParams(DIE &Buffer, DIArray TParams); - - /// addComplexAddress - Start with the address based on the location provided, - /// and generate the DWARF information necessary to find the actual variable - /// (navigating the extra location information encoded in the type) based on - /// the starting location. Add the DWARF information to the die. - /// - void addComplexAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute, - const MachineLocation &Location); - - // FIXME: Should be reformulated in terms of addComplexAddress. - /// addBlockByrefAddress - Start with the address based on the location - /// provided, and generate the DWARF information necessary to find the - /// actual Block variable (navigating the Block struct) based on the - /// starting location. Add the DWARF information to the die. Obsolete, - /// please use addComplexAddress instead. - /// - void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute, - const MachineLocation &Location); - - /// addVariableAddress - Add DW_AT_location attribute for a DbgVariable based - /// on provided frame index. - void addVariableAddress(DbgVariable *&DV, DIE *Die, int64_t FI); - - /// addToContextOwner - Add Die into the list of its context owner's children. - void addToContextOwner(DIE *Die, DIDescriptor Context); - - /// addType - Add a new type attribute to the specified entity. - void addType(DIE *Entity, DIType Ty); - - - /// getOrCreateNameSpace - Create a DIE for DINameSpace. - DIE *getOrCreateNameSpace(DINameSpace NS); - - /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the - /// given DIType. - DIE *getOrCreateTypeDIE(DIType Ty); - - /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE - /// for the given DITemplateTypeParameter. - DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP); - - /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE - /// for the given DITemplateValueParameter. - DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP); - - void addPubTypes(DISubprogram SP); - - /// constructTypeDIE - Construct basic type die from DIBasicType. - void constructTypeDIE(DIE &Buffer, - DIBasicType BTy); - - /// constructTypeDIE - Construct derived type die from DIDerivedType. - void constructTypeDIE(DIE &Buffer, - DIDerivedType DTy); - - /// constructTypeDIE - Construct type DIE from DICompositeType. - void constructTypeDIE(DIE &Buffer, - DICompositeType CTy); - - /// constructSubrangeDIE - Construct subrange DIE from DISubrange. - void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy); - - /// constructArrayTypeDIE - Construct array type DIE from DICompositeType. - void constructArrayTypeDIE(DIE &Buffer, - DICompositeType *CTy); - - /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator. - DIE *constructEnumTypeDIE(DIEnumerator ETy); - - /// createMemberDIE - Create new member DIE. - DIE *createMemberDIE(DIDerivedType DT); - - /// createSubprogramDIE - Create new DIE using SP. - DIE *createSubprogramDIE(DISubprogram SP); - /// getOrCreateDbgScope - Create DbgScope for the scope. DbgScope *getOrCreateDbgScope(const MDNode *Scope, const MDNode *InlinedAt); @@ -455,14 +353,6 @@ private: /// void emitEndOfLineMatrix(unsigned SectionEnd); - /// emitCommonDebugFrame - Emit common frame info into a debug frame section. - /// - void emitCommonDebugFrame(); - - /// emitFunctionDebugFrame - Emit per function frame info into a debug frame - /// section. - void emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo); - /// emitDebugPubNames - Emit visible names into a debug pubnames section. /// void emitDebugPubNames(); @@ -511,11 +401,6 @@ private: /// inlining instance. void emitDebugInlineInfo(); - /// GetOrCreateSourceID - Look up the source id with the given directory and - /// source file names. If none currently exists, create a new id and insert it - /// in the SourceIds map. - unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName); - /// constructCompileUnit - Create new CompileUnit for the given /// metadata node with tag DW_TAG_compile_unit. void constructCompileUnit(const MDNode *N); @@ -532,7 +417,8 @@ private: /// recordSourceLine - Register a source line with debug info. Returns the /// unique label that was emitted and which provides correspondence to /// the source line list. - void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope); + void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope, + unsigned Flags); /// recordVariableFrameIndex - Record a variable's index. void recordVariableFrameIndex(const DbgVariable *V, int Index); @@ -611,6 +497,14 @@ public: /// endInstruction - Prcess end of an instruction. void endInstruction(const MachineInstr *MI); + + /// GetOrCreateSourceID - Look up the source id with the given directory and + /// source file names. If none currently exists, create a new id and insert it + /// in the SourceIds map. + unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName); + + /// createSubprogramDIE - Create new DIE using SP. + DIE *createSubprogramDIE(DISubprogram SP); }; } // End of namespace llvm diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h index 06b1de6..b5f86ab 100644 --- a/lib/CodeGen/AsmPrinter/DwarfException.h +++ b/lib/CodeGen/AsmPrinter/DwarfException.h @@ -15,6 +15,7 @@ #define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H #include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/AsmPrinter.h" #include <vector> namespace llvm { @@ -140,17 +141,20 @@ public: }; class DwarfCFIException : public DwarfException { - /// shouldEmitTable - Per-function flag to indicate if EH tables should - /// be emitted. - bool shouldEmitTable; + /// shouldEmitPersonality - Per-function flag to indicate if .cfi_personality + /// should be emitted. + bool shouldEmitPersonality; + + /// shouldEmitLSDA - Per-function flag to indicate if .cfi_lsda + /// should be emitted. + bool shouldEmitLSDA; /// shouldEmitMoves - Per-function flag to indicate if frame moves info /// should be emitted. bool shouldEmitMoves; - /// shouldEmitTableModule - Per-module flag to indicate if EH tables - /// should be emitted. - bool shouldEmitTableModule; + AsmPrinter::CFIMoveType moveTypeModule; + public: //===--------------------------------------------------------------------===// // Main entry points. @@ -170,7 +174,7 @@ public: virtual void EndFunction(); }; -class DwarfTableException : public DwarfException { +class ARMException : public DwarfException { /// shouldEmitTable - Per-function flag to indicate if EH tables should /// be emitted. bool shouldEmitTable; @@ -182,48 +186,12 @@ class DwarfTableException : public DwarfException { /// shouldEmitTableModule - Per-module flag to indicate if EH tables /// should be emitted. bool shouldEmitTableModule; - - /// shouldEmitMovesModule - Per-module flag to indicate if frame moves - /// should be emitted. - bool shouldEmitMovesModule; - - struct FunctionEHFrameInfo { - MCSymbol *FunctionEHSym; // L_foo.eh - unsigned Number; - unsigned PersonalityIndex; - bool adjustsStack; - bool hasLandingPads; - std::vector<MachineMove> Moves; - const Function *function; - - FunctionEHFrameInfo(MCSymbol *EHSym, unsigned Num, unsigned P, - bool hC, bool hL, - const std::vector<MachineMove> &M, - const Function *f): - FunctionEHSym(EHSym), Number(Num), PersonalityIndex(P), - adjustsStack(hC), hasLandingPads(hL), Moves(M), function (f) { } - }; - - std::vector<FunctionEHFrameInfo> EHFrames; - - /// UsesLSDA - Indicates whether an FDE that uses the CIE at the given index - /// uses an LSDA. If so, then we need to encode that information in the CIE's - /// augmentation. - DenseMap<unsigned, bool> UsesLSDA; - - /// EmitCIE - Emit a Common Information Entry (CIE). This holds information - /// that is shared among many Frame Description Entries. There is at least - /// one CIE in every non-empty .debug_frame section. - void EmitCIE(const Function *Personality, unsigned Index); - - /// EmitFDE - Emit the Frame Description Entry (FDE) for the function. - void EmitFDE(const FunctionEHFrameInfo &EHFrameInfo); public: //===--------------------------------------------------------------------===// // Main entry points. // - DwarfTableException(AsmPrinter *A); - virtual ~DwarfTableException(); + ARMException(AsmPrinter *A); + virtual ~ARMException(); /// EndModule - Emit all exception information that should come after the /// content. @@ -237,25 +205,25 @@ public: virtual void EndFunction(); }; +class Win64Exception : public DwarfException { + /// shouldEmitPersonality - Per-function flag to indicate if personality + /// info should be emitted. + bool shouldEmitPersonality; -class ARMException : public DwarfException { - /// shouldEmitTable - Per-function flag to indicate if EH tables should - /// be emitted. - bool shouldEmitTable; + /// shouldEmitLSDA - Per-function flag to indicate if the LSDA + /// should be emitted. + bool shouldEmitLSDA; /// shouldEmitMoves - Per-function flag to indicate if frame moves info /// should be emitted. bool shouldEmitMoves; - /// shouldEmitTableModule - Per-module flag to indicate if EH tables - /// should be emitted. - bool shouldEmitTableModule; public: //===--------------------------------------------------------------------===// // Main entry points. // - ARMException(AsmPrinter *A); - virtual ~ARMException(); + Win64Exception(AsmPrinter *A); + virtual ~Win64Exception(); /// EndModule - Emit all exception information that should come after the /// content. diff --git a/lib/CodeGen/AsmPrinter/DwarfTableException.cpp b/lib/CodeGen/AsmPrinter/DwarfTableException.cpp deleted file mode 100644 index 7519011..0000000 --- a/lib/CodeGen/AsmPrinter/DwarfTableException.cpp +++ /dev/null @@ -1,349 +0,0 @@ -//===-- CodeGen/AsmPrinter/DwarfTableException.cpp - Dwarf Exception Impl --==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains support for writing DWARF exception info into asm files. -// The implementation emits all the necessary tables "by hands". -// -//===----------------------------------------------------------------------===// - -#include "DwarfException.h" -#include "llvm/Module.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineLocation.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCSection.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Target/Mangler.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetFrameLowering.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Support/Dwarf.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/Twine.h" -using namespace llvm; - -DwarfTableException::DwarfTableException(AsmPrinter *A) - : DwarfException(A), - shouldEmitTable(false), shouldEmitMoves(false), - shouldEmitTableModule(false), shouldEmitMovesModule(false) {} - -DwarfTableException::~DwarfTableException() {} - -/// EmitCIE - Emit a Common Information Entry (CIE). This holds information that -/// is shared among many Frame Description Entries. There is at least one CIE -/// in every non-empty .debug_frame section. -void DwarfTableException::EmitCIE(const Function *PersonalityFn, unsigned Index) { - // Size and sign of stack growth. - int stackGrowth = Asm->getTargetData().getPointerSize(); - if (Asm->TM.getFrameLowering()->getStackGrowthDirection() == - TargetFrameLowering::StackGrowsDown) - stackGrowth *= -1; - - const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); - - // Begin eh frame section. - Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection()); - - MCSymbol *EHFrameSym; - if (TLOF.isFunctionEHFrameSymbolPrivate()) - EHFrameSym = Asm->GetTempSymbol("EH_frame", Index); - else - EHFrameSym = Asm->OutContext.GetOrCreateSymbol(Twine("EH_frame") + - Twine(Index)); - Asm->OutStreamer.EmitLabel(EHFrameSym); - - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_eh_frame", Index)); - - // Define base labels. - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common", Index)); - - // Define the eh frame length. - Asm->OutStreamer.AddComment("Length of Common Information Entry"); - Asm->EmitLabelDifference(Asm->GetTempSymbol("eh_frame_common_end", Index), - Asm->GetTempSymbol("eh_frame_common_begin", Index), - 4); - - // EH frame header. - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common_begin",Index)); - Asm->OutStreamer.AddComment("CIE Identifier Tag"); - Asm->OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/); - Asm->OutStreamer.AddComment("DW_CIE_VERSION"); - Asm->OutStreamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1/*size*/, 0/*addr*/); - - // The personality presence indicates that language specific information will - // show up in the eh frame. Find out how we are supposed to lower the - // personality function reference: - - unsigned LSDAEncoding = TLOF.getLSDAEncoding(); - unsigned FDEEncoding = TLOF.getFDEEncoding(); - unsigned PerEncoding = TLOF.getPersonalityEncoding(); - - char Augmentation[6] = { 0 }; - unsigned AugmentationSize = 0; - char *APtr = Augmentation + 1; - - if (PersonalityFn) { - // There is a personality function. - *APtr++ = 'P'; - AugmentationSize += 1 + Asm->GetSizeOfEncodedValue(PerEncoding); - } - - if (UsesLSDA[Index]) { - // An LSDA pointer is in the FDE augmentation. - *APtr++ = 'L'; - ++AugmentationSize; - } - - if (FDEEncoding != dwarf::DW_EH_PE_absptr) { - // A non-default pointer encoding for the FDE. - *APtr++ = 'R'; - ++AugmentationSize; - } - - if (APtr != Augmentation + 1) - Augmentation[0] = 'z'; - - Asm->OutStreamer.AddComment("CIE Augmentation"); - Asm->OutStreamer.EmitBytes(StringRef(Augmentation, strlen(Augmentation)+1),0); - - // Round out reader. - Asm->EmitULEB128(1, "CIE Code Alignment Factor"); - Asm->EmitSLEB128(stackGrowth, "CIE Data Alignment Factor"); - Asm->OutStreamer.AddComment("CIE Return Address Column"); - - const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); - const TargetFrameLowering *TFI = Asm->TM.getFrameLowering(); - Asm->EmitInt8(RI->getDwarfRegNum(RI->getRARegister(), true)); - - if (Augmentation[0]) { - Asm->EmitULEB128(AugmentationSize, "Augmentation Size"); - - // If there is a personality, we need to indicate the function's location. - if (PersonalityFn) { - Asm->EmitEncodingByte(PerEncoding, "Personality"); - Asm->OutStreamer.AddComment("Personality"); - Asm->EmitReference(PersonalityFn, PerEncoding); - } - if (UsesLSDA[Index]) - Asm->EmitEncodingByte(LSDAEncoding, "LSDA"); - if (FDEEncoding != dwarf::DW_EH_PE_absptr) - Asm->EmitEncodingByte(FDEEncoding, "FDE"); - } - - // Indicate locations of general callee saved registers in frame. - std::vector<MachineMove> Moves; - TFI->getInitialFrameState(Moves); - Asm->EmitFrameMoves(Moves, 0, true); - - // On Darwin the linker honors the alignment of eh_frame, which means it must - // be 8-byte on 64-bit targets to match what gcc does. Otherwise you get - // holes which confuse readers of eh_frame. - Asm->EmitAlignment(Asm->getTargetData().getPointerSize() == 4 ? 2 : 3); - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common_end", Index)); -} - -/// EmitFDE - Emit the Frame Description Entry (FDE) for the function. -void DwarfTableException::EmitFDE(const FunctionEHFrameInfo &EHFrameInfo) { - assert(!EHFrameInfo.function->hasAvailableExternallyLinkage() && - "Should not emit 'available externally' functions at all"); - - const Function *TheFunc = EHFrameInfo.function; - const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); - - unsigned LSDAEncoding = TLOF.getLSDAEncoding(); - unsigned FDEEncoding = TLOF.getFDEEncoding(); - - Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection()); - - // Externally visible entry into the functions eh frame info. If the - // corresponding function is static, this should not be externally visible. - if (!TheFunc->hasLocalLinkage() && TLOF.isFunctionEHSymbolGlobal()) - Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,MCSA_Global); - - // If corresponding function is weak definition, this should be too. - if (TheFunc->isWeakForLinker() && Asm->MAI->getWeakDefDirective()) - Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym, - MCSA_WeakDefinition); - - // If corresponding function is hidden, this should be too. - if (TheFunc->hasHiddenVisibility()) - if (MCSymbolAttr HiddenAttr = Asm->MAI->getHiddenVisibilityAttr()) - Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym, - HiddenAttr); - - // If there are no calls then you can't unwind. This may mean we can omit the - // EH Frame, but some environments do not handle weak absolute symbols. If - // UnwindTablesMandatory is set we cannot do this optimization; the unwind - // info is to be available for non-EH uses. - if (!EHFrameInfo.adjustsStack && !UnwindTablesMandatory && - (!TheFunc->isWeakForLinker() || - !Asm->MAI->getWeakDefDirective() || - TLOF.getSupportsWeakOmittedEHFrame())) { - Asm->OutStreamer.EmitAssignment(EHFrameInfo.FunctionEHSym, - MCConstantExpr::Create(0, Asm->OutContext)); - // This name has no connection to the function, so it might get - // dead-stripped when the function is not, erroneously. Prohibit - // dead-stripping unconditionally. - if (Asm->MAI->hasNoDeadStrip()) - Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym, - MCSA_NoDeadStrip); - } else { - Asm->OutStreamer.EmitLabel(EHFrameInfo.FunctionEHSym); - - // EH frame header. - Asm->OutStreamer.AddComment("Length of Frame Information Entry"); - Asm->EmitLabelDifference( - Asm->GetTempSymbol("eh_frame_end", EHFrameInfo.Number), - Asm->GetTempSymbol("eh_frame_begin", EHFrameInfo.Number), 4); - - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_begin", - EHFrameInfo.Number)); - - Asm->OutStreamer.AddComment("FDE CIE offset"); - Asm->EmitLabelDifference( - Asm->GetTempSymbol("eh_frame_begin", EHFrameInfo.Number), - Asm->GetTempSymbol("eh_frame_common", - EHFrameInfo.PersonalityIndex), 4); - - MCSymbol *EHFuncBeginSym = - Asm->GetTempSymbol("eh_func_begin", EHFrameInfo.Number); - - Asm->OutStreamer.AddComment("FDE initial location"); - Asm->EmitReference(EHFuncBeginSym, FDEEncoding); - - Asm->OutStreamer.AddComment("FDE address range"); - Asm->EmitLabelDifference(Asm->GetTempSymbol("eh_func_end", - EHFrameInfo.Number), - EHFuncBeginSym, - Asm->GetSizeOfEncodedValue(FDEEncoding)); - - // If there is a personality and landing pads then point to the language - // specific data area in the exception table. - if (MMI->getPersonalities()[0] != NULL) { - unsigned Size = Asm->GetSizeOfEncodedValue(LSDAEncoding); - - Asm->EmitULEB128(Size, "Augmentation size"); - Asm->OutStreamer.AddComment("Language Specific Data Area"); - if (EHFrameInfo.hasLandingPads) - Asm->EmitReference(Asm->GetTempSymbol("exception", EHFrameInfo.Number), - LSDAEncoding); - else - Asm->OutStreamer.EmitIntValue(0, Size/*size*/, 0/*addrspace*/); - - } else { - Asm->EmitULEB128(0, "Augmentation size"); - } - - // Indicate locations of function specific callee saved registers in frame. - Asm->EmitFrameMoves(EHFrameInfo.Moves, EHFuncBeginSym, true); - - // On Darwin the linker honors the alignment of eh_frame, which means it - // must be 8-byte on 64-bit targets to match what gcc does. Otherwise you - // get holes which confuse readers of eh_frame. - Asm->EmitAlignment(Asm->getTargetData().getPointerSize() == 4 ? 2 : 3); - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_end", - EHFrameInfo.Number)); - - // If the function is marked used, this table should be also. We cannot - // make the mark unconditional in this case, since retaining the table also - // retains the function in this case, and there is code around that depends - // on unused functions (calling undefined externals) being dead-stripped to - // link correctly. Yes, there really is. - if (MMI->isUsedFunction(EHFrameInfo.function)) - if (Asm->MAI->hasNoDeadStrip()) - Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym, - MCSA_NoDeadStrip); - } - Asm->OutStreamer.AddBlankLine(); -} - -/// EndModule - Emit all exception information that should come after the -/// content. -void DwarfTableException::EndModule() { - if (!Asm->MAI->isExceptionHandlingDwarf()) - return; - - if (!shouldEmitMovesModule && !shouldEmitTableModule) - return; - - const std::vector<const Function*> &Personalities = MMI->getPersonalities(); - - for (unsigned I = 0, E = Personalities.size(); I < E; ++I) - EmitCIE(Personalities[I], I); - - for (std::vector<FunctionEHFrameInfo>::iterator - I = EHFrames.begin(), E = EHFrames.end(); I != E; ++I) - EmitFDE(*I); -} - -/// BeginFunction - Gather pre-function exception information. Assumes it's -/// being emitted immediately after the function entry point. -void DwarfTableException::BeginFunction(const MachineFunction *MF) { - shouldEmitTable = shouldEmitMoves = false; - - // If any landing pads survive, we need an EH table. - shouldEmitTable = !MMI->getLandingPads().empty(); - - // See if we need frame move info. - shouldEmitMoves = - !Asm->MF->getFunction()->doesNotThrow() || UnwindTablesMandatory; - - if (shouldEmitMoves || shouldEmitTable) - // Assumes in correct section after the entry point. - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin", - Asm->getFunctionNumber())); - - shouldEmitTableModule |= shouldEmitTable; - shouldEmitMovesModule |= shouldEmitMoves; -} - -/// EndFunction - Gather and emit post-function exception information. -/// -void DwarfTableException::EndFunction() { - if (!shouldEmitMoves && !shouldEmitTable) return; - - Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end", - Asm->getFunctionNumber())); - - // Record if this personality index uses a landing pad. - bool HasLandingPad = !MMI->getLandingPads().empty(); - UsesLSDA[MMI->getPersonalityIndex()] |= HasLandingPad; - - // Map all labels and get rid of any dead landing pads. - MMI->TidyLandingPads(); - - if (HasLandingPad) - EmitExceptionTable(); - - const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); - MCSymbol *FunctionEHSym = - Asm->GetSymbolWithGlobalValueBase(Asm->MF->getFunction(), ".eh", - TLOF.isFunctionEHFrameSymbolPrivate()); - - // Save EH frame information - EHFrames. - push_back(FunctionEHFrameInfo(FunctionEHSym, - Asm->getFunctionNumber(), - MMI->getPersonalityIndex(), - Asm->MF->getFrameInfo()->adjustsStack(), - !MMI->getLandingPads().empty(), - MMI->getFrameMoves(), - Asm->MF->getFunction())); -} diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp new file mode 100644 index 0000000..c2ad5eb --- /dev/null +++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp @@ -0,0 +1,116 @@ +//===-- CodeGen/AsmPrinter/Win64Exception.cpp - Dwarf Exception Impl ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for writing Win64 exception info into asm files. +// +//===----------------------------------------------------------------------===// + +#include "DwarfException.h" +#include "llvm/Module.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +using namespace llvm; + +Win64Exception::Win64Exception(AsmPrinter *A) + : DwarfException(A), + shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false) + {} + +Win64Exception::~Win64Exception() {} + +/// EndModule - Emit all exception information that should come after the +/// content. +void Win64Exception::EndModule() { +} + +/// BeginFunction - Gather pre-function exception information. Assumes it's +/// being emitted immediately after the function entry point. +void Win64Exception::BeginFunction(const MachineFunction *MF) { + shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false; + + // If any landing pads survive, we need an EH table. + bool hasLandingPads = !MMI->getLandingPads().empty(); + + shouldEmitMoves = Asm->needsSEHMoves(); + + const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); + unsigned PerEncoding = TLOF.getPersonalityEncoding(); + const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()]; + + shouldEmitPersonality = hasLandingPads && + PerEncoding != dwarf::DW_EH_PE_omit && Per; + + unsigned LSDAEncoding = TLOF.getLSDAEncoding(); + shouldEmitLSDA = shouldEmitPersonality && + LSDAEncoding != dwarf::DW_EH_PE_omit; + + if (!shouldEmitPersonality && !shouldEmitMoves) + return; + + Asm->OutStreamer.EmitWin64EHStartProc(Asm->CurrentFnSym); + + if (!shouldEmitPersonality) + return; + + MCSymbol *GCCHandlerSym = + Asm->GetExternalSymbolSymbol("_GCC_specific_handler"); + Asm->OutStreamer.EmitWin64EHHandler(GCCHandlerSym, true, true); + + Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin", + Asm->getFunctionNumber())); +} + +/// EndFunction - Gather and emit post-function exception information. +/// +void Win64Exception::EndFunction() { + if (!shouldEmitPersonality && !shouldEmitMoves) + return; + + Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end", + Asm->getFunctionNumber())); + + // Map all labels and get rid of any dead landing pads. + MMI->TidyLandingPads(); + + if (shouldEmitPersonality) { + const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); + const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()]; + const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(Per, Asm->Mang, MMI); + + Asm->OutStreamer.PushSection(); + Asm->OutStreamer.EmitWin64EHHandlerData(); + Asm->OutStreamer.EmitValue(MCSymbolRefExpr::Create(Sym, Asm->OutContext), + 4); + EmitExceptionTable(); + Asm->OutStreamer.PopSection(); + } + Asm->OutStreamer.EmitWin64EHEndProc(); +} diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index 78a8743..d95f77e 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -41,6 +41,7 @@ using namespace llvm; STATISTIC(NumDeadBlocks, "Number of dead blocks removed"); STATISTIC(NumBranchOpts, "Number of branches optimized"); STATISTIC(NumTailMerge , "Number of block tails merged"); +STATISTIC(NumHoist , "Number of times common instructions are hoisted"); static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge", cl::init(cl::BOU_UNSET), cl::Hidden); @@ -65,7 +66,7 @@ namespace { public: static char ID; explicit BranchFolderPass(bool defaultEnableTailMerge) - : MachineFunctionPass(ID), BranchFolder(defaultEnableTailMerge) {} + : MachineFunctionPass(ID), BranchFolder(defaultEnableTailMerge, true) {} virtual bool runOnMachineFunction(MachineFunction &MF); virtual const char *getPassName() const { return "Control Flow Optimizer"; } @@ -86,12 +87,14 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) { } -BranchFolder::BranchFolder(bool defaultEnableTailMerge) { +BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist) { switch (FlagEnableTailMerge) { case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break; case cl::BOU_TRUE: EnableTailMerge = true; break; case cl::BOU_FALSE: EnableTailMerge = false; break; } + + EnableHoistCommonCode = CommonHoist; } /// RemoveDeadBlock - Remove the specified dead machine basic block from the @@ -105,6 +108,9 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) { while (!MBB->succ_empty()) MBB->removeSuccessor(MBB->succ_end()-1); + // Avoid matching if this pointer gets reused. + TriedMerging.erase(MBB); + // Remove the block. MF->erase(MBB); } @@ -168,6 +174,8 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF, MachineModuleInfo *mmi) { if (!tii) return false; + TriedMerging.clear(); + TII = tii; TRI = tri; MMI = mmi; @@ -186,9 +194,10 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF, bool MadeChangeThisIteration = true; while (MadeChangeThisIteration) { - MadeChangeThisIteration = false; - MadeChangeThisIteration |= TailMergeBlocks(MF); - MadeChangeThisIteration |= OptimizeBranches(MF); + MadeChangeThisIteration = TailMergeBlocks(MF); + MadeChangeThisIteration |= OptimizeBranches(MF); + if (EnableHoistCommonCode) + MadeChangeThisIteration |= HoistCommonCode(MF); MadeChange |= MadeChangeThisIteration; } @@ -795,14 +804,21 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { // First find blocks with no successors. MergePotentials.clear(); - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); + I != E && MergePotentials.size() < TailMergeThreshold; ++I) { + if (TriedMerging.count(I)) + continue; if (I->succ_empty()) MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(I), I)); } + // If this is a large problem, avoid visiting the same basic blocks + // multiple times. + if (MergePotentials.size() == TailMergeThreshold) + for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) + TriedMerging.insert(MergePotentials[i].getBlock()); // See if we can do any tail merging on those. - if (MergePotentials.size() < TailMergeThreshold && - MergePotentials.size() >= 2) + if (MergePotentials.size() >= 2) MadeChange |= TryTailMergeBlocks(NULL, NULL); // Look at blocks (IBB) with multiple predecessors (PBB). @@ -826,15 +842,17 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end(); I != E; ++I) { - if (I->pred_size() >= 2 && I->pred_size() < TailMergeThreshold) { + if (I->pred_size() >= 2) { SmallPtrSet<MachineBasicBlock *, 8> UniquePreds; MachineBasicBlock *IBB = I; MachineBasicBlock *PredBB = prior(I); MergePotentials.clear(); for (MachineBasicBlock::pred_iterator P = I->pred_begin(), E2 = I->pred_end(); - P != E2; ++P) { + P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) { MachineBasicBlock *PBB = *P; + if (TriedMerging.count(PBB)) + continue; // Skip blocks that loop to themselves, can't tail merge these. if (PBB == IBB) continue; @@ -887,6 +905,11 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P)); } } + // If this is a large problem, avoid visiting the same basic blocks + // multiple times. + if (MergePotentials.size() == TailMergeThreshold) + for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) + TriedMerging.insert(MergePotentials[i].getBlock()); if (MergePotentials.size() >= 2) MadeChange |= TryTailMergeBlocks(IBB, PredBB); // Reinsert an unconditional branch if needed. @@ -910,7 +933,8 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) { // Make sure blocks are numbered in order MF.RenumberBlocks(); - for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) { + for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end(); + I != E; ) { MachineBasicBlock *MBB = I++; MadeChange |= OptimizeBlock(MBB); @@ -1048,9 +1072,25 @@ ReoptimizeBlock: // AnalyzeBranch. if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 && PrevBB.succ_size() == 1 && - !MBB->hasAddressTaken()) { + !MBB->hasAddressTaken() && !MBB->isLandingPad()) { DEBUG(dbgs() << "\nMerging into block: " << PrevBB << "From MBB: " << *MBB); + // Remove redundant DBG_VALUEs first. + if (PrevBB.begin() != PrevBB.end()) { + MachineBasicBlock::iterator PrevBBIter = PrevBB.end(); + --PrevBBIter; + MachineBasicBlock::iterator MBBIter = MBB->begin(); + // Check if DBG_VALUE at the end of PrevBB is identical to the + // DBG_VALUE at the beginning of MBB. + while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end() + && PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) { + if (!MBBIter->isIdenticalTo(PrevBBIter)) + break; + MachineInstr *DuplicateDbg = MBBIter; + ++MBBIter; -- PrevBBIter; + DuplicateDbg->eraseFromParent(); + } + } PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end()); PrevBB.removeSuccessor(PrevBB.succ_begin());; assert(PrevBB.succ_empty()); @@ -1339,3 +1379,282 @@ ReoptimizeBlock: return MadeChange; } + +//===----------------------------------------------------------------------===// +// Hoist Common Code +//===----------------------------------------------------------------------===// + +/// HoistCommonCode - Hoist common instruction sequences at the start of basic +/// blocks to their common predecessor. +bool BranchFolder::HoistCommonCode(MachineFunction &MF) { + bool MadeChange = false; + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) { + MachineBasicBlock *MBB = I++; + MadeChange |= HoistCommonCodeInSuccs(MBB); + } + + return MadeChange; +} + +/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given +/// its 'true' successor. +static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB, + MachineBasicBlock *TrueBB) { + for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), + E = BB->succ_end(); SI != E; ++SI) { + MachineBasicBlock *SuccBB = *SI; + if (SuccBB != TrueBB) + return SuccBB; + } + return NULL; +} + +/// findHoistingInsertPosAndDeps - Find the location to move common instructions +/// in successors to. The location is ususally just before the terminator, +/// however if the terminator is a conditional branch and its previous +/// instruction is the flag setting instruction, the previous instruction is +/// the preferred location. This function also gathers uses and defs of the +/// instructions from the insertion point to the end of the block. The data is +/// used by HoistCommonCodeInSuccs to ensure safety. +static +MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI, + SmallSet<unsigned,4> &Uses, + SmallSet<unsigned,4> &Defs) { + MachineBasicBlock::iterator Loc = MBB->getFirstTerminator(); + if (!TII->isUnpredicatedTerminator(Loc)) + return MBB->end(); + + for (unsigned i = 0, e = Loc->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = Loc->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + if (MO.isUse()) { + Uses.insert(Reg); + for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) + Uses.insert(*AS); + } else if (!MO.isDead()) + // Don't try to hoist code in the rare case the terminator defines a + // register that is later used. + return MBB->end(); + } + + if (Uses.empty()) + return Loc; + if (Loc == MBB->begin()) + return MBB->end(); + + // The terminator is probably a conditional branch, try not to separate the + // branch from condition setting instruction. + MachineBasicBlock::iterator PI = Loc; + --PI; + while (PI != MBB->begin() && Loc->isDebugValue()) + --PI; + + bool IsDef = false; + for (unsigned i = 0, e = PI->getNumOperands(); !IsDef && i != e; ++i) { + const MachineOperand &MO = PI->getOperand(i); + if (!MO.isReg() || MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + if (Uses.count(Reg)) + IsDef = true; + } + if (!IsDef) + // The condition setting instruction is not just before the conditional + // branch. + return Loc; + + // Be conservative, don't insert instruction above something that may have + // side-effects. And since it's potentially bad to separate flag setting + // instruction from the conditional branch, just abort the optimization + // completely. + // Also avoid moving code above predicated instruction since it's hard to + // reason about register liveness with predicated instruction. + bool DontMoveAcrossStore = true; + if (!PI->isSafeToMove(TII, 0, DontMoveAcrossStore) || + TII->isPredicated(PI)) + return MBB->end(); + + + // Find out what registers are live. Note this routine is ignoring other live + // registers which are only used by instructions in successor blocks. + for (unsigned i = 0, e = PI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = PI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + if (MO.isUse()) { + Uses.insert(Reg); + for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) + Uses.insert(*AS); + } else { + if (Uses.count(Reg)) { + Uses.erase(Reg); + for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) + Uses.erase(*SR); // Use getSubRegisters to be conservative + } + Defs.insert(Reg); + for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) + Defs.insert(*AS); + } + } + + return PI; +} + +/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction +/// sequence at the start of the function, move the instructions before MBB +/// terminator if it's legal. +bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { + MachineBasicBlock *TBB = 0, *FBB = 0; + SmallVector<MachineOperand, 4> Cond; + if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty()) + return false; + + if (!FBB) FBB = findFalseBlock(MBB, TBB); + if (!FBB) + // Malformed bcc? True and false blocks are the same? + return false; + + // Restrict the optimization to cases where MBB is the only predecessor, + // it is an obvious win. + if (TBB->pred_size() > 1 || FBB->pred_size() > 1) + return false; + + // Find a suitable position to hoist the common instructions to. Also figure + // out which registers are used or defined by instructions from the insertion + // point to the end of the block. + SmallSet<unsigned, 4> Uses, Defs; + MachineBasicBlock::iterator Loc = + findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs); + if (Loc == MBB->end()) + return false; + + bool HasDups = false; + SmallVector<unsigned, 4> LocalDefs; + SmallSet<unsigned, 4> LocalDefsSet; + MachineBasicBlock::iterator TIB = TBB->begin(); + MachineBasicBlock::iterator FIB = FBB->begin(); + MachineBasicBlock::iterator TIE = TBB->end(); + MachineBasicBlock::iterator FIE = FBB->end(); + while (TIB != TIE && FIB != FIE) { + // Skip dbg_value instructions. These do not count. + if (TIB->isDebugValue()) { + while (TIB != TIE && TIB->isDebugValue()) + ++TIB; + if (TIB == TIE) + break; + } + if (FIB->isDebugValue()) { + while (FIB != FIE && FIB->isDebugValue()) + ++FIB; + if (FIB == FIE) + break; + } + if (!TIB->isIdenticalTo(FIB, MachineInstr::CheckKillDead)) + break; + + if (TII->isPredicated(TIB)) + // Hard to reason about register liveness with predicated instruction. + break; + + bool IsSafe = true; + for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) { + MachineOperand &MO = TIB->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + if (MO.isDef()) { + if (Uses.count(Reg)) { + // Avoid clobbering a register that's used by the instruction at + // the point of insertion. + IsSafe = false; + break; + } + + if (Defs.count(Reg) && !MO.isDead()) { + // Don't hoist the instruction if the def would be clobber by the + // instruction at the point insertion. FIXME: This is overly + // conservative. It should be possible to hoist the instructions + // in BB2 in the following example: + // BB1: + // r1, eflag = op1 r2, r3 + // brcc eflag + // + // BB2: + // r1 = op2, ... + // = op3, r1<kill> + IsSafe = false; + break; + } + } else if (!LocalDefsSet.count(Reg)) { + if (Defs.count(Reg)) { + // Use is defined by the instruction at the point of insertion. + IsSafe = false; + break; + } + } + } + if (!IsSafe) + break; + + bool DontMoveAcrossStore = true; + if (!TIB->isSafeToMove(TII, 0, DontMoveAcrossStore)) + break; + + // Track local defs so we can update liveins. + for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) { + MachineOperand &MO = TIB->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + if (MO.isDef()) { + if (!MO.isDead()) { + LocalDefs.push_back(Reg); + LocalDefsSet.insert(Reg); + for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) + LocalDefsSet.insert(*SR); + } + } else if (MO.isKill() && LocalDefsSet.count(Reg)) { + LocalDefsSet.erase(Reg); + for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR) + LocalDefsSet.erase(*SR); + } + } + + HasDups = true;; + ++TIB; + ++FIB; + } + + if (!HasDups) + return false; + + MBB->splice(Loc, TBB, TBB->begin(), TIB); + FBB->erase(FBB->begin(), FIB); + + // Update livein's. + for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) { + unsigned Def = LocalDefs[i]; + if (LocalDefsSet.count(Def)) { + TBB->addLiveIn(Def); + FBB->addLiveIn(Def); + } + } + + ++NumHoist; + return true; +} diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h index 15dfa7f..4ed42c0 100644 --- a/lib/CodeGen/BranchFolding.h +++ b/lib/CodeGen/BranchFolding.h @@ -10,6 +10,7 @@ #ifndef LLVM_CODEGEN_BRANCHFOLDING_HPP #define LLVM_CODEGEN_BRANCHFOLDING_HPP +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include <vector> @@ -19,11 +20,10 @@ namespace llvm { class RegScavenger; class TargetInstrInfo; class TargetRegisterInfo; - template<typename T> class SmallVectorImpl; class BranchFolder { public: - explicit BranchFolder(bool defaultEnableTailMerge); + explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist); bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, @@ -48,6 +48,7 @@ namespace llvm { }; typedef std::vector<MergePotentialsElt>::iterator MPIterator; std::vector<MergePotentialsElt> MergePotentials; + SmallPtrSet<const MachineBasicBlock*, 2> TriedMerging; class SameTailElt { MPIterator MPIter; @@ -85,6 +86,7 @@ namespace llvm { std::vector<SameTailElt> SameTails; bool EnableTailMerge; + bool EnableHoistCommonCode; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MachineModuleInfo *MMI; @@ -110,6 +112,9 @@ namespace llvm { bool OptimizeBlock(MachineBasicBlock *MBB); void RemoveDeadBlock(MachineBasicBlock *MBB); bool OptimizeImpDefsBlock(MachineBasicBlock *MBB); + + bool HoistCommonCode(MachineFunction &MF); + bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB); }; } diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 2ca3859..aef4ff2 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -33,6 +33,7 @@ add_llvm_library(LLVMCodeGen LocalStackSlotAllocation.cpp LowerSubregs.cpp MachineBasicBlock.cpp + MachineBranchProbabilityInfo.cpp MachineCSE.cpp MachineDominators.cpp MachineFunction.cpp @@ -67,6 +68,7 @@ add_llvm_library(LLVMCodeGen RegAllocGreedy.cpp RegAllocLinearScan.cpp RegAllocPBQP.cpp + RegisterClassInfo.cpp RegisterCoalescer.cpp RegisterScavenging.cpp RenderMachineFunction.cpp diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp index 86ab2b6..5d722ee 100644 --- a/lib/CodeGen/CalcSpillWeights.cpp +++ b/lib/CodeGen/CalcSpillWeights.cpp @@ -87,8 +87,8 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg, } void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { - MachineRegisterInfo &mri = mf_.getRegInfo(); - const TargetRegisterInfo &tri = *mf_.getTarget().getRegisterInfo(); + MachineRegisterInfo &mri = MF.getRegInfo(); + const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo(); MachineBasicBlock *mbb = 0; MachineLoop *loop = 0; unsigned loopDepth = 0; @@ -118,7 +118,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { // Get loop info for mi. if (mi->getParent() != mbb) { mbb = mi->getParent(); - loop = loops_.getLoopFor(mbb); + loop = Loops.getLoopFor(mbb); loopDepth = loop ? loop->getLoopDepth() : 0; isExiting = loop ? loop->isLoopExiting(mbb) : false; } @@ -129,7 +129,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { weight = LiveIntervals::getSpillWeight(writes, reads, loopDepth); // Give extra weight to what looks like a loop induction variable update. - if (writes && isExiting && lis_.isLiveOutOfMBB(li, mbb)) + if (writes && isExiting && LIS.isLiveOutOfMBB(li, mbb)) weight *= 3; totalWeight += weight; @@ -141,9 +141,9 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { unsigned hint = copyHint(mi, li.reg, tri, mri); if (!hint) continue; - float hweight = hint_[hint] += weight; + float hweight = Hint[hint] += weight; if (TargetRegisterInfo::isPhysicalRegister(hint)) { - if (hweight > bestPhys && lis_.isAllocatable(hint)) + if (hweight > bestPhys && LIS.isAllocatable(hint)) bestPhys = hweight, hintPhys = hint; } else { if (hweight > bestVirt) @@ -151,7 +151,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { } } - hint_.clear(); + Hint.clear(); // Always prefer the physreg hint. if (unsigned hint = hintPhys ? hintPhys : hintVirt) { @@ -165,7 +165,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { return; // Mark li as unspillable if all live ranges are tiny. - if (li.isZeroLength()) { + if (li.isZeroLength(LIS.getSlotIndexes())) { li.markNotSpillable(); return; } @@ -176,7 +176,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { // FIXME: this gets much more complicated once we support non-trivial // re-materialization. bool isLoad = false; - if (lis_.isReMaterializable(li, 0, isLoad)) { + if (LIS.isReMaterializable(li, 0, isLoad)) { if (isLoad) totalWeight *= 0.9F; else @@ -187,50 +187,29 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { } void VirtRegAuxInfo::CalculateRegClass(unsigned reg) { - MachineRegisterInfo &mri = mf_.getRegInfo(); - const TargetRegisterInfo *tri = mf_.getTarget().getRegisterInfo(); - const TargetRegisterClass *orc = mri.getRegClass(reg); - SmallPtrSet<const TargetRegisterClass*,8> rcs; - - for (MachineRegisterInfo::reg_nodbg_iterator I = mri.reg_nodbg_begin(reg), - E = mri.reg_nodbg_end(); I != E; ++I) { - // The targets don't have accurate enough regclass descriptions that we can - // handle subregs. We need something similar to - // TRI::getMatchingSuperRegClass, but returning a super class instead of a - // sub class. - if (I.getOperand().getSubReg()) { - DEBUG(dbgs() << "Cannot handle subregs: " << I.getOperand() << '\n'); - return; - } - if (const TargetRegisterClass *rc = - I->getDesc().getRegClass(I.getOperandNo(), tri)) - rcs.insert(rc); - } + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo(); + const TargetRegisterClass *OldRC = MRI.getRegClass(reg); + const TargetRegisterClass *NewRC = TRI->getLargestLegalSuperClass(OldRC); - // If we found no regclass constraints, just leave reg as is. - // In theory, we could inflate to the largest superclass of reg's existing - // class, but that might not be legal for the current cpu setting. - // This could happen if reg is only used by COPY instructions, so we may need - // to improve on this. - if (rcs.empty()) { + // Stop early if there is no room to grow. + if (NewRC == OldRC) return; - } - // Compute the intersection of all classes in rcs. - // This ought to be independent of iteration order, but if the target register - // classes don't form a proper algebra, it is possible to get different - // results. The solution is to make sure the intersection of any two register - // classes is also a register class or the null set. - const TargetRegisterClass *rc = 0; - for (SmallPtrSet<const TargetRegisterClass*,8>::iterator I = rcs.begin(), - E = rcs.end(); I != E; ++I) { - rc = rc ? getCommonSubClass(rc, *I) : *I; - assert(rc && "Incompatible regclass constraints found"); + // Accumulate constraints from all uses. + for (MachineRegisterInfo::reg_nodbg_iterator I = MRI.reg_nodbg_begin(reg), + E = MRI.reg_nodbg_end(); I != E; ++I) { + // TRI doesn't have accurate enough information to model this yet. + if (I.getOperand().getSubReg()) + return; + const TargetRegisterClass *OpRC = + I->getDesc().getRegClass(I.getOperandNo(), TRI); + if (OpRC) + NewRC = getCommonSubClass(NewRC, OpRC); + if (!NewRC || NewRC == OldRC) + return; } - - if (rc == orc) - return; - DEBUG(dbgs() << "Inflating " << orc->getName() << ':' << PrintReg(reg) - << " to " << rc->getName() <<".\n"); - mri.setRegClass(reg, rc); + DEBUG(dbgs() << "Inflating " << OldRC->getName() << ':' << PrintReg(reg) + << " to " << NewRC->getName() <<".\n"); + MRI.setRegClass(reg, NewRC); } diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp index ecd69a0..14eb054 100644 --- a/lib/CodeGen/CallingConvLower.cpp +++ b/lib/CodeGen/CallingConvLower.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -22,19 +23,22 @@ #include "llvm/Target/TargetLowering.h" using namespace llvm; -CCState::CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &tm, - SmallVector<CCValAssign, 16> &locs, LLVMContext &C) - : CallingConv(CC), IsVarArg(isVarArg), TM(tm), - TRI(*TM.getRegisterInfo()), Locs(locs), Context(C) { +CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf, + const TargetMachine &tm, SmallVector<CCValAssign, 16> &locs, + LLVMContext &C) + : CallingConv(CC), IsVarArg(isVarArg), MF(mf), TM(tm), + TRI(*TM.getRegisterInfo()), Locs(locs), Context(C), + CallOrPrologue(Unknown) { // No stack is used. StackOffset = 0; - + + clearFirstByValReg(); UsedRegs.resize((TRI.getNumRegs()+31)/32); } -// HandleByVal - Allocate a stack slot large enough to pass an argument by -// value. The size and alignment information of the argument is encoded in its -// parameter attribute. +// HandleByVal - Allocate space on the stack large enough to pass an argument +// by value. The size and alignment information of the argument is encoded in +// its parameter attribute. void CCState::HandleByVal(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, int MinSize, int MinAlign, @@ -45,10 +49,11 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT, Size = MinSize; if (MinAlign > (int)Align) Align = MinAlign; + if (MF.getFrameInfo()->getMaxAlignment() < Align) + MF.getFrameInfo()->setMaxAlignment(Align); + TM.getTargetLowering()->HandleByVal(this, Size); unsigned Offset = AllocateStack(Size, Align); - addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - TM.getTargetLowering()->HandleByVal(const_cast<CCState*>(this)); } /// MarkAllocated - Mark a register and all of its aliases as allocated. diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp index e37356a..270c337 100644 --- a/lib/CodeGen/CodePlacementOpt.cpp +++ b/lib/CodeGen/CodePlacementOpt.cpp @@ -254,7 +254,7 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF, // Determine a position to move orphaned loop blocks to. If TopMBB is not // entered via fallthrough and BotMBB is exited via fallthrough, prepend them - // to the top of the loop to avoid loosing that fallthrough. Otherwise append + // to the top of the loop to avoid losing that fallthrough. Otherwise append // them to the bottom, even if it previously had a fallthrough, on the theory // that it's worth an extra branch to keep the loop contiguous. MachineFunction::iterator InsertPt = diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp index f79598d..4cac453 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -27,12 +27,12 @@ using namespace llvm; CriticalAntiDepBreaker:: -CriticalAntiDepBreaker(MachineFunction& MFi) : +CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI) : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()), TII(MF.getTarget().getInstrInfo()), TRI(MF.getTarget().getRegisterInfo()), - AllocatableSet(TRI->getAllocatableSet(MF)), + RegClassInfo(RCI), Classes(TRI->getNumRegs(), static_cast<const TargetRegisterClass *>(0)), KillIndices(TRI->getNumRegs(), 0), DefIndices(TRI->getNumRegs(), 0) {} @@ -385,11 +385,9 @@ CriticalAntiDepBreaker::findSuitableFreeRegister(RegRefIter RegRefBegin, unsigned LastNewReg, const TargetRegisterClass *RC) { - for (TargetRegisterClass::iterator R = RC->allocation_order_begin(MF), - RE = RC->allocation_order_end(MF); R != RE; ++R) { - unsigned NewReg = *R; - // Don't consider non-allocatable registers - if (!AllocatableSet.test(NewReg)) continue; + ArrayRef<unsigned> Order = RegClassInfo.getOrder(RC); + for (unsigned i = 0; i != Order.size(); ++i) { + unsigned NewReg = Order[i]; // Don't replace a register with itself. if (NewReg == AntiDepReg) continue; // Don't replace a register with one that was recently used to repair @@ -421,7 +419,8 @@ unsigned CriticalAntiDepBreaker:: BreakAntiDependencies(const std::vector<SUnit>& SUnits, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, - unsigned InsertPosIndex) { + unsigned InsertPosIndex, + DbgValueVector &DbgValues) { // The code below assumes that there is at least one instruction, // so just duck out immediately if the block is empty. if (SUnits.empty()) return 0; @@ -533,7 +532,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits, if (Edge->getKind() == SDep::Anti) { AntiDepReg = Edge->getReg(); assert(AntiDepReg != 0 && "Anti-dependence on reg0?"); - if (!AllocatableSet.test(AntiDepReg)) + if (!RegClassInfo.isAllocatable(AntiDepReg)) // Don't break anti-dependencies on non-allocatable registers. AntiDepReg = 0; else if (KeepRegs.count(AntiDepReg)) @@ -628,14 +627,10 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits, // as well. const SUnit *SU = MISUnitMap[Q->second->getParent()]; if (!SU) continue; - for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) { - MachineInstr *DI = SU->DbgInstrList[i]; - assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() && - DI->getOperand(0).getReg() - && "Non register dbg_value attached to SUnit!"); - if (DI->getOperand(0).getReg() == AntiDepReg) - DI->getOperand(0).setReg(NewReg); - } + for (DbgValueVector::iterator DVI = DbgValues.begin(), + DVE = DbgValues.end(); DVI != DVE; ++DVI) + if (DVI->second == Q->second->getParent()) + UpdateDbgValue(DVI->first, AntiDepReg, NewReg); } // We just went back in time and modified history; the diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h index 0daaef2..0710780 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.h +++ b/lib/CodeGen/CriticalAntiDepBreaker.h @@ -17,6 +17,7 @@ #define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H #include "AntiDepBreaker.h" +#include "RegisterClassInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -27,6 +28,7 @@ #include <map> namespace llvm { +class RegisterClassInfo; class TargetInstrInfo; class TargetRegisterInfo; @@ -35,6 +37,7 @@ class TargetRegisterInfo; MachineRegisterInfo &MRI; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; + const RegisterClassInfo &RegClassInfo; /// AllocatableSet - The set of allocatable registers. /// We'll be ignoring anti-dependencies on non-allocatable registers, @@ -66,7 +69,7 @@ class TargetRegisterInfo; SmallSet<unsigned, 4> KeepRegs; public: - CriticalAntiDepBreaker(MachineFunction& MFi); + CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo&); ~CriticalAntiDepBreaker(); /// Start - Initialize anti-dep breaking for a new basic block. @@ -79,7 +82,8 @@ class TargetRegisterInfo; unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, - unsigned InsertPosIndex); + unsigned InsertPosIndex, + DbgValueVector &DbgValues); /// Observe - Update liveness information to account for the current /// instruction, which will not be scheduled. diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp index 34b1a39..22c5465 100644 --- a/lib/CodeGen/DwarfEHPrepare.cpp +++ b/lib/CodeGen/DwarfEHPrepare.cpp @@ -30,6 +30,7 @@ using namespace llvm; STATISTIC(NumLandingPadsSplit, "Number of landing pads split"); STATISTIC(NumUnwindsLowered, "Number of unwind instructions lowered"); +STATISTIC(NumResumesLowered, "Number of eh.resume calls lowered"); STATISTIC(NumExceptionValuesMoved, "Number of eh.exception calls moved"); namespace { @@ -63,7 +64,7 @@ namespace { BBSet LandingPads; bool NormalizeLandingPads(); - bool LowerUnwinds(); + bool LowerUnwindsAndResumes(); bool MoveExceptionValueCalls(); Instruction *CreateExceptionValueCall(BasicBlock *BB); @@ -251,10 +252,7 @@ bool DwarfEHPrepare::HandleURoRInvokes() { if (!URoR) { URoR = F->getParent()->getFunction("_Unwind_Resume_or_Rethrow"); - if (!URoR) { - URoR = F->getParent()->getFunction("_Unwind_SjLj_Resume"); - if (!URoR) return CleanupSelectors(CatchAllSels); - } + if (!URoR) return CleanupSelectors(CatchAllSels); } SmallPtrSet<InvokeInst*, 32> URoRInvokes; @@ -480,20 +478,25 @@ bool DwarfEHPrepare::NormalizeLandingPads() { /// rethrowing any previously caught exception. This will crash horribly /// at runtime if there is no such exception: using unwind to throw a new /// exception is currently not supported. -bool DwarfEHPrepare::LowerUnwinds() { - SmallVector<TerminatorInst*, 16> UnwindInsts; - - for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - TerminatorInst *TI = I->getTerminator(); - if (isa<UnwindInst>(TI)) - UnwindInsts.push_back(TI); +bool DwarfEHPrepare::LowerUnwindsAndResumes() { + SmallVector<Instruction*, 16> ResumeInsts; + + for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + for (BasicBlock::iterator bi = fi->begin(), be = fi->end(); bi != be; ++bi){ + if (isa<UnwindInst>(bi)) + ResumeInsts.push_back(bi); + else if (CallInst *call = dyn_cast<CallInst>(bi)) + if (Function *fn = dyn_cast<Function>(call->getCalledValue())) + if (fn->getName() == "llvm.eh.resume") + ResumeInsts.push_back(bi); + } } - if (UnwindInsts.empty()) return false; + if (ResumeInsts.empty()) return false; // Find the rewind function if we didn't already. if (!RewindFunction) { - LLVMContext &Ctx = UnwindInsts[0]->getContext(); + LLVMContext &Ctx = ResumeInsts[0]->getContext(); std::vector<const Type*> Params(1, Type::getInt8PtrTy(Ctx)); FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), @@ -504,24 +507,36 @@ bool DwarfEHPrepare::LowerUnwinds() { bool Changed = false; - for (SmallVectorImpl<TerminatorInst*>::iterator - I = UnwindInsts.begin(), E = UnwindInsts.end(); I != E; ++I) { - TerminatorInst *TI = *I; + for (SmallVectorImpl<Instruction*>::iterator + I = ResumeInsts.begin(), E = ResumeInsts.end(); I != E; ++I) { + Instruction *RI = *I; - // Replace the unwind instruction with a call to _Unwind_Resume (or the - // appropriate target equivalent) followed by an UnreachableInst. + // Replace the resuming instruction with a call to _Unwind_Resume (or the + // appropriate target equivalent). + + llvm::Value *ExnValue; + if (isa<UnwindInst>(RI)) + ExnValue = CreateExceptionValueCall(RI->getParent()); + else + ExnValue = cast<CallInst>(RI)->getArgOperand(0); // Create the call... - CallInst *CI = CallInst::Create(RewindFunction, - CreateExceptionValueCall(TI->getParent()), - "", TI); + CallInst *CI = CallInst::Create(RewindFunction, ExnValue, "", RI); CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME)); - // ...followed by an UnreachableInst. - new UnreachableInst(TI->getContext(), TI); - // Nuke the unwind instruction. - TI->eraseFromParent(); - ++NumUnwindsLowered; + // ...followed by an UnreachableInst, if it was an unwind. + // Calls to llvm.eh.resume are typically already followed by this. + if (isa<UnwindInst>(RI)) + new UnreachableInst(RI->getContext(), RI); + + if (isa<UnwindInst>(RI)) + ++NumUnwindsLowered; + else + ++NumResumesLowered; + + // Nuke the resume instruction. + RI->eraseFromParent(); + Changed = true; } @@ -657,8 +672,8 @@ bool DwarfEHPrepare::runOnFunction(Function &Fn) { // basic block where an invoke unwind edge ends). Changed |= NormalizeLandingPads(); - // Turn unwind instructions into libcalls. - Changed |= LowerUnwinds(); + // Turn unwind instructions and eh.resume calls into libcalls. + Changed |= LowerUnwindsAndResumes(); // TODO: Move eh.selector calls to landing pads and combine them. diff --git a/lib/CodeGen/ELF.h b/lib/CodeGen/ELF.h index e08feeb..5b63468 100644 --- a/lib/CodeGen/ELF.h +++ b/lib/CodeGen/ELF.h @@ -173,7 +173,7 @@ namespace llvm { unsigned Offset; // sh_offset - Offset from the file start unsigned Size; // sh_size - The section size. unsigned Link; // sh_link - Section header table index link. - unsigned Info; // sh_info - Auxillary information. + unsigned Info; // sh_info - Auxiliary information. unsigned Align; // sh_addralign - Alignment of section. unsigned EntSize; // sh_entsize - Size of entries in the section e diff --git a/lib/CodeGen/ELFWriter.cpp b/lib/CodeGen/ELFWriter.cpp index 25c2e02..fa2319b 100644 --- a/lib/CodeGen/ELFWriter.cpp +++ b/lib/CodeGen/ELFWriter.cpp @@ -77,7 +77,7 @@ ELFWriter::ELFWriter(raw_ostream &o, TargetMachine &tm) // Create the object code emitter object for this target. ElfCE = new ELFCodeEmitter(*this); - // Inital number of sections + // Initial number of sections NumSections = 0; } @@ -662,12 +662,16 @@ bool ELFWriter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) { void ELFWriter::EmitXXStructorList(Constant *List, ELFSection &Xtor) { // Should be an array of '{ i32, void ()* }' structs. The first value is the // init priority, which we ignore. + if (List->isNullValue()) return; ConstantArray *InitList = cast<ConstantArray>(List); for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) { + if (InitList->getOperand(i)->isNullValue()) + continue; ConstantStruct *CS = cast<ConstantStruct>(InitList->getOperand(i)); if (CS->getOperand(1)->isNullValue()) - return; // Found a null terminator, exit printing. + continue; + // Emit the function pointer. EmitGlobalConstant(CS->getOperand(1), Xtor); } diff --git a/lib/CodeGen/EdgeBundles.cpp b/lib/CodeGen/EdgeBundles.cpp index aed8bc9..a7aba89 100644 --- a/lib/CodeGen/EdgeBundles.cpp +++ b/lib/CodeGen/EdgeBundles.cpp @@ -39,7 +39,7 @@ void EdgeBundles::getAnalysisUsage(AnalysisUsage &AU) const { bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) { MF = &mf; EC.clear(); - EC.grow(2 * MF->size()); + EC.grow(2 * MF->getNumBlockIDs()); for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E; ++I) { @@ -53,6 +53,19 @@ bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) { EC.compress(); if (ViewEdgeBundles) view(); + + // Compute the reverse mapping. + Blocks.clear(); + Blocks.resize(getNumBundles()); + + for (unsigned i = 0, e = MF->getNumBlockIDs(); i != e; ++i) { + unsigned b0 = getBundle(i, 0); + unsigned b1 = getBundle(i, 1); + Blocks[b0].push_back(i); + if (b1 != b0) + Blocks[b1].push_back(i); + } + return false; } @@ -82,5 +95,3 @@ raw_ostream &llvm::WriteGraph(raw_ostream &O, const EdgeBundles &G, O << "}\n"; return O; } - - diff --git a/lib/CodeGen/ExpandISelPseudos.cpp b/lib/CodeGen/ExpandISelPseudos.cpp index b5ec303..ebc2fc9 100644 --- a/lib/CodeGen/ExpandISelPseudos.cpp +++ b/lib/CodeGen/ExpandISelPseudos.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// Expand Psuedo-instructions produced by ISel. These are usually to allow +// Expand Pseudo-instructions produced by ISel. These are usually to allow // the expansion to contain control flow, such as a conditional move // implemented with a conditional branch and a phi, or an atomic operation // implemented with a loop. diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index db53b04..8b2c981 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -27,7 +27,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" @@ -146,10 +145,6 @@ namespace { : BBI(b), Kind(k), NeedSubsumption(s), NumDups(d), NumDups2(d2) {} }; - /// Roots - Basic blocks that do not have successors. These are the starting - /// points of Graph traversal. - std::vector<MachineBasicBlock*> Roots; - /// BBAnalysis - Results of if-conversion feasibility analysis indexed by /// basic block number. std::vector<BBInfo> BBAnalysis; @@ -270,7 +265,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) { if (!TII) return false; // Tail merge tend to expose more if-conversion opportunities. - BranchFolder BF(true); + BranchFolder BF(true, false); bool BFChange = BF.OptimizeFunction(MF, TII, MF.getTarget().getRegisterInfo(), getAnalysisIfAvailable<MachineModuleInfo>()); @@ -287,11 +282,6 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) { MF.RenumberBlocks(); BBAnalysis.resize(MF.getNumBlockIDs()); - // Look for root nodes, i.e. blocks without successors. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - if (I->succ_empty()) - Roots.push_back(I); - std::vector<IfcvtToken*> Tokens; MadeChange = false; unsigned NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle + @@ -406,11 +396,10 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) { } Tokens.clear(); - Roots.clear(); BBAnalysis.clear(); if (MadeChange && IfCvtBranchFold) { - BranchFolder BF(false); + BranchFolder BF(false, false); BF.OptimizeFunction(MF, TII, MF.getTarget().getRegisterInfo(), getAnalysisIfAvailable<MachineModuleInfo>()); @@ -924,13 +913,9 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, /// candidates. void IfConverter::AnalyzeBlocks(MachineFunction &MF, std::vector<IfcvtToken*> &Tokens) { - std::set<MachineBasicBlock*> Visited; - for (unsigned i = 0, e = Roots.size(); i != e; ++i) { - for (idf_ext_iterator<MachineBasicBlock*> I=idf_ext_begin(Roots[i],Visited), - E = idf_ext_end(Roots[i], Visited); I != E; ++I) { - MachineBasicBlock *BB = *I; - AnalyzeBlock(BB, Tokens); - } + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + MachineBasicBlock *BB = I; + AnalyzeBlock(BB, Tokens); } // Sort to favor more complex ifcvt scheme. diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp index 86f4cfc..19ae333 100644 --- a/lib/CodeGen/InlineSpiller.cpp +++ b/lib/CodeGen/InlineSpiller.cpp @@ -16,6 +16,7 @@ #include "Spiller.h" #include "LiveRangeEdit.h" #include "VirtRegMap.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveStackAnalysis.h" @@ -31,6 +32,18 @@ using namespace llvm; +STATISTIC(NumSpilledRanges, "Number of spilled live ranges"); +STATISTIC(NumSnippets, "Number of snippets included in spills"); +STATISTIC(NumSpills, "Number of spills inserted"); +STATISTIC(NumReloads, "Number of reloads inserted"); +STATISTIC(NumFolded, "Number of folded stack accesses"); +STATISTIC(NumFoldedLoads, "Number of folded loads"); +STATISTIC(NumRemats, "Number of rematerialized defs for spilling"); +STATISTIC(NumOmitReloadSpill, "Number of omitted spills after reloads"); +STATISTIC(NumHoistLocal, "Number of locally hoisted spills"); +STATISTIC(NumHoistGlobal, "Number of globally hoisted spills"); +STATISTIC(NumRedundantSpills, "Number of redundant spills identified"); + namespace { class InlineSpiller : public Spiller { MachineFunctionPass &Pass; @@ -134,9 +147,10 @@ private: bool foldMemoryOperand(MachineBasicBlock::iterator MI, const SmallVectorImpl<unsigned> &Ops, MachineInstr *LoadMI = 0); - void insertReload(LiveInterval &NewLI, MachineBasicBlock::iterator MI); + void insertReload(LiveInterval &NewLI, SlotIndex, + MachineBasicBlock::iterator MI); void insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI, - MachineBasicBlock::iterator MI); + SlotIndex, MachineBasicBlock::iterator MI); void spillAroundUses(unsigned Reg); void spillAll(); @@ -246,10 +260,11 @@ void InlineSpiller::collectRegsToSpill() { if (!isSnippet(SnipLI)) continue; SnippetCopies.insert(MI); - if (!isRegToSpill(SnipReg)) - RegsToSpill.push_back(SnipReg); - + if (isRegToSpill(SnipReg)) + continue; + RegsToSpill.push_back(SnipReg); DEBUG(dbgs() << "\talso spill snippet " << SnipLI << '\n'); + ++NumSnippets; } } @@ -419,8 +434,10 @@ void InlineSpiller::analyzeSiblingValues() { } if (!DefMI && !VNI->isPHIDef()) DefMI = LIS.getInstructionFromIndex(VNI->def); - if (DefMI) - Edit->checkRematerializable(VNI, DefMI, TII, AA); + if (DefMI && Edit->checkRematerializable(VNI, DefMI, TII, AA)) { + DEBUG(dbgs() << "Value " << PrintReg(Reg) << ':' << VNI->id << '@' + << VNI->def << " may remat from " << *DefMI); + } } } } @@ -431,7 +448,7 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) { SlotIndex Idx = LIS.getInstructionIndex(CopyMI); VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getDefIndex()); assert(VNI && VNI->def == Idx.getDefIndex() && "Not defined by copy"); - SibValueMap::const_iterator I = SibValues.find(VNI); + SibValueMap::iterator I = SibValues.find(VNI); if (I == SibValues.end()) return false; @@ -441,6 +458,20 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) { if (!SVI.AllDefsAreReloads && SVI.SpillVNI == VNI) return false; + // SpillReg may have been deleted by remat and DCE. + if (!LIS.hasInterval(SVI.SpillReg)) { + DEBUG(dbgs() << "Stale interval: " << PrintReg(SVI.SpillReg) << '\n'); + SibValues.erase(I); + return false; + } + + LiveInterval &SibLI = LIS.getInterval(SVI.SpillReg); + if (!SibLI.containsValue(SVI.SpillVNI)) { + DEBUG(dbgs() << "Stale value: " << PrintReg(SVI.SpillReg) << '\n'); + SibValues.erase(I); + return false; + } + // Conservatively extend the stack slot range to the range of the original // value. We may be able to do better with stack slot coloring by being more // careful here. @@ -452,19 +483,22 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) { << *StackInt << '\n'); // Already spilled everywhere. - if (SVI.AllDefsAreReloads) + if (SVI.AllDefsAreReloads) { + ++NumOmitReloadSpill; return true; - + } // We are going to spill SVI.SpillVNI immediately after its def, so clear out // any later spills of the same value. - eliminateRedundantSpills(LIS.getInterval(SVI.SpillReg), SVI.SpillVNI); + eliminateRedundantSpills(SibLI, SVI.SpillVNI); MachineBasicBlock *MBB = LIS.getMBBFromIndex(SVI.SpillVNI->def); MachineBasicBlock::iterator MII; if (SVI.SpillVNI->isPHIDef()) MII = MBB->SkipPHIsAndLabels(MBB->begin()); else { - MII = LIS.getInstructionFromIndex(SVI.SpillVNI->def); + MachineInstr *DefMI = LIS.getInstructionFromIndex(SVI.SpillVNI->def); + assert(DefMI && "Defining instruction disappeared"); + MII = DefMI; ++MII; } // Insert spill without kill flag immediately after def. @@ -474,6 +508,11 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) { LIS.InsertMachineInstrInMaps(MII); VRM.addSpillSlotUse(StackSlot, MII); DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII); + + if (MBB == CopyMI->getParent()) + ++NumHoistLocal; + else + ++NumHoistGlobal; return true; } @@ -489,8 +528,8 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) { LiveInterval *LI; tie(LI, VNI) = WorkList.pop_back_val(); unsigned Reg = LI->reg; - DEBUG(dbgs() << "Checking redundant spills for " << PrintReg(Reg) << ':' - << VNI->id << '@' << VNI->def << '\n'); + DEBUG(dbgs() << "Checking redundant spills for " + << VNI->id << '@' << VNI->def << " in " << *LI << '\n'); // Regs to spill are taken care of. if (isRegToSpill(Reg)) @@ -528,6 +567,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) { // eliminateDeadDefs won't normally remove stores, so switch opcode. MI->setDesc(TII.get(TargetOpcode::KILL)); DeadDefs.push_back(MI); + ++NumRedundantSpills; } } } while (!WorkList.empty()); @@ -623,6 +663,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, if (RM.OrigMI->getDesc().canFoldAsLoad() && foldMemoryOperand(MI, Ops, RM.OrigMI)) { Edit->markRematerialized(RM.ParentVNI); + ++NumFoldedLoads; return true; } @@ -649,6 +690,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, VNInfo *DefVNI = NewLI.getNextValue(DefIdx, 0, LIS.getVNInfoAllocator()); NewLI.addRange(LiveRange(DefIdx, UseIdx.getDefIndex(), DefVNI)); DEBUG(dbgs() << "\tinterval: " << NewLI << '\n'); + ++NumRemats; return true; } @@ -775,14 +817,15 @@ bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI, VRM.addSpillSlotUse(StackSlot, FoldMI); MI->eraseFromParent(); DEBUG(dbgs() << "\tfolded: " << *FoldMI); + ++NumFolded; return true; } /// insertReload - Insert a reload of NewLI.reg before MI. void InlineSpiller::insertReload(LiveInterval &NewLI, + SlotIndex Idx, MachineBasicBlock::iterator MI) { MachineBasicBlock &MBB = *MI->getParent(); - SlotIndex Idx = LIS.getInstructionIndex(MI).getDefIndex(); TII.loadRegFromStackSlot(MBB, MI, NewLI.reg, StackSlot, MRI.getRegClass(NewLI.reg), &TRI); --MI; // Point to load instruction. @@ -792,19 +835,13 @@ void InlineSpiller::insertReload(LiveInterval &NewLI, VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, 0, LIS.getVNInfoAllocator()); NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI)); + ++NumReloads; } /// insertSpill - Insert a spill of NewLI.reg after MI. void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI, - MachineBasicBlock::iterator MI) { + SlotIndex Idx, MachineBasicBlock::iterator MI) { MachineBasicBlock &MBB = *MI->getParent(); - - // Get the defined value. It could be an early clobber so keep the def index. - SlotIndex Idx = LIS.getInstructionIndex(MI).getDefIndex(); - VNInfo *VNI = OldLI.getVNInfoAt(Idx); - assert(VNI && VNI->def.getDefIndex() == Idx && "Inconsistent VNInfo"); - Idx = VNI->def; - TII.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, StackSlot, MRI.getRegClass(NewLI.reg), &TRI); --MI; // Point to store instruction. @@ -813,10 +850,12 @@ void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI, DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI); VNInfo *StoreVNI = NewLI.getNextValue(Idx, 0, LIS.getVNInfoAllocator()); NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI)); + ++NumSpills; } /// spillAroundUses - insert spill code around each use of Reg. void InlineSpiller::spillAroundUses(unsigned Reg) { + DEBUG(dbgs() << "spillAroundUses " << PrintReg(Reg) << '\n'); LiveInterval &OldLI = LIS.getInterval(Reg); // Iterate over instructions using Reg. @@ -854,9 +893,22 @@ void InlineSpiller::spillAroundUses(unsigned Reg) { SmallVector<unsigned, 8> Ops; tie(Reads, Writes) = MI->readsWritesVirtualRegister(Reg, &Ops); + // Find the slot index where this instruction reads and writes OldLI. + // This is usually the def slot, except for tied early clobbers. + SlotIndex Idx = LIS.getInstructionIndex(MI).getDefIndex(); + if (VNInfo *VNI = OldLI.getVNInfoAt(Idx.getUseIndex())) + if (SlotIndex::isSameInstr(Idx, VNI->def)) + Idx = VNI->def; + // Check for a sibling copy. unsigned SibReg = isFullCopyOf(MI, Reg); if (SibReg && isSibling(SibReg)) { + // This may actually be a copy between snippets. + if (isRegToSpill(SibReg)) { + DEBUG(dbgs() << "Found new snippet copy: " << *MI); + SnippetCopies.insert(MI); + continue; + } if (Writes) { // Hoist the spill of a sib-reg copy. if (hoistSpill(OldLI, MI)) { @@ -867,7 +919,6 @@ void InlineSpiller::spillAroundUses(unsigned Reg) { } } else { // This is a reload for a sib-reg copy. Drop spills downstream. - SlotIndex Idx = LIS.getInstructionIndex(MI).getDefIndex(); LiveInterval &SibLI = LIS.getInterval(SibReg); eliminateRedundantSpills(SibLI, SibLI.getVNInfoAt(Idx)); // The COPY will fold to a reload below. @@ -884,7 +935,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) { NewLI.markNotSpillable(); if (Reads) - insertReload(NewLI, MI); + insertReload(NewLI, Idx, MI); // Rewrite instruction operands. bool hasLiveDef = false; @@ -899,10 +950,11 @@ void InlineSpiller::spillAroundUses(unsigned Reg) { hasLiveDef = true; } } + DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI); // FIXME: Use a second vreg if instruction has no tied ops. if (Writes && hasLiveDef) - insertSpill(NewLI, OldLI, MI); + insertSpill(NewLI, OldLI, Idx, MI); DEBUG(dbgs() << "\tinterval: " << NewLI << '\n'); } @@ -938,13 +990,15 @@ void InlineSpiller::spillAll() { } // Finally delete the SnippetCopies. - for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(Edit->getReg()); - MachineInstr *MI = RI.skipInstruction();) { - assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy"); - // FIXME: Do this with a LiveRangeEdit callback. - VRM.RemoveMachineInstrFromMaps(MI); - LIS.RemoveMachineInstrFromMaps(MI); - MI->eraseFromParent(); + for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) { + for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(RegsToSpill[i]); + MachineInstr *MI = RI.skipInstruction();) { + assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy"); + // FIXME: Do this with a LiveRangeEdit callback. + VRM.RemoveMachineInstrFromMaps(MI); + LIS.RemoveMachineInstrFromMaps(MI); + MI->eraseFromParent(); + } } // Delete all spilled registers. @@ -953,6 +1007,7 @@ void InlineSpiller::spillAll() { } void InlineSpiller::spill(LiveRangeEdit &edit) { + ++NumSpilledRanges; Edit = &edit; assert(!TargetRegisterInfo::isStackSlot(edit.getReg()) && "Trying to spill a stack slot."); diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp index 0aff128..b1014a9 100644 --- a/lib/CodeGen/InterferenceCache.cpp +++ b/lib/CodeGen/InterferenceCache.cpp @@ -26,7 +26,7 @@ void InterferenceCache::init(MachineFunction *mf, TRI = tri; PhysRegEntries.assign(TRI->getNumRegs(), 0); for (unsigned i = 0; i != CacheEntries; ++i) - Entries[i].clear(indexes); + Entries[i].clear(mf, indexes); } InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) { @@ -91,10 +91,6 @@ bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray, } void InterferenceCache::Entry::update(unsigned MBBNum) { - BlockInterference *BI = &Blocks[MBBNum]; - BI->Tag = Tag; - BI->First = BI->Last = SlotIndex(); - SlotIndex Start, Stop; tie(Start, Stop) = Indexes->getMBBRange(MBBNum); @@ -109,23 +105,39 @@ void InterferenceCache::Entry::update(unsigned MBBNum) { PrevPos = Start; } - // Check for first interference. - for (unsigned i = 0, e = Iters.size(); i != e; ++i) { - Iter &I = Iters[i]; - if (!I.valid()) - continue; - SlotIndex StartI = I.start(); - if (StartI >= Stop) - continue; - if (!BI->First.isValid() || StartI < BI->First) - BI->First = StartI; - } + MachineFunction::const_iterator MFI = MF->getBlockNumbered(MBBNum); + BlockInterference *BI = &Blocks[MBBNum]; + for (;;) { + BI->Tag = Tag; + BI->First = BI->Last = SlotIndex(); + + // Check for first interference. + for (unsigned i = 0, e = Iters.size(); i != e; ++i) { + Iter &I = Iters[i]; + if (!I.valid()) + continue; + SlotIndex StartI = I.start(); + if (StartI >= Stop) + continue; + if (!BI->First.isValid() || StartI < BI->First) + BI->First = StartI; + } - // No interference in block. - if (!BI->First.isValid()) - return; + PrevPos = Stop; + if (BI->First.isValid()) + break; + + // No interference in this block? Go ahead and precompute the next block. + if (++MFI == MF->end()) + return; + MBBNum = MFI->getNumber(); + BI = &Blocks[MBBNum]; + if (BI->Tag == Tag) + return; + tie(Start, Stop) = Indexes->getMBBRange(MBBNum); + } - // Check for last interference. + // Check for last interference in block. for (unsigned i = 0, e = Iters.size(); i != e; ++i) { Iter &I = Iters[i]; if (!I.valid() || I.start() >= Stop) @@ -140,5 +152,4 @@ void InterferenceCache::Entry::update(unsigned MBBNum) { if (Backup) ++I; } - PrevPos = Stop; } diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h index 2e613ae..6c36fa4 100644 --- a/lib/CodeGen/InterferenceCache.h +++ b/lib/CodeGen/InterferenceCache.h @@ -43,6 +43,9 @@ class InterferenceCache { /// change. unsigned Tag; + /// MF - The current function. + MachineFunction *MF; + /// Indexes - Mapping block numbers to SlotIndex ranges. SlotIndexes *Indexes; @@ -67,8 +70,9 @@ class InterferenceCache { public: Entry() : PhysReg(0), Tag(0), Indexes(0) {} - void clear(SlotIndexes *indexes) { + void clear(MachineFunction *mf, SlotIndexes *indexes) { PhysReg = 0; + MF = mf; Indexes = indexes; } diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 8c2794a..b98fbed 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -13,6 +13,7 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/PassManager.h" +#include "llvm/Analysis/Passes.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Assembly/PrintModulePass.h" #include "llvm/CodeGen/AsmPrinter.h" @@ -32,7 +33,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Support/StandardPasses.h" using namespace llvm; namespace llvm { @@ -149,6 +149,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM, MCStreamer *S = getTarget().createAsmStreamer(*Context, Out, getVerboseAsm(), hasMCUseLoc(), + hasMCUseCFI(), InstPrinter, MCE, TAB, ShowMCInst); @@ -291,13 +292,21 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM, // Standard LLVM-Level Passes. // Basic AliasAnalysis support. - createStandardAliasAnalysisPasses(&PM); + // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that + // BasicAliasAnalysis wins if they disagree. This is intended to help + // support "obvious" type-punning idioms. + PM.add(createTypeBasedAliasAnalysisPass()); + PM.add(createBasicAliasAnalysisPass()); // Before running any passes, run the verifier to determine if the input // coming from the front-end and/or optimizer is valid. if (!DisableVerify) PM.add(createVerifierPass()); + // Simplify ObjC ARC code. This is done late because it makes re-optimization + // difficult. + PM.add(createObjCARCContractPass()); + // Run loop strength reduction before anything else. if (OptLevel != CodeGenOpt::None && !DisableLSR) { PM.add(createLoopStrengthReducePass(getTargetLowering())); @@ -323,8 +332,8 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM, PM.add(createSjLjEHPass(getTargetLowering())); // FALLTHROUGH case ExceptionHandling::DwarfCFI: - case ExceptionHandling::DwarfTable: case ExceptionHandling::ARM: + case ExceptionHandling::Win64: PM.add(createDwarfEHPass(this)); break; case ExceptionHandling::None: diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp index 333d15f..292928f 100644 --- a/lib/CodeGen/LiveDebugVariables.cpp +++ b/lib/CodeGen/LiveDebugVariables.cpp @@ -101,9 +101,13 @@ class UserValue { void insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx, unsigned LocNo, LiveIntervals &LIS, const TargetInstrInfo &TII); + /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs + /// is live. Returns true if any changes were made. + bool splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs); + public: /// UserValue - Create a new UserValue. - UserValue(const MDNode *var, unsigned o, DebugLoc L, + UserValue(const MDNode *var, unsigned o, DebugLoc L, LocMap::Allocator &alloc) : variable(var), offset(o), dl(L), leader(this), next(0), locInts(alloc) {} @@ -215,6 +219,10 @@ public: void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx, const TargetRegisterInfo *TRI); + /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is + /// live. Returns true if any changes were made. + bool splitRegister(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs); + /// rewriteLocations - Rewrite virtual register locations according to the /// provided virtual register map. void rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI); @@ -228,7 +236,7 @@ public: /// Only first one needs DebugLoc to identify variable's lexical scope /// in source file. DebugLoc findDebugLoc(); - void print(raw_ostream&, const TargetRegisterInfo*); + void print(raw_ostream&, const TargetMachine*); }; } // namespace @@ -290,9 +298,12 @@ public: /// mapVirtReg - Map virtual register to an equivalence class. void mapVirtReg(unsigned VirtReg, UserValue *EC); - /// renameRegister - Replace all references to OldReg wiht NewReg:SubIdx. + /// renameRegister - Replace all references to OldReg with NewReg:SubIdx. void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx); + /// splitRegister - Replace all references to OldReg with NewRegs. + void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs); + /// emitDebugVariables - Recreate DBG_VALUE instruction from data structures. void emitDebugValues(VirtRegMap *VRM); @@ -300,7 +311,7 @@ public: }; } // namespace -void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) { +void UserValue::print(raw_ostream &OS, const TargetMachine *TM) { if (const MDString *MDS = dyn_cast<MDString>(variable->getOperand(2))) OS << "!\"" << MDS->getString() << "\"\t"; if (offset) @@ -312,15 +323,17 @@ void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) { else OS << I.value(); } - for (unsigned i = 0, e = locations.size(); i != e; ++i) - OS << " Loc" << i << '=' << locations[i]; + for (unsigned i = 0, e = locations.size(); i != e; ++i) { + OS << " Loc" << i << '='; + locations[i].print(OS, TM); + } OS << '\n'; } void LDVImpl::print(raw_ostream &OS) { OS << "********** DEBUG VARIABLES **********\n"; for (unsigned i = 0, e = userValues.size(); i != e; ++i) - userValues[i]->print(OS, TRI); + userValues[i]->print(OS, &MF->getTarget()); } void UserValue::coalesceLocation(unsigned LocNo) { @@ -677,6 +690,143 @@ renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx) { static_cast<LDVImpl*>(pImpl)->renameRegister(OldReg, NewReg, SubIdx); } +//===----------------------------------------------------------------------===// +// Live Range Splitting +//===----------------------------------------------------------------------===// + +bool +UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) { + DEBUG({ + dbgs() << "Splitting Loc" << OldLocNo << '\t'; + print(dbgs(), 0); + }); + bool DidChange = false; + LocMap::iterator LocMapI; + LocMapI.setMap(locInts); + for (unsigned i = 0; i != NewRegs.size(); ++i) { + LiveInterval *LI = NewRegs[i]; + if (LI->empty()) + continue; + + // Don't allocate the new LocNo until it is needed. + unsigned NewLocNo = ~0u; + + // Iterate over the overlaps between locInts and LI. + LocMapI.find(LI->beginIndex()); + if (!LocMapI.valid()) + continue; + LiveInterval::iterator LII = LI->advanceTo(LI->begin(), LocMapI.start()); + LiveInterval::iterator LIE = LI->end(); + while (LocMapI.valid() && LII != LIE) { + // At this point, we know that LocMapI.stop() > LII->start. + LII = LI->advanceTo(LII, LocMapI.start()); + if (LII == LIE) + break; + + // Now LII->end > LocMapI.start(). Do we have an overlap? + if (LocMapI.value() == OldLocNo && LII->start < LocMapI.stop()) { + // Overlapping correct location. Allocate NewLocNo now. + if (NewLocNo == ~0u) { + MachineOperand MO = MachineOperand::CreateReg(LI->reg, false); + MO.setSubReg(locations[OldLocNo].getSubReg()); + NewLocNo = getLocationNo(MO); + DidChange = true; + } + + SlotIndex LStart = LocMapI.start(); + SlotIndex LStop = LocMapI.stop(); + + // Trim LocMapI down to the LII overlap. + if (LStart < LII->start) + LocMapI.setStartUnchecked(LII->start); + if (LStop > LII->end) + LocMapI.setStopUnchecked(LII->end); + + // Change the value in the overlap. This may trigger coalescing. + LocMapI.setValue(NewLocNo); + + // Re-insert any removed OldLocNo ranges. + if (LStart < LocMapI.start()) { + LocMapI.insert(LStart, LocMapI.start(), OldLocNo); + ++LocMapI; + assert(LocMapI.valid() && "Unexpected coalescing"); + } + if (LStop > LocMapI.stop()) { + ++LocMapI; + LocMapI.insert(LII->end, LStop, OldLocNo); + --LocMapI; + } + } + + // Advance to the next overlap. + if (LII->end < LocMapI.stop()) { + if (++LII == LIE) + break; + LocMapI.advanceTo(LII->start); + } else { + ++LocMapI; + if (!LocMapI.valid()) + break; + LII = LI->advanceTo(LII, LocMapI.start()); + } + } + } + + // Finally, remove any remaining OldLocNo intervals and OldLocNo itself. + locations.erase(locations.begin() + OldLocNo); + LocMapI.goToBegin(); + while (LocMapI.valid()) { + unsigned v = LocMapI.value(); + if (v == OldLocNo) { + DEBUG(dbgs() << "Erasing [" << LocMapI.start() << ';' + << LocMapI.stop() << ")\n"); + LocMapI.erase(); + } else { + if (v > OldLocNo) + LocMapI.setValueUnchecked(v-1); + ++LocMapI; + } + } + + DEBUG({dbgs() << "Split result: \t"; print(dbgs(), 0);}); + return DidChange; +} + +bool +UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) { + bool DidChange = false; + // Split locations referring to OldReg. Iterate backwards so splitLocation can + // safely erase unuused locations. + for (unsigned i = locations.size(); i ; --i) { + unsigned LocNo = i-1; + const MachineOperand *Loc = &locations[LocNo]; + if (!Loc->isReg() || Loc->getReg() != OldReg) + continue; + DidChange |= splitLocation(LocNo, NewRegs); + } + return DidChange; +} + +void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) { + bool DidChange = false; + for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext()) + DidChange |= UV->splitRegister(OldReg, NewRegs); + + if (!DidChange) + return; + + // Map all of the new virtual registers. + UserValue *UV = lookupVirtReg(OldReg); + for (unsigned i = 0; i != NewRegs.size(); ++i) + mapVirtReg(NewRegs[i]->reg, UV); +} + +void LiveDebugVariables:: +splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) { + if (pImpl) + static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs); +} + void UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) { // Iterate over locations in reverse makes it easier to handle coalescing. @@ -690,6 +840,9 @@ UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) { unsigned VirtReg = Loc.getReg(); if (VRM.isAssignedReg(VirtReg) && TargetRegisterInfo::isPhysicalRegister(VRM.getPhys(VirtReg))) { + // This can create a %noreg operand in rare cases when the sub-register + // index is no longer available. That means the user value is in a + // non-existent sub-register, and %noreg is exactly what we want. Loc.substPhysReg(VRM.getPhys(VirtReg), TRI); } else if (VRM.getStackSlot(VirtReg) != VirtRegMap::NO_STACK_SLOT && VRM.isSpillSlotUsed(VRM.getStackSlot(VirtReg))) { @@ -701,7 +854,6 @@ UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) { } coalesceLocation(LocNo); } - DEBUG(print(dbgs(), &TRI)); } /// findInsertLocation - Find an iterator for inserting a DBG_VALUE @@ -793,6 +945,7 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) { DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n"); const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); for (unsigned i = 0, e = userValues.size(); i != e; ++i) { + DEBUG(userValues[i]->print(dbgs(), &MF->getTarget())); userValues[i]->rewriteLocations(*VRM, *TRI); userValues[i]->emitDebugValues(VRM, *LIS, *TII); } diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h index a6e40a1..3ce3c39 100644 --- a/lib/CodeGen/LiveDebugVariables.h +++ b/lib/CodeGen/LiveDebugVariables.h @@ -21,10 +21,12 @@ #ifndef LLVM_CODEGEN_LIVEDEBUGVARIABLES_H #define LLVM_CODEGEN_LIVEDEBUGVARIABLES_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/MachineFunctionPass.h" namespace llvm { +class LiveInterval; class VirtRegMap; class LiveDebugVariables : public MachineFunctionPass { @@ -42,6 +44,11 @@ public: /// register. void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx); + /// splitRegister - Move any user variables in OldReg to the live ranges in + /// NewRegs where they are live. Mark the values as unavailable where no new + /// register is live. + void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs); + /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes /// that happened during register allocation. /// @param VRM Rename virtual registers according to map. diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp index c77ae1b..9257191 100644 --- a/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -578,13 +578,6 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB, CopyMI = MI; handlePhysicalRegisterDef(MBB, MI, MIIdx, MO, getOrCreateInterval(MO.getReg()), CopyMI); - // Def of a register also defines its sub-registers. - for (const unsigned* AS = tri_->getSubRegisters(MO.getReg()); *AS; ++AS) - // If MI also modifies the sub-register explicitly, avoid processing it - // more than once. Do not pass in TRI here so it checks for exact match. - if (!MI->definesRegister(*AS)) - handlePhysicalRegisterDef(MBB, MI, MIIdx, MO, - getOrCreateInterval(*AS), 0); } } @@ -645,7 +638,7 @@ void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB, end = MIIdx.getStoreIndex(); } else { DEBUG(dbgs() << " live through"); - end = baseIndex; + end = getMBBEndIdx(MBB); } } @@ -1514,7 +1507,7 @@ rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit, // ... // def = ... // = use - // It's better to start a new interval to avoid artifically + // It's better to start a new interval to avoid artificially // extend the new interval. if (MI->readsWritesVirtualRegister(li.reg) == std::make_pair(false,true)) { diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp index 205f28a..b67f966 100644 --- a/lib/CodeGen/LiveIntervalUnion.cpp +++ b/lib/CodeGen/LiveIntervalUnion.cpp @@ -35,12 +35,20 @@ void LiveIntervalUnion::unify(LiveInterval &VirtReg) { LiveInterval::iterator RegEnd = VirtReg.end(); SegmentIter SegPos = Segments.find(RegPos->start); - for (;;) { + while (SegPos.valid()) { SegPos.insert(RegPos->start, RegPos->end, &VirtReg); if (++RegPos == RegEnd) return; SegPos.advanceTo(RegPos->start); } + + // We have reached the end of Segments, so it is no longer necessary to search + // for the insertion position. + // It is faster to insert the end first. + --RegEnd; + SegPos.insert(RegEnd->start, RegEnd->end, &VirtReg); + for (; RegPos != RegEnd; ++RegPos, ++SegPos) + SegPos.insert(RegPos->start, RegPos->end, &VirtReg); } // Remove a live virtual register's segments from this union. @@ -168,6 +176,7 @@ LiveIntervalUnion::Query::firstInterference() { return FirstInterference; CheckedFirstInterference = true; InterferenceResult &IR = FirstInterference; + IR.LiveUnionI.setMap(LiveUnion->getMap()); // Quickly skip interference check for empty sets. if (VirtReg->empty() || LiveUnion->empty()) { @@ -176,10 +185,10 @@ LiveIntervalUnion::Query::firstInterference() { // VirtReg starts first, perform double binary search. IR.VirtRegI = VirtReg->find(LiveUnion->startIndex()); if (IR.VirtRegI != VirtReg->end()) - IR.LiveUnionI = LiveUnion->find(IR.VirtRegI->start); + IR.LiveUnionI.find(IR.VirtRegI->start); } else { // LiveUnion starts first, perform double binary search. - IR.LiveUnionI = LiveUnion->find(VirtReg->beginIndex()); + IR.LiveUnionI.find(VirtReg->beginIndex()); if (IR.LiveUnionI.valid()) IR.VirtRegI = VirtReg->find(IR.LiveUnionI.start()); else @@ -235,7 +244,7 @@ bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const { // // For comments on how to speed it up, see Query::findIntersection(). unsigned LiveIntervalUnion::Query:: -collectInterferingVRegs(unsigned MaxInterferingRegs) { +collectInterferingVRegs(unsigned MaxInterferingRegs, float MaxWeight) { InterferenceResult IR = firstInterference(); LiveInterval::iterator VirtRegEnd = VirtReg->end(); LiveInterval *RecentInterferingVReg = NULL; @@ -277,6 +286,11 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { // Cache the most recent interfering vreg to bypass isSeenInterference. RecentInterferingVReg = IR.LiveUnionI.value(); ++IR.LiveUnionI; + + // Stop collecting when the max weight is exceeded. + if (RecentInterferingVReg->weight >= MaxWeight) + return InterferingVRegs.size(); + continue; } // VirtRegI may have advanced far beyond LiveUnionI, diff --git a/lib/CodeGen/LiveIntervalUnion.h b/lib/CodeGen/LiveIntervalUnion.h index 0964ecd..c83578e 100644 --- a/lib/CodeGen/LiveIntervalUnion.h +++ b/lib/CodeGen/LiveIntervalUnion.h @@ -95,6 +95,9 @@ public: // Remove a live virtual register's segments from this union. void extract(LiveInterval &VirtReg); + // Remove all inserted virtual registers. + void clear() { Segments.clear(); ++Tag; } + // Print union, using TRI to translate register names void print(raw_ostream &OS, const TargetRegisterInfo *TRI) const; @@ -226,7 +229,8 @@ public: // Count the virtual registers in this union that interfere with this // query's live virtual register, up to maxInterferingRegs. - unsigned collectInterferingVRegs(unsigned MaxInterferingRegs = UINT_MAX); + unsigned collectInterferingVRegs(unsigned MaxInterferingRegs = UINT_MAX, + float MaxWeight = HUGE_VALF); // Was this virtual register visited during collectInterferingVRegs? bool isSeenInterference(LiveInterval *VReg) const; diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp index b96575e..052abad 100644 --- a/lib/CodeGen/LiveRangeEdit.cpp +++ b/lib/CodeGen/LiveRangeEdit.cpp @@ -15,6 +15,7 @@ #include "LiveRangeEdit.h" #include "VirtRegMap.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -24,6 +25,10 @@ using namespace llvm; +STATISTIC(NumDCEDeleted, "Number of instructions deleted by DCE"); +STATISTIC(NumDCEFoldedLoads, "Number of single use loads folded after DCE"); +STATISTIC(NumFracRanges, "Number of live ranges fractured by DCE"); + LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg, LiveIntervals &LIS, VirtRegMap &VRM) { @@ -36,14 +41,16 @@ LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg, return LI; } -void LiveRangeEdit::checkRematerializable(VNInfo *VNI, +bool LiveRangeEdit::checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI, const TargetInstrInfo &tii, AliasAnalysis *aa) { assert(DefMI && "Missing instruction"); - if (tii.isTriviallyReMaterializable(DefMI, aa)) - remattable_.insert(VNI); scannedRemattable_ = true; + if (!tii.isTriviallyReMaterializable(DefMI, aa)) + return false; + remattable_.insert(VNI); + return true; } void LiveRangeEdit::scanRemattable(LiveIntervals &lis, @@ -59,6 +66,7 @@ void LiveRangeEdit::scanRemattable(LiveIntervals &lis, continue; checkRematerializable(VNI, DefMI, tii, aa); } + scannedRemattable_ = true; } bool LiveRangeEdit::anyRematerializable(LiveIntervals &lis, @@ -137,11 +145,13 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB, const Remat &RM, LiveIntervals &lis, const TargetInstrInfo &tii, - const TargetRegisterInfo &tri) { + const TargetRegisterInfo &tri, + bool Late) { assert(RM.OrigMI && "Invalid remat"); tii.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri); rematted_.insert(RM.ParentVNI); - return lis.InsertMachineInstrInMaps(--MI).getDefIndex(); + return lis.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late) + .getDefIndex(); } void LiveRangeEdit::eraseVirtReg(unsigned Reg, LiveIntervals &LIS) { @@ -194,6 +204,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, UseMI->eraseFromParent(); DefMI->addRegisterDead(LI->reg, 0); Dead.push_back(DefMI); + ++NumDCEFoldedLoads; return true; } @@ -203,6 +214,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, SetVector<LiveInterval*, SmallVector<LiveInterval*, 8>, SmallPtrSet<LiveInterval*, 8> > ToShrink; + MachineRegisterInfo &MRI = VRM.getRegInfo(); for (;;) { // Erase all dead defs. @@ -236,8 +248,13 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, continue; LiveInterval &LI = LIS.getInterval(Reg); - // Shrink read registers. - if (MI->readsVirtualRegister(Reg)) + // Shrink read registers, unless it is likely to be expensive and + // unlikely to change anything. We typically don't want to shrink the + // PIC base register that has lots of uses everywhere. + // Always shrink COPY uses that probably come from live range splitting. + if (MI->readsVirtualRegister(Reg) && + (MI->isCopy() || MOI->isDef() || MRI.hasOneNonDBGUse(Reg) || + LI.killedAt(Idx))) ToShrink.insert(&LI); // Remove defined value. @@ -258,6 +275,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, delegate_->LRE_WillEraseInstruction(MI); LIS.RemoveMachineInstrFromMaps(MI); MI->eraseFromParent(); + ++NumDCEDeleted; } if (ToShrink.empty()) @@ -266,7 +284,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, // Shrink just one live interval. Then delete new dead defs. LiveInterval *LI = ToShrink.back(); ToShrink.pop_back(); - if (foldAsLoad(LI, Dead, VRM.getRegInfo(), LIS, TII)) + if (foldAsLoad(LI, Dead, MRI, LIS, TII)) continue; if (delegate_) delegate_->LRE_WillShrinkVirtReg(LI->reg); @@ -279,6 +297,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, unsigned NumComp = ConEQ.Classify(LI); if (NumComp <= 1) continue; + ++NumFracRanges; DEBUG(dbgs() << NumComp << " components: " << *LI << '\n'); SmallVector<LiveInterval*, 8> Dups(1, LI); for (unsigned i = 1; i != NumComp; ++i) { @@ -286,7 +305,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, if (delegate_) delegate_->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg); } - ConEQ.Distribute(&Dups[0], VRM.getRegInfo()); + ConEQ.Distribute(&Dups[0], MRI); } } diff --git a/lib/CodeGen/LiveRangeEdit.h b/lib/CodeGen/LiveRangeEdit.h index 54a2c83..db6740c 100644 --- a/lib/CodeGen/LiveRangeEdit.h +++ b/lib/CodeGen/LiveRangeEdit.h @@ -18,8 +18,9 @@ #ifndef LLVM_CODEGEN_LIVERANGEEDIT_H #define LLVM_CODEGEN_LIVERANGEEDIT_H -#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/LiveInterval.h" namespace llvm { @@ -113,6 +114,10 @@ public: bool empty() const { return size() == 0; } LiveInterval *get(unsigned idx) const { return newRegs_[idx+firstNew_]; } + ArrayRef<LiveInterval*> regs() const { + return ArrayRef<LiveInterval*>(newRegs_).slice(firstNew_); + } + /// FIXME: Temporary accessors until we can get rid of /// LiveIntervals::AddIntervalsForSpills SmallVectorImpl<LiveInterval*> *getNewVRegs() { return &newRegs_; } @@ -137,7 +142,7 @@ public: /// checkRematerializable - Manually add VNI to the list of rematerializable /// values if DefMI may be rematerializable. - void checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI, + bool checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI, const TargetInstrInfo&, AliasAnalysis*); /// Remat - Information needed to rematerialize at a specific location. @@ -165,7 +170,8 @@ public: const Remat &RM, LiveIntervals&, const TargetInstrInfo&, - const TargetRegisterInfo&); + const TargetRegisterInfo&, + bool Late = false); /// markRematerialized - explicitly mark a value as rematerialized after doing /// it manually. diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index ccbff0a..613f0c4 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -61,7 +61,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) { return OS; } -/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the +/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the /// parent pointer of the MBB, the MBB numbering, and any instructions in the /// MBB to be on the right operand list for registers. /// @@ -93,7 +93,7 @@ void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) { void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) { assert(N->getParent() == 0 && "machine instruction already in a basic block"); N->setParent(Parent); - + // Add the instruction's register operands to their corresponding // use/def lists. MachineFunction *MF = Parent->getParent(); @@ -110,7 +110,7 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) { // Remove from the use/def lists. N->RemoveRegOperandsFromUseLists(); - + N->setParent(0); LeakDetector::addGarbageObject(N); @@ -339,32 +339,70 @@ void MachineBasicBlock::updateTerminator() { } } -void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ) { - Successors.push_back(succ); - succ->addPredecessor(this); -} +void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ, uint32_t weight) { + + // If we see non-zero value for the first time it means we actually use Weight + // list, so we fill all Weights with 0's. + if (weight != 0 && Weights.empty()) + Weights.resize(Successors.size()); + + if (weight != 0 || !Weights.empty()) + Weights.push_back(weight); + + Successors.push_back(succ); + succ->addPredecessor(this); + } void MachineBasicBlock::removeSuccessor(MachineBasicBlock *succ) { succ->removePredecessor(this); succ_iterator I = std::find(Successors.begin(), Successors.end(), succ); assert(I != Successors.end() && "Not a current successor!"); + + // If Weight list is empty it means we don't use it (disabled optimization). + if (!Weights.empty()) { + weight_iterator WI = getWeightIterator(I); + Weights.erase(WI); + } + Successors.erase(I); } -MachineBasicBlock::succ_iterator +MachineBasicBlock::succ_iterator MachineBasicBlock::removeSuccessor(succ_iterator I) { assert(I != Successors.end() && "Not a current successor!"); + + // If Weight list is empty it means we don't use it (disabled optimization). + if (!Weights.empty()) { + weight_iterator WI = getWeightIterator(I); + Weights.erase(WI); + } + (*I)->removePredecessor(this); return Successors.erase(I); } +void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old, + MachineBasicBlock *New) { + uint32_t weight = 0; + succ_iterator SI = std::find(Successors.begin(), Successors.end(), Old); + + // If Weight list is empty it means we don't use it (disabled optimization). + if (!Weights.empty()) { + weight_iterator WI = getWeightIterator(SI); + weight = *WI; + } + + // Update the successor information. + removeSuccessor(SI); + addSuccessor(New, weight); +} + void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) { Predecessors.push_back(pred); } void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) { - std::vector<MachineBasicBlock *>::iterator I = - std::find(Predecessors.begin(), Predecessors.end(), pred); + pred_iterator I = std::find(Predecessors.begin(), Predecessors.end(), pred); assert(I != Predecessors.end() && "Pred is not a predecessor of this block!"); Predecessors.erase(I); } @@ -372,10 +410,17 @@ void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) { void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) { if (this == fromMBB) return; - + while (!fromMBB->succ_empty()) { MachineBasicBlock *Succ = *fromMBB->succ_begin(); - addSuccessor(Succ); + uint32_t weight = 0; + + + // If Weight list is empty it means we don't use it (disabled optimization). + if (!fromMBB->Weights.empty()) + weight = *fromMBB->Weights.begin(); + + addSuccessor(Succ, weight); fromMBB->removeSuccessor(Succ); } } @@ -384,7 +429,7 @@ void MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) { if (this == fromMBB) return; - + while (!fromMBB->succ_empty()) { MachineBasicBlock *Succ = *fromMBB->succ_begin(); addSuccessor(Succ); @@ -402,8 +447,7 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) { } bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const { - std::vector<MachineBasicBlock *>::const_iterator I = - std::find(Successors.begin(), Successors.end(), MBB); + const_succ_iterator I = std::find(Successors.begin(), Successors.end(), MBB); return I != Successors.end(); } @@ -487,6 +531,30 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { << " -- BB#" << NMBB->getNumber() << " -- BB#" << Succ->getNumber() << '\n'); + // On some targets like Mips, branches may kill virtual registers. Make sure + // that LiveVariables is properly updated after updateTerminator replaces the + // terminators. + LiveVariables *LV = P->getAnalysisIfAvailable<LiveVariables>(); + + // Collect a list of virtual registers killed by the terminators. + SmallVector<unsigned, 4> KilledRegs; + if (LV) + for (iterator I = getFirstTerminator(), E = end(); I != E; ++I) { + MachineInstr *MI = I; + for (MachineInstr::mop_iterator OI = MI->operands_begin(), + OE = MI->operands_end(); OI != OE; ++OI) { + if (!OI->isReg() || !OI->isUse() || !OI->isKill() || OI->isUndef()) + continue; + unsigned Reg = OI->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg) && + LV->getVarInfo(Reg).removeKill(MI)) { + KilledRegs.push_back(Reg); + DEBUG(dbgs() << "Removing terminator kill: " << *MI); + OI->setIsKill(false); + } + } + } + ReplaceUsesOfBlockWith(Succ, NMBB); updateTerminator(); @@ -504,9 +572,22 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { if (i->getOperand(ni+1).getMBB() == this) i->getOperand(ni+1).setMBB(NMBB); - if (LiveVariables *LV = - P->getAnalysisIfAvailable<LiveVariables>()) + // Update LiveVariables. + if (LV) { + // Restore kills of virtual registers that were killed by the terminators. + while (!KilledRegs.empty()) { + unsigned Reg = KilledRegs.pop_back_val(); + for (iterator I = end(), E = begin(); I != E;) { + if (!(--I)->addRegisterKilled(Reg, NULL, /* addIfNotFound= */ false)) + continue; + LV->getVarInfo(Reg).Kills.push_back(I); + DEBUG(dbgs() << "Restored terminator kill: " << *I); + break; + } + } + // Update relevant live-through information. LV->addNewBlock(NMBB, this, Succ); + } if (MachineDominatorTree *MDT = P->getAnalysisIfAvailable<MachineDominatorTree>()) { @@ -602,15 +683,14 @@ void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old, } // Update the successor information. - removeSuccessor(Old); - addSuccessor(New); + replaceSuccessor(Old, New); } /// CorrectExtraCFGEdges - Various pieces of code can cause excess edges in the /// CFG to be inserted. If we have proven that MBB can only branch to DestA and /// DestB, remove any other MBB successors from the CFG. DestA and DestB can be /// null. -/// +/// /// Besides DestA and DestB, retain other edges leading to LandingPads /// (currently there can be only one; we don't check or require that here). /// Note it is possible that DestA and/or DestB are LandingPads. @@ -685,6 +765,23 @@ MachineBasicBlock::findDebugLoc(MachineBasicBlock::iterator &MBBI) { return DL; } +/// getSuccWeight - Return weight of the edge from this block to MBB. +/// +uint32_t MachineBasicBlock::getSuccWeight(MachineBasicBlock *succ) { + succ_iterator I = std::find(Successors.begin(), Successors.end(), succ); + return *getWeightIterator(I); +} + +/// getWeightIterator - Return wight iterator corresonding to the I successor +/// iterator +MachineBasicBlock::weight_iterator MachineBasicBlock:: +getWeightIterator(MachineBasicBlock::succ_iterator I) { + assert(Weights.size() == Successors.size() && "Async weight list!"); + size_t index = std::distance(Successors.begin(), I); + assert(index < Weights.size() && "Not a current successor!"); + return Weights.begin() + index; +} + void llvm::WriteAsOperand(raw_ostream &OS, const MachineBasicBlock *MBB, bool t) { OS << "BB#" << MBB->getNumber(); diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp new file mode 100644 index 0000000..c13fa6b --- /dev/null +++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp @@ -0,0 +1,113 @@ +//===- MachineBranchProbabilityInfo.cpp - Machine Branch Probability Info -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This analysis uses probability info stored in Machine Basic Blocks. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Instructions.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfo, "machine-branch-prob", + "Machine Branch Probability Analysis", false, true) +INITIALIZE_PASS_END(MachineBranchProbabilityInfo, "machine-branch-prob", + "Machine Branch Probability Analysis", false, true) + +char MachineBranchProbabilityInfo::ID = 0; + +uint32_t MachineBranchProbabilityInfo:: +getSumForBlock(MachineBasicBlock *MBB) const { + uint32_t Sum = 0; + + for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) { + MachineBasicBlock *Succ = *I; + uint32_t Weight = getEdgeWeight(MBB, Succ); + uint32_t PrevSum = Sum; + + Sum += Weight; + assert(Sum > PrevSum); (void) PrevSum; + } + + return Sum; +} + +uint32_t +MachineBranchProbabilityInfo::getEdgeWeight(MachineBasicBlock *Src, + MachineBasicBlock *Dst) const { + uint32_t Weight = Src->getSuccWeight(Dst); + if (!Weight) + return DEFAULT_WEIGHT; + return Weight; +} + +bool MachineBranchProbabilityInfo::isEdgeHot(MachineBasicBlock *Src, + MachineBasicBlock *Dst) const { + // Hot probability is at least 4/5 = 80% + uint32_t Weight = getEdgeWeight(Src, Dst); + uint32_t Sum = getSumForBlock(Src); + + // FIXME: Implement BranchProbability::compare then change this code to + // compare this BranchProbability against a static "hot" BranchProbability. + return (uint64_t)Weight * 5 > (uint64_t)Sum * 4; +} + +MachineBasicBlock * +MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const { + uint32_t Sum = 0; + uint32_t MaxWeight = 0; + MachineBasicBlock *MaxSucc = 0; + + for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) { + MachineBasicBlock *Succ = *I; + uint32_t Weight = getEdgeWeight(MBB, Succ); + uint32_t PrevSum = Sum; + + Sum += Weight; + assert(Sum > PrevSum); (void) PrevSum; + + if (Weight > MaxWeight) { + MaxWeight = Weight; + MaxSucc = Succ; + } + } + + // FIXME: Use BranchProbability::compare. + if ((uint64_t)MaxWeight * 5 >= (uint64_t)Sum * 4) + return MaxSucc; + + return 0; +} + +BranchProbability +MachineBranchProbabilityInfo::getEdgeProbability(MachineBasicBlock *Src, + MachineBasicBlock *Dst) const { + uint32_t N = getEdgeWeight(Src, Dst); + uint32_t D = getSumForBlock(Src); + + return BranchProbability(N, D); +} + +raw_ostream &MachineBranchProbabilityInfo:: +printEdgeProbability(raw_ostream &OS, MachineBasicBlock *Src, + MachineBasicBlock *Dst) const { + + const BranchProbability Prob = getEdgeProbability(Src, Dst); + OS << "edge MBB#" << Src->getNumber() << " -> MBB#" << Dst->getNumber() + << " probability is " << Prob + << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n"); + + return OS; +} diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp index 07a7d27..f97ccf6 100644 --- a/lib/CodeGen/MachineCSE.cpp +++ b/lib/CodeGen/MachineCSE.cpp @@ -365,6 +365,8 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { if (!FoundCSE) { // Look for trivial copy coalescing opportunities. if (PerformTrivialCoalescing(MI, MBB)) { + Changed = true; + // After coalescing MI itself may become a copy. if (MI->isCopyLike()) continue; @@ -379,10 +381,11 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { if (NewMI) { Commuted = true; FoundCSE = VNT.count(NewMI); - if (NewMI != MI) + if (NewMI != MI) { // New instruction. It doesn't need to be kept. NewMI->eraseFromParent(); - else if (!FoundCSE) + Changed = true; + } else if (!FoundCSE) // MI was changed but it didn't help, commute it back! (void)TII->commuteInstruction(MI); } @@ -450,6 +453,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { ++NumPhysCSEs; if (Commuted) ++NumCommutes; + Changed = true; } else { DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n"); VNT.insert(MI, CurrVN++); diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index d81e4a1..50750a5 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -65,7 +65,11 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM, FrameInfo->setMaxAlignment(Attribute::getStackAlignmentFromAttrs( Fn->getAttributes().getFnAttributes())); ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData()); - Alignment = TM.getTargetLowering()->getFunctionAlignment(F); + Alignment = TM.getTargetLowering()->getMinFunctionAlignment(); + // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn. + if (!Fn->hasFnAttr(Attribute::OptimizeForSize)) + Alignment = std::max(Alignment, + TM.getTargetLowering()->getPrefFunctionAlignment()); FunctionNumber = FunctionNum; JumpTableInfo = 0; } @@ -300,31 +304,19 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const { OS << "Function Live Ins: "; for (MachineRegisterInfo::livein_iterator I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) { - if (TRI) - OS << "%" << TRI->getName(I->first); - else - OS << " %physreg" << I->first; - + OS << PrintReg(I->first, TRI); if (I->second) - OS << " in reg%" << I->second; - + OS << " in " << PrintReg(I->second, TRI); if (llvm::next(I) != E) OS << ", "; } OS << '\n'; } if (RegInfo && !RegInfo->liveout_empty()) { - OS << "Function Live Outs: "; + OS << "Function Live Outs:"; for (MachineRegisterInfo::liveout_iterator - I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I){ - if (TRI) - OS << '%' << TRI->getName(*I); - else - OS << "%physreg" << *I; - - if (llvm::next(I) != E) - OS << " "; - } + I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I) + OS << ' ' << PrintReg(*I, TRI); OS << '\n'; } diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index 0d137eb..36b0b83 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -125,7 +125,8 @@ void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) { assert(TargetRegisterInfo::isPhysicalRegister(Reg)); if (getSubReg()) { Reg = TRI.getSubReg(Reg, getSubReg()); - assert(Reg && "Invalid SubReg for physical register"); + // Note that getSubReg() may return 0 if the sub-register doesn't exist. + // That won't happen in legal code. setSubReg(0); } setReg(Reg); @@ -441,6 +442,10 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) { OS << ")"; } + // Print nontemporal info. + if (MMO.isNonTemporal()) + OS << "(nontemporal)"; + return OS; } @@ -759,19 +764,35 @@ bool MachineInstr::isIdenticalTo(const MachineInstr *Other, for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { const MachineOperand &MO = getOperand(i); const MachineOperand &OMO = Other->getOperand(i); + if (!MO.isReg()) { + if (!MO.isIdenticalTo(OMO)) + return false; + continue; + } + // Clients may or may not want to ignore defs when testing for equality. // For example, machine CSE pass only cares about finding common // subexpressions, so it's safe to ignore virtual register defs. - if (Check != CheckDefs && MO.isReg() && MO.isDef()) { + if (MO.isDef()) { if (Check == IgnoreDefs) continue; - // Check == IgnoreVRegDefs - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) || - TargetRegisterInfo::isPhysicalRegister(OMO.getReg())) - if (MO.getReg() != OMO.getReg()) + else if (Check == IgnoreVRegDefs) { + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) || + TargetRegisterInfo::isPhysicalRegister(OMO.getReg())) + if (MO.getReg() != OMO.getReg()) + return false; + } else { + if (!MO.isIdenticalTo(OMO)) return false; - } else if (!MO.isIdenticalTo(OMO)) - return false; + if (Check == CheckKillDead && MO.isDead() != OMO.isDead()) + return false; + } + } else { + if (!MO.isIdenticalTo(OMO)) + return false; + if (Check == CheckKillDead && MO.isKill() != OMO.isKill()) + return false; + } } return true; } diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp index 1c0f6ad..b315702 100644 --- a/lib/CodeGen/MachineLICM.cpp +++ b/lib/CodeGen/MachineLICM.cpp @@ -39,7 +39,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" - using namespace llvm; STATISTIC(NumHoisted, @@ -169,6 +168,10 @@ namespace { /// bool IsLoopInvariantInst(MachineInstr &I); + /// HasAnyPHIUse - Return true if the specified register is used by any + /// phi node. + bool HasAnyPHIUse(unsigned Reg) const; + /// HasHighOperandLatency - Compute operand latency between a def of 'Reg' /// and an use in the current loop, return true if the target considered /// it 'high'. @@ -758,18 +761,25 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) { } -/// HasPHIUses - Return true if the specified register has any PHI use. -static bool HasPHIUses(unsigned Reg, MachineRegisterInfo *MRI) { +/// HasAnyPHIUse - Return true if the specified register is used by any +/// phi node. +bool MachineLICM::HasAnyPHIUse(unsigned Reg) const { for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg), UE = MRI->use_end(); UI != UE; ++UI) { MachineInstr *UseMI = &*UI; if (UseMI->isPHI()) return true; + // Look pass copies as well. + if (UseMI->isCopy()) { + unsigned Def = UseMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Def) && + HasAnyPHIUse(Def)) + return true; + } } return false; } - /// HasHighOperandLatency - Compute operand latency between a def of 'Reg' /// and an use in the current loop, return true if the target considered /// it 'high'. @@ -976,14 +986,13 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) { return false; } - // If result(s) of this instruction is used by PHIs, then don't hoist it. - // The presence of joins makes it difficult for current register allocator - // implementation to perform remat. + // If result(s) of this instruction is used by PHIs outside of the loop, then + // don't hoist it if the instruction because it will introduce an extra copy. for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() || !MO.isDef()) continue; - if (HasPHIUses(MO.getReg(), MRI)) + if (HasAnyPHIUse(MO.getReg())) return false; } diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp index 7244d5f..08ff5bb 100644 --- a/lib/CodeGen/MachineRegisterInfo.cpp +++ b/lib/CodeGen/MachineRegisterInfo.cpp @@ -79,6 +79,8 @@ MachineRegisterInfo::constrainRegClass(unsigned Reg, unsigned MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){ assert(RegClass && "Cannot create register without RegClass!"); + assert(RegClass->isAllocatable() && + "Virtual register RegClass must be allocatable."); // New virtual register number. unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs()); diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp index 8a93a24..916dff7 100644 --- a/lib/CodeGen/MachineSink.cpp +++ b/lib/CodeGen/MachineSink.cpp @@ -265,8 +265,11 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) { if (MI->isDebugValue()) continue; - if (PerformTrivialForwardCoalescing(MI, &MBB)) + bool Joined = PerformTrivialForwardCoalescing(MI, &MBB); + if (Joined) { + MadeChange = true; continue; + } if (SinkInstruction(MI, SawStore)) ++NumSunk, MadeChange = true; diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index f95f411..471463b 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -23,6 +23,7 @@ // the verifier errors. //===----------------------------------------------------------------------===// +#include "llvm/Instructions.h" #include "llvm/Function.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveVariables.h" @@ -32,6 +33,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" @@ -394,7 +396,13 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { if ((*I)->isLandingPad()) LandingPadSuccs.insert(*I); } - if (LandingPadSuccs.size() > 1) + + const MCAsmInfo *AsmInfo = TM->getMCAsmInfo(); + const BasicBlock *BB = MBB->getBasicBlock(); + if (LandingPadSuccs.size() > 1 && + !(AsmInfo && + AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj && + BB && isa<SwitchInst>(BB->getTerminator()))) report("MBB has more than one landing pad successor", MBB); // Call AnalyzeBranch. If it succeeds, there several more conditions to check. @@ -402,11 +410,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { SmallVector<MachineOperand, 4> Cond; if (!TII->AnalyzeBranch(*const_cast<MachineBasicBlock *>(MBB), TBB, FBB, Cond)) { - // If the block branches directly to a landing pad successor, pretend that - // the landing pad is a normal block. - LandingPadSuccs.erase(TBB); - LandingPadSuccs.erase(FBB); - // Ok, AnalyzeBranch thinks it knows what's going on with this block. Let's // check whether its answers match up with reality. if (!TBB && !FBB) { @@ -741,7 +744,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { RC = SRC; } if (const TargetRegisterClass *DRC = TOI.getRegClass(TRI)) { - if (RC != DRC && !RC->hasSuperClass(DRC)) { + if (!RC->hasSuperClassEq(DRC)) { report("Illegal virtual register for instruction", MO, MONum); *OS << "Expected a " << DRC->getName() << " register, but got a " << RC->getName() << " register\n"; diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp index 9fd5b0e..af65f13 100644 --- a/lib/CodeGen/PHIElimination.cpp +++ b/lib/CodeGen/PHIElimination.cpp @@ -32,7 +32,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include <algorithm> -#include <map> using namespace llvm; static cl::opt<bool> diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp index 3489db2..315aedd 100644 --- a/lib/CodeGen/Passes.cpp +++ b/lib/CodeGen/Passes.cpp @@ -55,6 +55,11 @@ FunctionPass *llvm::createRegisterAllocator(CodeGenOpt::Level OptLevel) { RegisterRegAlloc::setDefault(RegAlloc); } + // This forces linking of the linear scan register allocator, + // so -regalloc=linearscan still works in clang. + if (Ctor == createLinearScanRegisterAllocator) + return createLinearScanRegisterAllocator(); + if (Ctor != createDefaultRegisterAllocator) return Ctor(); @@ -63,6 +68,6 @@ FunctionPass *llvm::createRegisterAllocator(CodeGenOpt::Level OptLevel) { case CodeGenOpt::None: return createFastRegisterAllocator(); default: - return createLinearScanRegisterAllocator(); + return createGreedyRegisterAllocator(); } } diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp index 60c24b7..982a2a5 100644 --- a/lib/CodeGen/PostRASchedulerList.cpp +++ b/lib/CodeGen/PostRASchedulerList.cpp @@ -22,6 +22,7 @@ #include "AntiDepBreaker.h" #include "AggressiveAntiDepBreaker.h" #include "CriticalAntiDepBreaker.h" +#include "RegisterClassInfo.h" #include "ScheduleDAGInstrs.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/LatencyPriorityQueue.h" @@ -80,6 +81,7 @@ namespace { class PostRAScheduler : public MachineFunctionPass { AliasAnalysis *AA; const TargetInstrInfo *TII; + RegisterClassInfo RegClassInfo; CodeGenOpt::Level OptLevel; public: @@ -135,7 +137,8 @@ namespace { public: SchedulePostRATDList( MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT, - AliasAnalysis *AA, TargetSubtarget::AntiDepBreakMode AntiDepMode, + AliasAnalysis *AA, const RegisterClassInfo&, + TargetSubtarget::AntiDepBreakMode AntiDepMode, SmallVectorImpl<TargetRegisterClass*> &CriticalPathRCs); ~SchedulePostRATDList(); @@ -179,7 +182,8 @@ namespace { SchedulePostRATDList::SchedulePostRATDList( MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT, - AliasAnalysis *AA, TargetSubtarget::AntiDepBreakMode AntiDepMode, + AliasAnalysis *AA, const RegisterClassInfo &RCI, + TargetSubtarget::AntiDepBreakMode AntiDepMode, SmallVectorImpl<TargetRegisterClass*> &CriticalPathRCs) : ScheduleDAGInstrs(MF, MLI, MDT), Topo(SUnits), AA(AA), KillIndices(TRI->getNumRegs()) @@ -190,9 +194,9 @@ SchedulePostRATDList::SchedulePostRATDList( TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins, this); AntiDepBreak = ((AntiDepMode == TargetSubtarget::ANTIDEP_ALL) ? - (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, CriticalPathRCs) : + (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) : ((AntiDepMode == TargetSubtarget::ANTIDEP_CRITICAL) ? - (AntiDepBreaker *)new CriticalAntiDepBreaker(MF) : NULL)); + (AntiDepBreaker *)new CriticalAntiDepBreaker(MF, RCI) : NULL)); } SchedulePostRATDList::~SchedulePostRATDList() { @@ -205,6 +209,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); AliasAnalysis *AA = &getAnalysis<AliasAnalysis>(); + RegClassInfo.runOnMachineFunction(Fn); // Check for explicit enable/disable of post-ra scheduling. TargetSubtarget::AntiDepBreakMode AntiDepMode = TargetSubtarget::ANTIDEP_NONE; @@ -230,7 +235,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { DEBUG(dbgs() << "PostRAScheduler\n"); - SchedulePostRATDList Scheduler(Fn, MLI, MDT, AA, AntiDepMode, + SchedulePostRATDList Scheduler(Fn, MLI, MDT, AA, RegClassInfo, AntiDepMode, CriticalPathRCs); // Loop over all of the basic blocks @@ -304,7 +309,7 @@ void SchedulePostRATDList::Schedule() { if (AntiDepBreak != NULL) { unsigned Broken = AntiDepBreak->BreakAntiDependencies(SUnits, Begin, InsertPos, - InsertPosIndex); + InsertPosIndex, DbgValues); if (Broken != 0) { // We made changes. Update the dependency graph. @@ -540,10 +545,16 @@ void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) { #endif --SuccSU->NumPredsLeft; - // Compute how many cycles it will be before this actually becomes - // available. This is the max of the start time of all predecessors plus - // their latencies. - SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency()); + // Standard scheduler algorithms will recompute the depth of the successor + // here as such: + // SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency()); + // + // However, we lazily compute node depth instead. Note that + // ScheduleNodeTopDown has already updated the depth of this node which causes + // all descendents to be marked dirty. Setting the successor depth explicitly + // here would cause depth to be recomputed for all its ancestors. If the + // successor is not yet ready (because of a transitively redundant edge) then + // this causes depth computation to be quadratic in the size of the DAG. // If all the node's predecessors are scheduled, this node is ready // to be scheduled. Ignore the special ExitSU node. @@ -655,6 +666,12 @@ void SchedulePostRATDList::ListScheduleTopDown() { ScheduleNodeTopDown(FoundSUnit, CurCycle); HazardRec->EmitInstruction(FoundSUnit); CycleHasInsts = true; + if (HazardRec->atIssueLimit()) { + DEBUG(dbgs() << "*** Max instructions per cycle " << CurCycle << '\n'); + HazardRec->AdvanceCycle(); + ++CurCycle; + CycleHasInsts = false; + } } else { if (CycleHasInsts) { DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n'); diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index 92e25e1..f1f3c99 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -337,7 +337,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) { --BeforeI; // Restore all registers immediately before the return and any - // terminators that preceed it. + // terminators that precede it. if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) { for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); @@ -437,7 +437,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) { --BeforeI; // Restore all registers immediately before the return and any - // terminators that preceed it. + // terminators that precede it. for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) { unsigned Reg = blockCSI[i].getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt index b655dda..7f75f65 100644 --- a/lib/CodeGen/README.txt +++ b/lib/CodeGen/README.txt @@ -26,7 +26,7 @@ and then "merge" mul and mov: sxth r3, r3 mla r4, r3, lr, r4 -It also increase the likelyhood the store may become dead. +It also increase the likelihood the store may become dead. //===---------------------------------------------------------------------===// @@ -162,7 +162,7 @@ synthesize the various copy insertion/inspection methods in TargetInstrInfo. //===---------------------------------------------------------------------===// -Stack coloring improvments: +Stack coloring improvements: 1. Do proper LiveStackAnalysis on all stack objects including those which are not spill slots. diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h index f431d5a..0316421 100644 --- a/lib/CodeGen/RegAllocBase.h +++ b/lib/CodeGen/RegAllocBase.h @@ -39,6 +39,7 @@ #include "llvm/ADT/OwningPtr.h" #include "LiveIntervalUnion.h" +#include "RegisterClassInfo.h" namespace llvm { @@ -91,6 +92,7 @@ protected: MachineRegisterInfo *MRI; VirtRegMap *VRM; LiveIntervals *LIS; + RegisterClassInfo RegClassInfo; LiveUnionArray PhysReg2LiveUnion; // Current queries, one per physreg. They must be reinitialized each time we @@ -113,6 +115,10 @@ protected: return Queries[PhysReg]; } + // Invalidate all cached information about virtual registers - live ranges may + // have changed. + void invalidateVirtRegs() { ++UserTag; } + // The top-level driver. The output is a VirtRegMap that us updated with // physical register assignments. // diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp index 0e218a7..1d77b29 100644 --- a/lib/CodeGen/RegAllocBasic.cpp +++ b/lib/CodeGen/RegAllocBasic.cpp @@ -13,10 +13,10 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "regalloc" +#include "RegAllocBase.h" #include "LiveDebugVariables.h" #include "LiveIntervalUnion.h" #include "LiveRangeEdit.h" -#include "RegAllocBase.h" #include "RenderMachineFunction.h" #include "Spiller.h" #include "VirtRegMap.h" @@ -85,7 +85,6 @@ class RABasic : public MachineFunctionPass, public RegAllocBase { // context MachineFunction *MF; - BitVector ReservedRegs; // analyses LiveStacks *LS; @@ -235,9 +234,14 @@ void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis) { MRI = &vrm.getRegInfo(); VRM = &vrm; LIS = &lis; - PhysReg2LiveUnion.init(UnionAllocator, TRI->getNumRegs()); - // Cache an interferece query for each physical reg - Queries.reset(new LiveIntervalUnion::Query[PhysReg2LiveUnion.numRegs()]); + RegClassInfo.runOnMachineFunction(vrm.getMachineFunction()); + + const unsigned NumRegs = TRI->getNumRegs(); + if (NumRegs != PhysReg2LiveUnion.numRegs()) { + PhysReg2LiveUnion.init(UnionAllocator, NumRegs); + // Cache an interferece query for each physical reg + Queries.reset(new LiveIntervalUnion::Query[PhysReg2LiveUnion.numRegs()]); + } } void RegAllocBase::LiveUnionArray::clear() { @@ -251,13 +255,15 @@ void RegAllocBase::LiveUnionArray::clear() { } void RegAllocBase::releaseMemory() { - PhysReg2LiveUnion.clear(); + for (unsigned r = 0, e = PhysReg2LiveUnion.numRegs(); r != e; ++r) + PhysReg2LiveUnion[r].clear(); } // Visit all the live registers. If they are already assigned to a physical // register, unify them with the corresponding LiveIntervalUnion, otherwise push // them on the priority queue for later assignment. void RegAllocBase::seedLiveRegs() { + NamedRegionTimer T("Seed Live Regs", TimerGroupName, TimePassesIsEnabled); for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end(); I != E; ++I) { unsigned RegNum = I->first; LiveInterval &VirtReg = *I->second; @@ -273,6 +279,7 @@ void RegAllocBase::assign(LiveInterval &VirtReg, unsigned PhysReg) { << " to " << PrintReg(PhysReg, TRI) << '\n'); assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment"); VRM->assignVirt2Phys(VirtReg.reg, PhysReg); + MRI->setPhysRegUsed(PhysReg); PhysReg2LiveUnion[PhysReg].unify(VirtReg); ++NumAssigned; } @@ -303,7 +310,7 @@ void RegAllocBase::allocatePhysRegs() { } // Invalidate all interference queries, live ranges could have changed. - ++UserTag; + invalidateVirtRegs(); // selectOrSplit requests the allocator to return an available physical // register if possible and populate a list of new live intervals that @@ -315,6 +322,23 @@ void RegAllocBase::allocatePhysRegs() { VirtRegVec SplitVRegs; unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs); + if (AvailablePhysReg == ~0u) { + // selectOrSplit failed to find a register! + std::string msg; + raw_string_ostream Msg(msg); + Msg << "Ran out of registers during register allocation!" + "\nCannot allocate: " << *VirtReg; + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(VirtReg->reg); + MachineInstr *MI = I.skipInstruction();) { + if (!MI->isInlineAsm()) + continue; + Msg << "\nPlease check your inline asm statement for " + "invalid constraints:\n"; + MI->print(Msg, &VRM->getMachineFunction().getTarget()); + } + report_fatal_error(Msg.str()); + } + if (AvailablePhysReg) assign(*VirtReg, AvailablePhysReg); @@ -404,29 +428,31 @@ RegAllocBase::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, // Add newly allocated physical registers to the MBB live in sets. void RegAllocBase::addMBBLiveIns(MachineFunction *MF) { NamedRegionTimer T("MBB Live Ins", TimerGroupName, TimePassesIsEnabled); - typedef SmallVector<MachineBasicBlock*, 8> MBBVec; - MBBVec liveInMBBs; - MachineBasicBlock &entryMBB = *MF->begin(); + SlotIndexes *Indexes = LIS->getSlotIndexes(); + if (MF->size() <= 1) + return; + LiveIntervalUnion::SegmentIter SI; for (unsigned PhysReg = 0; PhysReg < PhysReg2LiveUnion.numRegs(); ++PhysReg) { LiveIntervalUnion &LiveUnion = PhysReg2LiveUnion[PhysReg]; if (LiveUnion.empty()) continue; - for (LiveIntervalUnion::SegmentIter SI = LiveUnion.begin(); SI.valid(); - ++SI) { - - // Find the set of basic blocks which this range is live into... - liveInMBBs.clear(); - if (!LIS->findLiveInMBBs(SI.start(), SI.stop(), liveInMBBs)) continue; - - // And add the physreg for this interval to their live-in sets. - for (MBBVec::iterator I = liveInMBBs.begin(), E = liveInMBBs.end(); - I != E; ++I) { - MachineBasicBlock *MBB = *I; - if (MBB == &entryMBB) continue; - if (MBB->isLiveIn(PhysReg)) continue; - MBB->addLiveIn(PhysReg); - } + MachineFunction::iterator MBB = llvm::next(MF->begin()); + MachineFunction::iterator MFE = MF->end(); + SlotIndex Start, Stop; + tie(Start, Stop) = Indexes->getMBBRange(MBB); + SI.setMap(LiveUnion.getMap()); + SI.find(Start); + while (SI.valid()) { + if (SI.start() <= Start) { + if (!MBB->isLiveIn(PhysReg)) + MBB->addLiveIn(PhysReg); + } else if (SI.start() > Stop) + MBB = Indexes->getMBBFromIndex(SI.start().getPrevIndex()); + if (++MBB == MFE) + break; + tie(Start, Stop) = Indexes->getMBBRange(MBB); + SI.advanceTo(Start); } } } @@ -454,14 +480,11 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg, SmallVector<unsigned, 8> PhysRegSpillCands; // Check for an available register in this class. - const TargetRegisterClass *TRC = MRI->getRegClass(VirtReg.reg); - - for (TargetRegisterClass::iterator I = TRC->allocation_order_begin(*MF), - E = TRC->allocation_order_end(*MF); - I != E; ++I) { - + ArrayRef<unsigned> Order = + RegClassInfo.getOrder(MRI->getRegClass(VirtReg.reg)); + for (ArrayRef<unsigned>::iterator I = Order.begin(), E = Order.end(); I != E; + ++I) { unsigned PhysReg = *I; - if (ReservedRegs.test(PhysReg)) continue; // Check interference and as a side effect, intialize queries for this // VirtReg and its aliases. @@ -490,8 +513,11 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg, // Tell the caller to allocate to this newly freed physical register. return *PhysRegI; } + // No other spill candidates were found, so spill the current VirtReg. DEBUG(dbgs() << "spilling: " << VirtReg << '\n'); + if (!VirtReg.isSpillable()) + return ~0u; LiveRangeEdit LRE(VirtReg, SplitVRegs); spiller().spill(LRE); @@ -509,9 +535,6 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) { DEBUG(RMF = &getAnalysis<RenderMachineFunction>()); RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>()); - - ReservedRegs = TRI->getReservedRegs(*MF); - SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); allocatePhysRegs(); diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp index 15036e3..65ebdf8 100644 --- a/lib/CodeGen/RegAllocFast.cpp +++ b/lib/CodeGen/RegAllocFast.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "regalloc" +#include "RegisterClassInfo.h" #include "llvm/BasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -58,6 +59,7 @@ namespace { MachineRegisterInfo *MRI; const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; + RegisterClassInfo RegClassInfo; // Basic block currently being allocated. MachineBasicBlock *MBB; @@ -97,7 +99,7 @@ namespace { // immediately without checking aliases. regFree, - // A reserved register has been assigned expolicitly (e.g., setting up a + // A reserved register has been assigned explicitly (e.g., setting up a // call parameter), and it remains reserved until it is used. regReserved @@ -113,9 +115,6 @@ namespace { // instruction, and so cannot be allocated. BitVector UsedInInstr; - // Allocatable - vector of allocatable physical registers. - BitVector Allocatable; - // SkippedInstrs - Descriptors of instructions whose clobber list was // ignored because all registers were spilled. It is still necessary to // mark all the clobbered registers as used by the function. @@ -396,7 +395,6 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg, PhysRegState[PhysReg] = NewState; for (const unsigned *AS = TRI->getAliasSet(PhysReg); unsigned Alias = *AS; ++AS) { - UsedInInstr.set(Alias); switch (unsigned VirtReg = PhysRegState[Alias]) { case regDisabled: break; @@ -420,20 +418,25 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg, // can be allocated directly. // Returns spillImpossible when PhysReg or an alias can't be spilled. unsigned RAFast::calcSpillCost(unsigned PhysReg) const { - if (UsedInInstr.test(PhysReg)) + if (UsedInInstr.test(PhysReg)) { + DEBUG(dbgs() << "PhysReg: " << PhysReg << " is already used in instr.\n"); return spillImpossible; + } switch (unsigned VirtReg = PhysRegState[PhysReg]) { case regDisabled: break; case regFree: return 0; case regReserved: + DEBUG(dbgs() << "VirtReg: " << VirtReg << " corresponding to PhysReg: " + << PhysReg << " is reserved already.\n"); return spillImpossible; default: return LiveVirtRegs.lookup(VirtReg).Dirty ? spillDirty : spillClean; } - // This is a disabled register, add up const of aliases. + // This is a disabled register, add up cost of aliases. + DEBUG(dbgs() << "\tRegister: " << PhysReg << " is disabled.\n"); unsigned Cost = 0; for (const unsigned *AS = TRI->getAliasSet(PhysReg); unsigned Alias = *AS; ++AS) { @@ -479,30 +482,26 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) { // Ignore invalid hints. if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) || - !RC->contains(Hint) || !Allocatable.test(Hint))) + !RC->contains(Hint) || !RegClassInfo.isAllocatable(Hint))) Hint = 0; // Take hint when possible. if (Hint) { - switch(calcSpillCost(Hint)) { - default: - definePhysReg(MI, Hint, regFree); - // Fall through. - case 0: + // Ignore the hint if we would have to spill a dirty register. + unsigned Cost = calcSpillCost(Hint); + if (Cost < spillDirty) { + if (Cost) + definePhysReg(MI, Hint, regFree); return assignVirtToPhysReg(LRE, Hint); - case spillImpossible: - break; } } - TargetRegisterClass::iterator AOB = RC->allocation_order_begin(*MF); - TargetRegisterClass::iterator AOE = RC->allocation_order_end(*MF); + ArrayRef<unsigned> AO = RegClassInfo.getOrder(RC); // First try to find a completely free register. - for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) { + for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) { unsigned PhysReg = *I; - if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg) && - Allocatable.test(PhysReg)) + if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg)) return assignVirtToPhysReg(LRE, PhysReg); } @@ -510,10 +509,11 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) { << RC->getName() << "\n"); unsigned BestReg = 0, BestCost = spillImpossible; - for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) { - if (!Allocatable.test(*I)) - continue; + for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) { unsigned Cost = calcSpillCost(*I); + DEBUG(dbgs() << "\tRegister: " << *I << "\n"); + DEBUG(dbgs() << "\tCost: " << Cost << "\n"); + DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n"); // Cost is 0 when all aliases are already disabled. if (Cost == 0) return assignVirtToPhysReg(LRE, *I); @@ -722,9 +722,8 @@ void RAFast::handleThroughOperands(MachineInstr *MI, if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue; unsigned Reg = MO.getReg(); if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue; + DEBUG(dbgs() << "\tSetting reg " << Reg << " as used in instr\n"); UsedInInstr.set(Reg); - for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) - UsedInInstr.set(*AS); } // Also mark PartialDefs as used to avoid reallocation. @@ -764,7 +763,7 @@ void RAFast::AllocateBasicBlock() { // Add live-in registers as live. for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), E = MBB->livein_end(); I != E; ++I) - if (Allocatable.test(*I)) + if (RegClassInfo.isAllocatable(*I)) definePhysReg(MII, *I, regReserved); SmallVector<unsigned, 8> VirtDead; @@ -895,7 +894,7 @@ void RAFast::AllocateBasicBlock() { } continue; } - if (!Allocatable.test(Reg)) continue; + if (!RegClassInfo.isAllocatable(Reg)) continue; if (MO.isUse()) { usePhysReg(MO); } else if (MO.isEarlyClobber()) { @@ -984,7 +983,7 @@ void RAFast::AllocateBasicBlock() { unsigned Reg = MO.getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - if (!Allocatable.test(Reg)) continue; + if (!RegClassInfo.isAllocatable(Reg)) continue; definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ? regFree : regReserved); continue; @@ -1040,9 +1039,8 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) { TM = &Fn.getTarget(); TRI = TM->getRegisterInfo(); TII = TM->getInstrInfo(); - + RegClassInfo.runOnMachineFunction(Fn); UsedInInstr.resize(TRI->getNumRegs()); - Allocatable = TRI->getAllocatableSet(*MF); // initialize the virtual->physical register map to have a 'null' // mapping for all virtual registers diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index 889bca3..8d06325 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -62,7 +62,6 @@ class RAGreedy : public MachineFunctionPass, // context MachineFunction *MF; - BitVector ReservedRegs; // analyses SlotIndexes *Indexes; @@ -72,6 +71,7 @@ class RAGreedy : public MachineFunctionPass, MachineLoopRanges *LoopRanges; EdgeBundles *Bundles; SpillPlacement *SpillPlacer; + LiveDebugVariables *DebugVars; // state std::auto_ptr<Spiller> SpillerInstance; @@ -94,12 +94,13 @@ class RAGreedy : public MachineFunctionPass, RS_New, ///< Never seen before. RS_First, ///< First time in the queue. RS_Second, ///< Second time in the queue. - RS_Region, ///< Produced by region splitting. - RS_Block, ///< Produced by per-block splitting. + RS_Global, ///< Produced by global splitting. RS_Local, ///< Produced by local splitting. RS_Spill ///< Produced by spilling. }; + static const char *const StageName[]; + IndexedMap<unsigned char, VirtReg2IndexFunctor> LRStage; LiveRangeStage getStage(const LiveInterval &VirtReg) const { @@ -116,6 +117,15 @@ class RAGreedy : public MachineFunctionPass, } } + // Eviction. Sometimes an assigned live range can be evicted without + // conditions, but other times it must be split after being evicted to avoid + // infinite loops. + enum CanEvict { + CE_Never, ///< Can never evict. + CE_Always, ///< Can always evict. + CE_WithSplit ///< Can evict only if range is also split or spilled. + }; + // splitting state. std::auto_ptr<SplitAnalysis> SA; std::auto_ptr<SplitEditor> SE; @@ -126,14 +136,17 @@ class RAGreedy : public MachineFunctionPass, /// All basic blocks where the current register has uses. SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints; - /// All basic blocks where the current register is live-through and - /// interference free. - SmallVector<unsigned, 8> TransparentBlocks; - /// Global live range splitting candidate info. struct GlobalSplitCandidate { unsigned PhysReg; BitVector LiveBundles; + SmallVector<unsigned, 8> ActiveBlocks; + + void reset(unsigned Reg) { + PhysReg = Reg; + LiveBundles.clear(); + ActiveBlocks.clear(); + } }; /// Candidate info for for each PhysReg in AllocationOrder. @@ -141,10 +154,6 @@ class RAGreedy : public MachineFunctionPass, /// class. SmallVector<GlobalSplitCandidate, 32> GlobalCand; - /// For every instruction in SA->UseSlots, store the previous non-copy - /// instruction. - SmallVector<SlotIndex, 8> PrevSlot; - public: RAGreedy(); @@ -173,18 +182,21 @@ private: void LRE_WillShrinkVirtReg(unsigned); void LRE_DidCloneVirtReg(unsigned, unsigned); - bool addSplitConstraints(unsigned, float&); - float calcGlobalSplitCost(unsigned, const BitVector&); - void splitAroundRegion(LiveInterval&, unsigned, const BitVector&, + float calcSpillCost(); + bool addSplitConstraints(InterferenceCache::Cursor, float&); + void addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>); + void growRegion(GlobalSplitCandidate &Cand, InterferenceCache::Cursor); + float calcGlobalSplitCost(GlobalSplitCandidate&, InterferenceCache::Cursor); + void splitAroundRegion(LiveInterval&, GlobalSplitCandidate&, SmallVectorImpl<LiveInterval*>&); void calcGapWeights(unsigned, SmallVectorImpl<float>&); - SlotIndex getPrevMappedIndex(const MachineInstr*); - void calcPrevSlots(); - unsigned nextSplitPoint(unsigned); + CanEvict canEvict(LiveInterval &A, LiveInterval &B); bool canEvictInterference(LiveInterval&, unsigned, float&); + unsigned tryAssign(LiveInterval&, AllocationOrder&, + SmallVectorImpl<LiveInterval*>&); unsigned tryEvict(LiveInterval&, AllocationOrder&, - SmallVectorImpl<LiveInterval*>&); + SmallVectorImpl<LiveInterval*>&, unsigned = ~0u); unsigned tryRegionSplit(LiveInterval&, AllocationOrder&, SmallVectorImpl<LiveInterval*>&); unsigned tryLocalSplit(LiveInterval&, AllocationOrder&, @@ -196,6 +208,22 @@ private: char RAGreedy::ID = 0; +#ifndef NDEBUG +const char *const RAGreedy::StageName[] = { + "RS_New", + "RS_First", + "RS_Second", + "RS_Global", + "RS_Local", + "RS_Spill" +}; +#endif + +// Hysteresis to use when comparing floats. +// This helps stabilize decisions based on float comparisons. +const float Hysteresis = 0.98f; + + FunctionPass* llvm::createGreedyRegisterAllocator() { return new RAGreedy(); } @@ -287,6 +315,7 @@ void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) { void RAGreedy::releaseMemory() { SpillerInstance.reset(0); LRStage.clear(); + GlobalCand.clear(); RegAllocBase::releaseMemory(); } @@ -329,28 +358,85 @@ LiveInterval *RAGreedy::dequeue() { return LI; } + +//===----------------------------------------------------------------------===// +// Direct Assignment +//===----------------------------------------------------------------------===// + +/// tryAssign - Try to assign VirtReg to an available register. +unsigned RAGreedy::tryAssign(LiveInterval &VirtReg, + AllocationOrder &Order, + SmallVectorImpl<LiveInterval*> &NewVRegs) { + Order.rewind(); + unsigned PhysReg; + while ((PhysReg = Order.next())) + if (!checkPhysRegInterference(VirtReg, PhysReg)) + break; + if (!PhysReg || Order.isHint(PhysReg)) + return PhysReg; + + // PhysReg is available. Try to evict interference from a cheaper alternative. + unsigned Cost = TRI->getCostPerUse(PhysReg); + + // Most registers have 0 additional cost. + if (!Cost) + return PhysReg; + + DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is available at cost " << Cost + << '\n'); + unsigned CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost); + return CheapReg ? CheapReg : PhysReg; +} + + //===----------------------------------------------------------------------===// // Interference eviction //===----------------------------------------------------------------------===// +/// canEvict - determine if A can evict the assigned live range B. The eviction +/// policy defined by this function together with the allocation order defined +/// by enqueue() decides which registers ultimately end up being split and +/// spilled. +/// +/// This function must define a non-circular relation when it returns CE_Always, +/// otherwise infinite eviction loops are possible. When evicting a <= RS_Second +/// range, it is possible to return CE_WithSplit which forces the evicted +/// register to be split or spilled before it can evict anything again. That +/// guarantees progress. +RAGreedy::CanEvict RAGreedy::canEvict(LiveInterval &A, LiveInterval &B) { + return A.weight > B.weight ? CE_Always : CE_Never; +} + /// canEvict - Return true if all interferences between VirtReg and PhysReg can -/// be evicted. Set maxWeight to the maximal spill weight of an interference. +/// be evicted. +/// Return false if any interference is heavier than MaxWeight. +/// On return, set MaxWeight to the maximal spill weight of an interference. bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg, float &MaxWeight) { float Weight = 0; for (const unsigned *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) { LiveIntervalUnion::Query &Q = query(VirtReg, *AliasI); - // If there is 10 or more interferences, chances are one is smaller. - if (Q.collectInterferingVRegs(10) >= 10) + // If there is 10 or more interferences, chances are one is heavier. + if (Q.collectInterferingVRegs(10, MaxWeight) >= 10) return false; - // Check if any interfering live range is heavier than VirtReg. - for (unsigned i = 0, e = Q.interferingVRegs().size(); i != e; ++i) { - LiveInterval *Intf = Q.interferingVRegs()[i]; + // Check if any interfering live range is heavier than MaxWeight. + for (unsigned i = Q.interferingVRegs().size(); i; --i) { + LiveInterval *Intf = Q.interferingVRegs()[i - 1]; if (TargetRegisterInfo::isPhysicalRegister(Intf->reg)) return false; - if (Intf->weight >= VirtReg.weight) + if (Intf->weight >= MaxWeight) return false; + switch (canEvict(VirtReg, *Intf)) { + case CE_Always: + break; + case CE_Never: + return false; + case CE_WithSplit: + if (getStage(*Intf) > RS_Second) + return false; + break; + } Weight = std::max(Weight, Intf->weight); } } @@ -364,21 +450,28 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg, /// @return Physreg to assign VirtReg, or 0. unsigned RAGreedy::tryEvict(LiveInterval &VirtReg, AllocationOrder &Order, - SmallVectorImpl<LiveInterval*> &NewVRegs){ + SmallVectorImpl<LiveInterval*> &NewVRegs, + unsigned CostPerUseLimit) { NamedRegionTimer T("Evict", TimerGroupName, TimePassesIsEnabled); // Keep track of the lightest single interference seen so far. - float BestWeight = 0; + float BestWeight = HUGE_VALF; unsigned BestPhys = 0; Order.rewind(); while (unsigned PhysReg = Order.next()) { - float Weight = 0; + if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit) + continue; + // The first use of a register in a function has cost 1. + if (CostPerUseLimit == 1 && !MRI->isPhysRegUsed(PhysReg)) + continue; + + float Weight = BestWeight; if (!canEvictInterference(VirtReg, PhysReg, Weight)) continue; // This is an eviction candidate. - DEBUG(dbgs() << "max " << PrintReg(PhysReg, TRI) << " interference = " + DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " interference = " << Weight << '\n'); if (BestPhys && Weight >= BestWeight) continue; @@ -403,6 +496,11 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg, unassign(*Intf, VRM->getPhys(Intf->reg)); ++NumEvicted; NewVRegs.push_back(Intf); + // Prevent looping by forcing the evicted ranges to be split before they + // can evict anything else. + if (getStage(*Intf) < RS_Second && + canEvict(VirtReg, *Intf) == CE_WithSplit) + LRStage[Intf->reg] = RS_Second; } } return BestPhys; @@ -417,9 +515,9 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg, /// interference pattern in Physreg and its aliases. Add the constraints to /// SpillPlacement and return the static cost of this split in Cost, assuming /// that all preferences in SplitConstraints are met. -/// If it is evident that no bundles will be live, abort early and return false. -bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) { - InterferenceCache::Cursor Intf(IntfCache, PhysReg); +/// Return false if there are no bundles with positive bias. +bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf, + float &Cost) { ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks(); // Reset interference dependent info. @@ -446,7 +544,7 @@ bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) { BC.Entry = SpillPlacement::MustSpill, ++Ins; else if (Intf.first() < BI.FirstUse) BC.Entry = SpillPlacement::PrefSpill, ++Ins; - else if (Intf.first() < (BI.LiveThrough ? BI.LastUse : BI.Kill)) + else if (Intf.first() < BI.LastUse) ++Ins; } @@ -456,7 +554,7 @@ bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) { BC.Exit = SpillPlacement::MustSpill, ++Ins; else if (Intf.last() > BI.LastUse) BC.Exit = SpillPlacement::PrefSpill, ++Ins; - else if (Intf.last() > (BI.LiveThrough ? BI.FirstUse : BI.Def)) + else if (Intf.last() > BI.FirstUse) ++Ins; } @@ -464,35 +562,41 @@ bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) { if (Ins) StaticCost += Ins * SpillPlacer->getBlockFrequency(BC.Number); } + Cost = StaticCost; // Add constraints for use-blocks. Note that these are the only constraints // that may add a positive bias, it is downhill from here. SpillPlacer->addConstraints(SplitConstraints); - if (SpillPlacer->getPositiveNodes() == 0) - return false; + return SpillPlacer->scanActiveBundles(); +} - Cost = StaticCost; - // Now handle the live-through blocks without uses. These can only add - // negative bias, so we can abort whenever there are no more positive nodes. - // Compute constraints for a group of 8 blocks at a time. +/// addThroughConstraints - Add constraints and links to SpillPlacer from the +/// live-through blocks in Blocks. +void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf, + ArrayRef<unsigned> Blocks) { const unsigned GroupSize = 8; SpillPlacement::BlockConstraint BCS[GroupSize]; - unsigned B = 0; - TransparentBlocks.clear(); + unsigned TBS[GroupSize]; + unsigned B = 0, T = 0; - ArrayRef<unsigned> ThroughBlocks = SA->getThroughBlocks(); - for (unsigned i = 0; i != ThroughBlocks.size(); ++i) { - unsigned Number = ThroughBlocks[i]; - assert(B < GroupSize && "Array overflow"); - BCS[B].Number = Number; + for (unsigned i = 0; i != Blocks.size(); ++i) { + unsigned Number = Blocks[i]; Intf.moveToBlock(Number); if (!Intf.hasInterference()) { - TransparentBlocks.push_back(Number); + assert(T < GroupSize && "Array overflow"); + TBS[T] = Number; + if (++T == GroupSize) { + SpillPlacer->addLinks(ArrayRef<unsigned>(TBS, T)); + T = 0; + } continue; } + assert(B < GroupSize && "Array overflow"); + BCS[B].Number = Number; + // Interference for the live-in value. if (Intf.first() <= Indexes->getMBBStartIdx(Number)) BCS[B].Entry = SpillPlacement::MustSpill; @@ -509,30 +613,94 @@ bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) { ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B); SpillPlacer->addConstraints(Array); B = 0; - // Abort early when all hope is lost. - if (SpillPlacer->getPositiveNodes() == 0) - return false; } } ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B); SpillPlacer->addConstraints(Array); - if (SpillPlacer->getPositiveNodes() == 0) - return false; + SpillPlacer->addLinks(ArrayRef<unsigned>(TBS, T)); +} - // There is still some positive bias. Add all the links. - SpillPlacer->addLinks(TransparentBlocks); - return true; +void RAGreedy::growRegion(GlobalSplitCandidate &Cand, + InterferenceCache::Cursor Intf) { + // Keep track of through blocks that have not been added to SpillPlacer. + BitVector Todo = SA->getThroughBlocks(); + SmallVectorImpl<unsigned> &ActiveBlocks = Cand.ActiveBlocks; + unsigned AddedTo = 0; +#ifndef NDEBUG + unsigned Visited = 0; +#endif + + for (;;) { + ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive(); + if (NewBundles.empty()) + break; + // Find new through blocks in the periphery of PrefRegBundles. + for (int i = 0, e = NewBundles.size(); i != e; ++i) { + unsigned Bundle = NewBundles[i]; + // Look at all blocks connected to Bundle in the full graph. + ArrayRef<unsigned> Blocks = Bundles->getBlocks(Bundle); + for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end(); + I != E; ++I) { + unsigned Block = *I; + if (!Todo.test(Block)) + continue; + Todo.reset(Block); + // This is a new through block. Add it to SpillPlacer later. + ActiveBlocks.push_back(Block); +#ifndef NDEBUG + ++Visited; +#endif + } + } + // Any new blocks to add? + if (ActiveBlocks.size() > AddedTo) { + ArrayRef<unsigned> Add(&ActiveBlocks[AddedTo], + ActiveBlocks.size() - AddedTo); + addThroughConstraints(Intf, Add); + AddedTo = ActiveBlocks.size(); + } + // Perhaps iterating can enable more bundles? + SpillPlacer->iterate(); + } + DEBUG(dbgs() << ", v=" << Visited); } +/// calcSpillCost - Compute how expensive it would be to split the live range in +/// SA around all use blocks instead of forming bundle regions. +float RAGreedy::calcSpillCost() { + float Cost = 0; + const LiveInterval &LI = SA->getParent(); + ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks(); + for (unsigned i = 0; i != UseBlocks.size(); ++i) { + const SplitAnalysis::BlockInfo &BI = UseBlocks[i]; + unsigned Number = BI.MBB->getNumber(); + // We normally only need one spill instruction - a load or a store. + Cost += SpillPlacer->getBlockFrequency(Number); + + // Unless the value is redefined in the block. + if (BI.LiveIn && BI.LiveOut) { + SlotIndex Start, Stop; + tie(Start, Stop) = Indexes->getMBBRange(Number); + LiveInterval::const_iterator I = LI.find(Start); + assert(I != LI.end() && "Expected live-in value"); + // Is there a different live-out value? If so, we need an extra spill + // instruction. + if (I->end < Stop) + Cost += SpillPlacer->getBlockFrequency(Number); + } + } + return Cost; +} /// calcGlobalSplitCost - Return the global split cost of following the split /// pattern in LiveBundles. This cost should be added to the local cost of the /// interference pattern in SplitConstraints. /// -float RAGreedy::calcGlobalSplitCost(unsigned PhysReg, - const BitVector &LiveBundles) { +float RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand, + InterferenceCache::Cursor Intf) { float GlobalCost = 0; + const BitVector &LiveBundles = Cand.LiveBundles; ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks(); for (unsigned i = 0; i != UseBlocks.size(); ++i) { const SplitAnalysis::BlockInfo &BI = UseBlocks[i]; @@ -549,11 +717,8 @@ float RAGreedy::calcGlobalSplitCost(unsigned PhysReg, GlobalCost += Ins * SpillPlacer->getBlockFrequency(BC.Number); } - InterferenceCache::Cursor Intf(IntfCache, PhysReg); - ArrayRef<unsigned> ThroughBlocks = SA->getThroughBlocks(); - SplitConstraints.resize(UseBlocks.size() + ThroughBlocks.size()); - for (unsigned i = 0; i != ThroughBlocks.size(); ++i) { - unsigned Number = ThroughBlocks[i]; + for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) { + unsigned Number = Cand.ActiveBlocks[i]; bool RegIn = LiveBundles[Bundles->getBundle(Number, 0)]; bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)]; if (!RegIn && !RegOut) @@ -578,23 +743,25 @@ float RAGreedy::calcGlobalSplitCost(unsigned PhysReg, /// avoiding interference. The 'stack' interval is the complement constructed by /// SplitEditor. It will contain the rest. /// -void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg, - const BitVector &LiveBundles, +void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, + GlobalSplitCandidate &Cand, SmallVectorImpl<LiveInterval*> &NewVRegs) { + const BitVector &LiveBundles = Cand.LiveBundles; + DEBUG({ - dbgs() << "Splitting around region for " << PrintReg(PhysReg, TRI) + dbgs() << "Splitting around region for " << PrintReg(Cand.PhysReg, TRI) << " with bundles"; for (int i = LiveBundles.find_first(); i>=0; i = LiveBundles.find_next(i)) dbgs() << " EB#" << i; dbgs() << ".\n"; }); - InterferenceCache::Cursor Intf(IntfCache, PhysReg); + InterferenceCache::Cursor Intf(IntfCache, Cand.PhysReg); LiveRangeEdit LREdit(VirtReg, NewVRegs, this); SE->reset(LREdit); // Create the main cross-block interval. - SE->openIntv(); + const unsigned MainIntv = SE->openIntv(); // First add all defs that are live out of a block. ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks(); @@ -603,6 +770,14 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg, bool RegIn = LiveBundles[Bundles->getBundle(BI.MBB->getNumber(), 0)]; bool RegOut = LiveBundles[Bundles->getBundle(BI.MBB->getNumber(), 1)]; + // Create separate intervals for isolated blocks with multiple uses. + if (!RegIn && !RegOut && BI.FirstUse != BI.LastUse) { + DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " isolated.\n"); + SE->splitSingleBlock(BI); + SE->selectIntv(MainIntv); + continue; + } + // Should the register be live out? if (!BI.LiveOut || !RegOut) continue; @@ -628,7 +803,7 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg, DEBUG(dbgs() << ", no interference"); if (!BI.LiveThrough) { DEBUG(dbgs() << ", not live-through.\n"); - SE->useIntv(SE->enterIntvBefore(BI.Def), Stop); + SE->useIntv(SE->enterIntvBefore(BI.FirstUse), Stop); continue; } if (!RegIn) { @@ -645,10 +820,10 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg, // Block has interference. DEBUG(dbgs() << ", interference to " << Intf.last()); - if (!BI.LiveThrough && Intf.last() <= BI.Def) { + if (!BI.LiveThrough && Intf.last() <= BI.FirstUse) { // The interference doesn't reach the outgoing segment. - DEBUG(dbgs() << " doesn't affect def from " << BI.Def << '\n'); - SE->useIntv(BI.Def, Stop); + DEBUG(dbgs() << " doesn't affect def from " << BI.FirstUse << '\n'); + SE->useIntv(BI.FirstUse, Stop); continue; } @@ -704,7 +879,7 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg, DEBUG(dbgs() << ", no interference"); if (!BI.LiveThrough) { DEBUG(dbgs() << ", killed in block.\n"); - SE->useIntv(Start, SE->leaveIntvAfter(BI.Kill)); + SE->useIntv(Start, SE->leaveIntvAfter(BI.LastUse)); continue; } if (!RegOut) { @@ -737,10 +912,10 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg, // Block has interference. DEBUG(dbgs() << ", interference from " << Intf.first()); - if (!BI.LiveThrough && Intf.first() >= BI.Kill) { + if (!BI.LiveThrough && Intf.first() >= BI.LastUse) { // The interference doesn't reach the outgoing segment. - DEBUG(dbgs() << " doesn't affect kill at " << BI.Kill << '\n'); - SE->useIntv(Start, BI.Kill); + DEBUG(dbgs() << " doesn't affect kill at " << BI.LastUse << '\n'); + SE->useIntv(Start, BI.LastUse); continue; } @@ -766,9 +941,8 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg, } // Handle live-through blocks. - ArrayRef<unsigned> ThroughBlocks = SA->getThroughBlocks(); - for (unsigned i = 0; i != ThroughBlocks.size(); ++i) { - unsigned Number = ThroughBlocks[i]; + for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) { + unsigned Number = Cand.ActiveBlocks[i]; bool RegIn = LiveBundles[Bundles->getBundle(Number, 0)]; bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)]; DEBUG(dbgs() << "Live through BB#" << Number << '\n'); @@ -787,70 +961,113 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg, SE->enterIntvAtEnd(*MBB); } - SE->closeIntv(); - - // FIXME: Should we be more aggressive about splitting the stack region into - // per-block segments? The current approach allows the stack region to - // separate into connected components. Some components may be allocatable. - SE->finish(); ++NumGlobalSplits; + SmallVector<unsigned, 8> IntvMap; + SE->finish(&IntvMap); + DebugVars->splitRegister(VirtReg.reg, LREdit.regs()); + + LRStage.resize(MRI->getNumVirtRegs()); + unsigned OrigBlocks = SA->getNumLiveBlocks(); + + // Sort out the new intervals created by splitting. We get four kinds: + // - Remainder intervals should not be split again. + // - Candidate intervals can be assigned to Cand.PhysReg. + // - Block-local splits are candidates for local splitting. + // - DCE leftovers should go back on the queue. + for (unsigned i = 0, e = LREdit.size(); i != e; ++i) { + unsigned Reg = LREdit.get(i)->reg; + + // Ignore old intervals from DCE. + if (LRStage[Reg] != RS_New) + continue; + + // Remainder interval. Don't try splitting again, spill if it doesn't + // allocate. + if (IntvMap[i] == 0) { + LRStage[Reg] = RS_Global; + continue; + } + + // Main interval. Allow repeated splitting as long as the number of live + // blocks is strictly decreasing. + if (IntvMap[i] == MainIntv) { + if (SA->countLiveBlocks(LREdit.get(i)) >= OrigBlocks) { + DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks + << " blocks as original.\n"); + // Don't allow repeated splitting as a safe guard against looping. + LRStage[Reg] = RS_Global; + } + continue; + } + + // Other intervals are treated as new. This includes local intervals created + // for blocks with multiple uses, and anything created by DCE. + } + if (VerifyEnabled) MF->verify(this, "After splitting live range around region"); } unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order, SmallVectorImpl<LiveInterval*> &NewVRegs) { - BitVector LiveBundles, BestBundles; - float BestCost = 0; - unsigned BestReg = 0; + float BestCost = Hysteresis * calcSpillCost(); + DEBUG(dbgs() << "Cost of isolating all blocks = " << BestCost << '\n'); + const unsigned NoCand = ~0u; + unsigned BestCand = NoCand; Order.rewind(); for (unsigned Cand = 0; unsigned PhysReg = Order.next(); ++Cand) { if (GlobalCand.size() <= Cand) GlobalCand.resize(Cand+1); - GlobalCand[Cand].PhysReg = PhysReg; + GlobalCand[Cand].reset(PhysReg); - SpillPlacer->prepare(LiveBundles); + SpillPlacer->prepare(GlobalCand[Cand].LiveBundles); float Cost; - if (!addSplitConstraints(PhysReg, Cost)) { - DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tno positive bias\n"); + InterferenceCache::Cursor Intf(IntfCache, PhysReg); + if (!addSplitConstraints(Intf, Cost)) { + DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tno positive bundles\n"); continue; } - DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tbiased = " - << SpillPlacer->getPositiveNodes() << ", static = " << Cost); - if (BestReg && Cost >= BestCost) { - DEBUG(dbgs() << " worse than " << PrintReg(BestReg, TRI) << '\n'); + DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tstatic = " << Cost); + if (Cost >= BestCost) { + DEBUG({ + if (BestCand == NoCand) + dbgs() << " worse than no bundles\n"; + else + dbgs() << " worse than " + << PrintReg(GlobalCand[BestCand].PhysReg, TRI) << '\n'; + }); continue; } + growRegion(GlobalCand[Cand], Intf); SpillPlacer->finish(); // No live bundles, defer to splitSingleBlocks(). - if (!LiveBundles.any()) { + if (!GlobalCand[Cand].LiveBundles.any()) { DEBUG(dbgs() << " no bundles.\n"); continue; } - Cost += calcGlobalSplitCost(PhysReg, LiveBundles); + Cost += calcGlobalSplitCost(GlobalCand[Cand], Intf); DEBUG({ dbgs() << ", total = " << Cost << " with bundles"; - for (int i = LiveBundles.find_first(); i>=0; i = LiveBundles.find_next(i)) + for (int i = GlobalCand[Cand].LiveBundles.find_first(); i>=0; + i = GlobalCand[Cand].LiveBundles.find_next(i)) dbgs() << " EB#" << i; dbgs() << ".\n"; }); - if (!BestReg || Cost < BestCost) { - BestReg = PhysReg; - BestCost = 0.98f * Cost; // Prevent rounding effects. - BestBundles.swap(LiveBundles); + if (Cost < BestCost) { + BestCand = Cand; + BestCost = Hysteresis * Cost; // Prevent rounding effects. } } - if (!BestReg) + if (BestCand == NoCand) return 0; - splitAroundRegion(VirtReg, BestReg, BestBundles, NewVRegs); - setStage(NewVRegs.begin(), NewVRegs.end(), RS_Region); + splitAroundRegion(VirtReg, GlobalCand[BestCand], NewVRegs); return 0; } @@ -913,47 +1130,6 @@ void RAGreedy::calcGapWeights(unsigned PhysReg, } } -/// getPrevMappedIndex - Return the slot index of the last non-copy instruction -/// before MI that has a slot index. If MI is the first mapped instruction in -/// its block, return the block start index instead. -/// -SlotIndex RAGreedy::getPrevMappedIndex(const MachineInstr *MI) { - assert(MI && "Missing MachineInstr"); - const MachineBasicBlock *MBB = MI->getParent(); - MachineBasicBlock::const_iterator B = MBB->begin(), I = MI; - while (I != B) - if (!(--I)->isDebugValue() && !I->isCopy()) - return Indexes->getInstructionIndex(I); - return Indexes->getMBBStartIdx(MBB); -} - -/// calcPrevSlots - Fill in the PrevSlot array with the index of the previous -/// real non-copy instruction for each instruction in SA->UseSlots. -/// -void RAGreedy::calcPrevSlots() { - const SmallVectorImpl<SlotIndex> &Uses = SA->UseSlots; - PrevSlot.clear(); - PrevSlot.reserve(Uses.size()); - for (unsigned i = 0, e = Uses.size(); i != e; ++i) { - const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i]); - PrevSlot.push_back(getPrevMappedIndex(MI).getDefIndex()); - } -} - -/// nextSplitPoint - Find the next index into SA->UseSlots > i such that it may -/// be beneficial to split before UseSlots[i]. -/// -/// 0 is always a valid split point -unsigned RAGreedy::nextSplitPoint(unsigned i) { - const SmallVectorImpl<SlotIndex> &Uses = SA->UseSlots; - const unsigned Size = Uses.size(); - assert(i != Size && "No split points after the end"); - // Allow split before i when Uses[i] is not adjacent to the previous use. - while (++i != Size && PrevSlot[i].getBaseIndex() <= Uses[i-1].getBaseIndex()) - ; - return i; -} - /// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only /// basic block. /// @@ -981,11 +1157,27 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, dbgs() << '\n'; }); - // For every use, find the previous mapped non-copy instruction. - // We use this to detect valid split points, and to estimate new interval - // sizes. - calcPrevSlots(); + // Since we allow local split results to be split again, there is a risk of + // creating infinite loops. It is tempting to require that the new live + // ranges have less instructions than the original. That would guarantee + // convergence, but it is too strict. A live range with 3 instructions can be + // split 2+3 (including the COPY), and we want to allow that. + // + // Instead we use these rules: + // + // 1. Allow any split for ranges with getStage() < RS_Local. (Except for the + // noop split, of course). + // 2. Require progress be made for ranges with getStage() >= RS_Local. All + // the new ranges must have fewer instructions than before the split. + // 3. New ranges with the same number of instructions are marked RS_Local, + // smaller ranges are marked RS_New. + // + // These rules allow a 3 -> 2+3 split once, which we need. They also prevent + // excessive splitting and infinite loops. + // + bool ProgressRequired = getStage(VirtReg) >= RS_Local; + // Best split candidate. unsigned BestBefore = NumGaps; unsigned BestAfter = 0; float BestDiff = 0; @@ -1003,13 +1195,11 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, // The new spill weight must be larger than any gap interference. // We will split before Uses[SplitBefore] and after Uses[SplitAfter]. - unsigned SplitBefore = 0, SplitAfter = nextSplitPoint(1) - 1; + unsigned SplitBefore = 0, SplitAfter = 1; // MaxGap should always be max(GapWeight[SplitBefore..SplitAfter-1]). // It is the spill weight that needs to be evicted. float MaxGap = GapWeight[0]; - for (unsigned i = 1; i != SplitAfter; ++i) - MaxGap = std::max(MaxGap, GapWeight[i]); for (;;) { // Live before/after split? @@ -1027,41 +1217,31 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, } // Should the interval be extended or shrunk? bool Shrink = true; - if (MaxGap < HUGE_VALF) { - // Estimate the new spill weight. - // - // Each instruction reads and writes the register, except the first - // instr doesn't read when !FirstLive, and the last instr doesn't write - // when !LastLive. - // - // We will be inserting copies before and after, so the total number of - // reads and writes is 2 * EstUses. - // - const unsigned EstUses = 2*(SplitAfter - SplitBefore) + - 2*(LiveBefore + LiveAfter); - // Try to guess the size of the new interval. This should be trivial, - // but the slot index of an inserted copy can be a lot smaller than the - // instruction it is inserted before if there are many dead indexes - // between them. - // - // We measure the distance from the instruction before SplitBefore to - // get a conservative estimate. - // - // The final distance can still be different if inserting copies - // triggers a slot index renumbering. + // How many gaps would the new range have? + unsigned NewGaps = LiveBefore + SplitAfter - SplitBefore + LiveAfter; + + // Legally, without causing looping? + bool Legal = !ProgressRequired || NewGaps < NumGaps; + + if (Legal && MaxGap < HUGE_VALF) { + // Estimate the new spill weight. Each instruction reads or writes the + // register. Conservatively assume there are no read-modify-write + // instructions. // - const float EstWeight = normalizeSpillWeight(blockFreq * EstUses, - PrevSlot[SplitBefore].distance(Uses[SplitAfter])); + // Try to guess the size of the new interval. + const float EstWeight = normalizeSpillWeight(blockFreq * (NewGaps + 1), + Uses[SplitBefore].distance(Uses[SplitAfter]) + + (LiveBefore + LiveAfter)*SlotIndex::InstrDist); // Would this split be possible to allocate? // Never allocate all gaps, we wouldn't be making progress. - float Diff = EstWeight - MaxGap; - DEBUG(dbgs() << " w=" << EstWeight << " d=" << Diff); - if (Diff > 0) { + DEBUG(dbgs() << " w=" << EstWeight); + if (EstWeight * Hysteresis >= MaxGap) { Shrink = false; + float Diff = EstWeight - MaxGap; if (Diff > BestDiff) { DEBUG(dbgs() << " (best)"); - BestDiff = Diff; + BestDiff = Hysteresis * Diff; BestBefore = SplitBefore; BestAfter = SplitAfter; } @@ -1070,8 +1250,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, // Try to shrink. if (Shrink) { - SplitBefore = nextSplitPoint(SplitBefore); - if (SplitBefore < SplitAfter) { + if (++SplitBefore < SplitAfter) { DEBUG(dbgs() << " shrink\n"); // Recompute the max when necessary. if (GapWeight[SplitBefore - 1] >= MaxGap) { @@ -1091,10 +1270,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, } DEBUG(dbgs() << " extend\n"); - for (unsigned e = nextSplitPoint(SplitAfter + 1) - 1; - SplitAfter != e; ++SplitAfter) - MaxGap = std::max(MaxGap, GapWeight[SplitAfter]); - continue; + MaxGap = std::max(MaxGap, GapWeight[SplitAfter++]); } } @@ -1113,9 +1289,27 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, SlotIndex SegStart = SE->enterIntvBefore(Uses[BestBefore]); SlotIndex SegStop = SE->leaveIntvAfter(Uses[BestAfter]); SE->useIntv(SegStart, SegStop); - SE->closeIntv(); - SE->finish(); - setStage(NewVRegs.begin(), NewVRegs.end(), RS_Local); + SmallVector<unsigned, 8> IntvMap; + SE->finish(&IntvMap); + DebugVars->splitRegister(VirtReg.reg, LREdit.regs()); + + // If the new range has the same number of instructions as before, mark it as + // RS_Local so the next split will be forced to make progress. Otherwise, + // leave the new intervals as RS_New so they can compete. + bool LiveBefore = BestBefore != 0 || BI.LiveIn; + bool LiveAfter = BestAfter != NumGaps || BI.LiveOut; + unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter; + if (NewGaps >= NumGaps) { + DEBUG(dbgs() << "Tagging non-progress ranges: "); + assert(!ProgressRequired && "Didn't make progress when it was required."); + LRStage.resize(MRI->getNumVirtRegs()); + for (unsigned i = 0, e = IntvMap.size(); i != e; ++i) + if (IntvMap[i] == 1) { + LRStage[LREdit.get(i)->reg] = RS_Local; + DEBUG(dbgs() << PrintReg(LREdit.get(i)->reg)); + } + DEBUG(dbgs() << '\n'); + } ++NumLocalSplits; return 0; @@ -1141,30 +1335,36 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order, // Don't iterate global splitting. // Move straight to spilling if this range was produced by a global split. - LiveRangeStage Stage = getStage(VirtReg); - if (Stage >= RS_Block) + if (getStage(VirtReg) >= RS_Global) return 0; SA->analyze(&VirtReg); - // First try to split around a region spanning multiple blocks. - if (Stage < RS_Region) { - unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs); - if (PhysReg || !NewVRegs.empty()) + // FIXME: SplitAnalysis may repair broken live ranges coming from the + // coalescer. That may cause the range to become allocatable which means that + // tryRegionSplit won't be making progress. This check should be replaced with + // an assertion when the coalescer is fixed. + if (SA->didRepairRange()) { + // VirtReg has changed, so all cached queries are invalid. + invalidateVirtRegs(); + if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs)) return PhysReg; } + // First try to split around a region spanning multiple blocks. + unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs); + if (PhysReg || !NewVRegs.empty()) + return PhysReg; + // Then isolate blocks with multiple uses. - if (Stage < RS_Block) { - SplitAnalysis::BlockPtrSet Blocks; - if (SA->getMultiUseBlocks(Blocks)) { - LiveRangeEdit LREdit(VirtReg, NewVRegs, this); - SE->reset(LREdit); - SE->splitSingleBlocks(Blocks); - setStage(NewVRegs.begin(), NewVRegs.end(), RS_Block); - if (VerifyEnabled) - MF->verify(this, "After splitting live range around basic blocks"); - } + SplitAnalysis::BlockPtrSet Blocks; + if (SA->getMultiUseBlocks(Blocks)) { + LiveRangeEdit LREdit(VirtReg, NewVRegs, this); + SE->reset(LREdit); + SE->splitSingleBlocks(Blocks); + setStage(NewVRegs.begin(), NewVRegs.end(), RS_Global); + if (VerifyEnabled) + MF->verify(this, "After splitting live range around basic blocks"); } // Don't assign any physregs. @@ -1179,21 +1379,25 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order, unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg, SmallVectorImpl<LiveInterval*> &NewVRegs) { // First try assigning a free register. - AllocationOrder Order(VirtReg.reg, *VRM, ReservedRegs); - while (unsigned PhysReg = Order.next()) { - if (!checkPhysRegInterference(VirtReg, PhysReg)) - return PhysReg; - } - - if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs)) + AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo); + if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs)) return PhysReg; + LiveRangeStage Stage = getStage(VirtReg); + DEBUG(dbgs() << StageName[Stage] << '\n'); + + // Try to evict a less worthy live range, but only for ranges from the primary + // queue. The RS_Second ranges already failed to do this, and they should not + // get a second chance until they have been split. + if (Stage != RS_Second) + if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs)) + return PhysReg; + assert(NewVRegs.empty() && "Cannot append to existing NewVRegs"); // The first time we see a live range, don't try to split or spill. // Wait until the second time, when all smaller ranges have been allocated. // This gives a better picture of the interference to split around. - LiveRangeStage Stage = getStage(VirtReg); if (Stage == RS_First) { LRStage[VirtReg.reg] = RS_Second; DEBUG(dbgs() << "wait for second round\n"); @@ -1201,7 +1405,10 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg, return 0; } - assert(Stage < RS_Spill && "Cannot allocate after spilling"); + // If we couldn't allocate a register from spilling, there is probably some + // invalid inline assembly. The base class wil report it. + if (Stage >= RS_Spill) + return ~0u; // Try splitting VirtReg or interferences. unsigned PhysReg = trySplit(VirtReg, Order, NewVRegs); @@ -1234,12 +1441,12 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>()); Indexes = &getAnalysis<SlotIndexes>(); DomTree = &getAnalysis<MachineDominatorTree>(); - ReservedRegs = TRI->getReservedRegs(*MF); SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); Loops = &getAnalysis<MachineLoopInfo>(); LoopRanges = &getAnalysis<MachineLoopRanges>(); Bundles = &getAnalysis<EdgeBundles>(); SpillPlacer = &getAnalysis<SpillPlacement>(); + DebugVars = &getAnalysis<LiveDebugVariables>(); SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops)); SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree)); @@ -1258,7 +1465,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { } // Write out new DBG_VALUE instructions. - getAnalysis<LiveDebugVariables>().emitDebugValues(VRM); + DebugVars->emitDebugValues(VRM); // The pass output is in VirtRegMap. Release all the transient data. releaseMemory(); diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp index ef78949..0818034 100644 --- a/lib/CodeGen/RegAllocLinearScan.cpp +++ b/lib/CodeGen/RegAllocLinearScan.cpp @@ -16,6 +16,7 @@ #include "LiveRangeEdit.h" #include "VirtRegMap.h" #include "VirtRegRewriter.h" +#include "RegisterClassInfo.h" #include "Spiller.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Function.h" @@ -40,7 +41,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> -#include <set> #include <queue> #include <memory> #include <cmath> @@ -67,6 +67,11 @@ TrivCoalesceEnds("trivial-coalesce-ends", cl::desc("Attempt trivial coalescing of interval ends"), cl::init(false), cl::Hidden); +static cl::opt<bool> +AvoidWAWHazard("avoid-waw-hazard", + cl::desc("Avoid write-write hazards for some register classes"), + cl::init(false), cl::Hidden); + static RegisterRegAlloc linearscanRegAlloc("linearscan", "linear scan register allocator", createLinearScanRegisterAllocator); @@ -110,6 +115,7 @@ namespace { if (NumRecentlyUsedRegs > 0) RecentRegs.resize(NumRecentlyUsedRegs, 0); RecentNext = RecentRegs.begin(); + avoidWAW_ = 0; } typedef std::pair<LiveInterval*, LiveInterval::iterator> IntervalPtr; @@ -143,6 +149,7 @@ namespace { BitVector reservedRegs_; LiveIntervals* li_; MachineLoopInfo *loopInfo; + RegisterClassInfo RegClassInfo; /// handled_ - Intervals are added to the handled_ set in the order of their /// start value. This is uses for backtracking. @@ -180,6 +187,9 @@ namespace { SmallVector<unsigned, 4> RecentRegs; SmallVector<unsigned, 4>::iterator RecentNext; + // Last write-after-write register written. + unsigned avoidWAW_; + // Record that we just picked this register. void recordRecentlyUsed(unsigned reg) { assert(reg != 0 && "Recently used register is NOREG!"); @@ -227,8 +237,8 @@ namespace { // Determine if we skip this register due to its being recently used. bool isRecentlyUsed(unsigned reg) const { - return std::find(RecentRegs.begin(), RecentRegs.end(), reg) != - RecentRegs.end(); + return reg == avoidWAW_ || + std::find(RecentRegs.begin(), RecentRegs.end(), reg) != RecentRegs.end(); } private: @@ -358,13 +368,10 @@ namespace { /// getFirstNonReservedPhysReg - return the first non-reserved physical /// register in the register class. unsigned getFirstNonReservedPhysReg(const TargetRegisterClass *RC) { - TargetRegisterClass::iterator aoe = RC->allocation_order_end(*mf_); - TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_); - while (i != aoe && reservedRegs_.test(*i)) - ++i; - assert(i != aoe && "All registers reserved?!"); - return *i; - } + ArrayRef<unsigned> O = RegClassInfo.getOrder(RC); + assert(!O.empty() && "All registers reserved?!"); + return O.front(); + } void ComputeRelatedRegClasses(); @@ -516,6 +523,7 @@ bool RALinScan::runOnMachineFunction(MachineFunction &fn) { reservedRegs_ = tri_->getReservedRegs(fn); li_ = &getAnalysis<LiveIntervals>(); loopInfo = &getAnalysis<MachineLoopInfo>(); + RegClassInfo.runOnMachineFunction(fn); // We don't run the coalescer here because we have no reason to // interact with it. If the coalescer requires interaction, it @@ -792,7 +800,7 @@ void RALinScan::updateSpillWeights(std::vector<float> &Weights, // register class we are trying to allocate. Then add the weight to all // sub-registers of the super-register even if they are not aliases. // e.g. allocating for GR32, bh is not used, updating bl spill weight. - // bl should get the same spill weight otherwise it will be choosen + // bl should get the same spill weight otherwise it will be chosen // as a spill candidate since spilling bh doesn't make ebx available. for (unsigned i = 0, e = Supers.size(); i != e; ++i) { for (const unsigned *sr = tri_->getSubRegisters(Supers[i]); *sr; ++sr) @@ -1116,6 +1124,12 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) { active_.push_back(std::make_pair(cur, cur->begin())); handled_.push_back(cur); + // Remember physReg for avoiding a write-after-write hazard in the next + // instruction. + if (AvoidWAWHazard && + tri_->avoidWriteAfterWrite(mri_->getRegClass(cur->reg))) + avoidWAW_ = physReg; + // "Upgrade" the physical register since it has been allocated. UpgradeRegister(physReg); if (LiveInterval *NextReloadLI = hasNextReloadInterval(cur)) { @@ -1152,14 +1166,11 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) { bool Found = false; std::vector<std::pair<unsigned,float> > RegsWeights; + ArrayRef<unsigned> Order = RegClassInfo.getOrder(RC); if (!minReg || SpillWeights[minReg] == HUGE_VALF) - for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_), - e = RC->allocation_order_end(*mf_); i != e; ++i) { - unsigned reg = *i; + for (unsigned i = 0; i != Order.size(); ++i) { + unsigned reg = Order[i]; float regWeight = SpillWeights[reg]; - // Don't even consider reserved regs. - if (reservedRegs_.test(reg)) - continue; // Skip recently allocated registers and reserved registers. if (minWeight > regWeight && !isRecentlyUsed(reg)) Found = true; @@ -1168,11 +1179,8 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) { // If we didn't find a register that is spillable, try aliases? if (!Found) { - for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_), - e = RC->allocation_order_end(*mf_); i != e; ++i) { - unsigned reg = *i; - if (reservedRegs_.test(reg)) - continue; + for (unsigned i = 0; i != Order.size(); ++i) { + unsigned reg = Order[i]; // No need to worry about if the alias register size < regsize of RC. // We are going to spill all registers that alias it anyway. for (const unsigned* as = tri_->getAliasSet(reg); *as; ++as) @@ -1432,13 +1440,13 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur, if (TargetRegisterInfo::isVirtualRegister(physReg) && vrm_->hasPhys(physReg)) physReg = vrm_->getPhys(physReg); - TargetRegisterClass::iterator I, E; - tie(I, E) = tri_->getAllocationOrder(RC, Hint.first, physReg, *mf_); - assert(I != E && "No allocatable register in this register class!"); + ArrayRef<unsigned> Order = tri_->getRawAllocationOrder(RC, Hint.first, + physReg, *mf_); + assert(!Order.empty() && "No allocatable register in this register class!"); // Scan for the first available register. - for (; I != E; ++I) { - unsigned Reg = *I; + for (unsigned i = 0; i != Order.size(); ++i) { + unsigned Reg = Order[i]; // Ignore "downgraded" registers. if (SkipDGRegs && DowngradedRegs.count(Reg)) continue; @@ -1446,7 +1454,7 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur, if (reservedRegs_.test(Reg)) continue; // Skip recently allocated registers. - if (isRegAvail(Reg) && !isRecentlyUsed(Reg)) { + if (isRegAvail(Reg) && (!SkipDGRegs || !isRecentlyUsed(Reg))) { FreeReg = Reg; if (FreeReg < inactiveCounts.size()) FreeRegInactiveCount = inactiveCounts[FreeReg]; @@ -1468,8 +1476,8 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur, // inactive count. Alkis found that this reduced register pressure very // slightly on X86 (in rev 1.94 of this file), though this should probably be // reevaluated now. - for (; I != E; ++I) { - unsigned Reg = *I; + for (unsigned i = 0; i != Order.size(); ++i) { + unsigned Reg = Order[i]; // Ignore "downgraded" registers. if (SkipDGRegs && DowngradedRegs.count(Reg)) continue; @@ -1477,7 +1485,8 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur, if (reservedRegs_.test(Reg)) continue; if (isRegAvail(Reg) && Reg < inactiveCounts.size() && - FreeRegInactiveCount < inactiveCounts[Reg] && !isRecentlyUsed(Reg)) { + FreeRegInactiveCount < inactiveCounts[Reg] && + (!SkipDGRegs || !isRecentlyUsed(Reg))) { FreeReg = Reg; FreeRegInactiveCount = inactiveCounts[Reg]; if (FreeRegInactiveCount == MaxInactiveCount) @@ -1528,12 +1537,10 @@ unsigned RALinScan::getFreePhysReg(LiveInterval *cur) { return Preference; } - if (!DowngradedRegs.empty()) { - unsigned FreeReg = getFreePhysReg(cur, RC, MaxInactiveCount, inactiveCounts, - true); - if (FreeReg) - return FreeReg; - } + unsigned FreeReg = getFreePhysReg(cur, RC, MaxInactiveCount, inactiveCounts, + true); + if (FreeReg) + return FreeReg; return getFreePhysReg(cur, RC, MaxInactiveCount, inactiveCounts, false); } diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp index 1e1f1e0..605507f 100644 --- a/lib/CodeGen/RegAllocPBQP.cpp +++ b/lib/CodeGen/RegAllocPBQP.cpp @@ -222,10 +222,9 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf, // Compute an initial allowed set for the current vreg. typedef std::vector<unsigned> VRAllowed; VRAllowed vrAllowed; - for (TargetRegisterClass::iterator aoItr = trc->allocation_order_begin(*mf), - aoEnd = trc->allocation_order_end(*mf); - aoItr != aoEnd; ++aoItr) { - unsigned preg = *aoItr; + ArrayRef<unsigned> rawOrder = trc->getRawAllocationOrder(*mf); + for (unsigned i = 0; i != rawOrder.size(); ++i) { + unsigned preg = rawOrder[i]; if (!reservedRegs.test(preg)) { vrAllowed.push_back(preg); } @@ -581,7 +580,7 @@ void RegAllocPBQP::finalizeAlloc() const { if (physReg == 0) { const TargetRegisterClass *liRC = mri->getRegClass(li->reg); - physReg = *liRC->allocation_order_begin(*mf); + physReg = liRC->getRawAllocationOrder(*mf).front(); } vrm->assignVirt2Phys(li->reg, physReg); diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp new file mode 100644 index 0000000..5a77e47 --- /dev/null +++ b/lib/CodeGen/RegisterClassInfo.cpp @@ -0,0 +1,112 @@ +//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the RegisterClassInfo class which provides dynamic +// information about target register classes. Callee saved and reserved +// registers depends on calling conventions and other dynamic information, so +// some things cannot be determined statically. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "regalloc" +#include "RegisterClassInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetMachine.h" + +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +RegisterClassInfo::RegisterClassInfo() : Tag(0), MF(0), TRI(0), CalleeSaved(0) +{} + +void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { + bool Update = false; + MF = &mf; + + // Allocate new array the first time we see a new target. + if (MF->getTarget().getRegisterInfo() != TRI) { + TRI = MF->getTarget().getRegisterInfo(); + RegClass.reset(new RCInfo[TRI->getNumRegClasses()]); + Update = true; + } + + // Does this MF have different CSRs? + const unsigned *CSR = TRI->getCalleeSavedRegs(MF); + if (Update || CSR != CalleeSaved) { + // Build a CSRNum map. Every CSR alias gets an entry pointing to the last + // overlapping CSR. + CSRNum.clear(); + CSRNum.resize(TRI->getNumRegs(), 0); + for (unsigned N = 0; unsigned Reg = CSR[N]; ++N) + for (const unsigned *AS = TRI->getOverlaps(Reg); + unsigned Alias = *AS; ++AS) + CSRNum[Alias] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ... + Update = true; + } + CalleeSaved = CSR; + + // Different reserved registers? + BitVector RR = TRI->getReservedRegs(*MF); + if (RR != Reserved) + Update = true; + Reserved = RR; + + // Invalidate cached information from previous function. + if (Update) + ++Tag; +} + +/// compute - Compute the preferred allocation order for RC with reserved +/// registers filtered out. Volatile registers come first followed by CSR +/// aliases ordered according to the CSR order specified by the target. +void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { + RCInfo &RCI = RegClass[RC->getID()]; + + // Raw register count, including all reserved regs. + unsigned NumRegs = RC->getNumRegs(); + + if (!RCI.Order) + RCI.Order.reset(new unsigned[NumRegs]); + + unsigned N = 0; + SmallVector<unsigned, 16> CSRAlias; + + // FIXME: Once targets reserve registers instead of removing them from the + // allocation order, we can simply use begin/end here. + ArrayRef<unsigned> RawOrder = RC->getRawAllocationOrder(*MF); + for (unsigned i = 0; i != RawOrder.size(); ++i) { + unsigned PhysReg = RawOrder[i]; + // Remove reserved registers from the allocation order. + if (Reserved.test(PhysReg)) + continue; + if (CSRNum[PhysReg]) + // PhysReg aliases a CSR, save it for later. + CSRAlias.push_back(PhysReg); + else + RCI.Order[N++] = PhysReg; + } + RCI.NumRegs = N + CSRAlias.size(); + assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass"); + + // CSR aliases go after the volatile registers, preserve the target's order. + std::copy(CSRAlias.begin(), CSRAlias.end(), &RCI.Order[N]); + + DEBUG({ + dbgs() << "AllocationOrder(" << RC->getName() << ") = ["; + for (unsigned I = 0; I != RCI.NumRegs; ++I) + dbgs() << ' ' << PrintReg(RCI.Order[I], TRI); + dbgs() << " ]\n"; + }); + + // RCI is now up-to-date. + RCI.Tag = Tag; +} + diff --git a/lib/CodeGen/RegisterClassInfo.h b/lib/CodeGen/RegisterClassInfo.h new file mode 100644 index 0000000..6f7d9c9 --- /dev/null +++ b/lib/CodeGen/RegisterClassInfo.h @@ -0,0 +1,121 @@ +//===-- RegisterClassInfo.h - Dynamic Register Class Info -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the RegisterClassInfo class which provides dynamic +// information about target register classes. Callee saved and reserved +// registers depends on calling conventions and other dynamic information, so +// some things cannot be determined statically. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGISTERCLASSINFO_H +#define LLVM_CODEGEN_REGISTERCLASSINFO_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/Target/TargetRegisterInfo.h" + +namespace llvm { + +class RegisterClassInfo { + struct RCInfo { + unsigned Tag; + unsigned NumRegs; + OwningArrayPtr<unsigned> Order; + + RCInfo() : Tag(0), NumRegs(0) {} + operator ArrayRef<unsigned>() const { + return ArrayRef<unsigned>(Order.get(), NumRegs); + } + }; + + // Brief cached information for each register class. + OwningArrayPtr<RCInfo> RegClass; + + // Tag changes whenever cached information needs to be recomputed. An RCInfo + // entry is valid when its tag matches. + unsigned Tag; + + const MachineFunction *MF; + const TargetRegisterInfo *TRI; + + // Callee saved registers of last MF. Assumed to be valid until the next + // runOnFunction() call. + const unsigned *CalleeSaved; + + // Map register number to CalleeSaved index + 1; + SmallVector<uint8_t, 4> CSRNum; + + // Reserved registers in the current MF. + BitVector Reserved; + + // Compute all information about RC. + void compute(const TargetRegisterClass *RC) const; + + // Return an up-to-date RCInfo for RC. + const RCInfo &get(const TargetRegisterClass *RC) const { + const RCInfo &RCI = RegClass[RC->getID()]; + if (Tag != RCI.Tag) + compute(RC); + return RCI; + } + +public: + RegisterClassInfo(); + + /// runOnFunction - Prepare to answer questions about MF. This must be called + /// before any other methods are used. + void runOnMachineFunction(const MachineFunction &MF); + + /// getNumAllocatableRegs - Returns the number of actually allocatable + /// registers in RC in the current function. + unsigned getNumAllocatableRegs(const TargetRegisterClass *RC) const { + return get(RC).NumRegs; + } + + /// getOrder - Returns the preferred allocation order for RC. The order + /// contains no reserved registers, and registers that alias callee saved + /// registers come last. + ArrayRef<unsigned> getOrder(const TargetRegisterClass *RC) const { + return get(RC); + } + + /// getLastCalleeSavedAlias - Returns the last callee saved register that + /// overlaps PhysReg, or 0 if Reg doesn't overlap a CSR. + unsigned getLastCalleeSavedAlias(unsigned PhysReg) const { + assert(TargetRegisterInfo::isPhysicalRegister(PhysReg)); + if (unsigned N = CSRNum[PhysReg]) + return CalleeSaved[N-1]; + return 0; + } + + /// isReserved - Returns true when PhysReg is a reserved register. + /// + /// Reserved registers may belong to an allocatable register class, but the + /// target has explicitly requested that they are not used. + /// + bool isReserved(unsigned PhysReg) const { + return Reserved.test(PhysReg); + } + + /// isAllocatable - Returns true when PhysReg belongs to an allocatable + /// register class and it hasn't been reserved. + /// + /// Allocatable registers may show up in the allocation order of some virtual + /// register, so a register allocator needs to track its liveness and + /// availability. + bool isAllocatable(unsigned PhysReg) const { + return TRI->get(PhysReg).inAllocatableClass && !isReserved(PhysReg); + } +}; +} // end namespace llvm + +#endif + diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index ebfe533..9e9a145 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -154,13 +154,16 @@ void RegScavenger::forward() { BitVector DeadRegs(NumPhysRegs); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg() || MO.isUndef()) + if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!Reg || isReserved(Reg)) continue; if (MO.isUse()) { + // Ignore undef uses. + if (MO.isUndef()) + continue; // Two-address operands implicitly kill. if (!isPred && (MO.isKill() || MI->isRegTiedToDefOperand(i))) addRegWithSubRegs(KillRegs, Reg); @@ -178,12 +181,14 @@ void RegScavenger::forward() { // Verify uses and defs. for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg() || MO.isUndef()) + if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!Reg || isReserved(Reg)) continue; if (MO.isUse()) { + if (MO.isUndef()) + continue; if (!isUsed(Reg)) { // Check if it's partial live: e.g. // D0 = insert_subreg D0<undef>, S0 diff --git a/lib/CodeGen/RenderMachineFunction.cpp b/lib/CodeGen/RenderMachineFunction.cpp index cbfd5a2..8b02ec4 100644 --- a/lib/CodeGen/RenderMachineFunction.cpp +++ b/lib/CodeGen/RenderMachineFunction.cpp @@ -47,7 +47,7 @@ outputFileSuffix("rmf-file-suffix", static cl::opt<std::string> machineFuncsToRender("rmf-funcs", - cl::desc("Coma seperated list of functions to render" + cl::desc("Comma separated list of functions to render" ", or \"*\"."), cl::init(""), cl::Hidden); @@ -434,8 +434,7 @@ namespace llvm { rcEnd = tri->regclass_end(); rcItr != rcEnd; ++rcItr) { const TargetRegisterClass *trc = *rcItr; - unsigned capacity = std::distance(trc->allocation_order_begin(*mf), - trc->allocation_order_end(*mf)); + unsigned capacity = trc->getRawAllocationOrder(*mf).size(); if (capacity != 0) capacityMap[trc] = capacity; @@ -482,8 +481,7 @@ namespace llvm { rcItr != rcEnd; ++rcItr) { const TargetRegisterClass *trc = *rcItr; - if (trc->allocation_order_begin(*mf) == - trc->allocation_order_end(*mf)) + if (trc->getRawAllocationOrder(*mf).empty()) continue; unsigned worstAtI = getWorst(li->reg, trc); diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp index 3388889..f328493 100644 --- a/lib/CodeGen/ScheduleDAG.cpp +++ b/lib/CodeGen/ScheduleDAG.cpp @@ -19,17 +19,27 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <climits> using namespace llvm; +#ifndef NDEBUG +cl::opt<bool> StressSchedOpt( + "stress-sched", cl::Hidden, cl::init(false), + cl::desc("Stress test instruction scheduling")); +#endif + ScheduleDAG::ScheduleDAG(MachineFunction &mf) : TM(mf.getTarget()), TII(TM.getInstrInfo()), TRI(TM.getRegisterInfo()), MF(mf), MRI(mf.getRegInfo()), EntrySU(), ExitSU() { +#ifndef NDEBUG + StressSched = StressSchedOpt; +#endif } ScheduleDAG::~ScheduleDAG() {} @@ -307,6 +317,8 @@ void SUnit::dumpAll(const ScheduleDAG *G) const { if (I->isArtificial()) dbgs() << " *"; dbgs() << ": Latency=" << I->getLatency(); + if (I->isAssignedRegDep()) + dbgs() << " Reg=" << G->TRI->getName(I->getReg()); dbgs() << "\n"; } } @@ -472,7 +484,7 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { #endif } -/// AddPred - Updates the topological ordering to accomodate an edge +/// AddPred - Updates the topological ordering to accommodate an edge /// to be added from SUnit X to SUnit Y. void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { int UpperBound, LowerBound; @@ -490,7 +502,7 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { } } -/// RemovePred - Updates the topological ordering to accomodate an +/// RemovePred - Updates the topological ordering to accommodate an /// an edge to be removed from the specified node N from the predecessors /// of the current node M. void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) { diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index f17023e..2363df4 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -35,8 +35,9 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, const MachineDominatorTree &mdt) : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()), InstrItins(mf.getTarget().getInstrItineraryData()), - Defs(TRI->getNumRegs()), Uses(TRI->getNumRegs()), LoopRegs(MLI, MDT) { - DbgValueVec.clear(); + Defs(TRI->getNumRegs()), Uses(TRI->getNumRegs()), + LoopRegs(MLI, MDT), FirstDbgValue(0) { + DbgValues.clear(); } /// Run - perform scheduling. @@ -120,7 +121,7 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI, // such aliases. if (PSV->isAliased(MFI)) return 0; - + MayAlias = PSV->mayAlias(MFI); return V; } @@ -174,7 +175,7 @@ void ScheduleDAGInstrs::AddSchedBarrierDeps() { for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(), - E = (*SI)->livein_end(); I != E; ++I) { + E = (*SI)->livein_end(); I != E; ++I) { unsigned Reg = *I; if (Seen.insert(Reg)) Uses[Reg].push_back(&ExitSU); @@ -200,11 +201,6 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { std::map<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs; std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses; - // Keep track of dangling debug references to registers. - std::vector<std::pair<MachineInstr*, unsigned> > - DanglingDebugValue(TRI->getNumRegs(), - std::make_pair(static_cast<MachineInstr*>(0), 0)); - // Check to see if the scheduler cares about latencies. bool UnitLatencies = ForceUnitLatencies(); @@ -214,26 +210,32 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { // Remove any stale debug info; sometimes BuildSchedGraph is called again // without emitting the info from the previous call. - DbgValueVec.clear(); + DbgValues.clear(); + FirstDbgValue = NULL; // Model data dependencies between instructions being scheduled and the // ExitSU. AddSchedBarrierDeps(); + for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) { + assert(Defs[i].empty() && "Only BuildGraph should push/pop Defs"); + } + // Walk the list of instructions, from bottom moving up. + MachineInstr *PrevMI = NULL; for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin; MII != MIE; --MII) { MachineInstr *MI = prior(MII); - // DBG_VALUE does not have SUnit's built, so just remember these for later - // reinsertion. + if (MI && PrevMI) { + DbgValues.push_back(std::make_pair(PrevMI, MI)); + PrevMI = NULL; + } + if (MI->isDebugValue()) { - if (MI->getNumOperands()==3 && MI->getOperand(0).isReg() && - MI->getOperand(0).getReg()) - DanglingDebugValue[MI->getOperand(0).getReg()] = - std::make_pair(MI, DbgValueVec.size()); - DbgValueVec.push_back(MI); + PrevMI = MI; continue; } + const TargetInstrDesc &TID = MI->getDesc(); assert(!TID.isTerminator() && !MI->isLabel() && "Cannot schedule terminators or labels!"); @@ -257,13 +259,8 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { assert(TRI->isPhysicalRegister(Reg) && "Virtual register encountered!"); - if (MO.isDef() && DanglingDebugValue[Reg].first!=0) { - SU->DbgInstrList.push_back(DanglingDebugValue[Reg].first); - DbgValueVec[DanglingDebugValue[Reg].second] = 0; - DanglingDebugValue[Reg] = std::make_pair((MachineInstr*)0, 0); - } - std::vector<SUnit *> &UseList = Uses[Reg]; + // Defs are push in the order they are visited and never reordered. std::vector<SUnit *> &DefList = Defs[Reg]; // Optionally add output and anti dependencies. For anti // dependencies we use a latency of 0 because for a multi-issue @@ -283,9 +280,9 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { DefSU->addPred(SDep(SU, Kind, AOLatency, /*Reg=*/Reg)); } for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { - std::vector<SUnit *> &DefList = Defs[*Alias]; - for (unsigned i = 0, e = DefList.size(); i != e; ++i) { - SUnit *DefSU = DefList[i]; + std::vector<SUnit *> &MemDefList = Defs[*Alias]; + for (unsigned i = 0, e = MemDefList.size(); i != e; ++i) { + SUnit *DefSU = MemDefList[i]; if (DefSU == &ExitSU) continue; if (DefSU != SU && @@ -371,7 +368,7 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { // will be overlapped by work done outside the current // scheduling region. Latency -= std::min(Latency, Count); - // Add the artifical edge. + // Add the artificial edge. ExitSU.addPred(SDep(SU, SDep::Order, Latency, /*Reg=*/0, /*isNormalMemory=*/false, /*isMustAlias=*/false, @@ -393,6 +390,16 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { UseList.clear(); if (!MO.isDead()) DefList.clear(); + + // Calls will not be reordered because of chain dependencies (see + // below). Since call operands are dead, calls may continue to be added + // to the DefList making dependence checking quadratic in the size of + // the block. Instead, we leave only one call at the back of the + // DefList. + if (SU->isCall) { + while (!DefList.empty() && DefList.back()->isCall) + DefList.pop_back(); + } DefList.push_back(SU); } else { UseList.push_back(SU); @@ -411,11 +418,11 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { #define STORE_LOAD_LATENCY 1 unsigned TrueMemOrderLatency = 0; if (TID.isCall() || MI->hasUnmodeledSideEffects() || - (MI->hasVolatileMemoryRef() && + (MI->hasVolatileMemoryRef() && (!TID.mayLoad() || !MI->isInvariantLoad(AA)))) { // Be conservative with these and add dependencies on all memory // references, even those that are known to not alias. - for (std::map<const Value *, SUnit *>::iterator I = + for (std::map<const Value *, SUnit *>::iterator I = NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) { I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); } @@ -458,9 +465,9 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { // A store to a specific PseudoSourceValue. Add precise dependencies. // Record the def in MemDefs, first adding a dep if there is // an existing def. - std::map<const Value *, SUnit *>::iterator I = + std::map<const Value *, SUnit *>::iterator I = ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V)); - std::map<const Value *, SUnit *>::iterator IE = + std::map<const Value *, SUnit *>::iterator IE = ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); if (I != IE) { I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0, @@ -513,39 +520,41 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { if (MI->isInvariantLoad(AA)) { // Invariant load, no chain dependencies needed! } else { - if (const Value *V = + if (const Value *V = getUnderlyingObjectForInstr(MI, MFI, MayAlias)) { // A load from a specific PseudoSourceValue. Add precise dependencies. - std::map<const Value *, SUnit *>::iterator I = + std::map<const Value *, SUnit *>::iterator I = ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V)); - std::map<const Value *, SUnit *>::iterator IE = + std::map<const Value *, SUnit *>::iterator IE = ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); if (I != IE) I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0, /*isNormalMemory=*/true)); if (MayAlias) AliasMemUses[V].push_back(SU); - else + else NonAliasMemUses[V].push_back(SU); } else { // A load with no underlying object. Depend on all // potentially aliasing stores. - for (std::map<const Value *, SUnit *>::iterator I = + for (std::map<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); - + PendingLoads.push_back(SU); MayAlias = true; } - + // Add dependencies on alias and barrier chains, if needed. if (MayAlias && AliasChain) AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); if (BarrierChain) BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); - } + } } } + if (PrevMI) + FirstDbgValue = PrevMI; for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) { Defs[i].clear(); @@ -572,11 +581,11 @@ void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) { } } -void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, +void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, SDep& dep) const { if (!InstrItins || InstrItins->isEmpty()) return; - + // For a data dependency with a known register... if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0)) return; @@ -655,39 +664,33 @@ MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() { BB->remove(I); } - // First reinsert any remaining debug_values; these are either constants, - // or refer to live-in registers. The beginning of the block is the right - // place for the latter. The former might reasonably be placed elsewhere - // using some kind of ordering algorithm, but right now it doesn't matter. - for (int i = DbgValueVec.size()-1; i>=0; --i) - if (DbgValueVec[i]) - BB->insert(InsertPos, DbgValueVec[i]); + // If first instruction was a DBG_VALUE then put it back. + if (FirstDbgValue) + BB->insert(InsertPos, FirstDbgValue); // Then re-insert them according to the given schedule. for (unsigned i = 0, e = Sequence.size(); i != e; i++) { - SUnit *SU = Sequence[i]; - if (!SU) { + if (SUnit *SU = Sequence[i]) + BB->insert(InsertPos, SU->getInstr()); + else // Null SUnit* is a noop. EmitNoop(); - continue; - } - - BB->insert(InsertPos, SU->getInstr()); - for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) - BB->insert(InsertPos, SU->DbgInstrList[i]); } // Update the Begin iterator, as the first instruction in the block // may have been scheduled later. - if (!DbgValueVec.empty()) { - for (int i = DbgValueVec.size()-1; i>=0; --i) - if (DbgValueVec[i]!=0) { - Begin = DbgValueVec[DbgValueVec.size()-1]; - break; - } - } else if (!Sequence.empty()) + if (!Sequence.empty()) Begin = Sequence[0]->getInstr(); - DbgValueVec.clear(); + // Reinsert any remaining debug_values. + for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator + DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { + std::pair<MachineInstr *, MachineInstr *> P = *prior(DI); + MachineInstr *DbgValue = P.first; + MachineInstr *OrigPrivMI = P.second; + BB->insertAfter(OrigPrivMI, DbgValue); + } + DbgValues.clear(); + FirstDbgValue = NULL; return BB; } diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h index c878287..8a4ea85 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.h +++ b/lib/CodeGen/ScheduleDAGInstrs.h @@ -110,10 +110,6 @@ namespace llvm { std::vector<std::vector<SUnit *> > Defs; std::vector<std::vector<SUnit *> > Uses; - /// DbgValueVec - Remember DBG_VALUEs that refer to a particular - /// register. - std::vector<MachineInstr *>DbgValueVec; - /// PendingLoads - Remember where unknown loads are after the most recent /// unknown store, as we iterate. As with Defs and Uses, this is here /// to minimize construction/destruction. @@ -128,6 +124,14 @@ namespace llvm { /// SmallSet<unsigned, 8> LoopLiveInRegs; + protected: + + /// DbgValues - Remember instruction that preceeds DBG_VALUE. + typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > + DbgValueVector; + DbgValueVector DbgValues; + MachineInstr *FirstDbgValue; + public: MachineBasicBlock::iterator Begin; // The beginning of the range to // be scheduled. The range extends diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a9afec8..4ac590a 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -138,6 +138,10 @@ namespace { SDValue PromoteExtend(SDValue Op); bool PromoteLoad(SDValue Op); + void ExtendSetCCUses(SmallVector<SDNode*, 4> SetCCs, + SDValue Trunc, SDValue ExtLoad, DebugLoc DL, + ISD::NodeType ExtType); + /// combine - call the node-specific routine that knows how to fold each /// particular type of node. If that doesn't do anything, try the /// target-specific DAG combines. @@ -165,6 +169,8 @@ namespace { SDValue visitMULHS(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); + SDValue visitSMULO(SDNode *N); + SDValue visitUMULO(SDNode *N); SDValue visitSDIVREM(SDNode *N); SDValue visitUDIVREM(SDNode *N); SDValue visitAND(SDNode *N); @@ -529,7 +535,8 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL, cast<ConstantSDNode>(N0.getOperand(1)), cast<ConstantSDNode>(N1)); return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); - } else if (N0.hasOneUse()) { + } + if (N0.hasOneUse()) { // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one use SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT, N0.getOperand(0), N1); @@ -546,7 +553,8 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL, cast<ConstantSDNode>(N1.getOperand(1)), cast<ConstantSDNode>(N0)); return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode); - } else if (N1.hasOneUse()) { + } + if (N1.hasOneUse()) { // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one use SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT, N1.getOperand(0), N0); @@ -990,6 +998,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) { dbgs() << "\nWith: "; RV.getNode()->dump(&DAG); dbgs() << '\n'); + + // Transfer debug value. + DAG.TransferDbgValues(SDValue(N, 0), RV); WorkListRemover DeadNodes(*this); if (N->getNumValues() == RV.getNode()->getNumValues()) DAG.ReplaceAllUsesWith(N, RV.getNode(), &DeadNodes); @@ -1045,6 +1056,8 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::MULHS: return visitMULHS(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); + case ISD::SMULO: return visitSMULO(N); + case ISD::UMULO: return visitUMULO(N); case ISD::SDIVREM: return visitSDIVREM(N); case ISD::UDIVREM: return visitUDIVREM(N); case ISD::AND: return visitAND(N); @@ -1566,7 +1579,8 @@ static SDValue tryFoldToZero(DebugLoc DL, const TargetLowering &TLI, EVT VT, SelectionDAG &DAG, bool LegalOperations) { if (!VT.isVector()) { return DAG.getConstant(0, VT); - } else if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) { + } + if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) { // Produce a vector of zeros. SDValue El = DAG.getConstant(0, VT.getVectorElementType()); std::vector<SDValue> Ops(VT.getVectorNumElements(), El); @@ -2174,6 +2188,26 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSMULO(SDNode *N) { + // (smulo x, 2) -> (saddo x, x) + if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) + if (C2->getAPIntValue() == 2) + return DAG.getNode(ISD::SADDO, N->getDebugLoc(), N->getVTList(), + N->getOperand(0), N->getOperand(0)); + + return SDValue(); +} + +SDValue DAGCombiner::visitUMULO(SDNode *N) { + // (umulo x, 2) -> (uaddo x, x) + if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) + if (C2->getAPIntValue() == 2) + return DAG.getNode(ISD::UADDO, N->getDebugLoc(), N->getVTList(), + N->getOperand(0), N->getOperand(0)); + + return SDValue(); +} + SDValue DAGCombiner::visitSDIVREM(SDNode *N) { SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM); if (Res.getNode()) return Res; @@ -3000,6 +3034,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; + // fold (shl undef, x) -> 0 + if (N0.getOpcode() == ISD::UNDEF) + return DAG.getConstant(0, VT); // if (shl x, c) is known to be zero, return 0 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) @@ -3062,26 +3099,27 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { } } - // fold (shl (srl x, c1), c2) -> (shl (and x, (shl -1, c1)), (sub c2, c1)) or - // (srl (and x, (shl -1, c1)), (sub c1, c2)) + // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or + // (and (srl x, (sub c1, c2), MASK) if (N1C && N0.getOpcode() == ISD::SRL && N0.getOperand(1).getOpcode() == ISD::Constant) { uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue(); if (c1 < VT.getSizeInBits()) { uint64_t c2 = N1C->getZExtValue(); - SDValue HiBitsMask = - DAG.getConstant(APInt::getHighBitsSet(VT.getSizeInBits(), - VT.getSizeInBits() - c1), - VT); - SDValue Mask = DAG.getNode(ISD::AND, N0.getDebugLoc(), VT, - N0.getOperand(0), - HiBitsMask); - if (c2 > c1) - return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, Mask, - DAG.getConstant(c2-c1, N1.getValueType())); - else - return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, Mask, - DAG.getConstant(c1-c2, N1.getValueType())); + APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), + VT.getSizeInBits() - c1); + SDValue Shift; + if (c2 > c1) { + Mask = Mask.shl(c2-c1); + Shift = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0), + DAG.getConstant(c2-c1, N1.getValueType())); + } else { + Mask = Mask.lshr(c1-c2); + Shift = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0), + DAG.getConstant(c1-c2, N1.getValueType())); + } + return DAG.getNode(ISD::AND, N0.getDebugLoc(), VT, Shift, + DAG.getConstant(Mask, VT)); } } // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) @@ -3323,8 +3361,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return DAG.getUNDEF(VT); if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) { + uint64_t ShiftAmt = N1C->getZExtValue(); SDValue SmallShift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), SmallVT, - N0.getOperand(0), N1); + N0.getOperand(0), + DAG.getConstant(ShiftAmt, getShiftAmountTy(SmallVT))); AddToWorkList(SmallShift.getNode()); return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, SmallShift); } @@ -3663,6 +3703,28 @@ static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0, return true; } +void DAGCombiner::ExtendSetCCUses(SmallVector<SDNode*, 4> SetCCs, + SDValue Trunc, SDValue ExtLoad, DebugLoc DL, + ISD::NodeType ExtType) { + // Extend SetCC uses if necessary. + for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) { + SDNode *SetCC = SetCCs[i]; + SmallVector<SDValue, 4> Ops; + + for (unsigned j = 0; j != 2; ++j) { + SDValue SOp = SetCC->getOperand(j); + if (SOp == Trunc) + Ops.push_back(ExtLoad); + else + Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp)); + } + + Ops.push_back(SetCC->getOperand(2)); + CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), + &Ops[0], Ops.size())); + } +} + SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -3751,27 +3813,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - - // Extend SetCC uses if necessary. - for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) { - SDNode *SetCC = SetCCs[i]; - SmallVector<SDValue, 4> Ops; - - for (unsigned j = 0; j != 2; ++j) { - SDValue SOp = SetCC->getOperand(j); - if (SOp == Trunc) - Ops.push_back(ExtLoad); - else - Ops.push_back(DAG.getNode(ISD::SIGN_EXTEND, - N->getDebugLoc(), VT, SOp)); - } - - Ops.push_back(SetCC->getOperand(2)); - CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(), - SetCC->getValueType(0), - &Ops[0], Ops.size())); - } - + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(), + ISD::SIGN_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -3799,6 +3842,45 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { } } + // fold (sext (and/or/xor (load x), cst)) -> + // (and/or/xor (sextload x), (sext cst)) + if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || + N0.getOpcode() == ISD::XOR) && + isa<LoadSDNode>(N0.getOperand(0)) && + N0.getOperand(1).getOpcode() == ISD::Constant && + TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()) && + (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { + LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0)); + if (LN0->getExtensionType() != ISD::ZEXTLOAD) { + bool DoXform = true; + SmallVector<SDNode*, 4> SetCCs; + if (!N0.hasOneUse()) + DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND, + SetCCs, TLI); + if (DoXform) { + SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, LN0->getDebugLoc(), VT, + LN0->getChain(), LN0->getBasePtr(), + LN0->getPointerInfo(), + LN0->getMemoryVT(), + LN0->isVolatile(), + LN0->isNonTemporal(), + LN0->getAlignment()); + APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + Mask = Mask.sext(VT.getSizeInBits()); + SDValue And = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, + ExtLoad, DAG.getConstant(Mask, VT)); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, + N0.getOperand(0).getDebugLoc(), + N0.getOperand(0).getValueType(), ExtLoad); + CombineTo(N, And); + CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(), + ISD::SIGN_EXTEND); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + } + if (N0.getOpcode() == ISD::SETCC) { // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. @@ -3882,7 +3964,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorkList(oye); } - return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, NarrowLoad); + return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -3957,27 +4039,48 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - // Extend SetCC uses if necessary. - for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) { - SDNode *SetCC = SetCCs[i]; - SmallVector<SDValue, 4> Ops; - - for (unsigned j = 0; j != 2; ++j) { - SDValue SOp = SetCC->getOperand(j); - if (SOp == Trunc) - Ops.push_back(ExtLoad); - else - Ops.push_back(DAG.getNode(ISD::ZERO_EXTEND, - N->getDebugLoc(), VT, SOp)); - } + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(), + ISD::ZERO_EXTEND); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } - Ops.push_back(SetCC->getOperand(2)); - CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(), - SetCC->getValueType(0), - &Ops[0], Ops.size())); + // fold (zext (and/or/xor (load x), cst)) -> + // (and/or/xor (zextload x), (zext cst)) + if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || + N0.getOpcode() == ISD::XOR) && + isa<LoadSDNode>(N0.getOperand(0)) && + N0.getOperand(1).getOpcode() == ISD::Constant && + TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()) && + (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { + LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0)); + if (LN0->getExtensionType() != ISD::SEXTLOAD) { + bool DoXform = true; + SmallVector<SDNode*, 4> SetCCs; + if (!N0.hasOneUse()) + DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::ZERO_EXTEND, + SetCCs, TLI); + if (DoXform) { + SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), VT, + LN0->getChain(), LN0->getBasePtr(), + LN0->getPointerInfo(), + LN0->getMemoryVT(), + LN0->isVolatile(), + LN0->isNonTemporal(), + LN0->getAlignment()); + APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + Mask = Mask.zext(VT.getSizeInBits()); + SDValue And = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, + ExtLoad, DAG.getConstant(Mask, VT)); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, + N0.getOperand(0).getDebugLoc(), + N0.getOperand(0).getValueType(), ExtLoad); + CombineTo(N, And); + CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(), + ISD::ZERO_EXTEND); + return SDValue(N, 0); // Return N so it doesn't get rechecked! } - - return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -4012,7 +4115,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { EVT EltVT = VT.getVectorElementType(); SmallVector<SDValue,8> OneOps(VT.getVectorNumElements(), DAG.getConstant(1, EltVT)); - if (VT.getSizeInBits() == N0VT.getSizeInBits()) { + if (VT.getSizeInBits() == N0VT.getSizeInBits()) // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result // for that matter). Check to see that they are the same size. If so, @@ -4024,25 +4127,24 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { cast<CondCodeSDNode>(N0.getOperand(2))->get()), DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT, &OneOps[0], OneOps.size())); - } else { - // If the desired elements are smaller or larger than the source - // elements we can use a matching integer vector type and then - // truncate/sign extend - EVT MatchingElementType = - EVT::getIntegerVT(*DAG.getContext(), - N0VT.getScalarType().getSizeInBits()); - EVT MatchingVectorType = - EVT::getVectorVT(*DAG.getContext(), MatchingElementType, - N0VT.getVectorNumElements()); - SDValue VsetCC = - DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0), - N0.getOperand(1), - cast<CondCodeSDNode>(N0.getOperand(2))->get()); - return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, - DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT), - DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT, - &OneOps[0], OneOps.size())); - } + + // If the desired elements are smaller or larger than the source + // elements we can use a matching integer vector type and then + // truncate/sign extend + EVT MatchingElementType = + EVT::getIntegerVT(*DAG.getContext(), + N0VT.getScalarType().getSizeInBits()); + EVT MatchingVectorType = + EVT::getVectorVT(*DAG.getContext(), MatchingElementType, + N0VT.getVectorNumElements()); + SDValue VsetCC = + DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0), + N0.getOperand(1), + cast<CondCodeSDNode>(N0.getOperand(2))->get()); + return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, + DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT), + DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT, + &OneOps[0], OneOps.size())); } // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc @@ -4110,7 +4212,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorkList(oye); } - return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, NarrowLoad); + return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -4166,27 +4268,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - - // Extend SetCC uses if necessary. - for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) { - SDNode *SetCC = SetCCs[i]; - SmallVector<SDValue, 4> Ops; - - for (unsigned j = 0; j != 2; ++j) { - SDValue SOp = SetCC->getOperand(j); - if (SOp == Trunc) - Ops.push_back(ExtLoad); - else - Ops.push_back(DAG.getNode(ISD::ANY_EXTEND, - N->getDebugLoc(), VT, SOp)); - } - - Ops.push_back(SetCC->getOperand(2)); - CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(), - SetCC->getValueType(0), - &Ops[0], Ops.size())); - } - + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(), + ISD::ANY_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -6265,6 +6348,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { ST->isNonTemporal(), OrigAlign); } + // Turn 'store undef, Ptr' -> nothing. + if (Value.getOpcode() == ISD::UNDEF && ST->isUnindexed()) + return Chain; + // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) { // NOTE: If the original store is volatile, this transform must not increase @@ -6298,8 +6385,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { return DAG.getStore(Chain, N->getDebugLoc(), Tmp, Ptr, ST->getPointerInfo(), ST->isVolatile(), ST->isNonTemporal(), ST->getAlignment()); - } else if (!ST->isVolatile() && - TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { + } + + if (!ST->isVolatile() && + TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { // Many FP stores are not made apparent until after legalize, e.g. for // argument passing. Since this is so common, custom legalize the // 64-bit integer store into two 32-bit stores. @@ -6393,8 +6482,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" SDValue Shorter = GetDemandedBits(Value, - APInt::getLowBitsSet(Value.getValueSizeInBits(), - ST->getMemoryVT().getSizeInBits())); + APInt::getLowBitsSet( + Value.getValueType().getScalarType().getSizeInBits(), + ST->getMemoryVT().getScalarType().getSizeInBits())); AddToWorkList(Value.getNode()); if (Shorter.getNode()) return DAG.getTruncStore(Chain, N->getDebugLoc(), Shorter, @@ -6486,18 +6576,18 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // (vextract (scalar_to_vector val, 0) -> val SDValue InVec = N->getOperand(0); - if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { - // Check if the result type doesn't match the inserted element type. A - // SCALAR_TO_VECTOR may truncate the inserted element and the - // EXTRACT_VECTOR_ELT may widen the extracted vector. - SDValue InOp = InVec.getOperand(0); - EVT NVT = N->getValueType(0); - if (InOp.getValueType() != NVT) { - assert(InOp.getValueType().isInteger() && NVT.isInteger()); - return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT); - } - return InOp; - } + if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { + // Check if the result type doesn't match the inserted element type. A + // SCALAR_TO_VECTOR may truncate the inserted element and the + // EXTRACT_VECTOR_ELT may widen the extracted vector. + SDValue InOp = InVec.getOperand(0); + EVT NVT = N->getValueType(0); + if (InOp.getValueType() != NVT) { + assert(InOp.getValueType().isInteger() && NVT.isInteger()); + return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT); + } + return InOp; + } // Perform only after legalization to ensure build_vector / vector_shuffle // optimizations have already been done. @@ -6558,7 +6648,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { } } - if (!LN0 || !LN0->hasOneUse() || LN0->isVolatile()) + if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile()) return SDValue(); // If Idx was -1 above, Elt is going to be -1, so just return undef. @@ -7497,18 +7587,17 @@ bool DAGCombiner::FindAliasInfo(SDNode *N, SrcValueAlign = LD->getOriginalAlignment(); TBAAInfo = LD->getTBAAInfo(); return true; - } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + } + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { Ptr = ST->getBasePtr(); Size = ST->getMemoryVT().getSizeInBits() >> 3; SrcValue = ST->getSrcValue(); SrcValueOffset = ST->getSrcValueOffset(); SrcValueAlign = ST->getOriginalAlignment(); TBAAInfo = ST->getTBAAInfo(); - } else { - llvm_unreachable("FindAliasInfo expected a memory operand"); + return false; } - - return false; + llvm_unreachable("FindAliasInfo expected a memory operand"); } /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes, @@ -7621,13 +7710,13 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { // Accumulate all the aliases to this node. GatherAllAliases(N, OldChain, Aliases); - if (Aliases.size() == 0) { - // If no operands then chain to entry token. + // If no operands then chain to entry token. + if (Aliases.size() == 0) return DAG.getEntryNode(); - } else if (Aliases.size() == 1) { - // If a single operand then chain to it. We don't need to revisit it. + + // If a single operand then chain to it. We don't need to revisit it. + if (Aliases.size() == 1) return Aliases[0]; - } // Construct a custom tailored token factor. return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other, diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index ea8ace3..797f174 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -43,6 +43,8 @@ #include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Operator.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -109,8 +111,8 @@ unsigned FastISel::getRegForValue(const Value *V) { // of whether FastISel can handle them. MVT VT = RealVT.getSimpleVT(); if (!TLI.isTypeLegal(VT)) { - // Promote MVT::i1 to a legal type though, because it's common and easy. - if (VT == MVT::i1) + // Handle integer promotions, though, because they're common and easy. + if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16) VT = TLI.getTypeToTransformTo(V->getContext(), VT).getSimpleVT(); else return 0; @@ -121,10 +123,9 @@ unsigned FastISel::getRegForValue(const Value *V) { // only locally. This is because Instructions already have the SSA // def-dominates-use requirement enforced. DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V); - if (I != FuncInfo.ValueMap.end()) { - unsigned Reg = I->second; - return Reg; - } + if (I != FuncInfo.ValueMap.end()) + return I->second; + unsigned Reg = LocalValueMap[V]; if (Reg != 0) return Reg; @@ -164,8 +165,12 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) { Reg = getRegForValue(Constant::getNullValue(TD.getIntPtrType(V->getContext()))); } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) { - // Try to emit the constant directly. - Reg = FastEmit_f(VT, VT, ISD::ConstantFP, CF); + if (CF->isNullValue()) { + Reg = TargetMaterializeFloatZero(CF); + } else { + // Try to emit the constant directly. + Reg = FastEmit_f(VT, VT, ISD::ConstantFP, CF); + } if (!Reg) { // Try to emit the constant by using an integer constant with a cast. @@ -230,10 +235,10 @@ unsigned FastISel::lookUpRegForValue(const Value *V) { /// NOTE: This is only necessary because we might select a block that uses /// a value before we select the block that defines the value. It might be /// possible to fix this by selecting blocks in reverse postorder. -unsigned FastISel::UpdateValueMap(const Value *I, unsigned Reg) { +void FastISel::UpdateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) { if (!isa<Instruction>(I)) { LocalValueMap[I] = Reg; - return Reg; + return; } unsigned &AssignedReg = FuncInfo.ValueMap[I]; @@ -242,12 +247,11 @@ unsigned FastISel::UpdateValueMap(const Value *I, unsigned Reg) { AssignedReg = Reg; else if (Reg != AssignedReg) { // Arrange for uses of AssignedReg to be replaced by uses of Reg. - FuncInfo.RegFixups[AssignedReg] = Reg; + for (unsigned i = 0; i < NumRegs; i++) + FuncInfo.RegFixups[AssignedReg+i] = Reg+i; AssignedReg = Reg; } - - return AssignedReg; } std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) { @@ -330,23 +334,51 @@ bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) { return false; } + // Check if the first operand is a constant, and handle it as "ri". At -O0, + // we don't have anything that canonicalizes operand order. + if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(0))) + if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) { + unsigned Op1 = getRegForValue(I->getOperand(1)); + if (Op1 == 0) return false; + + bool Op1IsKill = hasTrivialKill(I->getOperand(1)); + + unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1, + Op1IsKill, CI->getZExtValue(), + VT.getSimpleVT()); + if (ResultReg == 0) return false; + + // We successfully emitted code for the given LLVM Instruction. + UpdateValueMap(I, ResultReg); + return true; + } + + unsigned Op0 = getRegForValue(I->getOperand(0)); - if (Op0 == 0) - // Unhandled operand. Halt "fast" selection and bail. + if (Op0 == 0) // Unhandled operand. Halt "fast" selection and bail. return false; bool Op0IsKill = hasTrivialKill(I->getOperand(0)); // Check if the second operand is a constant and handle it appropriately. if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { - unsigned ResultReg = FastEmit_ri(VT.getSimpleVT(), VT.getSimpleVT(), - ISDOpcode, Op0, Op0IsKill, - CI->getZExtValue()); - if (ResultReg != 0) { - // We successfully emitted code for the given LLVM Instruction. - UpdateValueMap(I, ResultReg); - return true; + uint64_t Imm = CI->getZExtValue(); + + // Transform "sdiv exact X, 8" -> "sra X, 3". + if (ISDOpcode == ISD::SDIV && isa<BinaryOperator>(I) && + cast<BinaryOperator>(I)->isExact() && + isPowerOf2_64(Imm)) { + Imm = Log2_64(Imm); + ISDOpcode = ISD::SRA; } + + unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0, + Op0IsKill, Imm, VT.getSimpleVT()); + if (ResultReg == 0) return false; + + // We successfully emitted code for the given LLVM Instruction. + UpdateValueMap(I, ResultReg); + return true; } // Check if the second operand is a constant float. @@ -454,15 +486,35 @@ bool FastISel::SelectGetElementPtr(const User *I) { } bool FastISel::SelectCall(const User *I) { - const Function *F = cast<CallInst>(I)->getCalledFunction(); + const CallInst *Call = cast<CallInst>(I); + + // Handle simple inline asms. + if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getArgOperand(0))) { + // Don't attempt to handle constraints. + if (!IA->getConstraintString().empty()) + return false; + + unsigned ExtraInfo = 0; + if (IA->hasSideEffects()) + ExtraInfo |= InlineAsm::Extra_HasSideEffects; + if (IA->isAlignStack()) + ExtraInfo |= InlineAsm::Extra_IsAlignStack; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::INLINEASM)) + .addExternalSymbol(IA->getAsmString().c_str()) + .addImm(ExtraInfo); + return true; + } + + const Function *F = Call->getCalledFunction(); if (!F) return false; // Handle selected intrinsic function calls. - unsigned IID = F->getIntrinsicID(); - switch (IID) { + switch (F->getIntrinsicID()) { default: break; case Intrinsic::dbg_declare: { - const DbgDeclareInst *DI = cast<DbgDeclareInst>(I); + const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call); if (!DIVariable(DI->getVariable()).Verify() || !FuncInfo.MF->getMMI().hasDebugInfo()) return true; @@ -494,7 +546,7 @@ bool FastISel::SelectCall(const User *I) { } case Intrinsic::dbg_value: { // This form of DBG_VALUE is target-independent. - const DbgValueInst *DI = cast<DbgValueInst>(I); + const DbgValueInst *DI = cast<DbgValueInst>(Call); const TargetInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); const Value *V = DI->getValue(); if (!V) { @@ -523,65 +575,68 @@ bool FastISel::SelectCall(const User *I) { return true; } case Intrinsic::eh_exception: { - EVT VT = TLI.getValueType(I->getType()); - switch (TLI.getOperationAction(ISD::EXCEPTIONADDR, VT)) { - default: break; - case TargetLowering::Expand: { - assert(FuncInfo.MBB->isLandingPad() && - "Call to eh.exception not in landing pad!"); - unsigned Reg = TLI.getExceptionAddressRegister(); - const TargetRegisterClass *RC = TLI.getRegClassFor(VT); - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - ResultReg).addReg(Reg); - UpdateValueMap(I, ResultReg); - return true; - } - } - break; + EVT VT = TLI.getValueType(Call->getType()); + if (TLI.getOperationAction(ISD::EXCEPTIONADDR, VT)!=TargetLowering::Expand) + break; + + assert(FuncInfo.MBB->isLandingPad() && + "Call to eh.exception not in landing pad!"); + unsigned Reg = TLI.getExceptionAddressRegister(); + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ResultReg).addReg(Reg); + UpdateValueMap(Call, ResultReg); + return true; } case Intrinsic::eh_selector: { - EVT VT = TLI.getValueType(I->getType()); - switch (TLI.getOperationAction(ISD::EHSELECTION, VT)) { - default: break; - case TargetLowering::Expand: { - if (FuncInfo.MBB->isLandingPad()) - AddCatchInfo(*cast<CallInst>(I), &FuncInfo.MF->getMMI(), FuncInfo.MBB); - else { + EVT VT = TLI.getValueType(Call->getType()); + if (TLI.getOperationAction(ISD::EHSELECTION, VT) != TargetLowering::Expand) + break; + if (FuncInfo.MBB->isLandingPad()) + AddCatchInfo(*Call, &FuncInfo.MF->getMMI(), FuncInfo.MBB); + else { #ifndef NDEBUG - FuncInfo.CatchInfoLost.insert(cast<CallInst>(I)); + FuncInfo.CatchInfoLost.insert(Call); #endif - // FIXME: Mark exception selector register as live in. Hack for PR1508. - unsigned Reg = TLI.getExceptionSelectorRegister(); - if (Reg) FuncInfo.MBB->addLiveIn(Reg); - } - + // FIXME: Mark exception selector register as live in. Hack for PR1508. unsigned Reg = TLI.getExceptionSelectorRegister(); - EVT SrcVT = TLI.getPointerTy(); - const TargetRegisterClass *RC = TLI.getRegClassFor(SrcVT); - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - ResultReg).addReg(Reg); - - bool ResultRegIsKill = hasTrivialKill(I); - - // Cast the register to the type of the selector. - if (SrcVT.bitsGT(MVT::i32)) - ResultReg = FastEmit_r(SrcVT.getSimpleVT(), MVT::i32, ISD::TRUNCATE, - ResultReg, ResultRegIsKill); - else if (SrcVT.bitsLT(MVT::i32)) - ResultReg = FastEmit_r(SrcVT.getSimpleVT(), MVT::i32, - ISD::SIGN_EXTEND, ResultReg, ResultRegIsKill); - if (ResultReg == 0) - // Unhandled operand. Halt "fast" selection and bail. - return false; + if (Reg) FuncInfo.MBB->addLiveIn(Reg); + } - UpdateValueMap(I, ResultReg); + unsigned Reg = TLI.getExceptionSelectorRegister(); + EVT SrcVT = TLI.getPointerTy(); + const TargetRegisterClass *RC = TLI.getRegClassFor(SrcVT); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ResultReg).addReg(Reg); + + bool ResultRegIsKill = hasTrivialKill(Call); + + // Cast the register to the type of the selector. + if (SrcVT.bitsGT(MVT::i32)) + ResultReg = FastEmit_r(SrcVT.getSimpleVT(), MVT::i32, ISD::TRUNCATE, + ResultReg, ResultRegIsKill); + else if (SrcVT.bitsLT(MVT::i32)) + ResultReg = FastEmit_r(SrcVT.getSimpleVT(), MVT::i32, + ISD::SIGN_EXTEND, ResultReg, ResultRegIsKill); + if (ResultReg == 0) + // Unhandled operand. Halt "fast" selection and bail. + return false; - return true; - } - } - break; + UpdateValueMap(Call, ResultReg); + + return true; + } + case Intrinsic::objectsize: { + ConstantInt *CI = cast<ConstantInt>(Call->getArgOperand(1)); + unsigned long long Res = CI->isZero() ? -1ULL : 0; + Constant *ResCI = ConstantInt::get(Call->getType(), Res); + unsigned ResultReg = getRegForValue(ResCI); + if (ResultReg == 0) + return false; + UpdateValueMap(Call, ResultReg); + return true; } } @@ -598,21 +653,13 @@ bool FastISel::SelectCast(const User *I, unsigned Opcode) { // Unhandled type. Halt "fast" selection and bail. return false; - // Check if the destination type is legal. Or as a special case, - // it may be i1 if we're doing a truncate because that's - // easy and somewhat common. + // Check if the destination type is legal. if (!TLI.isTypeLegal(DstVT)) - if (DstVT != MVT::i1 || Opcode != ISD::TRUNCATE) - // Unhandled type. Halt "fast" selection and bail. - return false; + return false; - // Check if the source operand is legal. Or as a special case, - // it may be i1 if we're doing zero-extension because that's - // easy and somewhat common. + // Check if the source operand is legal. if (!TLI.isTypeLegal(SrcVT)) - if (SrcVT != MVT::i1 || Opcode != ISD::ZERO_EXTEND) - // Unhandled type. Halt "fast" selection and bail. - return false; + return false; unsigned InputReg = getRegForValue(I->getOperand(0)); if (!InputReg) @@ -621,18 +668,6 @@ bool FastISel::SelectCast(const User *I, unsigned Opcode) { bool InputRegIsKill = hasTrivialKill(I->getOperand(0)); - // If the operand is i1, arrange for the high bits in the register to be zero. - if (SrcVT == MVT::i1) { - SrcVT = TLI.getTypeToTransformTo(I->getContext(), SrcVT); - InputReg = FastEmitZExtFromI1(SrcVT.getSimpleVT(), InputReg, InputRegIsKill); - if (!InputReg) - return false; - InputRegIsKill = true; - } - // If the result is i1, truncate to the target's type for i1 first. - if (DstVT == MVT::i1) - DstVT = TLI.getTypeToTransformTo(I->getContext(), DstVT); - unsigned ResultReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opcode, @@ -784,6 +819,47 @@ FastISel::SelectFNeg(const User *I) { } bool +FastISel::SelectExtractValue(const User *U) { + const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U); + if (!EVI) + return false; + + // Make sure we only try to handle extracts with a legal result. But also + // allow i1 because it's easy. + EVT RealVT = TLI.getValueType(EVI->getType(), /*AllowUnknown=*/true); + if (!RealVT.isSimple()) + return false; + MVT VT = RealVT.getSimpleVT(); + if (!TLI.isTypeLegal(VT) && VT != MVT::i1) + return false; + + const Value *Op0 = EVI->getOperand(0); + const Type *AggTy = Op0->getType(); + + // Get the base result register. + unsigned ResultReg; + DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(Op0); + if (I != FuncInfo.ValueMap.end()) + ResultReg = I->second; + else if (isa<Instruction>(Op0)) + ResultReg = FuncInfo.InitializeRegForValue(Op0); + else + return false; // fast-isel can't handle aggregate constants at the moment + + // Get the actual result register, which is an offset from the base register. + unsigned VTIndex = ComputeLinearIndex(AggTy, EVI->idx_begin(), EVI->idx_end()); + + SmallVector<EVT, 4> AggValueVTs; + ComputeValueVTs(TLI, AggTy, AggValueVTs); + + for (unsigned i = 0; i < VTIndex; i++) + ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]); + + UpdateValueMap(EVI, ResultReg); + return true; +} + +bool FastISel::SelectOperator(const User *I, unsigned Opcode) { switch (Opcode) { case Instruction::Add: @@ -887,6 +963,9 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) { return true; } + case Instruction::ExtractValue: + return SelectExtractValue(I); + case Instruction::PHI: llvm_unreachable("FastISel shouldn't visit PHI nodes!"); @@ -966,59 +1045,33 @@ unsigned FastISel::FastEmit_rri(MVT, MVT, unsigned FastISel::FastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0, bool Op0IsKill, uint64_t Imm, MVT ImmType) { + // If this is a multiply by a power of two, emit this as a shift left. + if (Opcode == ISD::MUL && isPowerOf2_64(Imm)) { + Opcode = ISD::SHL; + Imm = Log2_64(Imm); + } else if (Opcode == ISD::UDIV && isPowerOf2_64(Imm)) { + // div x, 8 -> srl x, 3 + Opcode = ISD::SRL; + Imm = Log2_64(Imm); + } + + // Horrible hack (to be removed), check to make sure shift amounts are + // in-range. + if ((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) && + Imm >= VT.getSizeInBits()) + return 0; + // First check if immediate type is legal. If not, we can't use the ri form. unsigned ResultReg = FastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm); if (ResultReg != 0) return ResultReg; unsigned MaterialReg = FastEmit_i(ImmType, ImmType, ISD::Constant, Imm); - if (MaterialReg == 0) - return 0; - return FastEmit_rr(VT, VT, Opcode, - Op0, Op0IsKill, - MaterialReg, /*Kill=*/true); -} - -/// FastEmit_rf_ - This method is a wrapper of FastEmit_ri. It first tries -/// to emit an instruction with a floating-point immediate operand using -/// FastEmit_rf. If that fails, it materializes the immediate into a register -/// and try FastEmit_rr instead. -unsigned FastISel::FastEmit_rf_(MVT VT, unsigned Opcode, - unsigned Op0, bool Op0IsKill, - const ConstantFP *FPImm, MVT ImmType) { - // First check if immediate type is legal. If not, we can't use the rf form. - unsigned ResultReg = FastEmit_rf(VT, VT, Opcode, Op0, Op0IsKill, FPImm); - if (ResultReg != 0) - return ResultReg; - - // Materialize the constant in a register. - unsigned MaterialReg = FastEmit_f(ImmType, ImmType, ISD::ConstantFP, FPImm); if (MaterialReg == 0) { - // If the target doesn't have a way to directly enter a floating-point - // value into a register, use an alternate approach. - // TODO: The current approach only supports floating-point constants - // that can be constructed by conversion from integer values. This should - // be replaced by code that creates a load from a constant-pool entry, - // which will require some target-specific work. - const APFloat &Flt = FPImm->getValueAPF(); - EVT IntVT = TLI.getPointerTy(); - - uint64_t x[2]; - uint32_t IntBitWidth = IntVT.getSizeInBits(); - bool isExact; - (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true, - APFloat::rmTowardZero, &isExact); - if (!isExact) - return 0; - APInt IntVal(IntBitWidth, 2, x); - - unsigned IntegerReg = FastEmit_i(IntVT.getSimpleVT(), IntVT.getSimpleVT(), - ISD::Constant, IntVal.getZExtValue()); - if (IntegerReg == 0) - return 0; - MaterialReg = FastEmit_r(IntVT.getSimpleVT(), VT, - ISD::SINT_TO_FP, IntegerReg, /*Kill=*/true); - if (MaterialReg == 0) - return 0; + // This is a bit ugly/slow, but failing here means falling out of + // fast-isel, which would be very slow. + const IntegerType *ITy = IntegerType::get(FuncInfo.Fn->getContext(), + VT.getSizeInBits()); + MaterialReg = getRegForValue(ConstantInt::get(ITy, Imm)); } return FastEmit_rr(VT, VT, Opcode, Op0, Op0IsKill, @@ -1078,6 +1131,30 @@ unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode, return ResultReg; } +unsigned FastISel::FastEmitInst_rrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill) { + unsigned ResultReg = createResultReg(RC); + const TargetInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addReg(Op2, Op2IsKill * RegState::Kill); + else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addReg(Op2, Op2IsKill * RegState::Kill); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ResultReg).addReg(II.ImplicitDefs[0]); + } + return ResultReg; +} + unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, @@ -1183,6 +1260,23 @@ unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode, return ResultReg; } +unsigned FastISel::FastEmitInst_ii(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm1, uint64_t Imm2) { + unsigned ResultReg = createResultReg(RC); + const TargetInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addImm(Imm1).addImm(Imm2); + else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II).addImm(Imm1).addImm(Imm2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ResultReg).addReg(II.ImplicitDefs[0]); + } + return ResultReg; +} + unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill, uint32_t Idx) { @@ -1238,7 +1332,7 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { // Only handle legal types. Two interesting things to note here. First, // by bailing out early, we may leave behind some dead instructions, // since SelectionDAG's HandlePHINodesInSuccessorBlocks will insert its - // own moves. Second, this check is necessary becuase FastISel doesn't + // own moves. Second, this check is necessary because FastISel doesn't // use CreateRegs to create registers, so it always creates // exactly one register for each non-void instruction. EVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true); diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index d8a5770..d518b5d 100644 --- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -54,25 +54,6 @@ static bool isUsedOutsideOfDefiningBlock(const Instruction *I) { return false; } -/// isOnlyUsedInEntryBlock - If the specified argument is only used in the -/// entry block, return true. This includes arguments used by switches, since -/// the switch may expand into multiple basic blocks. -static bool isOnlyUsedInEntryBlock(const Argument *A, bool EnableFastISel) { - // With FastISel active, we may be splitting blocks, so force creation - // of virtual registers for all non-dead arguments. - if (EnableFastISel) - return A->use_empty(); - - const BasicBlock *Entry = A->getParent()->begin(); - for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end(); - UI != E; ++UI) { - const User *U = *UI; - if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U)) - return false; // Use not in entry block. - } - return true; -} - FunctionLoweringInfo::FunctionLoweringInfo(const TargetLowering &tli) : TLI(tli) { } @@ -86,16 +67,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) { SmallVector<ISD::OutputArg, 4> Outs; GetReturnInfo(Fn->getReturnType(), Fn->getAttributes().getRetAttributes(), Outs, TLI); - CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), Fn->isVarArg(), + CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), *MF, + Fn->isVarArg(), Outs, Fn->getContext()); - // Create a vreg for each argument register that is not dead and is used - // outside of the entry block for the function. - for (Function::const_arg_iterator AI = Fn->arg_begin(), E = Fn->arg_end(); - AI != E; ++AI) - if (!isOnlyUsedInEntryBlock(AI, EnableFastISel)) - InitializeRegForValue(AI); - // Initialize the mapping of values to registers. This is only set up for // instruction values that are used outside of the block that defines // them. @@ -181,6 +156,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) { const PHINode *PN = dyn_cast<PHINode>(I); ++I) { if (PN->use_empty()) continue; + // Skip empty types + if (PN->getType()->isEmptyTy()) + continue; + DebugLoc DL = PN->getDebugLoc(); unsigned PHIReg = ValueMap[PN]; assert(PHIReg && "PHI node does not have an assigned virtual register!"); @@ -343,7 +322,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { APInt Zero(BitWidth, 0); DestLOI.KnownZero = Zero; DestLOI.KnownOne = Zero; - return; + return; } if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { @@ -375,18 +354,18 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { /// setByValArgumentFrameIndex - Record frame index for the byval /// argument. This overrides previous frame index entry for this argument, /// if any. -void FunctionLoweringInfo::setByValArgumentFrameIndex(const Argument *A, +void FunctionLoweringInfo::setByValArgumentFrameIndex(const Argument *A, int FI) { assert (A->hasByValAttr() && "Argument does not have byval attribute!"); ByValArgFrameIndexMap[A] = FI; } - + /// getByValArgumentFrameIndex - Get frame index for the byval argument. /// If the argument does not have any assigned frame index then 0 is /// returned. int FunctionLoweringInfo::getByValArgumentFrameIndex(const Argument *A) { assert (A->hasByValAttr() && "Argument does not have byval attribute!"); - DenseMap<const Argument *, int>::iterator I = + DenseMap<const Argument *, int>::iterator I = ByValArgFrameIndexMap.find(A); if (I != ByValArgFrameIndexMap.end()) return I->second; diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index e309def..2a65d65 100644 --- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -76,6 +76,12 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, // the CopyToReg'd destination register instead of creating a new vreg. bool MatchReg = true; const TargetRegisterClass *UseRC = NULL; + EVT VT = Node->getValueType(ResNo); + + // Stick to the preferred register classes for legal types. + if (TLI->isTypeLegal(VT)) + UseRC = TLI->getRegClassFor(VT); + if (!IsClone && !IsCloned) for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end(); UI != E; ++UI) { @@ -121,10 +127,9 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, break; } - EVT VT = Node->getValueType(ResNo); const TargetRegisterClass *SrcRC = 0, *DstRC = 0; SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT); - + // Figure out the register class to create for the destreg. if (VRBase) { DstRC = MRI->getRegClass(VRBase); @@ -283,7 +288,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op, DstRC = II->OpInfo[IIOpNum].getRegClass(TRI); assert((DstRC || (TID.isVariadic() && IIOpNum >= TID.getNumOperands())) && "Don't have operand info for this instruction!"); - if (DstRC && SrcRC != DstRC && !SrcRC->hasSuperClass(DstRC)) { + if (DstRC && !SrcRC->hasSuperClassEq(DstRC)) { unsigned NewVReg = MRI->createVirtualRegister(DstRC); BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(), TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg); @@ -543,17 +548,18 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node, void InstrEmitter::EmitRegSequence(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap, bool IsClone, bool IsCloned) { - const TargetRegisterClass *RC = TLI->getRegClassFor(Node->getValueType(0)); + unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue(); + const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx); unsigned NewVReg = MRI->createVirtualRegister(RC); MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), TII->get(TargetOpcode::REG_SEQUENCE), NewVReg); unsigned NumOps = Node->getNumOperands(); - assert((NumOps & 1) == 0 && - "REG_SEQUENCE must have an even number of operands!"); + assert((NumOps & 1) == 1 && + "REG_SEQUENCE must have an odd number of operands!"); const TargetInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE); - for (unsigned i = 0; i != NumOps; ++i) { + for (unsigned i = 1; i != NumOps; ++i) { SDValue Op = Node->getOperand(i); - if (i & 1) { + if ((i & 1) == 0) { unsigned SubIdx = cast<ConstantSDNode>(Op)->getZExtValue(); unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap); const TargetRegisterClass *TRC = MRI->getRegClass(SubReg); diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index b837261..4b6d3ef 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -14,23 +14,16 @@ #include "llvm/Analysis/DebugInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" #include "llvm/CallingConv.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/GlobalVariable.h" #include "llvm/LLVMContext.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -57,19 +50,13 @@ class SelectionDAGLegalize { const TargetMachine &TM; const TargetLowering &TLI; SelectionDAG &DAG; - CodeGenOpt::Level OptLevel; // Libcall insertion helpers. - /// LastCALLSEQ_END - This keeps track of the CALLSEQ_END node that has been + /// LastCALLSEQ - This keeps track of the CALLSEQ_END node that has been /// legalized. We use this to ensure that calls are properly serialized /// against each other, including inserted libcalls. - SDValue LastCALLSEQ_END; - - /// IsLegalizingCall - This member is used *only* for purposes of providing - /// helpful assertions that a libcall isn't created while another call is - /// being legalized (which could lead to non-serialized call sequences). - bool IsLegalizingCall; + SmallVector<SDValue, 8> LastCALLSEQ; enum LegalizeAction { Legal, // The target natively supports this operation. @@ -98,13 +85,13 @@ class SelectionDAGLegalize { } public: - SelectionDAGLegalize(SelectionDAG &DAG, CodeGenOpt::Level ol); + explicit SelectionDAGLegalize(SelectionDAG &DAG); /// getTypeAction - Return how we should legalize values of this type, either /// it is already legal or we need to expand it into multiple registers of /// smaller integer type, or we need to promote it to a larger type. LegalizeAction getTypeAction(EVT VT) const { - return (LegalizeAction)ValueTypeActions.getTypeAction(VT); + return (LegalizeAction)TLI.getTypeAction(*DAG.getContext(), VT); } /// isTypeLegal - Return true if this type is legal on this target. @@ -147,6 +134,9 @@ private: DebugLoc dl); SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned); + SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops, + unsigned NumOps, bool isSigned, DebugLoc dl); + std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned); SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32, @@ -158,7 +148,7 @@ private: RTLIB::Libcall Call_I32, RTLIB::Libcall Call_I64, RTLIB::Libcall Call_I128); - SDValue ExpandDivRemLibCall(SDNode *Node, bool isSigned, bool isDIV); + void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results); SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, DebugLoc dl); SDValue ExpandBUILD_VECTOR(SDNode *Node); @@ -184,6 +174,15 @@ private: void ExpandNode(SDNode *Node, SmallVectorImpl<SDValue> &Results); void PromoteNode(SDNode *Node, SmallVectorImpl<SDValue> &Results); + + SDValue getLastCALLSEQ() { return LastCALLSEQ.back(); } + void setLastCALLSEQ(const SDValue s) { LastCALLSEQ.back() = s; } + void pushLastCALLSEQ(SDValue s) { + LastCALLSEQ.push_back(s); + } + void popLastCALLSEQ() { + LastCALLSEQ.pop_back(); + } }; } @@ -219,18 +218,16 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT, DebugLoc dl, return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]); } -SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag, - CodeGenOpt::Level ol) +SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag) : TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()), - DAG(dag), OptLevel(ol), + DAG(dag), ValueTypeActions(TLI.getValueTypeActions()) { assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE && "Too many value types for ValueTypeActions to hold!"); } void SelectionDAGLegalize::LegalizeDAG() { - LastCALLSEQ_END = DAG.getEntryNode(); - IsLegalizingCall = false; + pushLastCALLSEQ(DAG.getEntryNode()); // The legalize process is inherently a bottom-up recursive process (users // legalize their uses before themselves). Given infinite stack space, we @@ -258,14 +255,15 @@ void SelectionDAGLegalize::LegalizeDAG() { /// FindCallEndFromCallStart - Given a chained node that is part of a call /// sequence, find the CALLSEQ_END node that terminates the call sequence. static SDNode *FindCallEndFromCallStart(SDNode *Node, int depth = 0) { - // Nested CALLSEQ_START/END constructs aren't yet legal, - // but we can DTRT and handle them correctly here. + int next_depth = depth; if (Node->getOpcode() == ISD::CALLSEQ_START) - depth++; - else if (Node->getOpcode() == ISD::CALLSEQ_END) { - depth--; - if (depth == 0) + next_depth = depth + 1; + if (Node->getOpcode() == ISD::CALLSEQ_END) { + assert(depth > 0 && "negative depth!"); + if (depth == 1) return Node; + else + next_depth = depth - 1; } if (Node->use_empty()) return 0; // No CallSeqEnd @@ -296,7 +294,7 @@ static SDNode *FindCallEndFromCallStart(SDNode *Node, int depth = 0) { SDNode *User = *UI; for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) if (User->getOperand(i) == TheChain) - if (SDNode *Result = FindCallEndFromCallStart(User, depth)) + if (SDNode *Result = FindCallEndFromCallStart(User, next_depth)) return Result; } return 0; @@ -317,6 +315,7 @@ static SDNode *FindCallStartFromCallEnd(SDNode *Node) { case ISD::CALLSEQ_START: if (!nested) return Node; + Node = Node->getOperand(0).getNode(); nested--; break; case ISD::CALLSEQ_END: @@ -324,7 +323,7 @@ static SDNode *FindCallStartFromCallEnd(SDNode *Node) { break; } } - return 0; + return (Node->getOpcode() == ISD::CALLSEQ_START) ? Node : 0; } /// LegalizeAllNodesNotLeadingTo - Recursively walk the uses of N, looking to @@ -433,68 +432,67 @@ SDValue ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG, SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val); return DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(), ST->isVolatile(), ST->isNonTemporal(), Alignment); - } else { - // Do a (aligned) store to a stack slot, then copy from the stack slot - // to the final destination using (unaligned) integer loads and stores. - EVT StoredVT = ST->getMemoryVT(); - EVT RegVT = - TLI.getRegisterType(*DAG.getContext(), - EVT::getIntegerVT(*DAG.getContext(), - StoredVT.getSizeInBits())); - unsigned StoredBytes = StoredVT.getSizeInBits() / 8; - unsigned RegBytes = RegVT.getSizeInBits() / 8; - unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes; - - // Make sure the stack slot is also aligned for the register type. - SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT); - - // Perform the original store, only redirected to the stack slot. - SDValue Store = DAG.getTruncStore(Chain, dl, - Val, StackPtr, MachinePointerInfo(), - StoredVT, false, false, 0); - SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy()); - SmallVector<SDValue, 8> Stores; - unsigned Offset = 0; - - // Do all but one copies using the full register width. - for (unsigned i = 1; i < NumRegs; i++) { - // Load one integer register's worth from the stack slot. - SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr, - MachinePointerInfo(), - false, false, 0); - // Store it to the final location. Remember the store. - Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr, - ST->getPointerInfo().getWithOffset(Offset), - ST->isVolatile(), ST->isNonTemporal(), - MinAlign(ST->getAlignment(), Offset))); - // Increment the pointers. - Offset += RegBytes; - StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, - Increment); - Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); - } + } + // Do a (aligned) store to a stack slot, then copy from the stack slot + // to the final destination using (unaligned) integer loads and stores. + EVT StoredVT = ST->getMemoryVT(); + EVT RegVT = + TLI.getRegisterType(*DAG.getContext(), + EVT::getIntegerVT(*DAG.getContext(), + StoredVT.getSizeInBits())); + unsigned StoredBytes = StoredVT.getSizeInBits() / 8; + unsigned RegBytes = RegVT.getSizeInBits() / 8; + unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes; - // The last store may be partial. Do a truncating store. On big-endian - // machines this requires an extending load from the stack slot to ensure - // that the bits are in the right place. - EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), - 8 * (StoredBytes - Offset)); + // Make sure the stack slot is also aligned for the register type. + SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT); - // Load from the stack slot. - SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr, - MachinePointerInfo(), - MemVT, false, false, 0); + // Perform the original store, only redirected to the stack slot. + SDValue Store = DAG.getTruncStore(Chain, dl, + Val, StackPtr, MachinePointerInfo(), + StoredVT, false, false, 0); + SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy()); + SmallVector<SDValue, 8> Stores; + unsigned Offset = 0; - Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr, - ST->getPointerInfo() - .getWithOffset(Offset), - MemVT, ST->isVolatile(), - ST->isNonTemporal(), - MinAlign(ST->getAlignment(), Offset))); - // The order of the stores doesn't matter - say it with a TokenFactor. - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0], - Stores.size()); + // Do all but one copies using the full register width. + for (unsigned i = 1; i < NumRegs; i++) { + // Load one integer register's worth from the stack slot. + SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr, + MachinePointerInfo(), + false, false, 0); + // Store it to the final location. Remember the store. + Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr, + ST->getPointerInfo().getWithOffset(Offset), + ST->isVolatile(), ST->isNonTemporal(), + MinAlign(ST->getAlignment(), Offset))); + // Increment the pointers. + Offset += RegBytes; + StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, + Increment); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); } + + // The last store may be partial. Do a truncating store. On big-endian + // machines this requires an extending load from the stack slot to ensure + // that the bits are in the right place. + EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), + 8 * (StoredBytes - Offset)); + + // Load from the stack slot. + SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr, + MachinePointerInfo(), + MemVT, false, false, 0); + + Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr, + ST->getPointerInfo() + .getWithOffset(Offset), + MemVT, ST->isVolatile(), + ST->isNonTemporal(), + MinAlign(ST->getAlignment(), Offset))); + // The order of the stores doesn't matter - say it with a TokenFactor. + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0], + Stores.size()); } assert(ST->getMemoryVT().isInteger() && !ST->getMemoryVT().isVector() && @@ -941,11 +939,12 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) { case ISD::BR_JT: case ISD::BR_CC: case ISD::BRCOND: - // Branches tweak the chain to include LastCALLSEQ_END + assert(LastCALLSEQ.size() == 1 && "branch inside CALLSEQ_BEGIN/END?"); + // Branches tweak the chain to include LastCALLSEQ Ops[0] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ops[0], - LastCALLSEQ_END); + getLastCALLSEQ()); Ops[0] = LegalizeOp(Ops[0]); - LastCALLSEQ_END = DAG.getEntryNode(); + setLastCALLSEQ(DAG.getEntryNode()); break; case ISD::SHL: case ISD::SRL: @@ -1034,6 +1033,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) { break; case ISD::CALLSEQ_START: { SDNode *CallEnd = FindCallEndFromCallStart(Node); + assert(CallEnd && "didn't find CALLSEQ_END!"); // Recursively Legalize all of the inputs of the call end that do not lead // to this call start. This ensures that any libcalls that need be inserted @@ -1050,9 +1050,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) { // Merge in the last call to ensure that this call starts after the last // call ended. - if (LastCALLSEQ_END.getOpcode() != ISD::EntryToken) { + if (getLastCALLSEQ().getOpcode() != ISD::EntryToken) { Tmp1 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - Tmp1, LastCALLSEQ_END); + Tmp1, getLastCALLSEQ()); Tmp1 = LegalizeOp(Tmp1); } @@ -1073,25 +1073,29 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) { // sequence have been legalized, legalize the call itself. During this // process, no libcalls can/will be inserted, guaranteeing that no calls // can overlap. - assert(!IsLegalizingCall && "Inconsistent sequentialization of calls!"); // Note that we are selecting this call! - LastCALLSEQ_END = SDValue(CallEnd, 0); - IsLegalizingCall = true; + setLastCALLSEQ(SDValue(CallEnd, 0)); // Legalize the call, starting from the CALLSEQ_END. - LegalizeOp(LastCALLSEQ_END); - assert(!IsLegalizingCall && "CALLSEQ_END should have cleared this!"); + LegalizeOp(getLastCALLSEQ()); return Result; } case ISD::CALLSEQ_END: - // If the CALLSEQ_START node hasn't been legalized first, legalize it. This - // will cause this node to be legalized as well as handling libcalls right. - if (LastCALLSEQ_END.getNode() != Node) { - LegalizeOp(SDValue(FindCallStartFromCallEnd(Node), 0)); - DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op); - assert(I != LegalizedNodes.end() && - "Legalizing the call start should have legalized this node!"); - return I->second; + { + SDNode *myCALLSEQ_BEGIN = FindCallStartFromCallEnd(Node); + + // If the CALLSEQ_START node hasn't been legalized first, legalize it. + // This will cause this node to be legalized as well as handling libcalls + // right. + if (getLastCALLSEQ().getNode() != Node) { + LegalizeOp(SDValue(myCALLSEQ_BEGIN, 0)); + DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op); + assert(I != LegalizedNodes.end() && + "Legalizing the call start should have legalized this node!"); + return I->second; + } + + pushLastCALLSEQ(SDValue(myCALLSEQ_BEGIN, 0)); } // Otherwise, the call start has been legalized and everything is going @@ -1119,9 +1123,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) { Result.getResNo()); } } - assert(IsLegalizingCall && "Call sequence imbalance between start/end?"); // This finishes up call legalization. - IsLegalizingCall = false; + popLastCALLSEQ(); // If the CALLSEQ_END node has a flag, remember that we legalized it. AddLegalizedOperand(SDValue(Node, 0), Result.getValue(0)); @@ -1371,6 +1374,91 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) { Tmp2 = LegalizeOp(Load.getValue(1)); break; } + + // If this is a promoted vector load, and the vector element types are + // legal, then scalarize it. + if (ExtType == ISD::EXTLOAD && SrcVT.isVector() && + isTypeLegal(Node->getValueType(0).getScalarType())) { + SmallVector<SDValue, 8> LoadVals; + SmallVector<SDValue, 8> LoadChains; + unsigned NumElem = SrcVT.getVectorNumElements(); + unsigned Stride = SrcVT.getScalarType().getSizeInBits()/8; + + for (unsigned Idx=0; Idx<NumElem; Idx++) { + Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, + DAG.getIntPtrConstant(Stride)); + SDValue ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, + Node->getValueType(0).getScalarType(), + Tmp1, Tmp2, LD->getPointerInfo().getWithOffset(Idx * Stride), + SrcVT.getScalarType(), + LD->isVolatile(), LD->isNonTemporal(), + LD->getAlignment()); + + LoadVals.push_back(ScalarLoad.getValue(0)); + LoadChains.push_back(ScalarLoad.getValue(1)); + } + Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &LoadChains[0], LoadChains.size()); + SDValue ValRes = DAG.getNode(ISD::BUILD_VECTOR, dl, + Node->getValueType(0), &LoadVals[0], LoadVals.size()); + + Tmp1 = LegalizeOp(ValRes); // Relegalize new nodes. + Tmp2 = LegalizeOp(Result.getValue(0)); // Relegalize new nodes. + break; + } + + // If this is a promoted vector load, and the vector element types are + // illegal, create the promoted vector from bitcasted segments. + if (ExtType == ISD::EXTLOAD && SrcVT.isVector()) { + EVT MemElemTy = Node->getValueType(0).getScalarType(); + EVT SrcSclrTy = SrcVT.getScalarType(); + unsigned SizeRatio = + (MemElemTy.getSizeInBits() / SrcSclrTy.getSizeInBits()); + + SmallVector<SDValue, 8> LoadVals; + SmallVector<SDValue, 8> LoadChains; + unsigned NumElem = SrcVT.getVectorNumElements(); + unsigned Stride = SrcVT.getScalarType().getSizeInBits()/8; + + for (unsigned Idx=0; Idx<NumElem; Idx++) { + Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, + DAG.getIntPtrConstant(Stride)); + SDValue ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, + SrcVT.getScalarType(), + Tmp1, Tmp2, LD->getPointerInfo().getWithOffset(Idx * Stride), + SrcVT.getScalarType(), + LD->isVolatile(), LD->isNonTemporal(), + LD->getAlignment()); + if (TLI.isBigEndian()) { + // MSB (which is garbage, comes first) + LoadVals.push_back(ScalarLoad.getValue(0)); + for (unsigned i = 0; i<SizeRatio-1; ++i) + LoadVals.push_back(DAG.getUNDEF(SrcVT.getScalarType())); + } else { + // LSB (which is data, comes first) + for (unsigned i = 0; i<SizeRatio-1; ++i) + LoadVals.push_back(DAG.getUNDEF(SrcVT.getScalarType())); + LoadVals.push_back(ScalarLoad.getValue(0)); + } + LoadChains.push_back(ScalarLoad.getValue(1)); + } + + Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &LoadChains[0], LoadChains.size()); + EVT TempWideVector = EVT::getVectorVT(*DAG.getContext(), + SrcVT.getScalarType(), NumElem*SizeRatio); + SDValue ValRes = DAG.getNode(ISD::BUILD_VECTOR, dl, + TempWideVector, &LoadVals[0], LoadVals.size()); + + // Cast to the correct type + ValRes = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), ValRes); + + Tmp1 = LegalizeOp(ValRes); // Relegalize new nodes. + Tmp2 = LegalizeOp(Result.getValue(0)); // Relegalize new nodes. + break; + + } + // FIXME: This does not work for vectors on most targets. Sign- and // zero-extend operations are currently folded into extending loads, // whether they are legal or not, and then we end up here without any @@ -1546,6 +1634,88 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) { Result = TLI.LowerOperation(Result, DAG); break; case Expand: + + EVT WideScalarVT = Tmp3.getValueType().getScalarType(); + EVT NarrowScalarVT = StVT.getScalarType(); + + // The Store type is illegal, must scalarize the vector store. + SmallVector<SDValue, 8> Stores; + bool ScalarLegal = isTypeLegal(WideScalarVT); + if (!isTypeLegal(StVT) && StVT.isVector() && ScalarLegal) { + unsigned NumElem = StVT.getVectorNumElements(); + + unsigned ScalarSize = StVT.getScalarType().getSizeInBits(); + // Round odd types to the next pow of two. + if (!isPowerOf2_32(ScalarSize)) + ScalarSize = NextPowerOf2(ScalarSize); + // Types smaller than 8 bits are promoted to 8 bits. + ScalarSize = std::max<unsigned>(ScalarSize, 8); + // Store stride + unsigned Stride = ScalarSize/8; + assert(isPowerOf2_32(Stride) && "Stride must be a power of two"); + + for (unsigned Idx=0; Idx<NumElem; Idx++) { + SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + WideScalarVT, Tmp3, DAG.getIntPtrConstant(Idx)); + + + EVT NVT = EVT::getIntegerVT(*DAG.getContext(), ScalarSize); + + Ex = DAG.getNode(ISD::TRUNCATE, dl, NVT, Ex); + Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, + DAG.getIntPtrConstant(Stride)); + SDValue Store = DAG.getStore(Tmp1, dl, Ex, Tmp2, + ST->getPointerInfo().getWithOffset(Idx*Stride), + isVolatile, isNonTemporal, Alignment); + Stores.push_back(Store); + } + Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &Stores[0], Stores.size()); + break; + } + + // The Store type is illegal, must scalarize the vector store. + // However, the scalar type is illegal. Must bitcast the result + // and store it in smaller parts. + if (!isTypeLegal(StVT) && StVT.isVector()) { + unsigned WideNumElem = StVT.getVectorNumElements(); + unsigned Stride = NarrowScalarVT.getSizeInBits()/8; + + unsigned SizeRatio = + (WideScalarVT.getSizeInBits() / NarrowScalarVT.getSizeInBits()); + + EVT CastValueVT = EVT::getVectorVT(*DAG.getContext(), NarrowScalarVT, + SizeRatio*WideNumElem); + + // Cast the wide elem vector to wider vec with smaller elem type. + // Example <2 x i64> -> <4 x i32> + Tmp3 = DAG.getNode(ISD::BITCAST, dl, CastValueVT, Tmp3); + + for (unsigned Idx=0; Idx<WideNumElem*SizeRatio; Idx++) { + // Extract elment i + SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + NarrowScalarVT, Tmp3, DAG.getIntPtrConstant(Idx)); + // bump pointer. + Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, + DAG.getIntPtrConstant(Stride)); + + // Store if, this element is: + // - First element on big endian, or + // - Last element on little endian + if (( TLI.isBigEndian() && (Idx%SizeRatio == 0)) || + ((!TLI.isBigEndian() && (Idx%SizeRatio == SizeRatio-1)))) { + SDValue Store = DAG.getStore(Tmp1, dl, Ex, Tmp2, + ST->getPointerInfo().getWithOffset(Idx*Stride), + isVolatile, isNonTemporal, Alignment); + Stores.push_back(Store); + } + } + Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &Stores[0], Stores.size()); + break; + } + + // TRUNCSTORE:i16 i32 -> STORE i16 assert(isTypeLegal(StVT) && "Do not know how to expand this store!"); Tmp3 = DAG.getNode(ISD::TRUNCATE, dl, StVT, Tmp3); @@ -2007,7 +2177,6 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) { // and leave the Hi part unset. SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned) { - assert(!IsLegalizingCall && "Cannot overlap legalization of calls!"); // The input chain to this libcall is the entry node of the function. // Legalizing the call will automatically add the previous call to the // dependence. @@ -2043,9 +2212,43 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, return DAG.getRoot(); // Legalize the call sequence, starting with the chain. This will advance + // the LastCALLSEQ to the legalized version of the CALLSEQ_END node that + // was added by LowerCallTo (guaranteeing proper serialization of calls). + LegalizeOp(CallInfo.second); + return CallInfo.first; +} + +/// ExpandLibCall - Generate a libcall taking the given operands as arguments +/// and returning a result of type RetVT. +SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, + const SDValue *Ops, unsigned NumOps, + bool isSigned, DebugLoc dl) { + TargetLowering::ArgListTy Args; + Args.reserve(NumOps); + + TargetLowering::ArgListEntry Entry; + for (unsigned i = 0; i != NumOps; ++i) { + Entry.Node = Ops[i]; + Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); + Entry.isSExt = isSigned; + Entry.isZExt = !isSigned; + Args.push_back(Entry); + } + SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), + TLI.getPointerTy()); + + const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); + std::pair<SDValue,SDValue> CallInfo = + TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false, + false, 0, TLI.getLibcallCallingConv(LC), false, + /*isReturnValueUsed=*/true, + Callee, Args, DAG, dl); + + // Legalize the call sequence, starting with the chain. This will advance // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that // was added by LowerCallTo (guaranteeing proper serialization of calls). LegalizeOp(CallInfo.second); + return CallInfo.first; } @@ -2055,7 +2258,6 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned) { - assert(!IsLegalizingCall && "Cannot overlap legalization of calls!"); SDValue InChain = Node->getOperand(0); TargetLowering::ArgListTy Args; @@ -2081,7 +2283,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, Callee, Args, DAG, Node->getDebugLoc()); // Legalize the call sequence, starting with the chain. This will advance - // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that + // the LastCALLSEQ to the legalized version of the CALLSEQ_END node that // was added by LowerCallTo (guaranteeing proper serialization of calls). LegalizeOp(CallInfo.second); return CallInfo; @@ -2121,10 +2323,9 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned, return ExpandLibCall(LC, Node, isSigned); } -/// ExpandDivRemLibCall - Issue libcalls to __{u}divmod to compute div / rem -/// pairs. -SDValue SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, bool isSigned, - bool isDIV) { +/// isDivRemLibcallAvailable - Return true if divmod libcall is available. +static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, + const TargetLowering &TLI) { RTLIB::Libcall LC; switch (Node->getValueType(0).getSimpleVT().SimpleTy) { default: assert(0 && "Unexpected request for libcall!"); @@ -2135,17 +2336,18 @@ SDValue SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, bool isSigned, case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; } - if (!TLI.getLibcallName(LC)) - return SDValue(); + return TLI.getLibcallName(LC) != 0; +} - // Only issue divrem libcall if both quotient and remainder are needed. +/// UseDivRem - Only issue divrem libcall if both quotient and remainder are +/// needed. +static bool UseDivRem(SDNode *Node, bool isSigned, bool isDIV) { unsigned OtherOpcode = 0; - if (isSigned) { + if (isSigned) OtherOpcode = isDIV ? ISD::SREM : ISD::SDIV; - } else { + else OtherOpcode = isDIV ? ISD::UREM : ISD::UDIV; - } - SDNode *OtherNode = 0; + SDValue Op0 = Node->getOperand(0); SDValue Op1 = Node->getOperand(1); for (SDNode::use_iterator UI = Op0.getNode()->use_begin(), @@ -2155,32 +2357,28 @@ SDValue SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, bool isSigned, continue; if (User->getOpcode() == OtherOpcode && User->getOperand(0) == Op0 && - User->getOperand(1) == Op1) { - OtherNode = User; - break; - } + User->getOperand(1) == Op1) + return true; } - if (!OtherNode) - return SDValue(); + return false; +} - // If the libcall is already generated, no need to issue it again. - DenseMap<SDValue, SDValue>::iterator I - = LegalizedNodes.find(SDValue(OtherNode,0)); - if (I != LegalizedNodes.end()) { - OtherNode = I->second.getNode(); - SDNode *Chain = OtherNode->getOperand(0).getNode(); - for (SDNode::use_iterator UI = Chain->use_begin(), UE = Chain->use_end(); - UI != UE; ++UI) { - SDNode *User = *UI; - if (User == OtherNode) - continue; - if (isDIV) { - assert(User->getOpcode() == ISD::CopyFromReg); - } else { - assert(User->getOpcode() == ISD::LOAD); - } - return SDValue(User, 0); - } +/// ExpandDivRemLibCall - Issue libcalls to __{u}divmod to compute div / rem +/// pairs. +void +SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, + SmallVectorImpl<SDValue> &Results) { + unsigned Opcode = Node->getOpcode(); + bool isSigned = Opcode == ISD::SDIVREM; + + RTLIB::Libcall LC; + switch (Node->getValueType(0).getSimpleVT().SimpleTy) { + default: assert(0 && "Unexpected request for libcall!"); + case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; + case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; + case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; + case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; + case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; } // The input chain to this libcall is the entry node of the function. @@ -2221,14 +2419,15 @@ SDValue SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, bool isSigned, /*isReturnValueUsed=*/true, Callee, Args, DAG, dl); // Legalize the call sequence, starting with the chain. This will advance - // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that + // the LastCALLSEQ to the legalized version of the CALLSEQ_END node that // was added by LowerCallTo (guaranteeing proper serialization of calls). LegalizeOp(CallInfo.second); // Remainder is loaded back from the stack frame. - SDValue Rem = DAG.getLoad(RetVT, dl, LastCALLSEQ_END, FIPtr, + SDValue Rem = DAG.getLoad(RetVT, dl, getLastCALLSEQ(), FIPtr, MachinePointerInfo(), false, false, 0); - return isDIV ? CallInfo.first : Rem; + Results.push_back(CallInfo.first); + Results.push_back(Rem); } /// ExpandLegalINT_TO_FP - This function is responsible for legalizing a @@ -2878,7 +3077,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, } case ISD::FP_ROUND_INREG: { // The only way we can lower this is to turn it into a TRUNCSTORE, - // EXTLOAD pair, targetting a temporary location (a stack slot). + // EXTLOAD pair, targeting a temporary location (a stack slot). // NOTE: there is a choice here between constantly creating new stack // slots and always reusing the same one. We currently always create @@ -3204,28 +3403,25 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; Tmp2 = Node->getOperand(0); Tmp3 = Node->getOperand(1); - if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) { + if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) || + (isDivRemLibcallAvailable(Node, isSigned, TLI) && + UseDivRem(Node, isSigned, false))) { Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1); } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) { // X % Y -> X-X/Y*Y Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3); Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1); - } else if (isSigned) { - Tmp1 = ExpandDivRemLibCall(Node, true, false); - if (!Tmp1.getNode()) - Tmp1 = ExpandIntLibCall(Node, true, - RTLIB::SREM_I8, - RTLIB::SREM_I16, RTLIB::SREM_I32, - RTLIB::SREM_I64, RTLIB::SREM_I128); - } else { - Tmp1 = ExpandDivRemLibCall(Node, false, false); - if (!Tmp1.getNode()) - Tmp1 = ExpandIntLibCall(Node, false, - RTLIB::UREM_I8, - RTLIB::UREM_I16, RTLIB::UREM_I32, - RTLIB::UREM_I64, RTLIB::UREM_I128); - } + } else if (isSigned) + Tmp1 = ExpandIntLibCall(Node, true, + RTLIB::SREM_I8, + RTLIB::SREM_I16, RTLIB::SREM_I32, + RTLIB::SREM_I64, RTLIB::SREM_I128); + else + Tmp1 = ExpandIntLibCall(Node, false, + RTLIB::UREM_I8, + RTLIB::UREM_I16, RTLIB::UREM_I32, + RTLIB::UREM_I64, RTLIB::UREM_I128); Results.push_back(Tmp1); break; } @@ -3235,26 +3431,21 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; EVT VT = Node->getValueType(0); SDVTList VTs = DAG.getVTList(VT, VT); - if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) + if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) || + (isDivRemLibcallAvailable(Node, isSigned, TLI) && + UseDivRem(Node, isSigned, true))) Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0), Node->getOperand(1)); - else if (isSigned) { - Tmp1 = ExpandDivRemLibCall(Node, true, true); - if (!Tmp1.getNode()) { - Tmp1 = ExpandIntLibCall(Node, true, - RTLIB::SDIV_I8, - RTLIB::SDIV_I16, RTLIB::SDIV_I32, - RTLIB::SDIV_I64, RTLIB::SDIV_I128); - } - } else { - Tmp1 = ExpandDivRemLibCall(Node, false, true); - if (!Tmp1.getNode()) { - Tmp1 = ExpandIntLibCall(Node, false, - RTLIB::UDIV_I8, - RTLIB::UDIV_I16, RTLIB::UDIV_I32, - RTLIB::UDIV_I64, RTLIB::UDIV_I128); - } - } + else if (isSigned) + Tmp1 = ExpandIntLibCall(Node, true, + RTLIB::SDIV_I8, + RTLIB::SDIV_I16, RTLIB::SDIV_I32, + RTLIB::SDIV_I64, RTLIB::SDIV_I128); + else + Tmp1 = ExpandIntLibCall(Node, false, + RTLIB::UDIV_I8, + RTLIB::UDIV_I16, RTLIB::UDIV_I32, + RTLIB::UDIV_I64, RTLIB::UDIV_I128); Results.push_back(Tmp1); break; } @@ -3271,6 +3462,11 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, Results.push_back(Tmp1.getValue(1)); break; } + case ISD::SDIVREM: + case ISD::UDIVREM: + // Expand into divrem libcall + ExpandDivRemLibCall(Node, Results); + break; case ISD::MUL: { EVT VT = Node->getValueType(0); SDVTList VTs = DAG.getVTList(VT, VT); @@ -3355,6 +3551,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, case ISD::UMULO: case ISD::SMULO: { EVT VT = Node->getValueType(0); + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2); SDValue LHS = Node->getOperand(0); SDValue RHS = Node->getOperand(1); SDValue BottomHalf; @@ -3372,7 +3569,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, TopHalf = BottomHalf.getValue(1); } else if (TLI.isTypeLegal(EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2))) { - EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2); LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS); RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS); Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS); @@ -3385,7 +3581,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, // have a libcall big enough. // Also, we can fall back to a division in some cases, but that's a big // performance hit in the general case. - EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2); RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (WideVT == MVT::i16) LC = RTLIB::MUL_I16; @@ -3396,15 +3591,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, else if (WideVT == MVT::i128) LC = RTLIB::MUL_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!"); - LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS); - RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS); - SDValue Ret = ExpandLibCall(LC, Node, isSigned); - BottomHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, Ret); - TopHalf = DAG.getNode(ISD::SRL, dl, Ret.getValueType(), Ret, - DAG.getConstant(VT.getSizeInBits(), TLI.getPointerTy())); - TopHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, TopHalf); + // The high part is obtained by SRA'ing all but one of the bits of low + // part. + unsigned LoSize = VT.getSizeInBits(); + SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, RHS, + DAG.getConstant(LoSize-1, TLI.getPointerTy())); + SDValue HiRHS = DAG.getNode(ISD::SRA, dl, VT, LHS, + DAG.getConstant(LoSize-1, TLI.getPointerTy())); + + // Here we're passing the 2 arguments explicitly as 4 arguments that are + // pre-lowered to the correct types. This all depends upon WideVT not + // being a legal type for the architecture and thus has to be split to + // two arguments. + SDValue Args[] = { LHS, HiLHS, RHS, HiRHS }; + SDValue Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl); + BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret, + DAG.getIntPtrConstant(0)); + TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret, + DAG.getIntPtrConstant(1)); } + if (isSigned) { Tmp1 = DAG.getConstant(VT.getSizeInBits() - 1, TLI.getShiftAmountTy(BottomHalf.getValueType())); @@ -3486,9 +3693,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, Tmp2.getOperand(0), Tmp2.getOperand(1), Node->getOperand(2)); } else { + // We test only the i1 bit. Skip the AND if UNDEF. + Tmp3 = (Tmp2.getOpcode() == ISD::UNDEF) ? Tmp2 : + DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2, + DAG.getConstant(1, Tmp2.getValueType())); Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1, - DAG.getCondCode(ISD::SETNE), Tmp2, - DAG.getConstant(0, Tmp2.getValueType()), + DAG.getCondCode(ISD::SETNE), Tmp3, + DAG.getConstant(0, Tmp3.getValueType()), Node->getOperand(2)); } Results.push_back(Tmp1); @@ -3539,7 +3750,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node, LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp2.getValueType()), Tmp2, Tmp3, Tmp4, dl); - LastCALLSEQ_END = DAG.getEntryNode(); + assert(LastCALLSEQ.size() == 1 && "branch inside CALLSEQ_BEGIN/END?"); + setLastCALLSEQ(DAG.getEntryNode()); assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!"); Tmp3 = DAG.getConstant(0, Tmp2.getValueType()); @@ -3697,9 +3909,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node, // SelectionDAG::Legalize - This is the entry point for the file. // -void SelectionDAG::Legalize(CodeGenOpt::Level OptLevel) { +void SelectionDAG::Legalize() { /// run - This is the main entry point to this class. /// - SelectionDAGLegalize(*this, OptLevel).LegalizeDAG(); + SelectionDAGLegalize(*this).LegalizeDAG(); } - diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 935aab0..da75b8a 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -73,6 +73,17 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::UNDEF: Res = PromoteIntRes_UNDEF(N); break; case ISD::VAARG: Res = PromoteIntRes_VAARG(N); break; + case ISD::EXTRACT_SUBVECTOR: + Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break; + case ISD::VECTOR_SHUFFLE: + Res = PromoteIntRes_VECTOR_SHUFFLE(N); break; + case ISD::INSERT_VECTOR_ELT: + Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break; + case ISD::BUILD_VECTOR: + Res = PromoteIntRes_BUILD_VECTOR(N); break; + case ISD::SCALAR_TO_VECTOR: + Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break; + case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Res = PromoteIntRes_INT_EXTEND(N); break; @@ -174,24 +185,30 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) { default: assert(false && "Unknown type action!"); break; - case Legal: + case TargetLowering::TypeLegal: break; - case PromoteInteger: + case TargetLowering::TypePromoteInteger: if (NOutVT.bitsEq(NInVT)) // The input promotes to the same size. Convert the promoted value. return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp)); + if (NInVT.isVector()) + // Promote vector element via memory load/store. + return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, + CreateStackStoreLoad(InOp, OutVT)); break; - case SoftenFloat: + case TargetLowering::TypeSoftenFloat: // Promote the integer operand by hand. return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp)); - case ExpandInteger: - case ExpandFloat: + case TargetLowering::TypeExpandInteger: + case TargetLowering::TypeExpandFloat: break; - case ScalarizeVector: + case TargetLowering::TypeScalarizeVector: // Convert the element to an integer and promote it by hand. - return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, - BitConvertToInteger(GetScalarizedVector(InOp))); - case SplitVector: { + if (!NOutVT.isVector()) + return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, + BitConvertToInteger(GetScalarizedVector(InOp))); + break; + case TargetLowering::TypeSplitVector: { // For example, i32 = BITCAST v2i16 on alpha. Convert the split // pieces of the input into integers and reassemble in the final type. SDValue Lo, Hi; @@ -208,7 +225,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) { JoinIntegers(Lo, Hi)); return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp); } - case WidenVector: + case TargetLowering::TypeWidenVector: if (OutVT.bitsEq(NInVT)) // The input is widened to the same size. Convert to the widened value. return DAG.getNode(ISD::BITCAST, dl, OutVT, GetWidenedVector(InOp)); @@ -342,7 +359,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); DebugLoc dl = N->getDebugLoc(); - if (getTypeAction(N->getOperand(0).getValueType()) == PromoteInteger) { + if (getTypeAction(N->getOperand(0).getValueType()) + == TargetLowering::TypePromoteInteger) { SDValue Res = GetPromotedInteger(N->getOperand(0)); assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!"); @@ -507,11 +525,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { switch (getTypeAction(N->getOperand(0).getValueType())) { default: llvm_unreachable("Unknown type action!"); - case Legal: - case ExpandInteger: + case TargetLowering::TypeLegal: + case TargetLowering::TypeExpandInteger: Res = N->getOperand(0); break; - case PromoteInteger: + case TargetLowering::TypePromoteInteger: Res = GetPromotedInteger(N->getOperand(0)); break; } @@ -557,9 +575,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) { DebugLoc DL = N->getDebugLoc(); EVT SmallVT = LHS.getValueType(); - // To determine if the result overflowed in a larger type, we extend the input - // to the larger type, do the multiply, then check the high bits of the result - // to see if the overflow happened. + // To determine if the result overflowed in a larger type, we extend the + // input to the larger type, do the multiply, then check the high bits of + // the result to see if the overflow happened. if (N->getOpcode() == ISD::SMULO) { LHS = SExtPromotedInteger(LHS); RHS = SExtPromotedInteger(RHS); @@ -569,8 +587,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) { } SDValue Mul = DAG.getNode(ISD::MUL, DL, LHS.getValueType(), LHS, RHS); - // Overflow occurred iff the high part of the result does not zero/sign-extend - // the low part. + // Overflow occurred iff the high part of the result does not + // zero/sign-extend the low part. SDValue Overflow; if (N->getOpcode() == ISD::UMULO) { // Unsigned overflow occurred iff the high part is non-zero. @@ -672,6 +690,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::BRCOND: Res = PromoteIntOp_BRCOND(N, OpNo); break; case ISD::BUILD_PAIR: Res = PromoteIntOp_BUILD_PAIR(N); break; case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break; + case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break; + case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break; case ISD::CONVERT_RNDSAT: Res = PromoteIntOp_CONVERT_RNDSAT(N); break; case ISD::INSERT_VECTOR_ELT: @@ -952,7 +972,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) { DebugLoc dl = N->getDebugLoc(); SDValue Op = GetPromotedInteger(N->getOperand(0)); Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op); - return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType()); + return DAG.getZeroExtendInReg(Op, dl, + N->getOperand(0).getValueType().getScalarType()); } @@ -1513,7 +1534,8 @@ void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N, } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. - assert(getTypeAction(Op.getValueType()) == PromoteInteger && + assert(getTypeAction(Op.getValueType()) == + TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && @@ -2030,7 +2052,8 @@ void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N, } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. - assert(getTypeAction(Op.getValueType()) == PromoteInteger && + assert(getTypeAction(Op.getValueType()) == + TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && @@ -2178,7 +2201,8 @@ void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N, } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. - assert(getTypeAction(Op.getValueType()) == PromoteInteger && + assert(getTypeAction(Op.getValueType()) == + TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && @@ -2613,3 +2637,158 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) { "Don't know how to expand this UINT_TO_FP!"); return MakeLibCall(LC, DstVT, &Op, 1, true, dl); } + +SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { + SDValue InOp0 = N->getOperand(0); + EVT InVT = InOp0.getValueType(); + EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT); + + EVT OutVT = N->getValueType(0); + EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); + assert(NOutVT.isVector() && "This type must be promoted to a vector type"); + unsigned OutNumElems = N->getValueType(0).getVectorNumElements(); + EVT NOutVTElem = NOutVT.getVectorElementType(); + + DebugLoc dl = N->getDebugLoc(); + SDValue BaseIdx = N->getOperand(1); + + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i != OutNumElems; ++i) { + + // Extract the element from the original vector. + SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(), + BaseIdx, DAG.getIntPtrConstant(i)); + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + InVT.getVectorElementType(), N->getOperand(0), Index); + + SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext); + // Insert the converted element to the new vector. + Ops.push_back(Op); + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size()); +} + + +SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) { + + ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N); + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); + + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 8> NewMask; + for (unsigned i = 0; i != NumElts; ++i) { + NewMask.push_back(SV->getMaskElt(i)); + } + + SDValue V0 = GetPromotedInteger(N->getOperand(0)); + SDValue V1 = GetPromotedInteger(N->getOperand(1)); + EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + + return DAG.getVectorShuffle(OutVT, dl, V0,V1, &NewMask[0]); +} + + +SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) { + + SDValue InOp0 = N->getOperand(0); + EVT InVT = InOp0.getValueType(); + EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT); + + EVT OutVT = N->getValueType(0); + EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); + assert(NOutVT.isVector() && "This type must be promoted to a vector type"); + unsigned NumElems = N->getNumOperands(); + EVT NOutVTElem = NOutVT.getVectorElementType(); + + DebugLoc dl = N->getDebugLoc(); + + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i != NumElems; ++i) { + SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i)); + Ops.push_back(Op); + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size()); +} + +SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) { + + DebugLoc dl = N->getDebugLoc(); + + SDValue InOp0 = N->getOperand(0); + EVT InVT = InOp0.getValueType(); + EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT); + assert(!InVT.isVector() && "Input must not be a scalar"); + + EVT OutVT = N->getValueType(0); + EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); + assert(NOutVT.isVector() && "This type must be promoted to a vector type"); + EVT NOutVTElem = NOutVT.getVectorElementType(); + + SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0)); + + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op); +} + +SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) { + + SDValue InOp0 = N->getOperand(0); + EVT InVT = InOp0.getValueType(); + EVT InElVT = InVT.getVectorElementType(); + EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT); + + EVT OutVT = N->getValueType(0); + EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); + assert(NOutVT.isVector() && "This type must be promoted to a vector type"); + + EVT NOutVTElem = NOutVT.getVectorElementType(); + + DebugLoc dl = N->getDebugLoc(); + + SDValue ConvertedVector = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, InOp0); + + SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl, + NOutVTElem, N->getOperand(1)); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,NOutVT, + ConvertedVector, ConvElem, N->getOperand(2)); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) { + DebugLoc dl = N->getDebugLoc(); + SDValue V0 = GetPromotedInteger(N->getOperand(0)); + SDValue V1 = N->getOperand(1); + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + V0->getValueType(0).getScalarType(), V0, V1); + + return DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), Ext); + +} + +SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) { + + DebugLoc dl = N->getDebugLoc(); + + EVT RetSclrTy = N->getValueType(0).getVectorElementType(); + + SmallVector<SDValue, 8> NewOps; + + // For each incoming vector + for (unsigned VecIdx = 0, E = N->getNumOperands(); VecIdx!= E; ++VecIdx) { + SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx)); + EVT SclrTy = Incoming->getValueType(0).getVectorElementType(); + unsigned NumElem = Incoming->getValueType(0).getVectorNumElements(); + + for (unsigned i=0; i<NumElem; ++i) { + // Extract element from incoming vector + SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, + Incoming, DAG.getIntPtrConstant(i)); + SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex); + NewOps.push_back(Tr); + } + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0), + &NewOps[0], NewOps.size()); + } diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index cedda7e..ba658b0 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -224,38 +224,38 @@ bool DAGTypeLegalizer::run() { switch (getTypeAction(ResultVT)) { default: assert(false && "Unknown action!"); - case Legal: + case TargetLowering::TypeLegal: break; // The following calls must take care of *all* of the node's results, // not just the illegal result they were passed (this includes results // with a legal type). Results can be remapped using ReplaceValueWith, // or their promoted/expanded/etc values registered in PromotedIntegers, // ExpandedIntegers etc. - case PromoteInteger: + case TargetLowering::TypePromoteInteger: PromoteIntegerResult(N, i); Changed = true; goto NodeDone; - case ExpandInteger: + case TargetLowering::TypeExpandInteger: ExpandIntegerResult(N, i); Changed = true; goto NodeDone; - case SoftenFloat: + case TargetLowering::TypeSoftenFloat: SoftenFloatResult(N, i); Changed = true; goto NodeDone; - case ExpandFloat: + case TargetLowering::TypeExpandFloat: ExpandFloatResult(N, i); Changed = true; goto NodeDone; - case ScalarizeVector: + case TargetLowering::TypeScalarizeVector: ScalarizeVectorResult(N, i); Changed = true; goto NodeDone; - case SplitVector: + case TargetLowering::TypeSplitVector: SplitVectorResult(N, i); Changed = true; goto NodeDone; - case WidenVector: + case TargetLowering::TypeWidenVector: WidenVectorResult(N, i); Changed = true; goto NodeDone; @@ -277,36 +277,36 @@ ScanOperands: switch (getTypeAction(OpVT)) { default: assert(false && "Unknown action!"); - case Legal: + case TargetLowering::TypeLegal: continue; // The following calls must either replace all of the node's results // using ReplaceValueWith, and return "false"; or update the node's // operands in place, and return "true". - case PromoteInteger: + case TargetLowering::TypePromoteInteger: NeedsReanalyzing = PromoteIntegerOperand(N, i); Changed = true; break; - case ExpandInteger: + case TargetLowering::TypeExpandInteger: NeedsReanalyzing = ExpandIntegerOperand(N, i); Changed = true; break; - case SoftenFloat: + case TargetLowering::TypeSoftenFloat: NeedsReanalyzing = SoftenFloatOperand(N, i); Changed = true; break; - case ExpandFloat: + case TargetLowering::TypeExpandFloat: NeedsReanalyzing = ExpandFloatOperand(N, i); Changed = true; break; - case ScalarizeVector: + case TargetLowering::TypeScalarizeVector: NeedsReanalyzing = ScalarizeVectorOperand(N, i); Changed = true; break; - case SplitVector: + case TargetLowering::TypeSplitVector: NeedsReanalyzing = SplitVectorOperand(N, i); Changed = true; break; - case WidenVector: + case TargetLowering::TypeWidenVector: NeedsReanalyzing = WidenVectorOperand(N, i); Changed = true; break; diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 5409b88..06dc40f 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -57,16 +57,6 @@ public: // 1+ - This is a node which has this many unprocessed operands. }; private: - enum LegalizeAction { - Legal, // The target natively supports this type. - PromoteInteger, // Replace this integer type with a larger one. - ExpandInteger, // Split this integer type into two of half the size. - SoftenFloat, // Convert this float type to a same size integer type. - ExpandFloat, // Split this float type into two of half the size. - ScalarizeVector, // Replace this one-element vector with its element type. - SplitVector, // Split this vector type into two of half the size. - WidenVector // This vector type should be widened into a larger vector. - }; /// ValueTypeActions - This is a bitvector that contains two bits for each /// simple value type, where the two bits correspond to the LegalizeAction @@ -74,41 +64,13 @@ private: TargetLowering::ValueTypeActionImpl ValueTypeActions; /// getTypeAction - Return how we should legalize values of this type. - LegalizeAction getTypeAction(EVT VT) const { - switch (ValueTypeActions.getTypeAction(VT)) { - default: - assert(false && "Unknown legalize action!"); - case TargetLowering::Legal: - return Legal; - case TargetLowering::Promote: - // Promote can mean - // 1) For integers, use a larger integer type (e.g. i8 -> i32). - // 2) For vectors, use a wider vector type (e.g. v3i32 -> v4i32). - if (!VT.isVector()) - return PromoteInteger; - return WidenVector; - case TargetLowering::Expand: - // Expand can mean - // 1) split scalar in half, 2) convert a float to an integer, - // 3) scalarize a single-element vector, 4) split a vector in two. - if (!VT.isVector()) { - if (VT.isInteger()) - return ExpandInteger; - if (VT.getSizeInBits() == - TLI.getTypeToTransformTo(*DAG.getContext(), VT).getSizeInBits()) - return SoftenFloat; - return ExpandFloat; - } - - if (VT.getVectorNumElements() == 1) - return ScalarizeVector; - return SplitVector; - } + TargetLowering::LegalizeTypeAction getTypeAction(EVT VT) const { + return TLI.getTypeAction(*DAG.getContext(), VT); } /// isTypeLegal - Return true if this type is legal on this target. bool isTypeLegal(EVT VT) const { - return ValueTypeActions.getTypeAction(VT) == TargetLowering::Legal; + return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal; } /// IgnoreNodeResults - Pretend all of this node's results are legal. @@ -239,7 +201,7 @@ private: EVT OldVT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); Op = GetPromotedInteger(Op); - return DAG.getZeroExtendInReg(Op, dl, OldVT); + return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType()); } // Integer Result Promotion. @@ -248,6 +210,11 @@ private: SDValue PromoteIntRes_AssertZext(SDNode *N); SDValue PromoteIntRes_Atomic1(AtomicSDNode *N); SDValue PromoteIntRes_Atomic2(AtomicSDNode *N); + SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N); + SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N); + SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N); + SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N); + SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N); SDValue PromoteIntRes_BITCAST(SDNode *N); SDValue PromoteIntRes_BSWAP(SDNode *N); SDValue PromoteIntRes_BUILD_PAIR(SDNode *N); @@ -289,6 +256,9 @@ private: SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N); SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N); SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_EXTRACT_ELEMENT(SDNode *N); + SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N); + SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N); SDValue PromoteIntOp_MEMBARRIER(SDNode *N); SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N); SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index a75ae87..85ea6b6 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -43,36 +43,36 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { switch (getTypeAction(InVT)) { default: assert(false && "Unknown type action!"); - case Legal: - case PromoteInteger: + case TargetLowering::TypeLegal: + case TargetLowering::TypePromoteInteger: break; - case SoftenFloat: + case TargetLowering::TypeSoftenFloat: // Convert the integer operand instead. SplitInteger(GetSoftenedFloat(InOp), Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; - case ExpandInteger: - case ExpandFloat: + case TargetLowering::TypeExpandInteger: + case TargetLowering::TypeExpandFloat: // Convert the expanded pieces of the input. GetExpandedOp(InOp, Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; - case SplitVector: + case TargetLowering::TypeSplitVector: GetSplitVector(InOp, Lo, Hi); if (TLI.isBigEndian()) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; - case ScalarizeVector: + case TargetLowering::TypeScalarizeVector: // Convert the element instead. SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; - case WidenVector: { + case TargetLowering::TypeWidenVector: { assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST"); InOp = GetWidenedVector(InOp); EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 0b4dd35..b5698f9 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -526,13 +526,13 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, switch (getTypeAction(InVT)) { default: assert(false && "Unknown type action!"); - case Legal: - case PromoteInteger: - case SoftenFloat: - case ScalarizeVector: + case TargetLowering::TypeLegal: + case TargetLowering::TypePromoteInteger: + case TargetLowering::TypeSoftenFloat: + case TargetLowering::TypeScalarizeVector: break; - case ExpandInteger: - case ExpandFloat: + case TargetLowering::TypeExpandInteger: + case TargetLowering::TypeExpandFloat: // A scalar to vector conversion, where the scalar needs expansion. // If the vector is being split in two then we can just convert the // expanded pieces. @@ -545,7 +545,7 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, return; } break; - case SplitVector: + case TargetLowering::TypeSplitVector: // If the input is a vector that needs to be split, convert each split // piece of the input now. GetSplitVector(InOp, Lo, Hi); @@ -774,7 +774,7 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, EVT InVT = N->getOperand(0).getValueType(); switch (getTypeAction(InVT)) { default: llvm_unreachable("Unexpected type action!"); - case Legal: { + case TargetLowering::TypeLegal: { EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), LoVT.getVectorNumElements()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0), @@ -783,10 +783,21 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, DAG.getIntPtrConstant(InNVT.getVectorNumElements())); break; } - case SplitVector: + case TargetLowering::TypePromoteInteger: { + SDValue InOp = GetPromotedInteger(N->getOperand(0)); + EVT InNVT = EVT::getVectorVT(*DAG.getContext(), + InOp.getValueType().getVectorElementType(), + LoVT.getVectorNumElements()); + Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp, + DAG.getIntPtrConstant(0)); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp, + DAG.getIntPtrConstant(InNVT.getVectorNumElements())); + break; + } + case TargetLowering::TypeSplitVector: GetSplitVector(N->getOperand(0), Lo, Hi); break; - case WidenVector: { + case TargetLowering::TypeWidenVector: { // If the result needs to be split and the input needs to be widened, // the two types must have different lengths. Use the widened result // and extract from it to do the split. @@ -1439,7 +1450,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { unsigned Opcode = N->getOpcode(); unsigned InVTNumElts = InVT.getVectorNumElements(); - if (getTypeAction(InVT) == WidenVector) { + if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { InOp = GetWidenedVector(N->getOperand(0)); InVT = InOp.getValueType(); InVTNumElts = InVT.getVectorNumElements(); @@ -1515,7 +1526,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) { SDValue ShOp = N->getOperand(1); EVT ShVT = ShOp.getValueType(); - if (getTypeAction(ShVT) == WidenVector) { + if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) { ShOp = GetWidenedVector(ShOp); ShVT = ShOp.getValueType(); } @@ -1557,9 +1568,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { default: assert(false && "Unknown type action!"); break; - case Legal: + case TargetLowering::TypeLegal: break; - case PromoteInteger: + case TargetLowering::TypePromoteInteger: // If the InOp is promoted to the same size, convert it. Otherwise, // fall out of the switch and widen the promoted input. InOp = GetPromotedInteger(InOp); @@ -1567,13 +1578,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { if (WidenVT.bitsEq(InVT)) return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp); break; - case SoftenFloat: - case ExpandInteger: - case ExpandFloat: - case ScalarizeVector: - case SplitVector: + case TargetLowering::TypeSoftenFloat: + case TargetLowering::TypeExpandInteger: + case TargetLowering::TypeExpandFloat: + case TargetLowering::TypeScalarizeVector: + case TargetLowering::TypeSplitVector: break; - case WidenVector: + case TargetLowering::TypeWidenVector: // If the InOp is widened to the same size, convert it. Otherwise, fall // out of the switch and widen the widened input. InOp = GetWidenedVector(InOp); @@ -1653,7 +1664,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { unsigned NumOperands = N->getNumOperands(); bool InputWidened = false; // Indicates we need to widen the input. - if (getTypeAction(InVT) != WidenVector) { + if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) { if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) { // Add undef vectors to widen to correct length. unsigned NumConcat = WidenVT.getVectorNumElements() / @@ -1732,7 +1743,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) { ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode(); unsigned InVTNumElts = InVT.getVectorNumElements(); - if (getTypeAction(InVT) == WidenVector) { + if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { InOp = GetWidenedVector(InOp); InVT = InOp.getValueType(); InVTNumElts = InVT.getVectorNumElements(); @@ -1800,7 +1811,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { SDValue Idx = N->getOperand(1); DebugLoc dl = N->getDebugLoc(); - if (getTypeAction(InOp.getValueType()) == WidenVector) + if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector) InOp = GetWidenedVector(InOp); EVT InVT = InOp.getValueType(); @@ -1882,7 +1893,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { EVT CondEltVT = CondVT.getVectorElementType(); EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(), CondEltVT, WidenNumElts); - if (getTypeAction(CondVT) == WidenVector) + if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector) Cond1 = GetWidenedVector(Cond1); if (Cond1.getValueType() != CondWidenVT) @@ -2026,7 +2037,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { DebugLoc dl = N->getDebugLoc(); unsigned NumElts = VT.getVectorNumElements(); SDValue InOp = N->getOperand(0); - if (getTypeAction(InOp.getValueType()) == WidenVector) + if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector) InOp = GetWidenedVector(InOp); EVT InVT = InOp.getValueType(); EVT InEltVT = InVT.getVectorElementType(); @@ -2081,7 +2092,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) { unsigned NumOperands = N->getNumOperands(); for (unsigned i=0; i < NumOperands; ++i) { SDValue InOp = N->getOperand(i); - if (getTypeAction(InOp.getValueType()) == WidenVector) + if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector) InOp = GetWidenedVector(InOp); for (unsigned j=0; j < NumInElts; ++j) Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, @@ -2153,6 +2164,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI, if (MemVT.getSizeInBits() <= WidenEltWidth) break; if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 && + isPowerOf2_32(WidenWidth / MemVTWidth) && (MemVTWidth <= Width || (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { RetVT = MemVT; @@ -2168,6 +2180,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI, unsigned MemVTWidth = MemVT.getSizeInBits(); if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() && (WidenWidth % MemVTWidth) == 0 && + isPowerOf2_32(WidenWidth / MemVTWidth) && (MemVTWidth <= Width || (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT) diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index b2e9c15..f09b381 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -71,6 +71,7 @@ static cl::opt<bool> DisableSchedCycles( cl::desc("Disable cycle-level precision during preRA scheduling")); // Temporary sched=list-ilp flags until the heuristics are robust. +// Some options are also available under sched=list-hybrid. static cl::opt<bool> DisableSchedRegPressure( "disable-sched-reg-pressure", cl::Hidden, cl::init(false), cl::desc("Disable regpressure priority in sched=list-ilp")); @@ -80,6 +81,9 @@ static cl::opt<bool> DisableSchedLiveUses( static cl::opt<bool> DisableSchedVRegCycle( "disable-sched-vrcycle", cl::Hidden, cl::init(false), cl::desc("Disable virtual register cycle interference checks")); +static cl::opt<bool> DisableSchedPhysRegJoin( + "disable-sched-physreg-join", cl::Hidden, cl::init(false), + cl::desc("Disable physreg def-use affinity")); static cl::opt<bool> DisableSchedStalls( "disable-sched-stalls", cl::Hidden, cl::init(true), cl::desc("Disable no-stall priority in sched=list-ilp")); @@ -102,11 +106,11 @@ static cl::opt<unsigned> AvgIPC( #ifndef NDEBUG namespace { // For sched=list-ilp, Count the number of times each factor comes into play. - enum { FactPressureDiff, FactRegUses, FactHeight, FactDepth, FactStatic, - FactOther, NumFactors }; + enum { FactPressureDiff, FactRegUses, FactStall, FactHeight, FactDepth, + FactStatic, FactOther, NumFactors }; } static const char *FactorName[NumFactors] = -{"PressureDiff", "RegUses", "Height", "Depth","Static", "Other"}; +{"PressureDiff", "RegUses", "Stall", "Height", "Depth","Static", "Other"}; static int FactorCount[NumFactors]; #endif //!NDEBUG @@ -272,6 +276,33 @@ private: }; } // end anonymous namespace +/// GetCostForDef - Looks up the register class and cost for a given definition. +/// Typically this just means looking up the representative register class, +/// but for untyped values (MVT::untyped) it means inspecting the node's +/// opcode to determine what register class is being generated. +static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos, + const TargetLowering *TLI, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI, + unsigned &RegClass, unsigned &Cost) { + EVT VT = RegDefPos.GetValue(); + + // Special handling for untyped values. These values can only come from + // the expansion of custom DAG-to-DAG patterns. + if (VT == MVT::untyped) { + unsigned Opcode = RegDefPos.GetNode()->getMachineOpcode(); + unsigned Idx = RegDefPos.GetIdx(); + const TargetInstrDesc Desc = TII->get(Opcode); + const TargetRegisterClass *RC = Desc.getRegClass(Idx, TRI); + RegClass = RC->getID(); + // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a + // better way to determine it. + Cost = 1; + } else { + RegClass = TLI->getRepRegClassFor(VT)->getID(); + Cost = TLI->getRepRegClassCostFor(VT); + } +} /// Schedule - Schedule the DAG using list scheduling. void ScheduleDAGRRList::Schedule() { @@ -463,6 +494,13 @@ void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) { if (DisableSchedCycles) return; + // FIXME: Nodes such as CopyFromReg probably should not advance the current + // cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node + // has predecessors the cycle will be advanced when they are scheduled. + // But given the crude nature of modeling latency though such nodes, we + // currently need to treat these nodes like real instructions. + // if (!SU->getNode() || !SU->getNode()->isMachineOpcode()) return; + unsigned ReadyCycle = isBottomUp ? SU->getHeight() : SU->getDepth(); // Bump CurCycle to account for latency. We assume the latency of other @@ -533,6 +571,8 @@ void ScheduleDAGRRList::EmitNode(SUnit *SU) { } } +static void resetVRegCycle(SUnit *SU); + /// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending /// count of its predecessors. If a predecessor pending count is zero, add it to /// the Available queue. @@ -542,7 +582,8 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { #ifndef NDEBUG if (CurCycle < SU->getHeight()) - DEBUG(dbgs() << " Height [" << SU->getHeight() << "] pipeline stall!\n"); + DEBUG(dbgs() << " Height [" << SU->getHeight() + << "] pipeline stall!\n"); #endif // FIXME: Do not modify node height. It may interfere with @@ -559,7 +600,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { AvailableQueue->ScheduledNode(SU); // If HazardRec is disabled, and each inst counts as one cycle, then - // advance CurCycle before ReleasePredecessors to avoid useles pushed to + // advance CurCycle before ReleasePredecessors to avoid useless pushes to // PendingQueue for schedulers that implement HasReadyFilter. if (!HazardRec->isEnabled() && AvgIPC < 2) AdvanceToCycle(CurCycle + 1); @@ -580,20 +621,25 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { } } + resetVRegCycle(SU); + SU->isScheduled = true; // Conditions under which the scheduler should eagerly advance the cycle: // (1) No available instructions // (2) All pipelines full, so available instructions must have hazards. // - // If HazardRec is disabled, the cycle was advanced earlier. + // If HazardRec is disabled, the cycle was pre-advanced before calling + // ReleasePredecessors. In that case, IssueCount should remain 0. // // Check AvailableQueue after ReleasePredecessors in case of zero latency. - ++IssueCount; - if ((HazardRec->isEnabled() && HazardRec->atIssueLimit()) - || (!HazardRec->isEnabled() && AvgIPC > 1 && IssueCount == AvgIPC) - || AvailableQueue->empty()) - AdvanceToCycle(CurCycle + 1); + if (HazardRec->isEnabled() || AvgIPC > 1) { + if (SU->getNode() && SU->getNode()->isMachineOpcode()) + ++IssueCount; + if ((HazardRec->isEnabled() && HazardRec->atIssueLimit()) + || (!HazardRec->isEnabled() && IssueCount == AvgIPC)) + AdvanceToCycle(CurCycle + 1); + } } /// CapturePred - This does the opposite of ReleasePred. Since SU is being @@ -989,14 +1035,15 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg, for (const unsigned *AliasI = TRI->getOverlaps(Reg); *AliasI; ++AliasI) { // Check if Ref is live. - if (!LiveRegDefs[Reg]) continue; + if (!LiveRegDefs[*AliasI]) continue; // Allow multiple uses of the same def. - if (LiveRegDefs[Reg] == SU) continue; + if (LiveRegDefs[*AliasI] == SU) continue; // Add Reg to the set of interfering live regs. - if (RegAdded.insert(Reg)) - LRegs.push_back(Reg); + if (RegAdded.insert(*AliasI)) { + LRegs.push_back(*AliasI); + } } } @@ -1220,7 +1267,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() { // priority. If it is not ready put it back. Schedule the node. Sequence.reserve(SUnits.size()); while (!AvailableQueue->empty()) { - DEBUG(dbgs() << "\n*** Examining Available\n"; + DEBUG(dbgs() << "\nExamining Available:\n"; AvailableQueue->dump(this)); // Pick the best node to schedule taking all constraints into @@ -1349,6 +1396,21 @@ struct queue_sort : public std::binary_function<SUnit*, SUnit*, bool> { bool isReady(SUnit* SU, unsigned CurCycle) const { return true; } }; +#ifndef NDEBUG +template<class SF> +struct reverse_sort : public queue_sort { + SF &SortFunc; + reverse_sort(SF &sf) : SortFunc(sf) {} + reverse_sort(const reverse_sort &RHS) : SortFunc(RHS.SortFunc) {} + + bool operator()(SUnit* left, SUnit* right) const { + // reverse left/right rather than simply !SortFunc(left, right) + // to expose different paths in the comparison logic. + return SortFunc(right, left); + } +}; +#endif // NDEBUG + /// bu_ls_rr_sort - Priority function for bottom up register pressure // reduction scheduler. struct bu_ls_rr_sort : public queue_sort { @@ -1549,20 +1611,33 @@ protected: }; template<class SF> -class RegReductionPriorityQueue : public RegReductionPQBase { - static SUnit *popFromQueue(std::vector<SUnit*> &Q, SF &Picker) { - std::vector<SUnit *>::iterator Best = Q.begin(); - for (std::vector<SUnit *>::iterator I = llvm::next(Q.begin()), - E = Q.end(); I != E; ++I) - if (Picker(*Best, *I)) - Best = I; - SUnit *V = *Best; - if (Best != prior(Q.end())) - std::swap(*Best, Q.back()); - Q.pop_back(); - return V; +static SUnit *popFromQueueImpl(std::vector<SUnit*> &Q, SF &Picker) { + std::vector<SUnit *>::iterator Best = Q.begin(); + for (std::vector<SUnit *>::iterator I = llvm::next(Q.begin()), + E = Q.end(); I != E; ++I) + if (Picker(*Best, *I)) + Best = I; + SUnit *V = *Best; + if (Best != prior(Q.end())) + std::swap(*Best, Q.back()); + Q.pop_back(); + return V; +} + +template<class SF> +SUnit *popFromQueue(std::vector<SUnit*> &Q, SF &Picker, ScheduleDAG *DAG) { +#ifndef NDEBUG + if (DAG->StressSched) { + reverse_sort<SF> RPicker(Picker); + return popFromQueueImpl(Q, RPicker); } +#endif + (void)DAG; + return popFromQueueImpl(Q, Picker); +} +template<class SF> +class RegReductionPriorityQueue : public RegReductionPQBase { SF Picker; public: @@ -1583,7 +1658,7 @@ public: SUnit *pop() { if (Queue.empty()) return NULL; - SUnit *V = popFromQueue(Queue, Picker); + SUnit *V = popFromQueue(Queue, Picker, scheduleDAG); V->NodeQueueId = 0; return V; } @@ -1593,7 +1668,7 @@ public: std::vector<SUnit*> DumpQueue = Queue; SF DumpPicker = Picker; while (!DumpQueue.empty()) { - SUnit *SU = popFromQueue(DumpQueue, DumpPicker); + SUnit *SU = popFromQueue(DumpQueue, DumpPicker, scheduleDAG); if (isBottomUp()) dbgs() << "Height " << SU->getHeight() << ": "; else @@ -1623,6 +1698,20 @@ ILPBURRPriorityQueue; // Static Node Priority for Register Pressure Reduction //===----------------------------------------------------------------------===// +// Check for special nodes that bypass scheduling heuristics. +// Currently this pushes TokenFactor nodes down, but may be used for other +// pseudo-ops as well. +// +// Return -1 to schedule right above left, 1 for left above right. +// Return 0 if no bias exists. +static int checkSpecialNodes(const SUnit *left, const SUnit *right) { + bool LSchedLow = left->isScheduleLow; + bool RSchedLow = right->isScheduleLow; + if (LSchedLow != RSchedLow) + return LSchedLow < RSchedLow ? 1 : -1; + return 0; +} + /// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number. /// Smaller number is the higher priority. static unsigned @@ -1661,17 +1750,6 @@ void RegReductionPQBase::CalculateSethiUllmanNumbers() { CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers); } -void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) { - SUnits = &sunits; - // Add pseudo dependency edges for two-address nodes. - AddPseudoTwoAddrDeps(); - // Reroute edges to nodes with multiple uses. - if (!TracksRegPressure) - PrescheduleNodesWithMultipleUses(); - // Calculate node priorities. - CalculateSethiUllmanNumbers(); -} - void RegReductionPQBase::addNode(const SUnit *SU) { unsigned SUSize = SethiUllmanNumbers.size(); if (SUnits->size() > SUSize) @@ -1710,7 +1788,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const { // If SU does not have a register def, schedule it close to its uses // because it does not lengthen any live ranges. return 0; +#if 1 return SethiUllmanNumbers[SU->NodeNum]; +#else + unsigned Priority = SethiUllmanNumbers[SU->NodeNum]; + if (SU->isCallOp) { + // FIXME: This assumes all of the defs are used as call operands. + int NP = (int)Priority - SU->getNode()->getNumValues(); + return (NP > 0) ? NP : 0; + } + return Priority; +#endif } //===----------------------------------------------------------------------===// @@ -1746,8 +1834,10 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const { for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG); RegDefPos.IsValid(); RegDefPos.Advance()) { EVT VT = RegDefPos.GetValue(); - unsigned RCId = TLI->getRepRegClassFor(VT)->getID(); - unsigned Cost = TLI->getRepRegClassCostFor(VT); + + unsigned RCId, Cost; + GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost); + if ((RegPressure[RCId] + Cost) >= RegLimit[RCId]) return true; } @@ -1858,9 +1948,10 @@ void RegReductionPQBase::ScheduledNode(SUnit *SU) { RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) { if (SkipRegDefs) continue; - EVT VT = RegDefPos.GetValue(); - unsigned RCId = TLI->getRepRegClassFor(VT)->getID(); - RegPressure[RCId] += TLI->getRepRegClassCostFor(VT); + + unsigned RCId, Cost; + GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost); + RegPressure[RCId] += Cost; break; } } @@ -1873,16 +1964,16 @@ void RegReductionPQBase::ScheduledNode(SUnit *SU) { RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) { if (SkipRegDefs > 0) continue; - EVT VT = RegDefPos.GetValue(); - unsigned RCId = TLI->getRepRegClassFor(VT)->getID(); - if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT)) { + unsigned RCId, Cost; + GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost); + if (RegPressure[RCId] < Cost) { // Register pressure tracking is imprecise. This can happen. But we try // hard not to let it happen because it likely results in poor scheduling. DEBUG(dbgs() << " SU(" << SU->NodeNum << ") has too many regdefs\n"); RegPressure[RCId] = 0; } else { - RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT); + RegPressure[RCId] -= Cost; } } dumpRegPressure(); @@ -2008,7 +2099,29 @@ static unsigned calcMaxScratches(const SUnit *SU) { return Scratches; } -/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a +/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are +/// CopyFromReg from a virtual register. +static bool hasOnlyLiveInOpers(const SUnit *SU) { + bool RetVal = false; + for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + if (I->isCtrl()) continue; + const SUnit *PredSU = I->getSUnit(); + if (PredSU->getNode() && + PredSU->getNode()->getOpcode() == ISD::CopyFromReg) { + unsigned Reg = + cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + RetVal = true; + continue; + } + } + return false; + } + return RetVal; +} + +/// hasOnlyLiveOutUses - Return true if SU has only value successors that are /// CopyToReg to a virtual register. This SU def is probably a liveout and /// it has no other use. It should be scheduled closer to the terminator. static bool hasOnlyLiveOutUses(const SUnit *SU) { @@ -2030,62 +2143,71 @@ static bool hasOnlyLiveOutUses(const SUnit *SU) { return RetVal; } -/// UnitsSharePred - Return true if the two scheduling units share a common -/// data predecessor. -static bool UnitsSharePred(const SUnit *left, const SUnit *right) { - SmallSet<const SUnit*, 4> Preds; - for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end(); +// Set isVRegCycle for a node with only live in opers and live out uses. Also +// set isVRegCycle for its CopyFromReg operands. +// +// This is only relevant for single-block loops, in which case the VRegCycle +// node is likely an induction variable in which the operand and target virtual +// registers should be coalesced (e.g. pre/post increment values). Setting the +// isVRegCycle flag helps the scheduler prioritize other uses of the same +// CopyFromReg so that this node becomes the virtual register "kill". This +// avoids interference between the values live in and out of the block and +// eliminates a copy inside the loop. +static void initVRegCycle(SUnit *SU) { + if (DisableSchedVRegCycle) + return; + + if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU)) + return; + + DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n"); + + SU->isVRegCycle = true; + + for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) { - if (I->isCtrl()) continue; // ignore chain preds - Preds.insert(I->getSUnit()); + if (I->isCtrl()) continue; + I->getSUnit()->isVRegCycle = true; } - for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end(); +} + +// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of +// CopyFromReg operands. We should no longer penalize other uses of this VReg. +static void resetVRegCycle(SUnit *SU) { + if (!SU->isVRegCycle) + return; + + for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end(); I != E; ++I) { if (I->isCtrl()) continue; // ignore chain preds - if (Preds.count(I->getSUnit())) - return true; + SUnit *PredSU = I->getSUnit(); + if (PredSU->isVRegCycle) { + assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg && + "VRegCycle def must be CopyFromReg"); + I->getSUnit()->isVRegCycle = 0; + } } - return false; } -// Return true if the virtual register defined by VRCycleSU may interfere with -// VRUseSU. -// -// Note: We may consider two SU's that use the same value live into a loop as -// interferng even though the value is not an induction variable. This is an -// unfortunate consequence of scheduling on the selection DAG. -static bool checkVRegCycleInterference(const SUnit *VRCycleSU, - const SUnit *VRUseSU) { - for (SUnit::const_pred_iterator I = VRCycleSU->Preds.begin(), - E = VRCycleSU->Preds.end(); I != E; ++I) { +// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This +// means a node that defines the VRegCycle has not been scheduled yet. +static bool hasVRegCycleUse(const SUnit *SU) { + // If this SU also defines the VReg, don't hoist it as a "use". + if (SU->isVRegCycle) + return false; + + for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end(); + I != E; ++I) { if (I->isCtrl()) continue; // ignore chain preds - SDNode *InNode = I->getSUnit()->getNode(); - if (!InNode || InNode->getOpcode() != ISD::CopyFromReg) - continue; - for (SUnit::const_pred_iterator II = VRUseSU->Preds.begin(), - EE = VRUseSU->Preds.end(); II != EE; ++II) { - if (II->getSUnit() == I->getSUnit()) - return true; + if (I->getSUnit()->isVRegCycle && + I->getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) { + DEBUG(dbgs() << " VReg cycle use: SU (" << SU->NodeNum << ")\n"); + return true; } } return false; } -// Compare the VRegCycle properties of the nodes. -// Return -1 if left has higher priority, 1 if right has higher priority. -// Return 0 if priority is equivalent. -static int BUCompareVRegCycle(const SUnit *left, const SUnit *right) { - if (left->isVRegCycle && !right->isVRegCycle) { - if (checkVRegCycleInterference(left, right)) - return -1; - } - else if (!left->isVRegCycle && right->isVRegCycle) { - if (checkVRegCycleInterference(right, left)) - return 1; - } - return 0; -} - // Check for either a dependence (latency) or resource (hazard) stall. // // Note: The ScheduleHazardRecognizer interface requires a non-const SU. @@ -2101,23 +2223,12 @@ static bool BUHasStall(SUnit *SU, int Height, RegReductionPQBase *SPQ) { // Return 0 if latency-based priority is equivalent. static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref, RegReductionPQBase *SPQ) { - // If the two nodes share an operand and one of them has a single - // use that is a live out copy, favor the one that is live out. Otherwise - // it will be difficult to eliminate the copy if the instruction is a - // loop induction variable update. e.g. - // BB: - // sub r1, r3, #1 - // str r0, [r2, r3] - // mov r3, r1 - // cmp - // bne BB - bool SharePred = UnitsSharePred(left, right); - // FIXME: Only adjust if BB is a loop back edge. - // FIXME: What's the cost of a copy? - int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0; - int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0; - int LHeight = (int)left->getHeight() - LBonus; - int RHeight = (int)right->getHeight() - RBonus; + // Scheduling an instruction that uses a VReg whose postincrement has not yet + // been scheduled will induce a copy. Model this as an extra cycle of latency. + int LPenalty = hasVRegCycleUse(left) ? 1 : 0; + int RPenalty = hasVRegCycleUse(right) ? 1 : 0; + int LHeight = (int)left->getHeight() + LPenalty; + int RHeight = (int)right->getHeight() + RPenalty; bool LStall = (!checkPref || left->SchedulingPref == Sched::Latency) && BUHasStall(left, LHeight, SPQ); @@ -2128,48 +2239,102 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref, // If scheduling either one of the node will cause a pipeline stall, sort // them according to their height. if (LStall) { - if (!RStall) + if (!RStall) { + DEBUG(++FactorCount[FactStall]); return 1; - if (LHeight != RHeight) + } + if (LHeight != RHeight) { + DEBUG(++FactorCount[FactStall]); return LHeight > RHeight ? 1 : -1; - } else if (RStall) + } + } else if (RStall) { + DEBUG(++FactorCount[FactStall]); return -1; + } // If either node is scheduling for latency, sort them by height/depth // and latency. if (!checkPref || (left->SchedulingPref == Sched::Latency || right->SchedulingPref == Sched::Latency)) { if (DisableSchedCycles) { - if (LHeight != RHeight) + if (LHeight != RHeight) { + DEBUG(++FactorCount[FactHeight]); return LHeight > RHeight ? 1 : -1; + } } else { // If neither instruction stalls (!LStall && !RStall) then // its height is already covered so only its depth matters. We also reach // this if both stall but have the same height. - unsigned LDepth = left->getDepth(); - unsigned RDepth = right->getDepth(); + int LDepth = left->getDepth() - LPenalty; + int RDepth = right->getDepth() - RPenalty; if (LDepth != RDepth) { + DEBUG(++FactorCount[FactDepth]); DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum << ") depth " << LDepth << " vs SU (" << right->NodeNum << ") depth " << RDepth << "\n"); return LDepth < RDepth ? 1 : -1; } } - if (left->Latency != right->Latency) + if (left->Latency != right->Latency) { + DEBUG(++FactorCount[FactOther]); return left->Latency > right->Latency ? 1 : -1; + } } return 0; } static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { + // Schedule physical register definitions close to their use. This is + // motivated by microarchitectures that can fuse cmp+jump macro-ops. But as + // long as shortening physreg live ranges is generally good, we can defer + // creating a subtarget hook. + if (!DisableSchedPhysRegJoin) { + bool LHasPhysReg = left->hasPhysRegDefs; + bool RHasPhysReg = right->hasPhysRegDefs; + if (LHasPhysReg != RHasPhysReg) { + DEBUG(++FactorCount[FactRegUses]); + #ifndef NDEBUG + const char *PhysRegMsg[] = {" has no physreg", " defines a physreg"}; + #endif + DEBUG(dbgs() << " SU (" << left->NodeNum << ") " + << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") " + << PhysRegMsg[RHasPhysReg] << "\n"); + return LHasPhysReg < RHasPhysReg; + } + } + + // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down. unsigned LPriority = SPQ->getNodePriority(left); unsigned RPriority = SPQ->getNodePriority(right); + + // Be really careful about hoisting call operands above previous calls. + // Only allows it if it would reduce register pressure. + if (left->isCall && right->isCallOp) { + unsigned RNumVals = right->getNode()->getNumValues(); + RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0; + } + if (right->isCall && left->isCallOp) { + unsigned LNumVals = left->getNode()->getNumValues(); + LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0; + } + if (LPriority != RPriority) { DEBUG(++FactorCount[FactStatic]); return LPriority > RPriority; } - DEBUG(++FactorCount[FactOther]); + + // One or both of the nodes are calls and their sethi-ullman numbers are the + // same, then keep source order. + if (left->isCall || right->isCall) { + unsigned LOrder = SPQ->getNodeOrdering(left); + unsigned ROrder = SPQ->getNodeOrdering(right); + + // Prefer an ordering where the lower the non-zero order number, the higher + // the preference. + if ((LOrder || ROrder) && LOrder != ROrder) + return LOrder != 0 && (LOrder < ROrder || ROrder == 0); + } // Try schedule def + use closer when Sethi-Ullman numbers are the same. // e.g. @@ -2190,40 +2355,62 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { // This creates more short live intervals. unsigned LDist = closestSucc(left); unsigned RDist = closestSucc(right); - if (LDist != RDist) + if (LDist != RDist) { + DEBUG(++FactorCount[FactOther]); return LDist < RDist; + } // How many registers becomes live when the node is scheduled. unsigned LScratch = calcMaxScratches(left); unsigned RScratch = calcMaxScratches(right); - if (LScratch != RScratch) + if (LScratch != RScratch) { + DEBUG(++FactorCount[FactOther]); return LScratch > RScratch; + } - if (!DisableSchedCycles) { + // Comparing latency against a call makes little sense unless the node + // is register pressure-neutral. + if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0)) + return (left->NodeQueueId > right->NodeQueueId); + + // Do not compare latencies when one or both of the nodes are calls. + if (!DisableSchedCycles && + !(left->isCall || right->isCall)) { int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ); if (result != 0) return result > 0; } else { - if (left->getHeight() != right->getHeight()) + if (left->getHeight() != right->getHeight()) { + DEBUG(++FactorCount[FactHeight]); return left->getHeight() > right->getHeight(); + } - if (left->getDepth() != right->getDepth()) + if (left->getDepth() != right->getDepth()) { + DEBUG(++FactorCount[FactDepth]); return left->getDepth() < right->getDepth(); + } } assert(left->NodeQueueId && right->NodeQueueId && "NodeQueueId cannot be zero"); + DEBUG(++FactorCount[FactOther]); return (left->NodeQueueId > right->NodeQueueId); } // Bottom up bool bu_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res > 0; + return BURRSort(left, right, SPQ); } // Source order, otherwise bottom up. bool src_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res > 0; + unsigned LOrder = SPQ->getNodeOrdering(left); unsigned ROrder = SPQ->getNodeOrdering(right); @@ -2255,6 +2442,9 @@ bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const { // Return true if right should be scheduled with higher priority than left. bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res > 0; + if (left->isCall || right->isCall) // No way to compute latency of calls. return BURRSort(left, right, SPQ); @@ -2264,24 +2454,22 @@ bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { // Avoid causing spills. If register pressure is high, schedule for // register pressure reduction. if (LHigh && !RHigh) { + DEBUG(++FactorCount[FactPressureDiff]); DEBUG(dbgs() << " pressure SU(" << left->NodeNum << ") > SU(" << right->NodeNum << ")\n"); return true; } else if (!LHigh && RHigh) { + DEBUG(++FactorCount[FactPressureDiff]); DEBUG(dbgs() << " pressure SU(" << right->NodeNum << ") > SU(" << left->NodeNum << ")\n"); return false; } - int result = 0; - if (!DisableSchedVRegCycle) { - result = BUCompareVRegCycle(left, right); - } - if (result == 0 && !LHigh && !RHigh) { - result = BUCompareLatency(left, right, true /*checkPref*/, SPQ); + if (!LHigh && !RHigh) { + int result = BUCompareLatency(left, right, true /*checkPref*/, SPQ); + if (result != 0) + return result > 0; } - if (result != 0) - return result > 0; return BURRSort(left, right, SPQ); } @@ -2322,6 +2510,9 @@ static bool canEnableCoalescing(SUnit *SU) { // list-ilp is currently an experimental scheduler that allows various // heuristics to be enabled prior to the normal register reduction logic. bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res > 0; + if (left->isCall || right->isCall) // No way to compute latency of calls. return BURRSort(left, right, SPQ); @@ -2347,12 +2538,6 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { if (RReduce && !LReduce) return true; } - if (!DisableSchedVRegCycle) { - int result = BUCompareVRegCycle(left, right); - if (result != 0) - return result > 0; - } - if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) { DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses << " != SU(" << right->NodeNum << "): " << RLiveUses << "\n"); @@ -2391,6 +2576,24 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { return BURRSort(left, right, SPQ); } +void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) { + SUnits = &sunits; + // Add pseudo dependency edges for two-address nodes. + AddPseudoTwoAddrDeps(); + // Reroute edges to nodes with multiple uses. + if (!TracksRegPressure) + PrescheduleNodesWithMultipleUses(); + // Calculate node priorities. + CalculateSethiUllmanNumbers(); + + // For single block loops, mark nodes that look like canonical IV increments. + if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB)) { + for (unsigned i = 0, e = sunits.size(); i != e; ++i) { + initVRegCycle(&sunits[i]); + } + } +} + //===----------------------------------------------------------------------===// // Preschedule for Register Pressure //===----------------------------------------------------------------------===// @@ -2668,6 +2871,9 @@ static unsigned LimitedSumOfUnscheduledPredsOfSuccs(const SUnit *SU, // Top down bool td_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res < 0; + unsigned LPriority = SPQ->getNodePriority(left); unsigned RPriority = SPQ->getNodePriority(right); bool LIsTarget = left->getNode() && left->getNode()->isMachineOpcode(); diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 24a1937..0d656ef 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -83,10 +83,13 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) { SU->Latency = Old->Latency; SU->isVRegCycle = Old->isVRegCycle; SU->isCall = Old->isCall; + SU->isCallOp = Old->isCallOp; SU->isTwoAddress = Old->isTwoAddress; SU->isCommutable = Old->isCommutable; SU->hasPhysRegDefs = Old->hasPhysRegDefs; SU->hasPhysRegClobbers = Old->hasPhysRegClobbers; + SU->isScheduleHigh = Old->isScheduleHigh; + SU->isScheduleLow = Old->isScheduleLow; SU->SchedulingPref = Old->SchedulingPref; Old->isCloned = true; return SU; @@ -283,6 +286,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { Worklist.push_back(DAG->getRoot().getNode()); Visited.insert(DAG->getRoot().getNode()); + SmallVector<SUnit*, 8> CallSUnits; while (!Worklist.empty()) { SDNode *NI = Worklist.pop_back_val(); @@ -335,6 +339,15 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { if (!HasGlueUse) break; } + if (NodeSUnit->isCall) + CallSUnits.push_back(NodeSUnit); + + // Schedule zero-latency TokenFactor below any nodes that may increase the + // schedule height. Otherwise, ancestors of the TokenFactor may appear to + // have false stalls. + if (NI->getOpcode() == ISD::TokenFactor) + NodeSUnit->isScheduleLow = true; + // If there are glue operands involved, N is now the bottom-most node // of the sequence of nodes that are glued together. // Update the SUnit. @@ -342,16 +355,26 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); - // Set isVRegCycle if the node operands are live into and value is live out - // of a single block loop. - InitVRegCycleFlag(NodeSUnit); - // Compute NumRegDefsLeft. This must be done before AddSchedEdges. InitNumRegDefsLeft(NodeSUnit); // Assign the Latency field of NodeSUnit using target-provided information. ComputeLatency(NodeSUnit); } + + // Find all call operands. + while (!CallSUnits.empty()) { + SUnit *SU = CallSUnits.pop_back_val(); + for (const SDNode *SUNode = SU->getNode(); SUNode; + SUNode = SUNode->getGluedNode()) { + if (SUNode->getOpcode() != ISD::CopyToReg) + continue; + SDNode *SrcN = SUNode->getOperand(2).getNode(); + if (isPassiveNode(SrcN)) continue; // Not scheduled. + SUnit *SrcSU = &SUnits[SrcN->getNodeId()]; + SrcSU->isCallOp = true; + } + } } void ScheduleDAGSDNodes::AddSchedEdges() { @@ -412,11 +435,15 @@ void ScheduleDAGSDNodes::AddSchedEdges() { // it requires a cross class copy (cost < 0). That means we are only // treating "expensive to copy" register dependency as physical register // dependency. This may change in the future though. - if (Cost >= 0) + if (Cost >= 0 && !StressSched) PhysReg = 0; // If this is a ctrl dep, latency is 1. unsigned OpLatency = isChain ? 1 : OpSU->Latency; + // Special-case TokenFactor chains as zero-latency. + if(isChain && OpN->getOpcode() == ISD::TokenFactor) + OpLatency = 0; + const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data, OpLatency, PhysReg); if (!isChain && !UnitLatencies) { @@ -512,47 +539,6 @@ void ScheduleDAGSDNodes::RegDefIter::Advance() { } } -// Set isVRegCycle if this node's single use is CopyToReg and its only active -// data operands are CopyFromReg. -// -// This is only relevant for single-block loops, in which case the VRegCycle -// node is likely an induction variable in which the operand and target virtual -// registers should be coalesced (e.g. pre/post increment values). Setting the -// isVRegCycle flag helps the scheduler prioritize other uses of the same -// CopyFromReg so that this node becomes the virtual register "kill". This -// avoids interference between the values live in and out of the block and -// eliminates a copy inside the loop. -void ScheduleDAGSDNodes::InitVRegCycleFlag(SUnit *SU) { - if (!BB->isSuccessor(BB)) - return; - - SDNode *N = SU->getNode(); - if (N->getGluedNode()) - return; - - if (!N->hasOneUse() || N->use_begin()->getOpcode() != ISD::CopyToReg) - return; - - bool FoundLiveIn = false; - for (SDNode::op_iterator OI = N->op_begin(), E = N->op_end(); OI != E; ++OI) { - EVT OpVT = OI->getValueType(); - assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!"); - - if (OpVT == MVT::Other) - continue; // ignore chain operands - - if (isPassiveNode(OI->getNode())) - continue; // ignore constants and such - - if (OI->getNode()->getOpcode() != ISD::CopyFromReg) - return; - - FoundLiveIn = true; - } - if (FoundLiveIn) - SU->isVRegCycle = true; -} - void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) { assert(SU->NumRegDefsLeft == 0 && "expect a new node"); for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) { @@ -562,6 +548,16 @@ void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) { } void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) { + SDNode *N = SU->getNode(); + + // TokenFactor operands are considered zero latency, and some schedulers + // (e.g. Top-Down list) may rely on the fact that operand latency is nonzero + // whenever node latency is nonzero. + if (N && N->getOpcode() == ISD::TokenFactor) { + SU->Latency = 0; + return; + } + // Check to see if the scheduler cares about latencies. if (ForceUnitLatencies()) { SU->Latency = 1; @@ -569,7 +565,6 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) { } if (!InstrItins || InstrItins->isEmpty()) { - SDNode *N = SU->getNode(); if (N && N->isMachineOpcode() && TII->isHighLatencyDef(N->getMachineOpcode())) SU->Latency = HighLatencyCycles; @@ -641,7 +636,7 @@ namespace { }; } -/// ProcessSDDbgValues - Process SDDbgValues assoicated with this node. +/// ProcessSDDbgValues - Process SDDbgValues associated with this node. static void ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, SmallVector<std::pair<unsigned, MachineInstr*>, 32> &Orders, diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h index b5f68f3..3ad2bd6 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h @@ -135,6 +135,14 @@ namespace llvm { return ValueType; } + const SDNode *GetNode() const { + return Node; + } + + unsigned GetIdx() const { + return DefIdx; + } + void Advance(); private: void InitNodeNumDefs(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index c2711c8..68eeb60 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2050,14 +2050,15 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask, break; default: - // Allow the target to implement this method for its nodes. - if (Op.getOpcode() >= ISD::BUILTIN_OP_END) { + if (Op.getOpcode() < ISD::BUILTIN_OP_END) + break; + // Fallthrough case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_VOID: - TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this, - Depth); - } + // Allow the target to implement this method for its nodes. + TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this, + Depth); return; } } @@ -2322,6 +2323,13 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op) const { return !C->isZero(); // TODO: Recognize more cases here. + switch (Op.getOpcode()) { + default: break; + case ISD::OR: + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) + return !C->isNullValue(); + break; + } return false; } @@ -2339,16 +2347,6 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { return false; } -bool SelectionDAG::isVerifiedDebugInfoDesc(SDValue Op) const { - GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op); - if (!GA) return false; - if (GA->getOffset() != 0) return false; - const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal()); - if (!GV) return false; - return MF->getMMI().hasDebugInfo(); -} - - /// getNode - Gets or creates the specified node. /// SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) { @@ -6304,7 +6302,7 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand, - getConstant(i, MVT::i32)); + getConstant(i, TLI.getPointerTy())); } else { // A scalar operand; just use it as is. Operands[j] = Operand; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 24c325e..d79a5ae 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -280,12 +280,35 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL, } // Vector/Vector bitcast. - return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + if (ValueVT.getSizeInBits() == PartVT.getSizeInBits()) + return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + + assert(PartVT.getVectorNumElements() == ValueVT.getVectorNumElements() && + "Cannot handle this kind of promotion"); + // Promoted vector extract + bool Smaller = ValueVT.bitsLE(PartVT); + return DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), + DL, ValueVT, Val); + } - assert(ValueVT.getVectorElementType() == PartVT && - ValueVT.getVectorNumElements() == 1 && + // Trivial bitcast if the types are the same size and the destination + // vector type is legal. + if (PartVT.getSizeInBits() == ValueVT.getSizeInBits() && + TLI.isTypeLegal(ValueVT)) + return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + + // Handle cases such as i8 -> <1 x i1> + assert(ValueVT.getVectorNumElements() == 1 && "Only trivial scalar-to-vector conversions should get here!"); + + if (ValueVT.getVectorNumElements() == 1 && + ValueVT.getVectorElementType() != PartVT) { + bool Smaller = ValueVT.bitsLE(PartVT); + Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), + DL, ValueVT.getScalarType(), Val); + } + return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val); } @@ -426,7 +449,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL, // Bitconvert vector->vector case. Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); } else if (PartVT.isVector() && - PartVT.getVectorElementType() == ValueVT.getVectorElementType()&& + PartVT.getVectorElementType() == ValueVT.getVectorElementType() && PartVT.getVectorNumElements() > ValueVT.getVectorNumElements()) { EVT ElementVT = PartVT.getVectorElementType(); // Vector widening case, e.g. <2 x float> -> <4 x float>. Shuffle in @@ -446,13 +469,33 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL, //SDValue UndefElts = DAG.getUNDEF(VectorTy); //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts); - } else { + } else if (PartVT.isVector() && + PartVT.getVectorElementType().bitsGE( + ValueVT.getVectorElementType()) && + PartVT.getVectorNumElements() == ValueVT.getVectorNumElements()) { + + // Promoted vector extract + unsigned NumElts = ValueVT.getVectorNumElements(); + SmallVector<SDValue, 8> NewOps; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + ValueVT.getScalarType(), Val ,DAG.getIntPtrConstant(i)); + SDValue Cast = DAG.getNode(ISD::ANY_EXTEND, + DL, PartVT.getScalarType(), Ext); + NewOps.push_back(Cast); + } + Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, + &NewOps[0], NewOps.size()); + } else{ // Vector -> scalar conversion. - assert(ValueVT.getVectorElementType() == PartVT && - ValueVT.getVectorNumElements() == 1 && + assert(ValueVT.getVectorNumElements() == 1 && "Only trivial vector-to-scalar conversions should get here!"); Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, DAG.getIntPtrConstant(0)); + + bool Smaller = ValueVT.bitsLE(PartVT); + Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), + DL, PartVT, Val); } Parts[0] = Val; @@ -783,11 +826,20 @@ void SelectionDAGBuilder::clear() { UnusedArgNodeMap.clear(); PendingLoads.clear(); PendingExports.clear(); - DanglingDebugInfoMap.clear(); CurDebugLoc = DebugLoc(); HasTailCall = false; } +/// clearDanglingDebugInfo - Clear the dangling debug information +/// map. This function is seperated from the clear so that debug +/// information that is dangling in a basic block can be properly +/// resolved in a different basic block. This allows the +/// SelectionDAG to resolve dangling debug information attached +/// to PHI nodes. +void SelectionDAGBuilder::clearDanglingDebugInfo() { + DanglingDebugInfoMap.clear(); +} + /// getRoot - Return the current virtual root of the Selection DAG, /// flushing any PendingLoad items. This must be done before emitting /// a store or any other node that may need to be ordered after any @@ -1175,6 +1227,10 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { /// created for it, emit nodes to copy the value into the virtual /// registers. void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) { + // Skip empty types + if (V->getType()->isEmptyTy()) + return; + DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V); if (VMI != FuncInfo.ValueMap.end()) { assert(!V->use_empty() && "Unused value assigned virtual registers!"); @@ -1223,6 +1279,24 @@ bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V, return true; } +/// Return branch probability calculated by BranchProbabilityInfo for IR blocks. +uint32_t SelectionDAGBuilder::getEdgeWeight(MachineBasicBlock *Src, + MachineBasicBlock *Dst) { + BranchProbabilityInfo *BPI = FuncInfo.BPI; + if (!BPI) + return 0; + BasicBlock *SrcBB = const_cast<BasicBlock*>(Src->getBasicBlock()); + BasicBlock *DstBB = const_cast<BasicBlock*>(Dst->getBasicBlock()); + return BPI->getEdgeWeight(SrcBB, DstBB); +} + +void SelectionDAGBuilder::addSuccessorWithWeight(MachineBasicBlock *Src, + MachineBasicBlock *Dst) { + uint32_t weight = getEdgeWeight(Src, Dst); + Src->addSuccessor(Dst, weight); +} + + static bool InBlock(const Value *V, const BasicBlock *BB) { if (const Instruction *I = dyn_cast<Instruction>(V)) return I->getParent() == BB; @@ -1492,8 +1566,8 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB, } // Update successor info - SwitchBB->addSuccessor(CB.TrueBB); - SwitchBB->addSuccessor(CB.FalseBB); + addSuccessorWithWeight(SwitchBB, CB.TrueBB); + addSuccessorWithWeight(SwitchBB, CB.FalseBB); // Set NextBlock to be the MBB immediately after the current one, if any. // This is used to avoid emitting unnecessary branches to the next block. @@ -1637,8 +1711,8 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B, MachineBasicBlock* MBB = B.Cases[0].ThisBB; - SwitchBB->addSuccessor(B.Default); - SwitchBB->addSuccessor(MBB); + addSuccessorWithWeight(SwitchBB, B.Default); + addSuccessorWithWeight(SwitchBB, MBB); SDValue BrRange = DAG.getNode(ISD::BRCOND, getCurDebugLoc(), MVT::Other, CopyTo, RangeCmp, @@ -1683,8 +1757,8 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB, ISD::SETNE); } - SwitchBB->addSuccessor(B.TargetBB); - SwitchBB->addSuccessor(NextMBB); + addSuccessorWithWeight(SwitchBB, B.TargetBB); + addSuccessorWithWeight(SwitchBB, NextMBB); SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(), MVT::Other, getControlRoot(), @@ -1924,8 +1998,9 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec& CR, // table. MachineBasicBlock *JumpTableBB = CurMF->CreateMachineBasicBlock(LLVMBB); CurMF->insert(BBI, JumpTableBB); - CR.CaseBB->addSuccessor(Default); - CR.CaseBB->addSuccessor(JumpTableBB); + + addSuccessorWithWeight(CR.CaseBB, Default); + addSuccessorWithWeight(CR.CaseBB, JumpTableBB); // Build a vector of destination BBs, corresponding to each target // of the jump table. If the value of the jump table slot corresponds to @@ -1952,7 +2027,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec& CR, E = DestBBs.end(); I != E; ++I) { if (!SuccsHandled[(*I)->getNumber()]) { SuccsHandled[(*I)->getNumber()] = true; - JumpTableBB->addSuccessor(*I); + addSuccessorWithWeight(JumpTableBB, *I); } } @@ -2019,9 +2094,13 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR, APInt Range = ComputeRange(LEnd, RBegin); assert((Range - 2ULL).isNonNegative() && "Invalid case distance"); - double LDensity = (double)LSize.roundToDouble() / + // Use volatile double here to avoid excess precision issues on some hosts, + // e.g. that use 80-bit X87 registers. + volatile double LDensity = + (double)LSize.roundToDouble() / (LEnd - First + 1ULL).roundToDouble(); - double RDensity = (double)RSize.roundToDouble() / + volatile double RDensity = + (double)RSize.roundToDouble() / (Last - RBegin + 1ULL).roundToDouble(); double Metric = Range.logBase2()*(LDensity+RDensity); // Should always split in some non-trivial place @@ -2367,8 +2446,10 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) { succs.push_back(I.getSuccessor(i)); array_pod_sort(succs.begin(), succs.end()); succs.erase(std::unique(succs.begin(), succs.end()), succs.end()); - for (unsigned i = 0, e = succs.size(); i != e; ++i) - IndirectBrMBB->addSuccessor(FuncInfo.MBBMap[succs[i]]); + for (unsigned i = 0, e = succs.size(); i != e; ++i) { + MachineBasicBlock *Succ = FuncInfo.MBBMap[succs[i]]; + addSuccessorWithWeight(IndirectBrMBB, Succ); + } DAG.setRoot(DAG.getNode(ISD::BRIND, getCurDebugLoc(), MVT::Other, getControlRoot(), @@ -2806,16 +2887,18 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) { SmallVector<SDValue, 4> Values(NumAggValues); SDValue Agg = getValue(Op0); - SDValue Val = getValue(Op1); unsigned i = 0; // Copy the beginning value(s) from the original aggregate. for (; i != LinearIndex; ++i) Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) : SDValue(Agg.getNode(), Agg.getResNo() + i); // Copy values from the inserted value(s). - for (; i != LinearIndex + NumValValues; ++i) - Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) : - SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex); + if (NumValValues) { + SDValue Val = getValue(Op1); + for (; i != LinearIndex + NumValValues; ++i) + Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) : + SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex); + } // Copy remaining value(s) from the original aggregate. for (; i != NumAggValues; ++i) Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) : @@ -2838,6 +2921,13 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) { ComputeValueVTs(TLI, ValTy, ValValueVTs); unsigned NumValValues = ValValueVTs.size(); + + // Ignore a extractvalue that produces an empty object + if (!NumValValues) { + setValue(&I, DAG.getUNDEF(MVT(MVT::Other))); + return; + } + SmallVector<SDValue, 4> Values(NumValValues); SDValue Agg = getValue(Op0); @@ -4009,6 +4099,24 @@ static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS, return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS); } +// getTruncatedArgReg - Find underlying register used for an truncated +// argument. +static unsigned getTruncatedArgReg(const SDValue &N) { + if (N.getOpcode() != ISD::TRUNCATE) + return 0; + + const SDValue &Ext = N.getOperand(0); + if (Ext.getOpcode() == ISD::AssertZext || Ext.getOpcode() == ISD::AssertSext){ + const SDValue &CFR = Ext.getOperand(0); + if (CFR.getOpcode() == ISD::CopyFromReg) + return cast<RegisterSDNode>(CFR.getOperand(1))->getReg(); + else + if (CFR.getOpcode() == ISD::TRUNCATE) + return getTruncatedArgReg(CFR); + } + return 0; +} + /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function /// argument, create the corresponding DBG_VALUE machine instruction for it now. /// At the end of instruction selection, they will be inserted to the entry BB. @@ -4029,10 +4137,6 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable, if (DV.isInlinedFnArgument(MF.getFunction())) return false; - MachineBasicBlock *MBB = FuncInfo.MBB; - if (MBB != &MF.front()) - return false; - unsigned Reg = 0; if (Arg->hasByValAttr()) { // Byval arguments' frame index is recorded during argument lowering. @@ -4044,9 +4148,12 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable, Reg = 0; } - if (N.getNode() && N.getOpcode() == ISD::CopyFromReg) { - Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (N.getNode()) { + if (N.getOpcode() == ISD::CopyFromReg) + Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg(); + else + Reg = getTruncatedArgReg(N); + if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) { MachineRegisterInfo &RegInfo = MF.getRegInfo(); unsigned PR = RegInfo.getLiveInPhysReg(Reg); if (PR) @@ -4208,9 +4315,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDV = DAG.getDbgValue(Variable, FINode->getIndex(), 0, dl, SDNodeOrder); else { - // Can't do anything with other non-AI cases yet. This might be a - // parameter of a callee function that got inlined, for example. - DEBUG(dbgs() << "Dropping debug info for " << DI); + // Address is an argument, so try to emit its dbg value using + // virtual register info from the FuncInfo.ValueMap. + EmitFuncArgumentDbgValue(Address, Variable, 0, N); return 0; } } else if (AI) @@ -4403,7 +4510,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } case Intrinsic::eh_sjlj_dispatch_setup: { DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other, - getRoot())); + getRoot(), getValue(I.getArgOperand(0)))); return 0; } @@ -4672,9 +4779,22 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::flt_rounds: setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, dl, MVT::i32)); return 0; - case Intrinsic::trap: - DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot())); + case Intrinsic::trap: { + StringRef TrapFuncName = getTrapFunctionName(); + if (TrapFuncName.empty()) { + DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot())); + return 0; + } + TargetLowering::ArgListTy Args; + std::pair<SDValue, SDValue> Result = + TLI.LowerCallTo(getRoot(), I.getType(), + false, false, false, false, 0, CallingConv::C, + /*isTailCall=*/false, /*isReturnValueUsed=*/true, + DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()), + Args, DAG, getCurDebugLoc()); + DAG.setRoot(Result.second); return 0; + } case Intrinsic::uadd_with_overflow: return implVisitAluOverflow(I, ISD::UADDO); case Intrinsic::sadd_with_overflow: @@ -4689,15 +4809,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return implVisitAluOverflow(I, ISD::SMULO); case Intrinsic::prefetch: { - SDValue Ops[4]; + SDValue Ops[5]; unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); Ops[0] = getRoot(); Ops[1] = getValue(I.getArgOperand(0)); Ops[2] = getValue(I.getArgOperand(1)); Ops[3] = getValue(I.getArgOperand(2)); + Ops[4] = getValue(I.getArgOperand(3)); DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, dl, DAG.getVTList(MVT::Other), - &Ops[0], 4, + &Ops[0], 5, EVT::getIntegerVT(*Context, 8), MachinePointerInfo(I.getArgOperand(0)), 0, /* align */ @@ -4784,7 +4905,9 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, Outs, TLI, &Offsets); bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), - FTy->isVarArg(), Outs, FTy->getContext()); + DAG.getMachineFunction(), + FTy->isVarArg(), Outs, + FTy->getContext()); SDValue DemoteStackSlot; int DemoteStackIdx = -100; @@ -4814,8 +4937,14 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { - SDValue ArgNode = getValue(*i); - Entry.Node = ArgNode; Entry.Ty = (*i)->getType(); + const Value *V = *i; + + // Skip empty types + if (V->getType()->isEmptyTy()) + continue; + + SDValue ArgNode = getValue(V); + Entry.Node = ArgNode; Entry.Ty = V->getType(); unsigned attrInd = i - CS.arg_begin() + 1; Entry.isSExt = CS.paramHasAttr(attrInd, Attribute::SExt); @@ -5255,6 +5384,7 @@ public: const llvm::Type *OpTy = CallOperandVal->getType(); + // FIXME: code duplicated from TargetLowering::ParseConstraints(). // If this is an indirect operand, the operand is a pointer to the // accessed type. if (isIndirect) { @@ -5264,6 +5394,11 @@ public: OpTy = PtrTy->getElementType(); } + // Look for vector wrapped in a struct. e.g. { <16 x i8> }. + if (const StructType *STy = dyn_cast<StructType>(OpTy)) + if (STy->getNumElements() == 1) + OpTy = STy->getElementType(0); + // If OpTy is not a single value, it may be a struct/union that we // can tile with integers. if (!OpTy->isSingleValueType() && OpTy->isSized()) { @@ -5315,6 +5450,8 @@ isAllocatableRegister(unsigned Reg, MachineFunction &MF, EVT ThisVT = MVT::Other; const TargetRegisterClass *RC = *RCI; + if (!RC->isAllocatable()) + continue; // If none of the value types for this register class are valid, we // can't use it. For example, 64-bit reg classes on 32-bit targets. for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end(); @@ -5336,15 +5473,14 @@ isAllocatableRegister(unsigned Reg, MachineFunction &MF, // frame pointer in functions that need it (due to them not being taken // out of allocation, because a variable sized allocation hasn't been seen // yet). This is a slight code pessimization, but should still work. - for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF), - E = RC->allocation_order_end(MF); I != E; ++I) - if (*I == Reg) { - // We found a matching register class. Keep looking at others in case - // we find one with larger registers that this physreg is also in. - FoundRC = RC; - FoundVT = ThisVT; - break; - } + ArrayRef<unsigned> RawOrder = RC->getRawAllocationOrder(MF); + if (std::find(RawOrder.begin(), RawOrder.end(), Reg) != RawOrder.end()) { + // We found a matching register class. Keep looking at others in case + // we find one with larger registers that this physreg is also in. + FoundRC = RC; + FoundVT = ThisVT; + break; + } } return FoundRC; } @@ -5491,9 +5627,15 @@ static void GetRegistersForValue(SelectionDAG &DAG, OpInfo.ConstraintVT); const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); + BitVector Reserved = TRI->getReservedRegs(MF); unsigned NumAllocated = 0; for (unsigned i = 0, e = RegClassRegs.size(); i != e; ++i) { unsigned Reg = RegClassRegs[i]; + // Filter out the reserved registers, but note that reserved registers are + // not fully determined at this point. We may still decide we need a frame + // pointer. + if (Reserved.test(Reg)) + continue; // See if this register is available. if ((isOutReg && OutputRegs.count(Reg)) || // Already used. (isInReg && InputRegs.count(Reg))) { // Already used. @@ -5542,7 +5684,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { std::set<unsigned> OutputRegs, InputRegs; - TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(CS); + TargetLowering::AsmOperandInfoVector + TargetConstraints = TLI.ParseConstraints(CS); + bool hasMemory = false; unsigned ArgNo = 0; // ArgNo - The argument of the CallInst. @@ -5601,7 +5745,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { hasMemory = true; else { for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) { - TargetLowering::ConstraintType CType = TLI.getConstraintType(OpInfo.Codes[j]); + TargetLowering::ConstraintType + CType = TLI.getConstraintType(OpInfo.Codes[j]); if (CType == TargetLowering::C_Memory) { hasMemory = true; break; @@ -5651,12 +5796,17 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { // need to to provide an address for the memory input. if (OpInfo.ConstraintType == TargetLowering::C_Memory && !OpInfo.isIndirect) { - assert((OpInfo.isMultipleAlternative || (OpInfo.Type == InlineAsm::isInput)) && + assert((OpInfo.isMultipleAlternative || + (OpInfo.Type == InlineAsm::isInput)) && "Can only indirectify direct input operands!"); // Memory operands really want the address of the value. If we don't have // an indirect input, put it in the constpool if we can, otherwise spill // it to a stack slot. + // TODO: This isn't quite right. We need to handle these according to + // the addressing mode that the constraint wants. Also, this may take + // an additional register for the computation and we don't want that + // either. // If the operand is a float, integer, or vector constant, spill to a // constant pool entry to get its address. @@ -5858,7 +6008,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { if (OpInfo.ConstraintType == TargetLowering::C_Other) { std::vector<SDValue> Ops; - TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode[0], + TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode, Ops, DAG); if (Ops.empty()) report_fatal_error("Invalid operand for inline asm constraint '" + @@ -6067,14 +6217,15 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy, Flags.setByVal(); const PointerType *Ty = cast<PointerType>(Args[i].Ty); const Type *ElementTy = Ty->getElementType(); - unsigned FrameAlign = getByValTypeAlignment(ElementTy); - unsigned FrameSize = getTargetData()->getTypeAllocSize(ElementTy); + Flags.setByValSize(getTargetData()->getTypeAllocSize(ElementTy)); // For ByVal, alignment should come from FE. BE will guess if this // info is not there but there are cases it cannot get right. + unsigned FrameAlign; if (Args[i].Alignment) FrameAlign = Args[i].Alignment; + else + FrameAlign = getByValTypeAlignment(ElementTy); Flags.setByValAlign(FrameAlign); - Flags.setByValSize(FrameSize); } if (Args[i].isNest) Flags.setNest(); @@ -6180,7 +6331,7 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy, // For a function returning void, there is no return value. We can't create // such a node, so we just return a null return value in that case. In - // that case, nothing will actualy look at the value. + // that case, nothing will actually look at the value. if (ReturnValues.empty()) return std::make_pair(SDValue(), Chain); @@ -6219,6 +6370,25 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) { #include "llvm/CodeGen/SelectionDAGISel.h" +/// isOnlyUsedInEntryBlock - If the specified argument is only used in the +/// entry block, return true. This includes arguments used by switches, since +/// the switch may expand into multiple basic blocks. +static bool isOnlyUsedInEntryBlock(const Argument *A) { + // With FastISel active, we may be splitting blocks, so force creation + // of virtual registers for all non-dead arguments. + if (EnableFastISel) + return A->use_empty(); + + const BasicBlock *Entry = A->getParent()->begin(); + for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end(); + UI != E; ++UI) { + const User *U = *UI; + if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U)) + return false; // Use not in entry block. + } + return true; +} + void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) { // If this is the entry block, emit arguments. const Function &F = *LLVMBB->getParent(); @@ -6273,14 +6443,15 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) { Flags.setByVal(); const PointerType *Ty = cast<PointerType>(I->getType()); const Type *ElementTy = Ty->getElementType(); - unsigned FrameAlign = TLI.getByValTypeAlignment(ElementTy); - unsigned FrameSize = TD->getTypeAllocSize(ElementTy); + Flags.setByValSize(TD->getTypeAllocSize(ElementTy)); // For ByVal, alignment should be passed from FE. BE will guess if // this info is not there but there are cases it cannot get right. + unsigned FrameAlign; if (F.getParamAlignment(Idx)) FrameAlign = F.getParamAlignment(Idx); + else + FrameAlign = TLI.getByValTypeAlignment(ElementTy); Flags.setByValAlign(FrameAlign); - Flags.setByValSize(FrameSize); } if (F.paramHasAttr(Idx, Attribute::Nest)) Flags.setNest(); @@ -6362,8 +6533,8 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) { if (I->use_empty() && NumValues) SDB->setUnusedArgValue(I, InVals[i]); - for (unsigned Value = 0; Value != NumValues; ++Value) { - EVT VT = ValueVTs[Value]; + for (unsigned Val = 0; Val != NumValues; ++Val) { + EVT VT = ValueVTs[Val]; EVT PartVT = TLI.getRegisterType(*CurDAG->getContext(), VT); unsigned NumParts = TLI.getNumRegisters(*CurDAG->getContext(), VT); @@ -6382,21 +6553,35 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) { i += NumParts; } + // We don't need to do anything else for unused arguments. + if (ArgValues.empty()) + continue; + // Note down frame index for byval arguments. - if (I->hasByValAttr() && !ArgValues.empty()) + if (I->hasByValAttr()) if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode())) FuncInfo->setByValArgumentFrameIndex(I, FI->getIndex()); - if (!I->use_empty()) { - SDValue Res; - if (!ArgValues.empty()) - Res = DAG.getMergeValues(&ArgValues[0], NumValues, - SDB->getCurDebugLoc()); - SDB->setValue(I, Res); - - // If this argument is live outside of the entry block, insert a copy from - // whereever we got it to the vreg that other BB's will reference it as. + SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues, + SDB->getCurDebugLoc()); + SDB->setValue(I, Res); + + // If this argument is live outside of the entry block, insert a copy from + // wherever we got it to the vreg that other BB's will reference it as. + if (!EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) { + // If we can, though, try to skip creating an unnecessary vreg. + // FIXME: This isn't very clean... it would be nice to make this more + // general. It's also subtly incompatible with the hacks FastISel + // uses with vregs. + unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + FuncInfo->ValueMap[I] = Reg; + continue; + } + } + if (!isOnlyUsedInEntryBlock(I)) { + FuncInfo->InitializeRegForValue(I); SDB->CopyToExportRegsIfNeeded(I); } } @@ -6442,6 +6627,10 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { // Ignore dead phi's. if (PN->use_empty()) continue; + // Skip empty types + if (PN->getType()->isEmptyTy()) + continue; + unsigned Reg; const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index f546612..a1ca891 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -23,7 +23,6 @@ #include "llvm/Support/CallSite.h" #include "llvm/Support/ErrorHandling.h" #include <vector> -#include <set> namespace llvm { @@ -333,6 +332,14 @@ public: /// consumed. void clear(); + /// clearDanglingDebugInfo - Clear the dangling debug information + /// map. This function is seperated from the clear so that debug + /// information that is dangling in a basic block can be properly + /// resolved in a different basic block. This allows the + /// SelectionDAG to resolve dangling debug information attached + /// to PHI nodes. + void clearDanglingDebugInfo(); + /// getRoot - Return the current virtual root of the Selection DAG, /// flushing any PendingLoad items. This must be done before emitting /// a store or any other node that may need to be ordered after any @@ -427,6 +434,9 @@ private: const Value* SV, MachineBasicBlock* Default, MachineBasicBlock *SwitchBB); + + uint32_t getEdgeWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst); + void addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst); public: void visitSwitchCase(CaseBlock &CB, MachineBasicBlock *SwitchBB); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 4b6066e..dc8044b 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -17,6 +17,7 @@ #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/DebugInfo.h" #include "llvm/Constants.h" #include "llvm/Function.h" @@ -55,17 +56,11 @@ using namespace llvm; STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on"); +STATISTIC(NumFastIselSuccess, "Number of instructions fast isel selected"); STATISTIC(NumFastIselBlocks, "Number of blocks selected entirely by fast isel"); STATISTIC(NumDAGBlocks, "Number of blocks selected using DAG"); STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path"); -#ifndef NDEBUG -STATISTIC(NumBBWithOutOfOrderLineInfo, - "Number of blocks with out of order line number info"); -STATISTIC(NumMBBWithOutOfOrderLineInfo, - "Number of machine blocks with out of order line number info"); -#endif - static cl::opt<bool> EnableFastISelVerbose("fast-isel-verbose", cl::Hidden, cl::desc("Enable verbose messages in the \"fast\" " @@ -74,6 +69,11 @@ static cl::opt<bool> EnableFastISelAbort("fast-isel-abort", cl::Hidden, cl::desc("Enable abort calls when \"fast\" instruction fails")); +static cl::opt<bool> +UseMBPI("use-mbpi", + cl::desc("use Machine Branch Probability Info"), + cl::init(true), cl::Hidden); + #ifndef NDEBUG static cl::opt<bool> ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden, @@ -192,6 +192,7 @@ SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm, DAGSize(0) { initializeGCModuleInfoPass(*PassRegistry::getPassRegistry()); initializeAliasAnalysisAnalysisGroup(*PassRegistry::getPassRegistry()); + initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry()); } SelectionDAGISel::~SelectionDAGISel() { @@ -205,43 +206,11 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<AliasAnalysis>(); AU.addRequired<GCModuleInfo>(); AU.addPreserved<GCModuleInfo>(); + if (UseMBPI && OptLevel != CodeGenOpt::None) + AU.addRequired<BranchProbabilityInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } -/// FunctionCallsSetJmp - Return true if the function has a call to setjmp or -/// other function that gcc recognizes as "returning twice". This is used to -/// limit code-gen optimizations on the machine function. -/// -/// FIXME: Remove after <rdar://problem/8031714> is fixed. -static bool FunctionCallsSetJmp(const Function *F) { - const Module *M = F->getParent(); - static const char *ReturnsTwiceFns[] = { - "_setjmp", - "setjmp", - "sigsetjmp", - "setjmp_syscall", - "savectx", - "qsetjmp", - "vfork", - "getcontext" - }; -#define NUM_RETURNS_TWICE_FNS sizeof(ReturnsTwiceFns) / sizeof(const char *) - - for (unsigned I = 0; I < NUM_RETURNS_TWICE_FNS; ++I) - if (const Function *Callee = M->getFunction(ReturnsTwiceFns[I])) { - if (!Callee->use_empty()) - for (Value::const_use_iterator - I = Callee->use_begin(), E = Callee->use_end(); - I != E; ++I) - if (const CallInst *CI = dyn_cast<CallInst>(*I)) - if (CI->getParent()->getParent() == F) - return true; - } - - return false; -#undef NUM_RETURNS_TWICE_FNS -} - /// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that /// may trap on it. In this case we have to split the edge so that the path /// through the predecessor block that doesn't go to the phi block doesn't @@ -302,6 +271,12 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { CurDAG->init(*MF); FuncInfo->set(Fn, *MF); + + if (UseMBPI && OptLevel != CodeGenOpt::None) + FuncInfo->BPI = &getAnalysis<BranchProbabilityInfo>(); + else + FuncInfo->BPI = 0; + SDB->init(GFI, *AA); SelectAllBasicBlocks(Fn); @@ -392,7 +367,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { } // Determine if there is a call to setjmp in the machine function. - MF->setCallsSetJmp(FunctionCallsSetJmp(&Fn)); + MF->setCallsSetJmp(Fn.callsFunctionThatReturnsTwice()); // Replace forward-declared registers with the registers containing // the desired value. @@ -421,10 +396,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { return true; } -void -SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin, - BasicBlock::const_iterator End, - bool &HadTailCall) { +void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin, + BasicBlock::const_iterator End, + bool &HadTailCall) { // Lower all of the non-terminator instructions. If a call is emitted // as a tail call, cease emitting nodes for this block. Terminators // are handled below. @@ -438,7 +412,6 @@ SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin, // Final step, emit the lowered DAG as machine code. CodeGenAndEmitDAG(); - return; } void SelectionDAGISel::ComputeLiveOutVRegInfo() { @@ -572,7 +545,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled); - CurDAG->Legalize(OptLevel); + CurDAG->Legalize(); } DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber @@ -748,16 +721,49 @@ void SelectionDAGISel::PrepareEHLandingPad() { - +/// TryToFoldFastISelLoad - We're checking to see if we can fold the specified +/// load into the specified FoldInst. Note that we could have a sequence where +/// multiple LLVM IR instructions are folded into the same machineinstr. For +/// example we could have: +/// A: x = load i32 *P +/// B: y = icmp A, 42 +/// C: br y, ... +/// +/// In this scenario, LI is "A", and FoldInst is "C". We know about "B" (and +/// any other folded instructions) because it is between A and C. +/// +/// If we succeed in folding the load into the operation, return true. +/// bool SelectionDAGISel::TryToFoldFastISelLoad(const LoadInst *LI, + const Instruction *FoldInst, FastISel *FastIS) { + // We know that the load has a single use, but don't know what it is. If it + // isn't one of the folded instructions, then we can't succeed here. Handle + // this by scanning the single-use users of the load until we get to FoldInst. + unsigned MaxUsers = 6; // Don't scan down huge single-use chains of instrs. + + const Instruction *TheUser = LI->use_back(); + while (TheUser != FoldInst && // Scan up until we find FoldInst. + // Stay in the right block. + TheUser->getParent() == FoldInst->getParent() && + --MaxUsers) { // Don't scan too far. + // If there are multiple or no uses of this instruction, then bail out. + if (!TheUser->hasOneUse()) + return false; + + TheUser = TheUser->use_back(); + } + // Don't try to fold volatile loads. Target has to deal with alignment // constraints. if (LI->isVolatile()) return false; - // Figure out which vreg this is going into. + // Figure out which vreg this is going into. If there is no assigned vreg yet + // then there actually was no reference to it. Perhaps the load is referenced + // by a dead instruction. unsigned LoadReg = FastIS->getRegForValue(LI); - assert(LoadReg && "Load isn't already assigned a vreg? "); + if (LoadReg == 0) + return false; // Check to see what the uses of this vreg are. If it has no uses, or more // than one use (at the machine instr level) then we can't fold it. @@ -788,48 +794,17 @@ bool SelectionDAGISel::TryToFoldFastISelLoad(const LoadInst *LI, return FastIS->TryToFoldLoad(User, RI.getOperandNo(), LI); } -#ifndef NDEBUG -/// CheckLineNumbers - Check if basic block instructions follow source order -/// or not. -static void CheckLineNumbers(const BasicBlock *BB) { - unsigned Line = 0; - unsigned Col = 0; - for (BasicBlock::const_iterator BI = BB->begin(), - BE = BB->end(); BI != BE; ++BI) { - const DebugLoc DL = BI->getDebugLoc(); - if (DL.isUnknown()) continue; - unsigned L = DL.getLine(); - unsigned C = DL.getCol(); - if (L < Line || (L == Line && C < Col)) { - ++NumBBWithOutOfOrderLineInfo; - return; - } - Line = L; - Col = C; - } +/// isFoldedOrDeadInstruction - Return true if the specified instruction is +/// side-effect free and is either dead or folded into a generated instruction. +/// Return false if it needs to be emitted. +static bool isFoldedOrDeadInstruction(const Instruction *I, + FunctionLoweringInfo *FuncInfo) { + return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded. + !isa<TerminatorInst>(I) && // Terminators aren't folded. + !isa<DbgInfoIntrinsic>(I) && // Debug instructions aren't folded. + !FuncInfo->isExportedInst(I); // Exported instrs must be computed. } -/// CheckLineNumbers - Check if machine basic block instructions follow source -/// order or not. -static void CheckLineNumbers(const MachineBasicBlock *MBB) { - unsigned Line = 0; - unsigned Col = 0; - for (MachineBasicBlock::const_iterator MBI = MBB->begin(), - MBE = MBB->end(); MBI != MBE; ++MBI) { - const DebugLoc DL = MBI->getDebugLoc(); - if (DL.isUnknown()) continue; - unsigned L = DL.getLine(); - unsigned C = DL.getCol(); - if (L < Line || (L == Line && C < Col)) { - ++NumMBBWithOutOfOrderLineInfo; - return; - } - Line = L; - Col = C; - } -} -#endif - void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Initialize the Fast-ISel state, if needed. FastISel *FastIS = 0; @@ -841,9 +816,6 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { for (ReversePostOrderTraversal<const Function*>::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { const BasicBlock *LLVMBB = *I; -#ifndef NDEBUG - CheckLineNumbers(LLVMBB); -#endif if (OptLevel != CodeGenOpt::None) { bool AllPredsVisited = true; @@ -856,15 +828,13 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { } if (AllPredsVisited) { - for (BasicBlock::const_iterator I = LLVMBB->begin(), E = LLVMBB->end(); - I != E && isa<PHINode>(I); ++I) { + for (BasicBlock::const_iterator I = LLVMBB->begin(); + isa<PHINode>(I); ++I) FuncInfo->ComputePHILiveOutRegInfo(cast<PHINode>(I)); - } } else { - for (BasicBlock::const_iterator I = LLVMBB->begin(), E = LLVMBB->end(); - I != E && isa<PHINode>(I); ++I) { + for (BasicBlock::const_iterator I = LLVMBB->begin(); + isa<PHINode>(I); ++I) FuncInfo->InvalidatePHILiveOutRegInfo(cast<PHINode>(I)); - } } FuncInfo->VisitedBBs.insert(LLVMBB); @@ -912,10 +882,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { const Instruction *Inst = llvm::prior(BI); // If we no longer require this instruction, skip it. - if (!Inst->mayWriteToMemory() && - !isa<TerminatorInst>(Inst) && - !isa<DbgInfoIntrinsic>(Inst) && - !FuncInfo->isExportedInst(Inst)) + if (isFoldedOrDeadInstruction(Inst, FuncInfo)) continue; // Bottom-up: reset the insert pos at the top, after any local-value @@ -924,16 +891,21 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Try to select the instruction with FastISel. if (FastIS->SelectInstruction(Inst)) { - // If fast isel succeeded, check to see if there is a single-use - // non-volatile load right before the selected instruction, and see if - // the load is used by the instruction. If so, try to fold it. - const Instruction *BeforeInst = 0; - if (Inst != Begin) - BeforeInst = llvm::prior(llvm::prior(BI)); - if (BeforeInst && isa<LoadInst>(BeforeInst) && - BeforeInst->hasOneUse() && *BeforeInst->use_begin() == Inst && - TryToFoldFastISelLoad(cast<LoadInst>(BeforeInst), FastIS)) - --BI; // If we succeeded, don't re-select the load. + ++NumFastIselSuccess; + // If fast isel succeeded, skip over all the folded instructions, and + // then see if there is a load right before the selected instructions. + // Try to fold the load if so. + const Instruction *BeforeInst = Inst; + while (BeforeInst != Begin) { + BeforeInst = llvm::prior(BasicBlock::const_iterator(BeforeInst)); + if (!isFoldedOrDeadInstruction(BeforeInst, FuncInfo)) + break; + } + if (BeforeInst != Inst && isa<LoadInst>(BeforeInst) && + BeforeInst->hasOneUse() && + TryToFoldFastISelLoad(cast<LoadInst>(BeforeInst), Inst, FastIS)) + // If we succeeded, don't re-select the load. + BI = llvm::next(BasicBlock::const_iterator(BeforeInst)); continue; } @@ -963,9 +935,14 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { continue; } - // Otherwise, give up on FastISel for the rest of the block. - // For now, be a little lenient about non-branch terminators. - if (!isa<TerminatorInst>(Inst) || isa<BranchInst>(Inst)) { + if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) { + // Don't abort, and use a different message for terminator misses. + ++NumFastIselFailures; + if (EnableFastISelVerbose || EnableFastISelAbort) { + dbgs() << "FastISel missed terminator: "; + Inst->dump(); + } + } else { ++NumFastIselFailures; if (EnableFastISelVerbose || EnableFastISelAbort) { dbgs() << "FastISel miss: "; @@ -987,22 +964,20 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { else ++NumFastIselBlocks; - // Run SelectionDAG instruction selection on the remainder of the block - // not handled by FastISel. If FastISel is not run, this is the entire - // block. - bool HadTailCall; - SelectBasicBlock(Begin, BI, HadTailCall); + if (Begin != BI) { + // Run SelectionDAG instruction selection on the remainder of the block + // not handled by FastISel. If FastISel is not run, this is the entire + // block. + bool HadTailCall; + SelectBasicBlock(Begin, BI, HadTailCall); + } FinishBasicBlock(); FuncInfo->PHINodesToUpdate.clear(); } delete FastIS; -#ifndef NDEBUG - for (MachineFunction::const_iterator MBI = MF->begin(), MBE = MF->end(); - MBI != MBE; ++MBI) - CheckLineNumbers(MBI); -#endif + SDB->clearDanglingDebugInfo(); } void @@ -2634,11 +2609,45 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable, // instructions that access memory and for ComplexPatterns that match // loads. if (EmitNodeInfo & OPFL_MemRefs) { + // Only attach load or store memory operands if the generated + // instruction may load or store. + const TargetInstrDesc &TID = TM.getInstrInfo()->get(TargetOpc); + bool mayLoad = TID.mayLoad(); + bool mayStore = TID.mayStore(); + + unsigned NumMemRefs = 0; + for (SmallVector<MachineMemOperand*, 2>::const_iterator I = + MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) { + if ((*I)->isLoad()) { + if (mayLoad) + ++NumMemRefs; + } else if ((*I)->isStore()) { + if (mayStore) + ++NumMemRefs; + } else { + ++NumMemRefs; + } + } + MachineSDNode::mmo_iterator MemRefs = - MF->allocateMemRefsArray(MatchedMemRefs.size()); - std::copy(MatchedMemRefs.begin(), MatchedMemRefs.end(), MemRefs); + MF->allocateMemRefsArray(NumMemRefs); + + MachineSDNode::mmo_iterator MemRefsPos = MemRefs; + for (SmallVector<MachineMemOperand*, 2>::const_iterator I = + MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) { + if ((*I)->isLoad()) { + if (mayLoad) + *MemRefsPos++ = *I; + } else if ((*I)->isStore()) { + if (mayStore) + *MemRefsPos++ = *I; + } else { + *MemRefsPos++ = *I; + } + } + cast<MachineSDNode>(Res) - ->setMemRefs(MemRefs, MemRefs + MatchedMemRefs.size()); + ->setMemRefs(MemRefs, MemRefs + NumMemRefs); } DEBUG(errs() << " " diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 4b0822d..efbfaa4 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -26,11 +26,19 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include <cctype> using namespace llvm; +/// We are in the process of implementing a new TypeLegalization action +/// - the promotion of vector elements. This feature is disabled by default +/// and only enabled using this flag. +static cl::opt<bool> +AllowPromoteIntElem("promote-elements", cl::Hidden, + cl::desc("Allow promotion of integer vector element types")); + namespace llvm { TLSModel::Model getTLSModel(const GlobalValue *GV, Reloc::Model reloc) { bool isLocal = GV->hasLocalLinkage(); @@ -528,7 +536,8 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) { /// NOTE: The constructor takes ownership of TLOF. TargetLowering::TargetLowering(const TargetMachine &tm, const TargetLoweringObjectFile *tlof) - : TM(tm), TD(TM.getTargetData()), TLOF(*tlof) { + : TM(tm), TD(TM.getTargetData()), TLOF(*tlof), + mayPromoteElements(AllowPromoteIntElem) { // All operations default to being supported. memset(OpActions, 0, sizeof(OpActions)); memset(LoadExtActions, 0, sizeof(LoadExtActions)); @@ -596,6 +605,8 @@ TargetLowering::TargetLowering(const TargetMachine &tm, SchedPreferenceInfo = Sched::Latency; JumpBufSize = 0; JumpBufAlignment = 0; + MinFunctionAlignment = 0; + PrefFunctionAlignment = 0; PrefLoopAlignment = 0; MinStackArgumentAlignment = 1; ShouldFoldAtomicFences = false; @@ -662,10 +673,16 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT, NewVT = EltTy; IntermediateVT = NewVT; + unsigned NewVTSize = NewVT.getSizeInBits(); + + // Convert sizes such as i33 to i64. + if (!isPowerOf2_32(NewVTSize)) + NewVTSize = NextPowerOf2(NewVTSize); + EVT DestVT = TLI->getRegisterType(NewVT); RegisterVT = DestVT; if (EVT(DestVT).bitsLT(NewVT)) // Value is expanded, e.g. i64 -> i16. - return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits()); + return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits()); // Otherwise, promotion or legal types use the same number of registers as // the vector decimated to the appropriate level. @@ -747,7 +764,7 @@ void TargetLowering::computeRegisterProperties() { NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1]; RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg; TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1); - ValueTypeActions.setTypeAction(ExpandedVT, Expand); + ValueTypeActions.setTypeAction(ExpandedVT, TypeExpandInteger); } // Inspect all of the ValueType's smaller than the largest integer @@ -761,7 +778,7 @@ void TargetLowering::computeRegisterProperties() { } else { RegisterTypeForVT[IntReg] = TransformToType[IntReg] = (MVT::SimpleValueType)LegalIntReg; - ValueTypeActions.setTypeAction(IVT, Promote); + ValueTypeActions.setTypeAction(IVT, TypePromoteInteger); } } @@ -770,7 +787,7 @@ void TargetLowering::computeRegisterProperties() { NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64]; RegisterTypeForVT[MVT::ppcf128] = MVT::f64; TransformToType[MVT::ppcf128] = MVT::f64; - ValueTypeActions.setTypeAction(MVT::ppcf128, Expand); + ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat); } // Decide how to handle f64. If the target does not have native f64 support, @@ -779,7 +796,7 @@ void TargetLowering::computeRegisterProperties() { NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64]; RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64]; TransformToType[MVT::f64] = MVT::i64; - ValueTypeActions.setTypeAction(MVT::f64, Expand); + ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat); } // Decide how to handle f32. If the target does not have native support for @@ -789,12 +806,12 @@ void TargetLowering::computeRegisterProperties() { NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::f64]; RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::f64]; TransformToType[MVT::f32] = MVT::f64; - ValueTypeActions.setTypeAction(MVT::f32, Promote); + ValueTypeActions.setTypeAction(MVT::f32, TypePromoteInteger); } else { NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32]; RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32]; TransformToType[MVT::f32] = MVT::i32; - ValueTypeActions.setTypeAction(MVT::f32, Expand); + ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat); } } @@ -810,6 +827,30 @@ void TargetLowering::computeRegisterProperties() { unsigned NElts = VT.getVectorNumElements(); if (NElts != 1) { bool IsLegalWiderType = false; + // If we allow the promotion of vector elements using a flag, + // then return TypePromoteInteger on vector elements. + // First try to promote the elements of integer vectors. If no legal + // promotion was found, fallback to the widen-vector method. + if (mayPromoteElements) + for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { + EVT SVT = (MVT::SimpleValueType)nVT; + // Promote vectors of integers to vectors with the same number + // of elements, with a wider element type. + if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits() + && SVT.getVectorNumElements() == NElts && + isTypeLegal(SVT) && SVT.getScalarType().isInteger()) { + TransformToType[i] = SVT; + RegisterTypeForVT[i] = SVT; + NumRegistersForVT[i] = 1; + ValueTypeActions.setTypeAction(VT, TypePromoteInteger); + IsLegalWiderType = true; + break; + } + } + + if (IsLegalWiderType) continue; + + // Try to widen the vector. for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { EVT SVT = (MVT::SimpleValueType)nVT; if (SVT.getVectorElementType() == EltVT && @@ -818,7 +859,7 @@ void TargetLowering::computeRegisterProperties() { TransformToType[i] = SVT; RegisterTypeForVT[i] = SVT; NumRegistersForVT[i] = 1; - ValueTypeActions.setTypeAction(VT, Promote); + ValueTypeActions.setTypeAction(VT, TypeWidenVector); IsLegalWiderType = true; break; } @@ -838,10 +879,12 @@ void TargetLowering::computeRegisterProperties() { if (NVT == VT) { // Type is already a power of 2. The default action is to split. TransformToType[i] = MVT::Other; - ValueTypeActions.setTypeAction(VT, Expand); + unsigned NumElts = VT.getVectorNumElements(); + ValueTypeActions.setTypeAction(VT, + NumElts > 1 ? TypeSplitVector : TypeScalarizeVector); } else { TransformToType[i] = NVT; - ValueTypeActions.setTypeAction(VT, Promote); + ValueTypeActions.setTypeAction(VT, TypeWidenVector); } } @@ -890,7 +933,7 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT, // If there is a wider vector type with the same element type as this one, // we should widen to that legal vector type. This handles things like // <2 x float> -> <4 x float>. - if (NumElts != 1 && getTypeAction(VT) == Promote) { + if (NumElts != 1 && getTypeAction(Context, VT) == TypeWidenVector) { RegisterVT = getTypeToTransformTo(Context, VT); if (isTypeLegal(RegisterVT)) { IntermediateVT = RegisterVT; @@ -928,8 +971,14 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT, EVT DestVT = getRegisterType(Context, NewVT); RegisterVT = DestVT; + unsigned NewVTSize = NewVT.getSizeInBits(); + + // Convert sizes such as i33 to i64. + if (!isPowerOf2_32(NewVTSize)) + NewVTSize = NextPowerOf2(NewVTSize); + if (DestVT.bitsLT(NewVT)) // Value is expanded, e.g. i64 -> i16. - return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits()); + return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits()); // Otherwise, promotion or legal types use the same number of registers as // the vector decimated to the appropriate level. @@ -1678,6 +1727,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(In.getOperand(1)); if (!ShAmt) break; + SDValue Shift = In.getOperand(1); + if (TLO.LegalTypes()) { + uint64_t ShVal = ShAmt->getZExtValue(); + Shift = + TLO.DAG.getConstant(ShVal, getShiftAmountTy(Op.getValueType())); + } + APInt HighBits = APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth); HighBits = HighBits.lshr(ShAmt->getZExtValue()).trunc(BitWidth); @@ -1691,7 +1747,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(), NewTrunc, - In.getOperand(1))); + Shift)); } break; } @@ -1716,26 +1772,28 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, break; } case ISD::BITCAST: -#if 0 - // If this is an FP->Int bitcast and if the sign bit is the only thing that - // is demanded, turn this into a FGETSIGN. - if (NewMask == EVT::getIntegerVTSignBit(Op.getValueType()) && - MVT::isFloatingPoint(Op.getOperand(0).getValueType()) && - !MVT::isVector(Op.getOperand(0).getValueType())) { - // Only do this xform if FGETSIGN is valid or if before legalize. - if (!TLO.AfterLegalize || - isOperationLegal(ISD::FGETSIGN, Op.getValueType())) { + // If this is an FP->Int bitcast and if the sign bit is the only + // thing demanded, turn this into a FGETSIGN. + if (!Op.getOperand(0).getValueType().isVector() && + NewMask == APInt::getSignBit(Op.getValueType().getSizeInBits()) && + Op.getOperand(0).getValueType().isFloatingPoint()) { + bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType()); + bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32); + if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) { + EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32; // Make a FGETSIGN + SHL to move the sign bit into the appropriate // place. We expect the SHL to be eliminated by other optimizations. - SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, Op.getValueType(), - Op.getOperand(0)); + SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Op.getOperand(0)); + unsigned OpVTSizeInBits = Op.getValueType().getSizeInBits(); + if (!OpVTLegal && OpVTSizeInBits > 32) + Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), Sign); unsigned ShVal = Op.getValueType().getSizeInBits()-1; - SDValue ShAmt = TLO.DAG.getConstant(ShVal, getShiftAmountTy()); - return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, Op.getValueType(), + SDValue ShAmt = TLO.DAG.getConstant(ShVal, Op.getValueType()); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, + Op.getValueType(), Sign, ShAmt)); } } -#endif break; case ISD::ADD: case ISD::MUL: @@ -1842,7 +1900,6 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, bool foldBooleans, DAGCombinerInfo &DCI, DebugLoc dl) const { SelectionDAG &DAG = DCI.DAG; - LLVMContext &Context = *DAG.getContext(); // These setcc operations always fold. switch (Cond) { @@ -1853,12 +1910,11 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, case ISD::SETTRUE2: return DAG.getConstant(1, VT); } - if (isa<ConstantSDNode>(N0.getNode())) { - // Ensure that the constant occurs on the RHS, and fold constant - // comparisons. + // Ensure that the constant occurs on the RHS, and fold constant + // comparisons. + if (isa<ConstantSDNode>(N0.getNode())) return DAG.getSetCC(dl, VT, N1, N0, ISD::getSetCCSwappedOperands(Cond)); - } - + if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) { const APInt &C1 = N1C->getAPIntValue(); @@ -1911,6 +1967,42 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // TODO: (ctpop x) == 1 -> x && (x & x-1) == 0 iff ctpop is illegal. } + // (zext x) == C --> x == (trunc C) + if (DCI.isBeforeLegalize() && N0->hasOneUse() && + (Cond == ISD::SETEQ || Cond == ISD::SETNE)) { + unsigned MinBits = N0.getValueSizeInBits(); + SDValue PreZExt; + if (N0->getOpcode() == ISD::ZERO_EXTEND) { + // ZExt + MinBits = N0->getOperand(0).getValueSizeInBits(); + PreZExt = N0->getOperand(0); + } else if (N0->getOpcode() == ISD::AND) { + // DAGCombine turns costly ZExts into ANDs + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) + if ((C->getAPIntValue()+1).isPowerOf2()) { + MinBits = C->getAPIntValue().countTrailingOnes(); + PreZExt = N0->getOperand(0); + } + } else if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0)) { + // ZEXTLOAD + if (LN0->getExtensionType() == ISD::ZEXTLOAD) { + MinBits = LN0->getMemoryVT().getSizeInBits(); + PreZExt = N0; + } + } + + // Make sure we're not loosing bits from the constant. + if (MinBits < C1.getBitWidth() && MinBits > C1.getActiveBits()) { + EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits); + if (isTypeDesirableForOp(ISD::SETCC, MinVT)) { + // Will get folded away. + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreZExt); + SDValue C = DAG.getConstant(C1.trunc(MinBits), MinVT); + return DAG.getSetCC(dl, VT, Trunc, C, Cond); + } + } + } + // If the LHS is '(and load, const)', the RHS is 0, // the test is for equality or unsigned, and all 1 bits of the const are // in the same partial word, see if we can shorten the load. @@ -1949,7 +2041,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } } if (bestWidth) { - EVT newVT = EVT::getIntegerVT(Context, bestWidth); + EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth); if (newVT.isRound()) { EVT PtrType = Lod->getOperand(1).getValueType(); SDValue Ptr = Lod->getBasePtr(); @@ -2578,9 +2670,13 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{ /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void TargetLowering::LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, + std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { + + if (Constraint.length() > 1) return; + + char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'X': // Allows any operand; labels (basic block) use this. @@ -2769,6 +2865,12 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints( report_fatal_error("Indirect operand for inline asm not a pointer!"); OpTy = PtrTy->getElementType(); } + + // Look for vector wrapped in a struct. e.g. { <16 x i8> }. + if (const StructType *STy = dyn_cast<StructType>(OpTy)) + if (STy->getNumElements() == 1) + OpTy = STy->getElementType(0); + // If OpTy is not a single value, it may be a struct/union that we // can tile with integers. if (!OpTy->isSingleValueType() && OpTy->isSized()) { @@ -3013,7 +3115,7 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo, assert(OpInfo.Codes[i].size() == 1 && "Unhandled multi-letter 'other' constraint"); std::vector<SDValue> ResultOps; - TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0], + TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i], ResultOps, *DAG); if (!ResultOps.empty()) { BestType = CType; diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp index 7b5bca4..160f38f 100644 --- a/lib/CodeGen/ShrinkWrapping.cpp +++ b/lib/CodeGen/ShrinkWrapping.cpp @@ -277,7 +277,7 @@ void PEI::calculateAnticAvail(MachineFunction &Fn) { // Initialize data flow sets. clearAnticAvailSets(); - // Calulate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG. + // Calculate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG. bool changed = true; unsigned iterations = 0; while (changed) { diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp index c621726..221bec5 100644 --- a/lib/CodeGen/SimpleRegisterCoalescing.cpp +++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp @@ -47,7 +47,6 @@ STATISTIC(numExtends , "Number of copies extended"); STATISTIC(NumReMats , "Number of instructions re-materialized"); STATISTIC(numPeep , "Number of identity moves eliminated after coalescing"); STATISTIC(numAborts , "Number of times interval joining aborted"); -STATISTIC(numDeadValNo, "Number of valno def marked dead"); char SimpleRegisterCoalescing::ID = 0; static cl::opt<bool> @@ -61,9 +60,9 @@ DisableCrossClassJoin("disable-cross-class-join", cl::init(false), cl::Hidden); static cl::opt<bool> -DisablePhysicalJoin("disable-physical-join", - cl::desc("Avoid coalescing physical register copies"), - cl::init(false), cl::Hidden); +EnablePhysicalJoin("join-physregs", + cl::desc("Join physical register copies"), + cl::init(false), cl::Hidden); static cl::opt<bool> VerifyCoalescing("verify-coalescing", @@ -208,15 +207,14 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP, if (ValLR+1 != BLR) return false; // If a live interval is a physical register, conservatively check if any - // of its sub-registers is overlapping the live interval of the virtual - // register. If so, do not coalesce. - if (TargetRegisterInfo::isPhysicalRegister(IntB.reg) && - *tri_->getSubRegisters(IntB.reg)) { - for (const unsigned* SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) - if (li_->hasInterval(*SR) && IntA.overlaps(li_->getInterval(*SR))) { + // of its aliases is overlapping the live interval of the virtual register. + // If so, do not coalesce. + if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) { + for (const unsigned *AS = tri_->getAliasSet(IntB.reg); *AS; ++AS) + if (li_->hasInterval(*AS) && IntA.overlaps(li_->getInterval(*AS))) { DEBUG({ - dbgs() << "\t\tInterfere with sub-register "; - li_->getInterval(*SR).print(dbgs(), tri_); + dbgs() << "\t\tInterfere with alias "; + li_->getInterval(*AS).print(dbgs(), tri_); }); return false; } @@ -254,7 +252,12 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP, // Okay, merge "B1" into the same value number as "B0". if (BValNo != ValLR->valno) { + // If B1 is killed by a PHI, then the merged live range must also be killed + // by the same PHI, as B0 and B1 can not overlap. + bool HasPHIKill = BValNo->hasPHIKill(); IntB.MergeValueNumberInto(BValNo, ValLR->valno); + if (HasPHIKill) + ValLR->valno->setHasPHIKill(true); } DEBUG({ dbgs() << " result = "; @@ -273,7 +276,7 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP, // merge, find the last use and trim the live range. That will also add the // isKill marker. if (ALR->end == CopyIdx) - TrimLiveIntervalToLastUse(CopyUseIdx, CopyMI->getParent(), IntA, ALR); + li_->shrinkToUses(&IntA); ++numExtends; return true; @@ -427,6 +430,10 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP, MachineInstr *NewMI = tii_->commuteInstruction(DefMI); if (!NewMI) return false; + if (TargetRegisterInfo::isVirtualRegister(IntA.reg) && + TargetRegisterInfo::isVirtualRegister(IntB.reg) && + !mri_->constrainRegClass(IntB.reg, mri_->getRegClass(IntA.reg))) + return false; if (NewMI != DefMI) { li_->ReplaceMachineInstrInMaps(DefMI, NewMI); MBB->insert(DefMI, NewMI); @@ -504,98 +511,6 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP, return true; } -/// isSameOrFallThroughBB - Return true if MBB == SuccMBB or MBB simply -/// fallthoughs to SuccMBB. -static bool isSameOrFallThroughBB(MachineBasicBlock *MBB, - MachineBasicBlock *SuccMBB, - const TargetInstrInfo *tii_) { - if (MBB == SuccMBB) - return true; - MachineBasicBlock *TBB = 0, *FBB = 0; - SmallVector<MachineOperand, 4> Cond; - return !tii_->AnalyzeBranch(*MBB, TBB, FBB, Cond) && !TBB && !FBB && - MBB->isSuccessor(SuccMBB); -} - -/// removeRange - Wrapper for LiveInterval::removeRange. This removes a range -/// from a physical register live interval as well as from the live intervals -/// of its sub-registers. -static void removeRange(LiveInterval &li, - SlotIndex Start, SlotIndex End, - LiveIntervals *li_, const TargetRegisterInfo *tri_) { - li.removeRange(Start, End, true); - if (TargetRegisterInfo::isPhysicalRegister(li.reg)) { - for (const unsigned* SR = tri_->getSubRegisters(li.reg); *SR; ++SR) { - if (!li_->hasInterval(*SR)) - continue; - LiveInterval &sli = li_->getInterval(*SR); - SlotIndex RemoveStart = Start; - SlotIndex RemoveEnd = Start; - - while (RemoveEnd != End) { - LiveInterval::iterator LR = sli.FindLiveRangeContaining(RemoveStart); - if (LR == sli.end()) - break; - RemoveEnd = (LR->end < End) ? LR->end : End; - sli.removeRange(RemoveStart, RemoveEnd, true); - RemoveStart = RemoveEnd; - } - } - } -} - -/// TrimLiveIntervalToLastUse - If there is a last use in the same basic block -/// as the copy instruction, trim the live interval to the last use and return -/// true. -bool -SimpleRegisterCoalescing::TrimLiveIntervalToLastUse(SlotIndex CopyIdx, - MachineBasicBlock *CopyMBB, - LiveInterval &li, - const LiveRange *LR) { - SlotIndex MBBStart = li_->getMBBStartIdx(CopyMBB); - SlotIndex LastUseIdx; - MachineOperand *LastUse = - lastRegisterUse(LR->start, CopyIdx.getPrevSlot(), li.reg, LastUseIdx); - if (LastUse) { - MachineInstr *LastUseMI = LastUse->getParent(); - if (!isSameOrFallThroughBB(LastUseMI->getParent(), CopyMBB, tii_)) { - // r1024 = op - // ... - // BB1: - // = r1024 - // - // BB2: - // r1025<dead> = r1024<kill> - if (MBBStart < LR->end) - removeRange(li, MBBStart, LR->end, li_, tri_); - return true; - } - - // There are uses before the copy, just shorten the live range to the end - // of last use. - LastUse->setIsKill(); - removeRange(li, LastUseIdx.getDefIndex(), LR->end, li_, tri_); - if (LastUseMI->isCopy()) { - MachineOperand &DefMO = LastUseMI->getOperand(0); - if (DefMO.getReg() == li.reg && !DefMO.getSubReg()) - DefMO.setIsDead(); - } - return true; - } - - // Is it livein? - if (LR->start <= MBBStart && LR->end > MBBStart) { - if (LR->start == li_->getZeroIndex()) { - assert(TargetRegisterInfo::isPhysicalRegister(li.reg)); - // Live-in to the function but dead. Remove it from entry live-in set. - mf_->begin()->removeLiveIn(li.reg); - } - // FIXME: Shorten intervals in BBs that reaches this BB. - } - - return false; -} - /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial /// computation, replace the copy by rematerialize the definition. bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt, @@ -782,26 +697,6 @@ static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *li_, return false; } -/// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy. -/// Return true if live interval is removed. -bool SimpleRegisterCoalescing::ShortenDeadCopyLiveRange(LiveInterval &li, - MachineInstr *CopyMI) { - SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI); - LiveInterval::iterator MLR = - li.FindLiveRangeContaining(CopyIdx.getDefIndex()); - if (MLR == li.end()) - return false; // Already removed by ShortenDeadCopySrcLiveRange. - SlotIndex RemoveStart = MLR->start; - SlotIndex RemoveEnd = MLR->end; - SlotIndex DefIdx = CopyIdx.getDefIndex(); - // Remove the liverange that's defined by this. - if (RemoveStart == DefIdx && RemoveEnd == DefIdx.getStoreIndex()) { - removeRange(li, RemoveStart, RemoveEnd, li_, tri_); - return removeIntervalIfEmpty(li, li_, tri_); - } - return false; -} - /// RemoveDeadDef - If a def of a live interval is now determined dead, remove /// the val# it defines. If the live interval becomes empty, remove it as well. bool SimpleRegisterCoalescing::RemoveDeadDef(LiveInterval &li, @@ -835,84 +730,6 @@ void SimpleRegisterCoalescing::RemoveCopyFlag(unsigned DstReg, } } -/// PropagateDeadness - Propagate the dead marker to the instruction which -/// defines the val#. -static void PropagateDeadness(LiveInterval &li, MachineInstr *CopyMI, - SlotIndex &LRStart, LiveIntervals *li_, - const TargetRegisterInfo* tri_) { - MachineInstr *DefMI = - li_->getInstructionFromIndex(LRStart.getDefIndex()); - if (DefMI && DefMI != CopyMI) { - int DeadIdx = DefMI->findRegisterDefOperandIdx(li.reg); - if (DeadIdx != -1) - DefMI->getOperand(DeadIdx).setIsDead(); - else - DefMI->addOperand(MachineOperand::CreateReg(li.reg, - /*def*/true, /*implicit*/true, /*kill*/false, /*dead*/true)); - LRStart = LRStart.getNextSlot(); - } -} - -/// ShortenDeadCopySrcLiveRange - Shorten a live range as it's artificially -/// extended by a dead copy. Mark the last use (if any) of the val# as kill as -/// ends the live range there. If there isn't another use, then this live range -/// is dead. Return true if live interval is removed. -bool -SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li, - MachineInstr *CopyMI) { - SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI); - if (CopyIdx == SlotIndex()) { - // FIXME: special case: function live in. It can be a general case if the - // first instruction index starts at > 0 value. - assert(TargetRegisterInfo::isPhysicalRegister(li.reg)); - // Live-in to the function but dead. Remove it from entry live-in set. - if (mf_->begin()->isLiveIn(li.reg)) - mf_->begin()->removeLiveIn(li.reg); - if (const LiveRange *LR = li.getLiveRangeContaining(CopyIdx)) - removeRange(li, LR->start, LR->end, li_, tri_); - return removeIntervalIfEmpty(li, li_, tri_); - } - - LiveInterval::iterator LR = - li.FindLiveRangeContaining(CopyIdx.getPrevIndex().getStoreIndex()); - if (LR == li.end()) - // Livein but defined by a phi. - return false; - - SlotIndex RemoveStart = LR->start; - SlotIndex RemoveEnd = CopyIdx.getStoreIndex(); - if (LR->end > RemoveEnd) - // More uses past this copy? Nothing to do. - return false; - - // If there is a last use in the same bb, we can't remove the live range. - // Shorten the live interval and return. - MachineBasicBlock *CopyMBB = CopyMI->getParent(); - if (TrimLiveIntervalToLastUse(CopyIdx, CopyMBB, li, LR)) - return false; - - // There are other kills of the val#. Nothing to do. - if (!li.isOnlyLROfValNo(LR)) - return false; - - MachineBasicBlock *StartMBB = li_->getMBBFromIndex(RemoveStart); - if (!isSameOrFallThroughBB(StartMBB, CopyMBB, tii_)) - // If the live range starts in another mbb and the copy mbb is not a fall - // through mbb, then we can only cut the range from the beginning of the - // copy mbb. - RemoveStart = li_->getMBBStartIdx(CopyMBB).getNextIndex().getBaseIndex(); - - if (LR->valno->def == RemoveStart) { - // If the def MI defines the val# and this copy is the only kill of the - // val#, then propagate the dead marker. - PropagateDeadness(li, CopyMI, RemoveStart, li_, tri_); - ++numDeadValNo; - } - - removeRange(li, RemoveStart, RemoveEnd, li_, tri_); - return removeIntervalIfEmpty(li, li_, tri_); -} - /// shouldJoinPhys - Return true if a copy involving a physreg should be joined. /// We need to be careful about coalescing a source physical register with a /// virtual register. Once the coalescing is done, it cannot be broken and these @@ -928,7 +745,7 @@ bool SimpleRegisterCoalescing::shouldJoinPhys(CoalescerPair &CP) { if (!Allocatable && CP.isFlipped() && JoinVInt.containsOneValue()) return true; - if (DisablePhysicalJoin) { + if (!EnablePhysicalJoin) { DEBUG(dbgs() << "\tPhysreg joins disabled.\n"); return false; } @@ -955,7 +772,7 @@ bool SimpleRegisterCoalescing::shouldJoinPhys(CoalescerPair &CP) { // CodeGen/X86/phys_subreg_coalesce-3.ll needs it. if (!CP.isPartial()) { const TargetRegisterClass *RC = mri_->getRegClass(CP.getSrcReg()); - unsigned Threshold = allocatableRCRegs_[RC].count() * 2; + unsigned Threshold = RegClassInfo.getNumAllocatableRegs(RC) * 2; unsigned Length = li_->getApproximateInstructionCount(JoinVInt); if (Length > Threshold) { ++numAborts; @@ -974,7 +791,7 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg, const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const TargetRegisterClass *NewRC) { - unsigned NewRCCount = allocatableRCRegs_[NewRC].count(); + unsigned NewRCCount = RegClassInfo.getNumAllocatableRegs(NewRC); // This heuristics is good enough in practice, but it's obviously not *right*. // 4 is a magic number that works well enough for x86, ARM, etc. It filter // out all but the most restrictive register classes. @@ -988,8 +805,14 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg, LiveInterval &DstInt = li_->getInterval(DstReg); unsigned SrcSize = li_->getApproximateInstructionCount(SrcInt); unsigned DstSize = li_->getApproximateInstructionCount(DstInt); - if (SrcSize <= NewRCCount && DstSize <= NewRCCount) + + // Coalesce aggressively if the intervals are small compared to the number of + // registers in the new class. The number 4 is fairly arbitrary, chosen to be + // less aggressive than the 8 used for the whole function size. + const unsigned ThresSize = 4 * NewRCCount; + if (SrcSize <= ThresSize && DstSize <= ThresSize) return true; + // Estimate *register use density*. If it doubles or more, abort. unsigned SrcUses = std::distance(mri_->use_nodbg_begin(SrcReg), mri_->use_nodbg_end()); @@ -997,13 +820,13 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg, mri_->use_nodbg_end()); unsigned NewUses = SrcUses + DstUses; unsigned NewSize = SrcSize + DstSize; - if (SrcRC != NewRC && SrcSize > NewRCCount) { - unsigned SrcRCCount = allocatableRCRegs_[SrcRC].count(); + if (SrcRC != NewRC && SrcSize > ThresSize) { + unsigned SrcRCCount = RegClassInfo.getNumAllocatableRegs(SrcRC); if (NewUses*SrcSize*SrcRCCount > 2*SrcUses*NewSize*NewRCCount) return false; } - if (DstRC != NewRC && DstSize > NewRCCount) { - unsigned DstRCCount = allocatableRCRegs_[DstRC].count(); + if (DstRC != NewRC && DstSize > ThresSize) { + unsigned DstRCCount = RegClassInfo.getNumAllocatableRegs(DstRC); if (NewUses*DstSize*DstRCCount > 2*DstUses*NewSize*NewRCCount) return false; } @@ -1033,6 +856,7 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) { // If they are already joined we continue. if (CP.getSrcReg() == CP.getDstReg()) { + markAsJoined(CopyMI); DEBUG(dbgs() << "\tCopy already coalesced.\n"); return false; // Not coalescable. } @@ -1552,81 +1376,6 @@ void SimpleRegisterCoalescing::joinIntervals() { } } -/// Return true if the two specified registers belong to different register -/// classes. The registers may be either phys or virt regs. -bool -SimpleRegisterCoalescing::differingRegisterClasses(unsigned RegA, - unsigned RegB) const { - // Get the register classes for the first reg. - if (TargetRegisterInfo::isPhysicalRegister(RegA)) { - assert(TargetRegisterInfo::isVirtualRegister(RegB) && - "Shouldn't consider two physregs!"); - return !mri_->getRegClass(RegB)->contains(RegA); - } - - // Compare against the regclass for the second reg. - const TargetRegisterClass *RegClassA = mri_->getRegClass(RegA); - if (TargetRegisterInfo::isVirtualRegister(RegB)) { - const TargetRegisterClass *RegClassB = mri_->getRegClass(RegB); - return RegClassA != RegClassB; - } - return !RegClassA->contains(RegB); -} - -/// lastRegisterUse - Returns the last (non-debug) use of the specific register -/// between cycles Start and End or NULL if there are no uses. -MachineOperand * -SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start, - SlotIndex End, - unsigned Reg, - SlotIndex &UseIdx) const{ - UseIdx = SlotIndex(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - MachineOperand *LastUse = NULL; - for (MachineRegisterInfo::use_nodbg_iterator I = mri_->use_nodbg_begin(Reg), - E = mri_->use_nodbg_end(); I != E; ++I) { - MachineOperand &Use = I.getOperand(); - MachineInstr *UseMI = Use.getParent(); - if (UseMI->isIdentityCopy()) - continue; - SlotIndex Idx = li_->getInstructionIndex(UseMI); - if (Idx >= Start && Idx < End && (!UseIdx.isValid() || Idx >= UseIdx)) { - LastUse = &Use; - UseIdx = Idx.getUseIndex(); - } - } - return LastUse; - } - - SlotIndex s = Start; - SlotIndex e = End.getPrevSlot().getBaseIndex(); - while (e >= s) { - // Skip deleted instructions - MachineInstr *MI = li_->getInstructionFromIndex(e); - while (e != SlotIndex() && e.getPrevIndex() >= s && !MI) { - e = e.getPrevIndex(); - MI = li_->getInstructionFromIndex(e); - } - if (e < s || MI == NULL) - return NULL; - - // Ignore identity copies. - if (!MI->isIdentityCopy()) - for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) { - MachineOperand &Use = MI->getOperand(i); - if (Use.isReg() && Use.isUse() && Use.getReg() && - tri_->regsOverlap(Use.getReg(), Reg)) { - UseIdx = e.getUseIndex(); - return &Use; - } - } - - e = e.getPrevIndex(); - } - - return NULL; -} - void SimpleRegisterCoalescing::releaseMemory() { JoinedCopies.clear(); ReMatCopies.clear(); @@ -1651,10 +1400,7 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) { if (VerifyCoalescing) mf_->verify(this, "Before register coalescing"); - for (TargetRegisterInfo::regclass_iterator I = tri_->regclass_begin(), - E = tri_->regclass_end(); I != E; ++I) - allocatableRCRegs_.insert(std::make_pair(*I, - tri_->getAllocatableSet(fn, *I))); + RegClassInfo.runOnMachineFunction(fn); // Join (coalesce) intervals if requested. if (EnableJoining) { @@ -1691,13 +1437,11 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) { // or else the scavenger may complain. LowerSubregs will // delete them later. DoDelete = false; - + if (MI->allDefsAreDead()) { - if (li_->hasInterval(SrcReg)) { - LiveInterval &li = li_->getInterval(SrcReg); - if (!ShortenDeadCopySrcLiveRange(li, MI)) - ShortenDeadCopyLiveRange(li, MI); - } + if (TargetRegisterInfo::isVirtualRegister(SrcReg) && + li_->hasInterval(SrcReg)) + li_->shrinkToUses(&li_->getInterval(SrcReg)); DoDelete = true; } if (!DoDelete) { @@ -1749,24 +1493,6 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) { DeadDefs.clear(); } - // If the move will be an identity move delete it - if (MI->isIdentityCopy()) { - unsigned SrcReg = MI->getOperand(1).getReg(); - if (li_->hasInterval(SrcReg)) { - LiveInterval &RegInt = li_->getInterval(SrcReg); - // If def of this move instruction is dead, remove its live range - // from the destination register's live interval. - if (MI->allDefsAreDead()) { - if (!ShortenDeadCopySrcLiveRange(RegInt, MI)) - ShortenDeadCopyLiveRange(RegInt, MI); - } - } - li_->RemoveMachineInstrFromMaps(MI); - mii = mbbi->erase(mii); - ++numPeep; - continue; - } - ++mii; // Check for now unnecessary kill flags. diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h index 65cf542..92f6c64 100644 --- a/lib/CodeGen/SimpleRegisterCoalescing.h +++ b/lib/CodeGen/SimpleRegisterCoalescing.h @@ -17,7 +17,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/RegisterCoalescer.h" -#include "llvm/ADT/BitVector.h" +#include "RegisterClassInfo.h" namespace llvm { class SimpleRegisterCoalescing; @@ -47,8 +47,7 @@ namespace llvm { LiveDebugVariables *ldv_; const MachineLoopInfo* loopInfo; AliasAnalysis *AA; - - DenseMap<const TargetRegisterClass*, BitVector> allocatableRCRegs_; + RegisterClassInfo RegClassInfo; /// JoinedCopies - Keep track of copies eliminated due to coalescing. /// @@ -103,10 +102,6 @@ namespace llvm { /// use this information below to update aliases. bool JoinIntervals(CoalescerPair &CP); - /// Return true if the two specified registers belong to different register - /// classes. The registers may be either phys or virt regs. - bool differingRegisterClasses(unsigned RegA, unsigned RegB) const; - /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If /// the source value number is defined by a copy from the destination reg /// see if we can merge these two destination reg valno# into a single @@ -124,13 +119,6 @@ namespace llvm { /// can transform the copy into a noop by commuting the definition. bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); - /// TrimLiveIntervalToLastUse - If there is a last use in the same basic - /// block as the copy instruction, trim the ive interval to the last use - /// and return true. - bool TrimLiveIntervalToLastUse(SlotIndex CopyIdx, - MachineBasicBlock *CopyMBB, - LiveInterval &li, const LiveRange *LR); - /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial /// computation, replace the copy by rematerialize the definition. /// If PreserveSrcInt is true, make sure SrcInt is valid after the call. @@ -156,16 +144,6 @@ namespace llvm { /// subregister. void UpdateRegDefsUses(const CoalescerPair &CP); - /// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy. - /// Return true if live interval is removed. - bool ShortenDeadCopyLiveRange(LiveInterval &li, MachineInstr *CopyMI); - - /// ShortenDeadCopyLiveRange - Shorten a live range as it's artificially - /// extended by a dead copy. Mark the last use (if any) of the val# as kill - /// as ends the live range there. If there isn't another use, then this - /// live range is dead. Return true if live interval is removed. - bool ShortenDeadCopySrcLiveRange(LiveInterval &li, MachineInstr *CopyMI); - /// RemoveDeadDef - If a def of a live interval is now determined dead, /// remove the val# it defines. If the live interval becomes empty, remove /// it as well. @@ -175,11 +153,6 @@ namespace llvm { /// VNInfo copy flag for DstReg and all aliases. void RemoveCopyFlag(unsigned DstReg, const MachineInstr *CopyMI); - /// lastRegisterUse - Returns the last use of the specific register between - /// cycles Start and End or NULL if there are no uses. - MachineOperand *lastRegisterUse(SlotIndex Start, SlotIndex End, - unsigned Reg, SlotIndex &LastUseIdx) const; - /// markAsJoined - Remember that CopyMI has already been joined. void markAsJoined(MachineInstr *CopyMI); }; diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp index aaa8568..92970e4 100644 --- a/lib/CodeGen/SjLjEHPrepare.cpp +++ b/lib/CodeGen/SjLjEHPrepare.cpp @@ -443,16 +443,17 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) { BasicBlock::Create(F.getContext(), "eh.sjlj.setjmp.catch", &F); // Insert a load of the callsite in the dispatch block, and a switch on its - // value. By default, we go to a block that just does an unwind (which is the - // correct action for a standard call). - BasicBlock *UnwindBlock = - BasicBlock::Create(F.getContext(), "unwindbb", &F); - Unwinds.push_back(new UnwindInst(F.getContext(), UnwindBlock)); + // value. By default, we issue a trap statement. + BasicBlock *TrapBlock = + BasicBlock::Create(F.getContext(), "trapbb", &F); + CallInst::Create(Intrinsic::getDeclaration(F.getParent(), Intrinsic::trap), + "", TrapBlock); + new UnreachableInst(F.getContext(), TrapBlock); Value *DispatchLoad = new LoadInst(CallSite, "invoke.num", true, DispatchBlock); SwitchInst *DispatchSwitch = - SwitchInst::Create(DispatchLoad, UnwindBlock, Invokes.size(), + SwitchInst::Create(DispatchLoad, TrapBlock, Invokes.size(), DispatchBlock); // Split the entry block to insert the conditional branch for the setjmp. BasicBlock *ContBlock = EntryBB->splitBasicBlock(EntryBB->getTerminator(), @@ -519,7 +520,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) { // Add a call to dispatch_setup after the setjmp call. This is expanded to any // target-specific setup that needs to be done. - CallInst::Create(DispatchSetupFn, "", EntryBB->getTerminator()); + CallInst::Create(DispatchSetupFn, DispatchVal, "", EntryBB->getTerminator()); // check the return value of the setjmp. non-zero goes to dispatcher. Value *IsNormal = new ICmpInst(EntryBB->getTerminator(), @@ -561,7 +562,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) { // Replace all unwinds with a branch to the unwind handler. // ??? Should this ever happen with sjlj exceptions? for (unsigned i = 0, e = Unwinds.size(); i != e; ++i) { - BranchInst::Create(UnwindBlock, Unwinds[i]); + BranchInst::Create(TrapBlock, Unwinds[i]); Unwinds[i]->eraseFromParent(); } diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp index cab18a1..6949618 100644 --- a/lib/CodeGen/SpillPlacement.cpp +++ b/lib/CodeGen/SpillPlacement.cpp @@ -135,13 +135,10 @@ struct SpillPlacement::Node { /// addBias - Bias this node from an ingoing[0] or outgoing[1] link. /// Return the change to the total number of positive biases. - int addBias(float w, bool out) { + void addBias(float w, bool out) { // Normalize w relative to all connected blocks from that direction. w *= Scale[out]; - int Before = Bias > 0; Bias += w; - int After = Bias > 0; - return After - Before; } /// update - Recompute Value from Bias and Links. Return true when node @@ -230,14 +227,14 @@ void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) { if (I->Entry != DontCare) { unsigned ib = bundles->getBundle(I->Number, 0); activate(ib); - PositiveNodes += nodes[ib].addBias(Freq * Bias[I->Entry], 1); + nodes[ib].addBias(Freq * Bias[I->Entry], 1); } // Live-out from block? if (I->Exit != DontCare) { unsigned ob = bundles->getBundle(I->Number, 1); activate(ob); - PositiveNodes += nodes[ob].addBias(Freq * Bias[I->Exit], 0); + nodes[ob].addBias(Freq * Bias[I->Exit], 0); } } } @@ -254,16 +251,42 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) { continue; activate(ib); activate(ob); + if (nodes[ib].Links.empty() && !nodes[ib].mustSpill()) + Linked.push_back(ib); + if (nodes[ob].Links.empty() && !nodes[ob].mustSpill()) + Linked.push_back(ob); float Freq = getBlockFrequency(Number); nodes[ib].addLink(ob, Freq, 1); nodes[ob].addLink(ib, Freq, 0); } } +bool SpillPlacement::scanActiveBundles() { + Linked.clear(); + RecentPositive.clear(); + for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) { + nodes[n].update(nodes); + // A node that must spill, or a node without any links is not going to + // change its value ever again, so exclude it from iterations. + if (nodes[n].mustSpill()) + continue; + if (!nodes[n].Links.empty()) + Linked.push_back(n); + if (nodes[n].preferReg()) + RecentPositive.push_back(n); + } + return !RecentPositive.empty(); +} + /// iterate - Repeatedly update the Hopfield nodes until stability or the /// maximum number of iterations is reached. /// @param Linked - Numbers of linked nodes that need updating. -void SpillPlacement::iterate(const SmallVectorImpl<unsigned> &Linked) { +void SpillPlacement::iterate() { + // First update the recently positive nodes. They have likely received new + // negative bias that will turn them off. + while (!RecentPositive.empty()) + nodes[RecentPositive.pop_back_val()].update(nodes); + if (Linked.empty()) return; @@ -279,10 +302,13 @@ void SpillPlacement::iterate(const SmallVectorImpl<unsigned> &Linked) { for (SmallVectorImpl<unsigned>::const_reverse_iterator I = llvm::next(Linked.rbegin()), E = Linked.rend(); I != E; ++I) { unsigned n = *I; - bool C = nodes[n].update(nodes); - Changed |= C; + if (nodes[n].update(nodes)) { + Changed = true; + if (nodes[n].preferReg()) + RecentPositive.push_back(n); + } } - if (!Changed) + if (!Changed || !RecentPositive.empty()) return; // Scan forwards, skipping the first node which was just updated. @@ -290,38 +316,29 @@ void SpillPlacement::iterate(const SmallVectorImpl<unsigned> &Linked) { for (SmallVectorImpl<unsigned>::const_iterator I = llvm::next(Linked.begin()), E = Linked.end(); I != E; ++I) { unsigned n = *I; - bool C = nodes[n].update(nodes); - Changed |= C; + if (nodes[n].update(nodes)) { + Changed = true; + if (nodes[n].preferReg()) + RecentPositive.push_back(n); + } } - if (!Changed) + if (!Changed || !RecentPositive.empty()) return; } } void SpillPlacement::prepare(BitVector &RegBundles) { + Linked.clear(); + RecentPositive.clear(); // Reuse RegBundles as our ActiveNodes vector. ActiveNodes = &RegBundles; ActiveNodes->clear(); ActiveNodes->resize(bundles->getNumBundles()); - PositiveNodes = 0; } bool SpillPlacement::finish() { assert(ActiveNodes && "Call prepare() first"); - // Update all active nodes, and find the ones that are actually linked to - // something so their value may change when iterating. - SmallVector<unsigned, 8> Linked; - for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) { - nodes[n].update(nodes); - // A node that must spill, or a node without any links is not going to - // change its value ever again, so exclude it from iterations. - if (!nodes[n].Links.empty() && !nodes[n].mustSpill()) - Linked.push_back(n); - } - - // Iterate the network to convergence. - iterate(Linked); // Write preferences back to ActiveNodes. bool Perfect = true; diff --git a/lib/CodeGen/SpillPlacement.h b/lib/CodeGen/SpillPlacement.h index 46e64e6..6952ad8 100644 --- a/lib/CodeGen/SpillPlacement.h +++ b/lib/CodeGen/SpillPlacement.h @@ -49,8 +49,12 @@ class SpillPlacement : public MachineFunctionPass { // caller. BitVector *ActiveNodes; - // The number of active nodes with a positive bias. - unsigned PositiveNodes; + // Nodes with active links. Populated by scanActiveBundles. + SmallVector<unsigned, 8> Linked; + + // Nodes that went positive during the last call to scanActiveBundles or + // iterate. + SmallVector<unsigned, 8> RecentPositive; // Block frequencies are computed once. Indexed by block number. SmallVector<float, 4> BlockFrequency; @@ -95,9 +99,20 @@ public: /// addLinks - Add transparent blocks with the given numbers. void addLinks(ArrayRef<unsigned> Links); - /// getPositiveNodes - Return the total number of graph nodes with a positive - /// bias after adding constraints. - unsigned getPositiveNodes() const { return PositiveNodes; } + /// scanActiveBundles - Perform an initial scan of all bundles activated by + /// addConstraints and addLinks, updating their state. Add all the bundles + /// that now prefer a register to RecentPositive. + /// Prepare internal data structures for iterate. + /// Return true is there are any positive nodes. + bool scanActiveBundles(); + + /// iterate - Update the network iteratively until convergence, or new bundles + /// are found. + void iterate(); + + /// getRecentPositive - Return an array of bundles that became positive during + /// the previous call to scanActiveBundles or iterate. + ArrayRef<unsigned> getRecentPositive() { return RecentPositive; } /// finish - Compute the optimal spill code placement given the /// constraints. No MustSpill constraints will be violated, and the smallest @@ -120,7 +135,6 @@ private: virtual void releaseMemory(); void activate(unsigned); - void iterate(const SmallVectorImpl<unsigned>&); }; } // end namespace llvm diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp index b89139f..b6bbcd7 100644 --- a/lib/CodeGen/Spiller.cpp +++ b/lib/CodeGen/Spiller.cpp @@ -25,7 +25,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include <set> using namespace llvm; diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp index 201e9b1..bf27cc8 100644 --- a/lib/CodeGen/SplitKit.cpp +++ b/lib/CodeGen/SplitKit.cpp @@ -30,6 +30,9 @@ using namespace llvm; STATISTIC(NumFinished, "Number of splits finished"); STATISTIC(NumSimple, "Number of splits that were simple"); +STATISTIC(NumCopies, "Number of copies inserted for splitting"); +STATISTIC(NumRemats, "Number of rematerialized defs for splitting"); +STATISTIC(NumRepairs, "Number of invalid live ranges repaired"); //===----------------------------------------------------------------------===// // Split Analysis @@ -51,6 +54,7 @@ void SplitAnalysis::clear() { UseBlocks.clear(); ThroughBlocks.clear(); CurLI = 0; + DidRepairRange = false; } SlotIndex SplitAnalysis::computeLastSplitPoint(unsigned Num) { @@ -119,6 +123,8 @@ void SplitAnalysis::analyzeUses() { if (!calcLiveBlockInfo()) { // FIXME: calcLiveBlockInfo found inconsistencies in the live range. // I am looking at you, SimpleRegisterCoalescing! + DidRepairRange = true; + ++NumRepairs; DEBUG(dbgs() << "*** Fixing inconsistent live interval! ***\n"); const_cast<LiveIntervals&>(LIS) .shrinkToUses(const_cast<LiveInterval*>(CurLI)); @@ -132,12 +138,14 @@ void SplitAnalysis::analyzeUses() { DEBUG(dbgs() << "Analyze counted " << UseSlots.size() << " instrs in " << UseBlocks.size() << " blocks, through " - << ThroughBlocks.size() << " blocks.\n"); + << NumThroughBlocks << " blocks.\n"); } /// calcLiveBlockInfo - Fill the LiveBlocks array with information about blocks /// where CurLI is live. bool SplitAnalysis::calcLiveBlockInfo() { + ThroughBlocks.resize(MF.getNumBlockIDs()); + NumThroughBlocks = NumGapBlocks = 0; if (CurLI->empty()) return true; @@ -156,54 +164,63 @@ bool SplitAnalysis::calcLiveBlockInfo() { SlotIndex Start, Stop; tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB); - // LVI is the first live segment overlapping MBB. - BI.LiveIn = LVI->start <= Start; - if (!BI.LiveIn) - BI.Def = LVI->start; - - // Find the first and last uses in the block. - bool Uses = UseI != UseE && *UseI < Stop; - if (Uses) { + // If the block contains no uses, the range must be live through. At one + // point, SimpleRegisterCoalescing could create dangling ranges that ended + // mid-block. + if (UseI == UseE || *UseI >= Stop) { + ++NumThroughBlocks; + ThroughBlocks.set(BI.MBB->getNumber()); + // The range shouldn't end mid-block if there are no uses. This shouldn't + // happen. + if (LVI->end < Stop) + return false; + } else { + // This block has uses. Find the first and last uses in the block. BI.FirstUse = *UseI; assert(BI.FirstUse >= Start); do ++UseI; while (UseI != UseE && *UseI < Stop); BI.LastUse = UseI[-1]; assert(BI.LastUse < Stop); - } - // Look for gaps in the live range. - bool hasGap = false; - BI.LiveOut = true; - while (LVI->end < Stop) { - SlotIndex LastStop = LVI->end; - if (++LVI == LVE || LVI->start >= Stop) { - BI.Kill = LastStop; - BI.LiveOut = false; - break; - } - if (LastStop < LVI->start) { - hasGap = true; - BI.Kill = LastStop; - BI.Def = LVI->start; + // LVI is the first live segment overlapping MBB. + BI.LiveIn = LVI->start <= Start; + + // Look for gaps in the live range. + BI.LiveOut = true; + while (LVI->end < Stop) { + SlotIndex LastStop = LVI->end; + if (++LVI == LVE || LVI->start >= Stop) { + BI.LiveOut = false; + BI.LastUse = LastStop; + break; + } + if (LastStop < LVI->start) { + // There is a gap in the live range. Create duplicate entries for the + // live-in snippet and the live-out snippet. + ++NumGapBlocks; + + // Push the Live-in part. + BI.LiveThrough = false; + BI.LiveOut = false; + UseBlocks.push_back(BI); + UseBlocks.back().LastUse = LastStop; + + // Set up BI for the live-out part. + BI.LiveIn = false; + BI.LiveOut = true; + BI.FirstUse = LVI->start; + } } - } - // Don't set LiveThrough when the block has a gap. - BI.LiveThrough = !hasGap && BI.LiveIn && BI.LiveOut; - if (Uses) + // Don't set LiveThrough when the block has a gap. + BI.LiveThrough = BI.LiveIn && BI.LiveOut; UseBlocks.push_back(BI); - else - ThroughBlocks.push_back(BI.MBB->getNumber()); - // FIXME: This should never happen. The live range stops or starts without a - // corresponding use. An earlier pass did something wrong. - if (!BI.LiveThrough && !Uses) - return false; - - // LVI is now at LVE or LVI->end >= Stop. - if (LVI == LVE) - break; + // LVI is now at LVE or LVI->end >= Stop. + if (LVI == LVE) + break; + } // Live segment ends exactly at Stop. Move to the next segment. if (LVI->end == Stop && ++LVI == LVE) @@ -215,9 +232,34 @@ bool SplitAnalysis::calcLiveBlockInfo() { else MFI = LIS.getMBBFromIndex(LVI->start); } + + assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count"); return true; } +unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const { + if (cli->empty()) + return 0; + LiveInterval *li = const_cast<LiveInterval*>(cli); + LiveInterval::iterator LVI = li->begin(); + LiveInterval::iterator LVE = li->end(); + unsigned Count = 0; + + // Loop over basic blocks where li is live. + MachineFunction::const_iterator MFI = LIS.getMBBFromIndex(LVI->start); + SlotIndex Stop = LIS.getMBBEndIdx(MFI); + for (;;) { + ++Count; + LVI = li->advanceTo(LVI, Stop); + if (LVI == LVE) + return Count; + do { + ++MFI; + Stop = LIS.getMBBEndIdx(MFI); + } while (Stop <= LVI->start); + } +} + bool SplitAnalysis::isOriginalEndpoint(SlotIndex Idx) const { unsigned OrigReg = VRM.getOriginal(CurLI->reg); const LiveInterval &Orig = LIS.getInterval(OrigReg); @@ -348,8 +390,33 @@ void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) { // Now for the fun part. We know that ParentVNI potentially has multiple defs, // and we may need to create even more phi-defs to preserve VNInfo SSA form. // Perform a search for all predecessor blocks where we know the dominating - // VNInfo. Insert phi-def VNInfos along the path back to IdxMBB. + // VNInfo. + VNInfo *VNI = findReachingDefs(LI, IdxMBB, Idx.getNextSlot()); + // When there were multiple different values, we may need new PHIs. + if (!VNI) + return updateSSA(); + + // Poor man's SSA update for the single-value case. + LiveOutPair LOP(VNI, MDT[LIS.getMBBFromIndex(VNI->def)]); + for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(), + E = LiveInBlocks.end(); I != E; ++I) { + MachineBasicBlock *MBB = I->DomNode->getBlock(); + SlotIndex Start = LIS.getMBBStartIdx(MBB); + if (I->Kill.isValid()) + LI->addRange(LiveRange(Start, I->Kill, VNI)); + else { + LiveOutCache[MBB] = LOP; + LI->addRange(LiveRange(Start, LIS.getMBBEndIdx(MBB), VNI)); + } + } +} + +/// findReachingDefs - Search the CFG for known live-out values. +/// Add required live-in blocks to LiveInBlocks. +VNInfo *SplitEditor::findReachingDefs(LiveInterval *LI, + MachineBasicBlock *KillMBB, + SlotIndex Kill) { // Initialize the live-out cache the first time it is needed. if (LiveOutSeen.empty()) { unsigned N = VRM.getMachineFunction().getNumBlockIDs(); @@ -358,16 +425,15 @@ void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) { } // Blocks where LI should be live-in. - SmallVector<MachineDomTreeNode*, 16> LiveIn; - LiveIn.push_back(MDT[IdxMBB]); + SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB); // Remember if we have seen more than one value. bool UniqueVNI = true; - VNInfo *IdxVNI = 0; + VNInfo *TheVNI = 0; // Using LiveOutCache as a visited set, perform a BFS for all reaching defs. - for (unsigned i = 0; i != LiveIn.size(); ++i) { - MachineBasicBlock *MBB = LiveIn[i]->getBlock(); + for (unsigned i = 0; i != WorkList.size(); ++i) { + MachineBasicBlock *MBB = WorkList[i]; assert(!MBB->pred_empty() && "Value live-in to entry block?"); for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), PE = MBB->pred_end(); PI != PE; ++PI) { @@ -377,9 +443,9 @@ void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) { // Is this a known live-out block? if (LiveOutSeen.test(Pred->getNumber())) { if (VNInfo *VNI = LOP.first) { - if (IdxVNI && IdxVNI != VNI) + if (TheVNI && TheVNI != VNI) UniqueVNI = false; - IdxVNI = VNI; + TheVNI = VNI; } continue; } @@ -395,64 +461,50 @@ void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) { LOP.first = VNI; if (VNI) { LOP.second = MDT[LIS.getMBBFromIndex(VNI->def)]; - if (IdxVNI && IdxVNI != VNI) + if (TheVNI && TheVNI != VNI) UniqueVNI = false; - IdxVNI = VNI; + TheVNI = VNI; continue; } LOP.second = 0; // No, we need a live-in value for Pred as well - if (Pred != IdxMBB) - LiveIn.push_back(MDT[Pred]); + if (Pred != KillMBB) + WorkList.push_back(Pred); else - UniqueVNI = false; // Loopback to IdxMBB, ask updateSSA() for help. + // Loopback to KillMBB, so value is really live through. + Kill = SlotIndex(); } } - // We may need to add phi-def values to preserve the SSA form. - if (UniqueVNI) { - LiveOutPair LOP(IdxVNI, MDT[LIS.getMBBFromIndex(IdxVNI->def)]); - // Update LiveOutCache, but skip IdxMBB at LiveIn[0]. - for (unsigned i = 1, e = LiveIn.size(); i != e; ++i) - LiveOutCache[LiveIn[i]->getBlock()] = LOP; - } else - IdxVNI = updateSSA(RegIdx, LiveIn, Idx, IdxMBB); - - // Since we went through the trouble of a full BFS visiting all reaching defs, - // the values in LiveIn are now accurate. No more phi-defs are needed - // for these blocks, so we can color the live ranges. - for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) { - MachineBasicBlock *MBB = LiveIn[i]->getBlock(); - SlotIndex Start = LIS.getMBBStartIdx(MBB); - VNInfo *VNI = LiveOutCache[MBB].first; + // Transfer WorkList to LiveInBlocks in reverse order. + // This ordering works best with updateSSA(). + LiveInBlocks.clear(); + LiveInBlocks.reserve(WorkList.size()); + while(!WorkList.empty()) + LiveInBlocks.push_back(MDT[WorkList.pop_back_val()]); - // Anything in LiveIn other than IdxMBB is live-through. - // In IdxMBB, we should stop at Idx unless the same value is live-out. - if (MBB == IdxMBB && IdxVNI != VNI) - LI->addRange(LiveRange(Start, Idx.getNextSlot(), IdxVNI)); - else - LI->addRange(LiveRange(Start, LIS.getMBBEndIdx(MBB), VNI)); - } + // The kill block may not be live-through. + assert(LiveInBlocks.back().DomNode->getBlock() == KillMBB); + LiveInBlocks.back().Kill = Kill; + + return UniqueVNI ? TheVNI : 0; } -VNInfo *SplitEditor::updateSSA(unsigned RegIdx, - SmallVectorImpl<MachineDomTreeNode*> &LiveIn, - SlotIndex Idx, - const MachineBasicBlock *IdxMBB) { +void SplitEditor::updateSSA() { // This is essentially the same iterative algorithm that SSAUpdater uses, // except we already have a dominator tree, so we don't have to recompute it. - LiveInterval *LI = Edit->get(RegIdx); - VNInfo *IdxVNI = 0; unsigned Changes; do { Changes = 0; // Propagate live-out values down the dominator tree, inserting phi-defs - // when necessary. Since LiveIn was created by a BFS, going backwards makes - // it more likely for us to visit immediate dominators before their - // children. - for (unsigned i = LiveIn.size(); i; --i) { - MachineDomTreeNode *Node = LiveIn[i-1]; + // when necessary. + for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(), + E = LiveInBlocks.end(); I != E; ++I) { + MachineDomTreeNode *Node = I->DomNode; + // Skip block if the live-in value has already been determined. + if (!Node) + continue; MachineBasicBlock *MBB = Node->getBlock(); MachineDomTreeNode *IDom = Node->getIDom(); LiveOutPair IDomValue; @@ -461,9 +513,9 @@ VNInfo *SplitEditor::updateSSA(unsigned RegIdx, // This is probably an unreachable block that has survived somehow. bool needPHI = !IDom || !LiveOutSeen.test(IDom->getBlock()->getNumber()); - // IDom dominates all of our predecessors, but it may not be the immediate - // dominator. Check if any of them have live-out values that are properly - // dominated by IDom. If so, we need a phi-def here. + // IDom dominates all of our predecessors, but it may not be their + // immediate dominator. Check if any of them have live-out values that are + // properly dominated by IDom. If so, we need a phi-def here. if (!needPHI) { IDomValue = LiveOutCache[IDom->getBlock()]; for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), @@ -481,40 +533,35 @@ VNInfo *SplitEditor::updateSSA(unsigned RegIdx, } } + // The value may be live-through even if Kill is set, as can happen when + // we are called from extendRange. In that case LiveOutSeen is true, and + // LiveOutCache indicates a foreign or missing value. + LiveOutPair &LOP = LiveOutCache[MBB]; + // Create a phi-def if required. if (needPHI) { ++Changes; SlotIndex Start = LIS.getMBBStartIdx(MBB); + unsigned RegIdx = RegAssign.lookup(Start); + LiveInterval *LI = Edit->get(RegIdx); VNInfo *VNI = LI->getNextValue(Start, 0, LIS.getVNInfoAllocator()); VNI->setIsPHIDef(true); - // We no longer need LI to be live-in. - LiveIn.erase(LiveIn.begin()+(i-1)); - // Blocks in LiveIn are either IdxMBB, or have a value live-through. - if (MBB == IdxMBB) - IdxVNI = VNI; - // Check if we need to update live-out info. - LiveOutPair &LOP = LiveOutCache[MBB]; - if (LOP.second == Node || !LiveOutSeen.test(MBB->getNumber())) { - // We already have a live-out defined in MBB, so this must be IdxMBB. - assert(MBB == IdxMBB && "Adding phi-def to known live-out"); - LI->addRange(LiveRange(Start, Idx.getNextSlot(), VNI)); - } else { - // This phi-def is also live-out, so color the whole block. + I->Value = VNI; + // This block is done, we know the final value. + I->DomNode = 0; + if (I->Kill.isValid()) + LI->addRange(LiveRange(Start, I->Kill, VNI)); + else { LI->addRange(LiveRange(Start, LIS.getMBBEndIdx(MBB), VNI)); LOP = LiveOutPair(VNI, Node); } } else if (IDomValue.first) { - // No phi-def here. Remember incoming value for IdxMBB. - if (MBB == IdxMBB) { - IdxVNI = IDomValue.first; - // IdxMBB need not be live-out. - if (!LiveOutSeen.test(MBB->getNumber())) - continue; - } - assert(LiveOutSeen.test(MBB->getNumber()) && "Expected live-out block"); + // No phi-def here. Remember incoming value. + I->Value = IDomValue.first; + if (I->Kill.isValid()) + continue; // Propagate IDomValue if needed: // MBB is live-out and doesn't define its own value. - LiveOutPair &LOP = LiveOutCache[MBB]; if (LOP.second != Node && LOP.first != IDomValue.first) { ++Changes; LOP = IDomValue; @@ -523,8 +570,20 @@ VNInfo *SplitEditor::updateSSA(unsigned RegIdx, } } while (Changes); - assert(IdxVNI && "Didn't find value for Idx"); - return IdxVNI; + // The values in LiveInBlocks are now accurate. No more phi-defs are needed + // for these blocks, so we can color the live ranges. + for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(), + E = LiveInBlocks.end(); I != E; ++I) { + if (!I->DomNode) + continue; + assert(I->Value && "No live-in value found"); + MachineBasicBlock *MBB = I->DomNode->getBlock(); + SlotIndex Start = LIS.getMBBStartIdx(MBB); + unsigned RegIdx = RegAssign.lookup(Start); + LiveInterval *LI = Edit->get(RegIdx); + LI->addRange(LiveRange(Start, I->Kill.isValid() ? + I->Kill : LIS.getMBBEndIdx(MBB), I->Value)); + } } VNInfo *SplitEditor::defFromParent(unsigned RegIdx, @@ -536,15 +595,22 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx, SlotIndex Def; LiveInterval *LI = Edit->get(RegIdx); + // We may be trying to avoid interference that ends at a deleted instruction, + // so always begin RegIdx 0 early and all others late. + bool Late = RegIdx != 0; + // Attempt cheap-as-a-copy rematerialization. LiveRangeEdit::Remat RM(ParentVNI); if (Edit->canRematerializeAt(RM, UseIdx, true, LIS)) { - Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, LIS, TII, TRI); + Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, LIS, TII, TRI, Late); + ++NumRemats; } else { // Can't remat, just insert a copy from parent. CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg) .addReg(Edit->getReg()); - Def = LIS.InsertMachineInstrInMaps(CopyMI).getDefIndex(); + Def = LIS.getSlotIndexes()->insertMachineInstrInMaps(CopyMI, Late) + .getDefIndex(); + ++NumCopies; } // Define the value in Reg. @@ -554,9 +620,7 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx, } /// Create a new virtual register and live interval. -void SplitEditor::openIntv() { - assert(!OpenIdx && "Previous LI not closed before openIntv"); - +unsigned SplitEditor::openIntv() { // Create the complement as index 0. if (Edit->empty()) Edit->create(LIS, VRM); @@ -564,6 +628,13 @@ void SplitEditor::openIntv() { // Create the open interval. OpenIdx = Edit->size(); Edit->create(LIS, VRM); + return OpenIdx; +} + +void SplitEditor::selectIntv(unsigned Idx) { + assert(Idx != 0 && "Cannot select the complement interval"); + assert(Idx < Edit->size() && "Can only select previously opened interval"); + OpenIdx = Idx; } SlotIndex SplitEditor::enterIntvBefore(SlotIndex Idx) { @@ -686,18 +757,11 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) { DEBUG(dump()); } -/// closeIntv - Indicate that we are done editing the currently open -/// LiveInterval, and ranges can be trimmed. -void SplitEditor::closeIntv() { - assert(OpenIdx && "openIntv not called before closeIntv"); - OpenIdx = 0; -} - -/// transferSimpleValues - Transfer all simply defined values to the new live -/// ranges. -/// Values that were rematerialized or that have multiple defs are left alone. -bool SplitEditor::transferSimpleValues() { +/// transferValues - Transfer all possible values to the new live ranges. +/// Values that were rematerialized are left alone, they need extendRange(). +bool SplitEditor::transferValues() { bool Skipped = false; + LiveInBlocks.clear(); RegAssignMap::const_iterator AssignI = RegAssign.begin(); for (LiveInterval::const_iterator ParentI = Edit->getParent().begin(), ParentE = Edit->getParent().end(); ParentI != ParentE; ++ParentI) { @@ -721,16 +785,98 @@ bool SplitEditor::transferSimpleValues() { RegIdx = 0; End = std::min(End, AssignI.start()); } + + // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI. DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx); + LiveInterval *LI = Edit->get(RegIdx); + + // Check for a simply defined value that can be blitted directly. if (VNInfo *VNI = Values.lookup(std::make_pair(RegIdx, ParentVNI->id))) { DEBUG(dbgs() << ':' << VNI->id); - Edit->get(RegIdx)->addRange(LiveRange(Start, End, VNI)); - } else + LI->addRange(LiveRange(Start, End, VNI)); + Start = End; + continue; + } + + // Skip rematerialized values, we need to use extendRange() and + // extendPHIKillRanges() to completely recompute the live ranges. + if (Edit->didRematerialize(ParentVNI)) { + DEBUG(dbgs() << "(remat)"); Skipped = true; + Start = End; + continue; + } + + // Initialize the live-out cache the first time it is needed. + if (LiveOutSeen.empty()) { + unsigned N = VRM.getMachineFunction().getNumBlockIDs(); + LiveOutSeen.resize(N); + LiveOutCache.resize(N); + } + + // This value has multiple defs in RegIdx, but it wasn't rematerialized, + // so the live range is accurate. Add live-in blocks in [Start;End) to the + // LiveInBlocks. + MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start); + SlotIndex BlockStart, BlockEnd; + tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(MBB); + + // The first block may be live-in, or it may have its own def. + if (Start != BlockStart) { + VNInfo *VNI = LI->extendInBlock(BlockStart, + std::min(BlockEnd, End).getPrevSlot()); + assert(VNI && "Missing def for complex mapped value"); + DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber()); + // MBB has its own def. Is it also live-out? + if (BlockEnd <= End) { + LiveOutSeen.set(MBB->getNumber()); + LiveOutCache[MBB] = LiveOutPair(VNI, MDT[MBB]); + } + // Skip to the next block for live-in. + ++MBB; + BlockStart = BlockEnd; + } + + // Handle the live-in blocks covered by [Start;End). + assert(Start <= BlockStart && "Expected live-in block"); + while (BlockStart < End) { + DEBUG(dbgs() << ">BB#" << MBB->getNumber()); + BlockEnd = LIS.getMBBEndIdx(MBB); + if (BlockStart == ParentVNI->def) { + // This block has the def of a parent PHI, so it isn't live-in. + assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?"); + VNInfo *VNI = LI->extendInBlock(BlockStart, + std::min(BlockEnd, End).getPrevSlot()); + assert(VNI && "Missing def for complex mapped parent PHI"); + if (End >= BlockEnd) { + // Live-out as well. + LiveOutSeen.set(MBB->getNumber()); + LiveOutCache[MBB] = LiveOutPair(VNI, MDT[MBB]); + } + } else { + // This block needs a live-in value. + LiveInBlocks.push_back(MDT[MBB]); + // The last block covered may not be live-out. + if (End < BlockEnd) + LiveInBlocks.back().Kill = End; + else { + // Live-out, but we need updateSSA to tell us the value. + LiveOutSeen.set(MBB->getNumber()); + LiveOutCache[MBB] = LiveOutPair((VNInfo*)0, + (MachineDomTreeNode*)0); + } + } + BlockStart = BlockEnd; + ++MBB; + } Start = End; } while (Start != ParentI->end); DEBUG(dbgs() << '\n'); } + + if (!LiveInBlocks.empty()) + updateSSA(); + return Skipped; } @@ -835,8 +981,7 @@ void SplitEditor::deleteRematVictims() { Edit->eliminateDeadDefs(Dead, LIS, VRM, TII); } -void SplitEditor::finish() { - assert(OpenIdx == 0 && "Previous LI not closed before rewrite"); +void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) { ++NumFinished; // At this point, the live intervals in Edit contain VNInfos corresponding to @@ -866,24 +1011,31 @@ void SplitEditor::finish() { assert((*I)->hasAtLeastOneValue() && "Split interval has no value"); #endif - // Transfer the simply mapped values, check if any are complex. - bool Complex = transferSimpleValues(); - if (Complex) + // Transfer the simply mapped values, check if any are skipped. + bool Skipped = transferValues(); + if (Skipped) extendPHIKillRanges(); else ++NumSimple; // Rewrite virtual registers, possibly extending ranges. - rewriteAssigned(Complex); + rewriteAssigned(Skipped); // Delete defs that were rematted everywhere. - if (Complex) + if (Skipped) deleteRematVictims(); // Get rid of unused values and set phi-kill flags. for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I) (*I)->RenumberValues(LIS); + // Provide a reverse mapping from original indices to Edit ranges. + if (LRMap) { + LRMap->clear(); + for (unsigned i = 0, e = Edit->size(); i != e; ++i) + LRMap->push_back(i); + } + // Now check if any registers were separated into multiple components. ConnectedVNInfoEqClasses ConEQ(LIS); for (unsigned i = 0, e = Edit->size(); i != e; ++i) { @@ -895,13 +1047,18 @@ void SplitEditor::finish() { DEBUG(dbgs() << " " << NumComp << " components: " << *li << '\n'); SmallVector<LiveInterval*, 8> dups; dups.push_back(li); - for (unsigned i = 1; i != NumComp; ++i) + for (unsigned j = 1; j != NumComp; ++j) dups.push_back(&Edit->create(LIS, VRM)); ConEQ.Distribute(&dups[0], MRI); + // The new intervals all map back to i. + if (LRMap) + LRMap->resize(Edit->size(), i); } // Calculate spill weight and allocation hints for new intervals. Edit->calculateRegClassAndHint(VRM.getMachineFunction(), LIS, SA.Loops); + + assert(!LRMap || LRMap->size() == Edit->size()); } @@ -925,6 +1082,21 @@ bool SplitAnalysis::getMultiUseBlocks(BlockPtrSet &Blocks) { return !Blocks.empty(); } +void SplitEditor::splitSingleBlock(const SplitAnalysis::BlockInfo &BI) { + openIntv(); + SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB->getNumber()); + SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstUse, + LastSplitPoint)); + if (!BI.LiveOut || BI.LastUse < LastSplitPoint) { + useIntv(SegStart, leaveIntvAfter(BI.LastUse)); + } else { + // The last use is after the last valid split point. + SlotIndex SegStop = leaveIntvBefore(LastSplitPoint); + useIntv(SegStart, SegStop); + overlapIntv(SegStop, BI.LastUse); + } +} + /// splitSingleBlocks - Split CurLI into a separate live interval inside each /// basic block in Blocks. void SplitEditor::splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks) { @@ -932,22 +1104,8 @@ void SplitEditor::splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks) { ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA.getUseBlocks(); for (unsigned i = 0; i != UseBlocks.size(); ++i) { const SplitAnalysis::BlockInfo &BI = UseBlocks[i]; - if (!Blocks.count(BI.MBB)) - continue; - - openIntv(); - SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB->getNumber()); - SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstUse, - LastSplitPoint)); - if (!BI.LiveOut || BI.LastUse < LastSplitPoint) { - useIntv(SegStart, leaveIntvAfter(BI.LastUse)); - } else { - // The last use is after the last valid split point. - SlotIndex SegStop = leaveIntvBefore(LastSplitPoint); - useIntv(SegStart, SegStop); - overlapIntv(SegStop, BI.LastUse); - } - closeIntv(); + if (Blocks.count(BI.MBB)) + splitSingleBlock(BI); } finish(); } diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h index 20ac8a1..7174c0b 100644 --- a/lib/CodeGen/SplitKit.h +++ b/lib/CodeGen/SplitKit.h @@ -12,6 +12,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_CODEGEN_SPLITKIT_H +#define LLVM_CODEGEN_SPLITKIT_H + #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" @@ -60,17 +63,22 @@ public: /// 1. | o---x | Internal to block. Variable is only live in this block. /// 2. |---x | Live-in, kill. /// 3. | o---| Def, live-out. - /// 4. |---x o---| Live-in, kill, def, live-out. + /// 4. |---x o---| Live-in, kill, def, live-out. Counted by NumGapBlocks. /// 5. |---o---o---| Live-through with uses or defs. - /// 6. |-----------| Live-through without uses. Transparent. + /// 6. |-----------| Live-through without uses. Counted by NumThroughBlocks. + /// + /// Two BlockInfo entries are created for template 4. One for the live-in + /// segment, and one for the live-out segment. These entries look as if the + /// block were split in the middle where the live range isn't live. + /// + /// Live-through blocks without any uses don't get BlockInfo entries. They + /// are simply listed in ThroughBlocks instead. /// struct BlockInfo { MachineBasicBlock *MBB; SlotIndex FirstUse; ///< First instr using current reg. SlotIndex LastUse; ///< Last instr using current reg. - SlotIndex Kill; ///< Interval end point inside block. - SlotIndex Def; ///< Interval start point inside block. - bool LiveThrough; ///< Live in whole block (Templ 5. or 6. above). + bool LiveThrough; ///< Live in whole block (Templ 5. above). bool LiveIn; ///< Current reg is live in. bool LiveOut; ///< Current reg is live out. }; @@ -88,8 +96,18 @@ private: /// UseBlocks - Blocks where CurLI has uses. SmallVector<BlockInfo, 8> UseBlocks; + /// NumGapBlocks - Number of duplicate entries in UseBlocks for blocks where + /// the live range has a gap. + unsigned NumGapBlocks; + /// ThroughBlocks - Block numbers where CurLI is live through without uses. - SmallVector<unsigned, 8> ThroughBlocks; + BitVector ThroughBlocks; + + /// NumThroughBlocks - Number of live-through blocks. + unsigned NumThroughBlocks; + + /// DidRepairRange - analyze was forced to shrinkToUses(). + bool DidRepairRange; SlotIndex computeLastSplitPoint(unsigned Num); @@ -107,6 +125,11 @@ public: /// split. void analyze(const LiveInterval *li); + /// didRepairRange() - Returns true if CurLI was invalid and has been repaired + /// by analyze(). This really shouldn't happen, but sometimes the coalescer + /// can create live ranges that end in mid-air. + bool didRepairRange() const { return DidRepairRange; } + /// clear - clear all data structures so SplitAnalysis is ready to analyze a /// new interval. void clear(); @@ -133,11 +156,26 @@ public: /// getUseBlocks - Return an array of BlockInfo objects for the basic blocks /// where CurLI has uses. - ArrayRef<BlockInfo> getUseBlocks() { return UseBlocks; } + ArrayRef<BlockInfo> getUseBlocks() const { return UseBlocks; } + + /// getNumThroughBlocks - Return the number of through blocks. + unsigned getNumThroughBlocks() const { return NumThroughBlocks; } + + /// isThroughBlock - Return true if CurLI is live through MBB without uses. + bool isThroughBlock(unsigned MBB) const { return ThroughBlocks.test(MBB); } + + /// getThroughBlocks - Return the set of through blocks. + const BitVector &getThroughBlocks() const { return ThroughBlocks; } + + /// getNumLiveBlocks - Return the number of blocks where CurLI is live. + unsigned getNumLiveBlocks() const { + return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks(); + } - /// getThroughBlocks - Return an array of block numbers where CurLI is live - /// through without uses. - ArrayRef<unsigned> getThroughBlocks() { return ThroughBlocks; } + /// countLiveBlocks - Return the number of blocks where li is live. This is + /// guaranteed to return the same number as getNumLiveBlocks() after calling + /// analyze(li). + unsigned countLiveBlocks(const LiveInterval *li) const; typedef SmallPtrSet<const MachineBasicBlock*, 16> BlockPtrSet; @@ -223,6 +261,30 @@ class SplitEditor { // entry in LiveOutCache. BitVector LiveOutSeen; + /// LiveInBlock - Info for updateSSA() about a block where a register is + /// live-in. + /// The updateSSA caller provides DomNode and Kill inside MBB, updateSSA() + /// adds the computed live-in value. + struct LiveInBlock { + // Dominator tree node for the block. + // Cleared by updateSSA when the final value has been determined. + MachineDomTreeNode *DomNode; + + // Live-in value filled in by updateSSA once it is known. + VNInfo *Value; + + // Position in block where the live-in range ends, or SlotIndex() if the + // range passes through the block. + SlotIndex Kill; + + LiveInBlock(MachineDomTreeNode *node) : DomNode(node), Value(0) {} + }; + + /// LiveInBlocks - List of live-in blocks used by findReachingDefs() and + /// updateSSA(). This list is usually empty, it exists here to avoid frequent + /// reallocations. + SmallVector<LiveInBlock, 16> LiveInBlocks; + /// defValue - define a value in RegIdx from ParentVNI at Idx. /// Idx does not have to be ParentVNI->def, but it must be contained within /// ParentVNI's live range in ParentLI. The new value is added to the value @@ -246,17 +308,22 @@ class SplitEditor { /// Insert PHIDefs as needed to preserve SSA form. void extendRange(unsigned RegIdx, SlotIndex Idx); - /// updateSSA - Insert PHIDefs as necessary and update LiveOutCache such that - /// Edit.get(RegIdx) is live-in to all the blocks in LiveIn. - /// Return the value that is eventually live-in to IdxMBB. - VNInfo *updateSSA(unsigned RegIdx, - SmallVectorImpl<MachineDomTreeNode*> &LiveIn, - SlotIndex Idx, - const MachineBasicBlock *IdxMBB); + /// findReachingDefs - Starting from MBB, add blocks to LiveInBlocks until all + /// reaching defs for LI are found. + /// @param LI Live interval whose value is needed. + /// @param MBB Block where LI should be live-in. + /// @param Kill Kill point in MBB. + /// @return Unique value seen, or NULL. + VNInfo *findReachingDefs(LiveInterval *LI, MachineBasicBlock *MBB, + SlotIndex Kill); - /// transferSimpleValues - Transfer simply defined values to the new ranges. - /// Return true if any complex ranges were skipped. - bool transferSimpleValues(); + /// updateSSA - Compute and insert PHIDefs such that all blocks in + // LiveInBlocks get a known live-in value. Add live ranges to the blocks. + void updateSSA(); + + /// transferValues - Transfer values to the new ranges. + /// Return true if any ranges were skipped. + bool transferValues(); /// extendPHIKillRanges - Extend the ranges of all values killed by original /// parent PHIDefs. @@ -278,7 +345,15 @@ public: void reset(LiveRangeEdit&); /// Create a new virtual register and live interval. - void openIntv(); + /// Return the interval index, starting from 1. Interval index 0 is the + /// implicit complement interval. + unsigned openIntv(); + + /// currentIntv - Return the current interval index. + unsigned currentIntv() const { return OpenIdx; } + + /// selectIntv - Select a previously opened interval index. + void selectIntv(unsigned Idx); /// enterIntvBefore - Enter the open interval before the instruction at Idx. /// If the parent interval is not live before Idx, a COPY is not inserted. @@ -321,22 +396,28 @@ public: /// void overlapIntv(SlotIndex Start, SlotIndex End); - /// closeIntv - Indicate that we are done editing the currently open - /// LiveInterval, and ranges can be trimmed. - void closeIntv(); - /// finish - after all the new live ranges have been created, compute the /// remaining live range, and rewrite instructions to use the new registers. - void finish(); + /// @param LRMap When not null, this vector will map each live range in Edit + /// back to the indices returned by openIntv. + /// There may be extra indices created by dead code elimination. + void finish(SmallVectorImpl<unsigned> *LRMap = 0); /// dump - print the current interval maping to dbgs(). void dump() const; // ===--- High level methods ---=== + /// splitSingleBlock - Split CurLI into a separate live interval around the + /// uses in a single block. This is intended to be used as part of a larger + /// split, and doesn't call finish(). + void splitSingleBlock(const SplitAnalysis::BlockInfo &BI); + /// splitSingleBlocks - Split CurLI into a separate live interval inside each /// basic block in Blocks. void splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks); }; } + +#endif diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp index ec7829e..227eb47 100644 --- a/lib/CodeGen/StrongPHIElimination.cpp +++ b/lib/CodeGen/StrongPHIElimination.cpp @@ -587,7 +587,7 @@ StrongPHIElimination::SplitInterferencesForBasicBlock( } // We now walk the PHIs in successor blocks and check for interferences. This - // is necesary because the use of a PHI's operands are logically contained in + // is necessary because the use of a PHI's operands are logically contained in // the predecessor block. The def of a PHI's destination register is processed // along with the other defs in a basic block. diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp index 04d3d31..90cb72f 100644 --- a/lib/CodeGen/TailDuplication.cpp +++ b/lib/CodeGen/TailDuplication.cpp @@ -34,6 +34,7 @@ STATISTIC(NumTails , "Number of tails duplicated"); STATISTIC(NumTailDups , "Number of tail duplicated blocks"); STATISTIC(NumInstrDups , "Additional instructions due to tail duplication"); STATISTIC(NumDeadBlocks, "Number of dead blocks removed"); +STATISTIC(NumAddedPHIs , "Number of phis added"); // Heuristic for tail duplication. static cl::opt<unsigned> @@ -80,16 +81,21 @@ namespace { void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB, DenseMap<unsigned, unsigned> &LocalVRMap, - SmallVector<std::pair<unsigned,unsigned>, 4> &Copies); + SmallVector<std::pair<unsigned,unsigned>, 4> &Copies, + const DenseSet<unsigned> &UsedByPhi, + bool Remove); void DuplicateInstruction(MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB, MachineFunction &MF, - DenseMap<unsigned, unsigned> &LocalVRMap); + DenseMap<unsigned, unsigned> &LocalVRMap, + const DenseSet<unsigned> &UsedByPhi); void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead, SmallVector<MachineBasicBlock*, 8> &TDBBs, SmallSetVector<MachineBasicBlock*, 8> &Succs); bool TailDuplicateBlocks(MachineFunction &MF); + bool shouldTailDuplicate(const MachineFunction &MF, + MachineBasicBlock &TailBB); bool TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, SmallVector<MachineBasicBlock*, 8> &TDBBs, SmallVector<MachineInstr*, 16> &Copies); @@ -146,11 +152,11 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) { for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) { MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB(); if (CheckExtra && !Preds.count(PHIBB)) { - // This is not a hard error. dbgs() << "Warning: malformed PHI in BB#" << MBB->getNumber() << ": " << *MI; dbgs() << " extra input from predecessor BB#" << PHIBB->getNumber() << '\n'; + llvm_unreachable(0); } if (PHIBB->getNumber() < 0) { dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI; @@ -183,10 +189,6 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) { if (NumTails == TailDupLimit) break; - // Only duplicate blocks that end with unconditional branches. - if (MBB->canFallThrough()) - continue; - // Save the successors list. SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(), MBB->succ_end()); @@ -240,7 +242,7 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) { MachineOperand &UseMO = UI.getOperand(); MachineInstr *UseMI = &*UI; ++UI; - if (UseMI->getParent() == DefBB) + if (UseMI->getParent() == DefBB && !UseMI->isPHI()) continue; SSAUpdate.RewriteUse(UseMO); } @@ -271,6 +273,7 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) { MadeChange = true; } } + NumAddedPHIs += NewPHIs.size(); return MadeChange; } @@ -293,6 +296,24 @@ static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) { return 0; } + +// Remember which registers are used by phis in this block. This is +// used to determine which registers are liveout while modifying the +// block (which is why we need to copy the information). +static void getRegsUsedByPHIs(const MachineBasicBlock &BB, + DenseSet<unsigned> *UsedByPhi) { + for(MachineBasicBlock::const_iterator I = BB.begin(), E = BB.end(); + I != E; ++I) { + const MachineInstr &MI = *I; + if (!MI.isPHI()) + break; + for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { + unsigned SrcReg = MI.getOperand(i).getReg(); + UsedByPhi->insert(SrcReg); + } + } +} + /// AddSSAUpdateEntry - Add a definition and source virtual registers pair for /// SSA update. void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg, @@ -315,7 +336,9 @@ void TailDuplicatePass::ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB, DenseMap<unsigned, unsigned> &LocalVRMap, - SmallVector<std::pair<unsigned,unsigned>, 4> &Copies) { + SmallVector<std::pair<unsigned,unsigned>, 4> &Copies, + const DenseSet<unsigned> &RegsUsedByPhi, + bool Remove) { unsigned DefReg = MI->getOperand(0).getReg(); unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB); assert(SrcOpIdx && "Unable to find matching PHI source?"); @@ -327,9 +350,12 @@ void TailDuplicatePass::ProcessPHI(MachineInstr *MI, // available value liveout of the block. unsigned NewDef = MRI->createVirtualRegister(RC); Copies.push_back(std::make_pair(NewDef, SrcReg)); - if (isDefLiveOut(DefReg, TailBB, MRI)) + if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg)) AddSSAUpdateEntry(DefReg, NewDef, PredBB); + if (!Remove) + return; + // Remove PredBB from the PHI node. MI->RemoveOperand(SrcOpIdx+1); MI->RemoveOperand(SrcOpIdx); @@ -343,7 +369,8 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB, MachineFunction &MF, - DenseMap<unsigned, unsigned> &LocalVRMap) { + DenseMap<unsigned, unsigned> &LocalVRMap, + const DenseSet<unsigned> &UsedByPhi) { MachineInstr *NewMI = TII->duplicate(MI, MF); for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) { MachineOperand &MO = NewMI->getOperand(i); @@ -357,7 +384,7 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI, unsigned NewReg = MRI->createVirtualRegister(RC); MO.setReg(NewReg); LocalVRMap.insert(std::make_pair(Reg, NewReg)); - if (isDefLiveOut(Reg, TailBB, MRI)) + if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg)) AddSSAUpdateEntry(Reg, NewReg, PredBB); } else { DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg); @@ -416,6 +443,13 @@ TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead, // This register is defined in the tail block. for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) { MachineBasicBlock *SrcBB = LI->second[j].first; + // If we didn't duplicate a bb into a particular predecessor, we + // might still have added an entry to SSAUpdateVals to correcly + // recompute SSA. If that case, avoid adding a dummy extra argument + // this PHI. + if (!SrcBB->isSuccessor(SuccBB)) + continue; + unsigned SrcReg = LI->second[j].second; if (Idx != 0) { II->getOperand(Idx).setReg(SrcReg); @@ -448,14 +482,19 @@ TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead, } } -/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each -/// of its predecessors. +/// shouldTailDuplicate - Determine if it is profitable to duplicate this block. bool -TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, - SmallVector<MachineBasicBlock*, 8> &TDBBs, - SmallVector<MachineInstr*, 16> &Copies) { - // Set the limit on the number of instructions to duplicate, with a default - // of one less than the tail-merge threshold. When optimizing for size, +TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF, + MachineBasicBlock &TailBB) { + // Only duplicate blocks that end with unconditional branches. + if (TailBB.canFallThrough()) + return false; + + // Don't try to tail-duplicate single-block loops. + if (TailBB.isSuccessor(&TailBB)) + return false; + + // Set the limit on the cost to duplicate. When optimizing for size, // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. unsigned MaxDuplicateCount; @@ -465,49 +504,56 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, else MaxDuplicateCount = TailDuplicateSize; - if (PreRegAlloc) { - if (TailBB->empty()) - return false; - const TargetInstrDesc &TID = TailBB->back().getDesc(); - // Pre-regalloc tail duplication hurts compile time and doesn't help - // much except for indirect branches and returns. - if (!TID.isIndirectBranch() && !TID.isReturn()) - return false; - // If the target has hardware branch prediction that can handle indirect - // branches, duplicating them can often make them predictable when there - // are common paths through the code. The limit needs to be high enough - // to allow undoing the effects of tail merging and other optimizations - // that rearrange the predecessors of the indirect branch. - MaxDuplicateCount = 20; - } + // If the target has hardware branch prediction that can handle indirect + // branches, duplicating them can often make them predictable when there + // are common paths through the code. The limit needs to be high enough + // to allow undoing the effects of tail merging and other optimizations + // that rearrange the predecessors of the indirect branch. - // Don't try to tail-duplicate single-block loops. - if (TailBB->isSuccessor(TailBB)) - return false; + if (PreRegAlloc && !TailBB.empty()) { + const TargetInstrDesc &TID = TailBB.back().getDesc(); + if (TID.isIndirectBranch()) + MaxDuplicateCount = 20; + } // Check the instructions in the block to determine whether tail-duplication // is invalid or unlikely to be profitable. unsigned InstrCount = 0; - bool HasCall = false; - for (MachineBasicBlock::iterator I = TailBB->begin(); - I != TailBB->end(); ++I) { + for (MachineBasicBlock::const_iterator I = TailBB.begin(); I != TailBB.end(); + ++I) { // Non-duplicable things shouldn't be tail-duplicated. - if (I->getDesc().isNotDuplicable()) return false; + if (I->getDesc().isNotDuplicable()) + return false; + // Do not duplicate 'return' instructions if this is a pre-regalloc run. // A return may expand into a lot more instructions (e.g. reload of callee // saved registers) after PEI. - if (PreRegAlloc && I->getDesc().isReturn()) return false; - // Don't duplicate more than the threshold. - if (InstrCount == MaxDuplicateCount) return false; - // Remember if we saw a call. - if (I->getDesc().isCall()) HasCall = true; + if (PreRegAlloc && I->getDesc().isReturn()) + return false; + + // Avoid duplicating calls before register allocation. Calls presents a + // barrier to register allocation so duplicating them may end up increasing + // spills. + if (PreRegAlloc && I->getDesc().isCall()) + return false; + if (!I->isPHI() && !I->isDebugValue()) InstrCount += 1; + + if (InstrCount > MaxDuplicateCount) + return false; } - // Don't tail-duplicate calls before register allocation. Calls presents a - // barrier to register allocation so duplicating them may end up increasing - // spills. - if (InstrCount > 1 && (PreRegAlloc && HasCall)) + + return true; +} + +/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each +/// of its predecessors. +bool +TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, + SmallVector<MachineBasicBlock*, 8> &TDBBs, + SmallVector<MachineInstr*, 16> &Copies) { + if (!shouldTailDuplicate(MF, *TailBB)) return false; DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n'); @@ -518,13 +564,17 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, bool Changed = false; SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(), TailBB->pred_end()); + DenseSet<unsigned> UsedByPhi; + getRegsUsedByPHIs(*TailBB, &UsedByPhi); for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(), PE = Preds.end(); PI != PE; ++PI) { MachineBasicBlock *PredBB = *PI; assert(TailBB != PredBB && "Single-block loop should have been rejected earlier!"); - if (PredBB->succ_size() > 1) continue; + // EH edges are ignored by AnalyzeBranch. + if (PredBB->succ_size() > 1) + continue; MachineBasicBlock *PredTBB, *PredFBB; SmallVector<MachineOperand, 4> PredCond; @@ -532,9 +582,6 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, continue; if (!PredCond.empty()) continue; - // EH edges are ignored by AnalyzeBranch. - if (PredBB->succ_size() != 1) - continue; // Don't duplicate into a fall-through predecessor (at least for now). if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) continue; @@ -557,11 +604,11 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, if (MI->isPHI()) { // Replace the uses of the def of the PHI with the register coming // from PredBB. - ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos); + ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true); } else { // Replace def of virtual registers with new registers, and update // uses with PHI source register or the new registers. - DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap); + DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap, UsedByPhi); } } MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator(); @@ -570,6 +617,10 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, TII->get(TargetOpcode::COPY), CopyInfos[i].first).addReg(CopyInfos[i].second)); } + + // Simplify + TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true); + NumInstrDups += TailBB->size() - 1; // subtract one for removed branch // Update the CFG. @@ -590,12 +641,11 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, MachineBasicBlock *PrevBB = prior(MachineFunction::iterator(TailBB)); MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0; SmallVector<MachineOperand, 4> PriorCond; - bool PriorUnAnalyzable = - TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true); // This has to check PrevBB->succ_size() because EH edges are ignored by // AnalyzeBranch. - if (!PriorUnAnalyzable && PriorCond.empty() && !PriorTBB && - TailBB->pred_size() == 1 && PrevBB->succ_size() == 1 && + if (PrevBB->succ_size() == 1 && + !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) && + PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 && !TailBB->hasAddressTaken()) { DEBUG(dbgs() << "\nMerging into block: " << *PrevBB << "From MBB: " << *TailBB); @@ -608,7 +658,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, // Replace the uses of the def of the PHI with the register coming // from PredBB. MachineInstr *MI = &*I++; - ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos); + ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true); if (MI->getParent()) MI->eraseFromParent(); } @@ -618,7 +668,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, // Replace def of virtual registers with new registers, and update // uses with PHI source register or the new registers. MachineInstr *MI = &*I++; - DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap); + DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap, UsedByPhi); MI->eraseFromParent(); } MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator(); @@ -639,6 +689,57 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, Changed = true; } + // If this is after register allocation, there are no phis to fix. + if (!PreRegAlloc) + return Changed; + + // If we made no changes so far, we are safe. + if (!Changed) + return Changed; + + + // Handle the nasty case in that we duplicated a block that is part of a loop + // into some but not all of its predecessors. For example: + // 1 -> 2 <-> 3 | + // \ | + // \---> rest | + // if we duplicate 2 into 1 but not into 3, we end up with + // 12 -> 3 <-> 2 -> rest | + // \ / | + // \----->-----/ | + // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced + // with a phi in 3 (which now dominates 2). + // What we do here is introduce a copy in 3 of the register defined by the + // phi, just like when we are duplicating 2 into 3, but we don't copy any + // real instructions or remove the 3 -> 2 edge from the phi in 2. + for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(), + PE = Preds.end(); PI != PE; ++PI) { + MachineBasicBlock *PredBB = *PI; + if (std::find(TDBBs.begin(), TDBBs.end(), PredBB) != TDBBs.end()) + continue; + + // EH edges + if (PredBB->succ_size() != 1) + continue; + + DenseMap<unsigned, unsigned> LocalVRMap; + SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos; + MachineBasicBlock::iterator I = TailBB->begin(); + // Process PHI instructions first. + while (I != TailBB->end() && I->isPHI()) { + // Replace the uses of the def of the PHI with the register coming + // from PredBB. + MachineInstr *MI = &*I++; + ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false); + } + MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator(); + for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) { + Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(), + TII->get(TargetOpcode::COPY), + CopyInfos[i].first).addReg(CopyInfos[i].second)); + } + } + return Changed; } @@ -655,4 +756,3 @@ void TailDuplicatePass::RemoveDeadBlock(MachineBasicBlock *MBB) { // Remove the block. MBB->eraseFromParent(); } - diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp index 15340a3..34e2b33 100644 --- a/lib/CodeGen/TargetInstrInfoImpl.cpp +++ b/lib/CodeGen/TargetInstrInfoImpl.cpp @@ -212,8 +212,7 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI, if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg())) return RC->contains(LiveOp.getReg()) ? RC : 0; - const TargetRegisterClass *LiveRC = MRI.getRegClass(LiveReg); - if (RC == LiveRC || RC->hasSubClass(LiveRC)) + if (RC->hasSubClassEq(MRI.getRegClass(LiveReg))) return RC; // FIXME: Allow folding when register classes are memory compatible. @@ -388,11 +387,6 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI, if (MO.isDef() != (i == 0)) return false; - // For the def, it should be the only def of that register. - if (MO.isDef() && (llvm::next(MRI.def_begin(Reg)) != MRI.def_end() || - MRI.isLiveIn(Reg))) - return false; - // Don't allow any virtual-register uses. Rematting an instruction with // virtual register uses would length the live ranges of the uses, which // is not necessarily a good idea, certainly not "trivial". diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index f332d12..2da1bd4 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -23,6 +23,7 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Target/Mangler.h" #include "llvm/Target/TargetData.h" @@ -176,12 +177,59 @@ const MCSection *TargetLoweringObjectFileELF::getEHFrameSection() const { SectionKind::getDataRel()); } +MCSymbol * +TargetLoweringObjectFileELF::getCFIPersonalitySymbol(const GlobalValue *GV, + Mangler *Mang, + MachineModuleInfo *MMI) const { + unsigned Encoding = getPersonalityEncoding(); + switch (Encoding & 0x70) { + default: + report_fatal_error("We do not support this DWARF encoding yet!"); + case dwarf::DW_EH_PE_absptr: + return Mang->getSymbol(GV); + break; + case dwarf::DW_EH_PE_pcrel: { + return getContext().GetOrCreateSymbol(StringRef("DW.ref.") + + Mang->getSymbol(GV)->getName()); + break; + } + } +} + +void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer, + const TargetMachine &TM, + const MCSymbol *Sym) const { + SmallString<64> NameData("DW.ref."); + NameData += Sym->getName(); + MCSymbol *Label = getContext().GetOrCreateSymbol(NameData); + Streamer.EmitSymbolAttribute(Label, MCSA_Hidden); + Streamer.EmitSymbolAttribute(Label, MCSA_Weak); + StringRef Prefix = ".data."; + NameData.insert(NameData.begin(), Prefix.begin(), Prefix.end()); + unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP; + const MCSection *Sec = getContext().getELFSection(NameData, + ELF::SHT_PROGBITS, + Flags, + SectionKind::getDataRel(), + 0, Label->getName()); + Streamer.SwitchSection(Sec); + Streamer.EmitValueToAlignment(8); + Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject); + const MCExpr *E = MCConstantExpr::Create(8, getContext()); + Streamer.EmitELFSize(Label, E); + Streamer.EmitLabel(Label); + + unsigned Size = TM.getTargetData()->getPointerSize(); + Streamer.EmitSymbolValue(Sym, Size); +} + static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) { - // FIXME: Why is this here? Codegen is should not be in the business - // of figuring section flags. If the user wrote section(".eh_frame"), - // we should just pass that to MC which will defer to the assembly - // or use its default if producing an object file. + // N.B.: The defaults used in here are no the same ones used in MC. + // We follow gcc, MC follows gas. For example, given ".section .eh_frame", + // both gas and MC will produce a section with no flags. Given + // section(".eh_frame") gcc will produce + // .section .eh_frame,"a",@progbits if (Name.empty() || Name[0] != '.') return K; // Some lame default implementation based on some magic section names. @@ -207,9 +255,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) { Name.startswith(".llvm.linkonce.tb.")) return SectionKind::getThreadBSS(); - if (Name == ".eh_frame") - return SectionKind::getDataRel(); - return K; } @@ -424,8 +469,7 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, } return TargetLoweringObjectFile:: - getExprForDwarfReference(SSym, Mang, MMI, - Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); + getExprForDwarfReference(SSym, Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); } return TargetLoweringObjectFile:: @@ -438,26 +482,13 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx, const TargetMachine &TM) { - // _foo.eh symbols are currently always exported so that the linker knows - // about them. This is not necessary on 10.6 and later, but it - // doesn't hurt anything. - // FIXME: I need to get this from Triple. - IsFunctionEHSymbolGlobal = true; IsFunctionEHFrameSymbolPrivate = false; SupportsWeakOmittedEHFrame = false; + // .comm doesn't support alignment before Leopard. Triple T(((LLVMTargetMachine&)TM).getTargetTriple()); - if (T.getOS() == Triple::Darwin) { - switch (T.getDarwinMajorNumber()) { - case 7: // 10.3 Panther. - case 8: // 10.4 Tiger. - CommDirectiveSupportsAlignment = false; - break; - case 9: // 10.5 Leopard. - case 10: // 10.6 SnowLeopard. - break; - } - } + if (T.isMacOSX() && T.isMacOSXVersionLT(10, 5)) + CommDirectiveSupportsAlignment = false; TargetLoweringObjectFile::Initialize(Ctx, TM); @@ -803,14 +834,36 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, } return TargetLoweringObjectFile:: - getExprForDwarfReference(SSym, Mang, MMI, - Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); + getExprForDwarfReference(SSym, Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); } return TargetLoweringObjectFile:: getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer); } +MCSymbol *TargetLoweringObjectFileMachO:: +getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const { + // The mach-o version of this method defaults to returning a stub reference. + MachineModuleInfoMachO &MachOMMI = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + SmallString<128> Name; + Mang->getNameWithPrefix(Name, GV, true); + Name += "$non_lazy_ptr"; + + // Add information about the stub reference to MachOMMI so that the stub + // gets emitted by the asmprinter. + MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str()); + MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym); + if (StubSym.getPointer() == 0) { + MCSymbol *Sym = Mang->getSymbol(GV); + StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage()); + } + + return SSym; +} + unsigned TargetLoweringObjectFileMachO::getPersonalityEncoding() const { return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4; } @@ -819,7 +872,7 @@ unsigned TargetLoweringObjectFileMachO::getLSDAEncoding() const { return DW_EH_PE_pcrel; } -unsigned TargetLoweringObjectFileMachO::getFDEEncoding() const { +unsigned TargetLoweringObjectFileMachO::getFDEEncoding(bool CFI) const { return DW_EH_PE_pcrel; } @@ -934,6 +987,20 @@ void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx, getContext().getCOFFSection(".drectve", COFF::IMAGE_SCN_LNK_INFO, SectionKind::getMetadata()); + + PDataSection = + getContext().getCOFFSection(".pdata", + COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_MEM_WRITE, + SectionKind::getDataRel()); + + XDataSection = + getContext().getCOFFSection(".xdata", + COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_MEM_WRITE, + SectionKind::getDataRel()); } const MCSection *TargetLoweringObjectFileCOFF::getEHFrameSection() const { @@ -944,6 +1011,28 @@ const MCSection *TargetLoweringObjectFileCOFF::getEHFrameSection() const { SectionKind::getDataRel()); } +const MCSection *TargetLoweringObjectFileCOFF::getWin64EHFuncTableSection( + StringRef suffix) const { + if (suffix == "") + return PDataSection; + return getContext().getCOFFSection((".pdata"+suffix).str(), + COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_MEM_WRITE, + SectionKind::getDataRel()); +} + +const MCSection *TargetLoweringObjectFileCOFF::getWin64EHTableSection( + StringRef suffix) const { + if (suffix == "") + return XDataSection; + return getContext().getCOFFSection((".xdata"+suffix).str(), + COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_MEM_WRITE, + SectionKind::getDataRel()); +} + static unsigned getCOFFSectionFlags(SectionKind K) { diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 52ea872..f54d879 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1125,6 +1125,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { break; // The tied operands have been eliminated. } + bool IsEarlyClobber = false; bool RemovedKillFlag = false; bool AllUsesCopied = true; unsigned LastCopiedReg = 0; @@ -1132,7 +1133,11 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) { unsigned SrcIdx = TiedPairs[tpi].first; unsigned DstIdx = TiedPairs[tpi].second; - unsigned regA = mi->getOperand(DstIdx).getReg(); + + const MachineOperand &DstMO = mi->getOperand(DstIdx); + unsigned regA = DstMO.getReg(); + IsEarlyClobber |= DstMO.isEarlyClobber(); + // Grab regB from the instruction because it may have changed if the // instruction was commuted. regB = mi->getOperand(SrcIdx).getReg(); @@ -1196,15 +1201,17 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { } if (AllUsesCopied) { - // Replace other (un-tied) uses of regB with LastCopiedReg. - for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) { - MachineOperand &MO = mi->getOperand(i); - if (MO.isReg() && MO.getReg() == regB && MO.isUse()) { - if (MO.isKill()) { - MO.setIsKill(false); - RemovedKillFlag = true; + if (!IsEarlyClobber) { + // Replace other (un-tied) uses of regB with LastCopiedReg. + for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) { + MachineOperand &MO = mi->getOperand(i); + if (MO.isReg() && MO.getReg() == regB && MO.isUse()) { + if (MO.isKill()) { + MO.setIsKill(false); + RemovedKillFlag = true; + } + MO.setReg(LastCopiedReg); } - MO.setReg(LastCopiedReg); } } diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp index 48d8ab1..52693f0 100644 --- a/lib/CodeGen/UnreachableBlockElim.cpp +++ b/lib/CodeGen/UnreachableBlockElim.cpp @@ -196,8 +196,11 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { temp->eraseFromParent(); ModifiedPHI = true; - if (Input != Output) - F.getRegInfo().replaceRegWith(Output, Input); + if (Input != Output) { + MachineRegisterInfo &MRI = F.getRegInfo(); + MRI.constrainRegClass(Input, MRI.getRegClass(Output)); + MRI.replaceRegWith(Output, Input); + } continue; } diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp index 7a7ea69..7557979 100644 --- a/lib/CodeGen/VirtRegMap.cpp +++ b/lib/CodeGen/VirtRegMap.cpp @@ -42,6 +42,7 @@ using namespace llvm; STATISTIC(NumSpills , "Number of register spills"); +STATISTIC(NumIdCopies, "Number of identity moves eliminated after rewriting"); //===----------------------------------------------------------------------===// // VirtRegMap implementation @@ -260,6 +261,8 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) { << "********** Function: " << MF->getFunction()->getName() << '\n'); DEBUG(dump()); + SmallVector<unsigned, 8> SuperDeads; + SmallVector<unsigned, 8> SuperDefs; SmallVector<unsigned, 8> SuperKills; for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end(); @@ -283,12 +286,13 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) { if (MO.getSubReg()) { // A virtual register kill refers to the whole register, so we may // have to add <imp-use,kill> operands for the super-register. - if (MO.isUse() && MO.isKill() && !MO.isUndef()) - SuperKills.push_back(PhysReg); - - // We don't have to deal with sub-register defs because - // LiveIntervalAnalysis already added the necessary <imp-def> - // operands. + if (MO.isUse()) { + if (MO.isKill() && !MO.isUndef()) + SuperKills.push_back(PhysReg); + } else if (MO.isDead()) + SuperDeads.push_back(PhysReg); + else + SuperDefs.push_back(PhysReg); // PhysReg operands cannot have subregister indexes. PhysReg = TRI->getSubReg(PhysReg, MO.getSubReg()); @@ -305,10 +309,17 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) { while (!SuperKills.empty()) MI->addRegisterKilled(SuperKills.pop_back_val(), TRI, true); + while (!SuperDeads.empty()) + MI->addRegisterDead(SuperDeads.pop_back_val(), TRI, true); + + while (!SuperDefs.empty()) + MI->addRegisterDefined(SuperDefs.pop_back_val(), TRI); + DEBUG(dbgs() << "> " << *MI); // Finally, remove any identity copies. if (MI->isIdentityCopy()) { + ++NumIdCopies; if (MI->getNumOperands() == 2) { DEBUG(dbgs() << "Deleting identity copy.\n"); RemoveMachineInstrFromMaps(MI); diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp index 67be1b0..1850658 100644 --- a/lib/CodeGen/VirtRegRewriter.cpp +++ b/lib/CodeGen/VirtRegRewriter.cpp @@ -32,7 +32,7 @@ STATISTIC(NumCommutes, "Number of instructions commuted"); STATISTIC(NumDRM , "Number of re-materializable defs elided"); STATISTIC(NumStores , "Number of stores added"); STATISTIC(NumPSpills , "Number of physical register spills"); -STATISTIC(NumOmitted , "Number of reloads omited"); +STATISTIC(NumOmitted , "Number of reloads omitted"); STATISTIC(NumAvoided , "Number of reloads deemed unnecessary"); STATISTIC(NumCopified, "Number of available reloads turned into copies"); STATISTIC(NumReMats , "Number of re-materialization"); @@ -669,7 +669,7 @@ static void UpdateKills(MachineInstr &MI, const TargetRegisterInfo* TRI, } } -/// ReMaterialize - Re-materialize definition for Reg targetting DestReg. +/// ReMaterialize - Re-materialize definition for Reg targeting DestReg. /// static void ReMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MII, diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt index 8bff265..58caae8 100644 --- a/lib/ExecutionEngine/CMakeLists.txt +++ b/lib/ExecutionEngine/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMExecutionEngine ExecutionEngine.cpp ExecutionEngineBindings.cpp + TargetSelect.cpp ) add_subdirectory(Interpreter) diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp index 13e07ac..7652090 100644 --- a/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/lib/ExecutionEngine/ExecutionEngine.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/DynamicLibrary.h" #include "llvm/Support/Host.h" #include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" #include <cmath> #include <cstring> using namespace llvm; @@ -42,20 +43,14 @@ ExecutionEngine *(*ExecutionEngine::JITCtor)( JITMemoryManager *JMM, CodeGenOpt::Level OptLevel, bool GVsWithCode, - CodeModel::Model CMM, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs) = 0; + TargetMachine *TM) = 0; ExecutionEngine *(*ExecutionEngine::MCJITCtor)( Module *M, std::string *ErrorStr, JITMemoryManager *JMM, CodeGenOpt::Level OptLevel, bool GVsWithCode, - CodeModel::Model CMM, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs) = 0; + TargetMachine *TM) = 0; ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M, std::string *ErrorStr) = 0; @@ -313,13 +308,17 @@ void ExecutionEngine::runStaticConstructorsDestructors(Module *module, // Should be an array of '{ i32, void ()* }' structs. The first value is // the init priority, which we ignore. + if (isa<ConstantAggregateZero>(GV->getInitializer())) + return; ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer()); for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) { + if (isa<ConstantAggregateZero>(InitList->getOperand(i))) + continue; ConstantStruct *CS = cast<ConstantStruct>(InitList->getOperand(i)); Constant *FP = CS->getOperand(1); if (FP->isNullValue()) - break; // Found a null terminator, exit. + continue; // Found a sentinal value, ignore. // Strip off constant expression casts. if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP)) @@ -415,6 +414,35 @@ ExecutionEngine *ExecutionEngine::create(Module *M, .create(); } +/// createJIT - This is the factory method for creating a JIT for the current +/// machine, it does not fall back to the interpreter. This takes ownership +/// of the module. +ExecutionEngine *ExecutionEngine::createJIT(Module *M, + std::string *ErrorStr, + JITMemoryManager *JMM, + CodeGenOpt::Level OptLevel, + bool GVsWithCode, + CodeModel::Model CMM) { + if (ExecutionEngine::JITCtor == 0) { + if (ErrorStr) + *ErrorStr = "JIT has not been linked in."; + return 0; + } + + // Use the defaults for extra parameters. Users can use EngineBuilder to + // set them. + StringRef MArch = ""; + StringRef MCPU = ""; + SmallVector<std::string, 1> MAttrs; + + TargetMachine *TM = + EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr); + if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0; + TM->setCodeModel(CMM); + + return ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel, GVsWithCode, TM); +} + ExecutionEngine *EngineBuilder::create() { // Make sure we can resolve symbols in the program as well. The zero arg // to the function tells DynamicLibrary to load the program, not a library. @@ -437,18 +465,21 @@ ExecutionEngine *EngineBuilder::create() { // Unless the interpreter was explicitly selected or the JIT is not linked, // try making a JIT. if (WhichEngine & EngineKind::JIT) { - if (UseMCJIT && ExecutionEngine::MCJITCtor) { - ExecutionEngine *EE = - ExecutionEngine::MCJITCtor(M, ErrorStr, JMM, OptLevel, - AllocateGVsWithCode, CMModel, - MArch, MCPU, MAttrs); - if (EE) return EE; - } else if (ExecutionEngine::JITCtor) { - ExecutionEngine *EE = - ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel, - AllocateGVsWithCode, CMModel, - MArch, MCPU, MAttrs); - if (EE) return EE; + if (TargetMachine *TM = + EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr)) { + TM->setCodeModel(CMModel); + + if (UseMCJIT && ExecutionEngine::MCJITCtor) { + ExecutionEngine *EE = + ExecutionEngine::MCJITCtor(M, ErrorStr, JMM, OptLevel, + AllocateGVsWithCode, TM); + if (EE) return EE; + } else if (ExecutionEngine::JITCtor) { + ExecutionEngine *EE = + ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel, + AllocateGVsWithCode, TM); + if (EE) return EE; + } } } @@ -835,7 +866,7 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val, case Type::PointerTyID: // Ensure 64 bit target pointers are fully initialized on 32 bit hosts. if (StoreBytes != sizeof(PointerTy)) - memset(Ptr, 0, StoreBytes); + memset(&(Ptr->PointerVal), 0, StoreBytes); *((PointerTy*)Ptr) = Val.PointerVal; break; diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt index 42020d6..cefb0ae 100644 --- a/lib/ExecutionEngine/JIT/CMakeLists.txt +++ b/lib/ExecutionEngine/JIT/CMakeLists.txt @@ -9,5 +9,4 @@ add_llvm_library(LLVMJIT JITEmitter.cpp JITMemoryManager.cpp OProfileJITEventListener.cpp - TargetSelect.cpp ) diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp index 56121c1..8fceaf2 100644 --- a/lib/ExecutionEngine/JIT/JIT.cpp +++ b/lib/ExecutionEngine/JIT/JIT.cpp @@ -203,39 +203,18 @@ void DarwinRegisterFrame(void* FrameBegin) { /// createJIT - This is the factory method for creating a JIT for the current /// machine, it does not fall back to the interpreter. This takes ownership /// of the module. -ExecutionEngine *ExecutionEngine::createJIT(Module *M, - std::string *ErrorStr, - JITMemoryManager *JMM, - CodeGenOpt::Level OptLevel, - bool GVsWithCode, - CodeModel::Model CMM) { - // Use the defaults for extra parameters. Users can use EngineBuilder to - // set them. - StringRef MArch = ""; - StringRef MCPU = ""; - SmallVector<std::string, 1> MAttrs; - return JIT::createJIT(M, ErrorStr, JMM, OptLevel, GVsWithCode, CMM, - MArch, MCPU, MAttrs); -} - ExecutionEngine *JIT::createJIT(Module *M, std::string *ErrorStr, JITMemoryManager *JMM, CodeGenOpt::Level OptLevel, bool GVsWithCode, - CodeModel::Model CMM, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs) { + TargetMachine *TM) { // Try to register the program as a source of symbols to resolve against. + // + // FIXME: Don't do this here. sys::DynamicLibrary::LoadLibraryPermanently(0, NULL); - // Pick a target either via -march or by guessing the native arch. - TargetMachine *TM = JIT::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr); - if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0; - TM->setCodeModel(CMM); - - // If the target supports JIT code generation, create a the JIT. + // If the target supports JIT code generation, create the JIT. if (TargetJITInfo *TJ = TM->getJITInfo()) { return new JIT(M, *TM, *TJ, JMM, OptLevel, GVsWithCode); } else { @@ -666,7 +645,7 @@ void JIT::jitTheFunction(Function *F, const MutexGuard &locked) { } /// getPointerToFunction - This method is used to get the address of the -/// specified function, compiling it if neccesary. +/// specified function, compiling it if necessary. /// void *JIT::getPointerToFunction(Function *F) { diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h index b576c16..b879fc3 100644 --- a/lib/ExecutionEngine/JIT/JIT.h +++ b/lib/ExecutionEngine/JIT/JIT.h @@ -181,23 +181,12 @@ public: /// JITCodeEmitter *getCodeEmitter() const { return JCE; } - /// selectTarget - Pick a target either via -march or by guessing the native - /// arch. Add any CPU features specified via -mcpu or -mattr. - static TargetMachine *selectTarget(Module *M, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs, - std::string *Err); - static ExecutionEngine *createJIT(Module *M, std::string *ErrorStr, JITMemoryManager *JMM, CodeGenOpt::Level OptLevel, bool GVsWithCode, - CodeModel::Model CMM, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs); + TargetMachine *TM); // Run the JIT on F and return information about the generated code void runJITOnFunction(Function *F, MachineCodeInfo *MCI = 0); diff --git a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp index 3b5acb7..e71c20b 100644 --- a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp +++ b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp @@ -27,7 +27,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Mutex.h" #include <string> -#include <vector> namespace llvm { @@ -143,7 +142,7 @@ void JITDebugRegisterer::RegisterFunction(const Function *F, DebugInfo &I) { // Add a mapping from F to the entry and buffer, so we can delete this // info later. - FnMap[F] = std::make_pair<std::string, jit_code_entry*>(Buffer, JITCodeEntry); + FnMap[F] = std::make_pair(Buffer, JITCodeEntry); // Acquire the lock and do the registration. { diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp index 3b4e750..d046b8a 100644 --- a/lib/ExecutionEngine/JIT/JITEmitter.cpp +++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp @@ -128,7 +128,7 @@ namespace { return GlobalToIndirectSymMap; } - pair<void *, Function *> LookupFunctionFromCallSite( + std::pair<void *, Function *> LookupFunctionFromCallSite( const MutexGuard &locked, void *CallSite) const { assert(locked.holds(TheJIT->lock)); @@ -646,7 +646,7 @@ void *JITResolver::JITCompilerFn(void *Stub) { // The address given to us for the stub may not be exactly right, it might // be a little bit after the stub. As such, use upper_bound to find it. - pair<void*, Function*> I = + std::pair<void*, Function*> I = JR->state.LookupFunctionFromCallSite(locked, Stub); F = I.second; ActualPtr = I.first; diff --git a/lib/ExecutionEngine/MCJIT/CMakeLists.txt b/lib/ExecutionEngine/MCJIT/CMakeLists.txt index 6553079..38fdffa 100644 --- a/lib/ExecutionEngine/MCJIT/CMakeLists.txt +++ b/lib/ExecutionEngine/MCJIT/CMakeLists.txt @@ -1,5 +1,4 @@ add_llvm_library(LLVMMCJIT MCJIT.cpp - TargetSelect.cpp Intercept.cpp ) diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp index 148e0d9..4475f4d 100644 --- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp +++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp @@ -1,4 +1,4 @@ -//===-- MCJIT.cpp - MC-based Just-in-Time Compiler --------------------------===// +//===-- MCJIT.cpp - MC-based Just-in-Time Compiler ------------------------===// // // The LLVM Compiler Infrastructure // @@ -38,27 +38,15 @@ ExecutionEngine *MCJIT::createJIT(Module *M, JITMemoryManager *JMM, CodeGenOpt::Level OptLevel, bool GVsWithCode, - CodeModel::Model CMM, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs) { + TargetMachine *TM) { // Try to register the program as a source of symbols to resolve against. // // FIXME: Don't do this here. sys::DynamicLibrary::LoadLibraryPermanently(0, NULL); - // Pick a target either via -march or by guessing the native arch. - // - // FIXME: This should be lifted out of here, it isn't something which should - // be part of the JIT policy, rather the burden for this selection should be - // pushed to clients. - TargetMachine *TM = MCJIT::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr); - if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0; - TM->setCodeModel(CMM); - // If the target supports JIT code generation, create the JIT. if (TargetJITInfo *TJ = TM->getJITInfo()) - return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM), OptLevel, + return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM, M), OptLevel, GVsWithCode); if (ErrorStr) @@ -93,6 +81,8 @@ MCJIT::MCJIT(Module *m, TargetMachine *tm, TargetJITInfo &tji, Buffer.size())); if (Dyld.loadObject(MB)) report_fatal_error(Dyld.getErrorString()); + // Resolve any relocations. + Dyld.resolveRelocations(); } MCJIT::~MCJIT() { @@ -112,8 +102,12 @@ void *MCJIT::getPointerToFunction(Function *F) { return Addr; } - Twine Name = TM->getMCAsmInfo()->getGlobalPrefix() + F->getName(); - return (void*)Dyld.getSymbolAddress(Name.str()); + // FIXME: Should we be using the mangler for this? Probably. + StringRef BaseName = F->getName(); + if (BaseName[0] == '\1') + return (void*)Dyld.getSymbolAddress(BaseName.substr(1)); + return (void*)Dyld.getSymbolAddress((TM->getMCAsmInfo()->getGlobalPrefix() + + BaseName).str()); } void *MCJIT::recompileAndRelinkFunction(Function *F) { diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h index 1b50766..b64c21a 100644 --- a/lib/ExecutionEngine/MCJIT/MCJIT.h +++ b/lib/ExecutionEngine/MCJIT/MCJIT.h @@ -76,22 +76,12 @@ public: MCJITCtor = createJIT; } - // FIXME: This routine is scheduled for termination. Do not use it. - static TargetMachine *selectTarget(Module *M, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs, - std::string *Err); - static ExecutionEngine *createJIT(Module *M, std::string *ErrorStr, JITMemoryManager *JMM, CodeGenOpt::Level OptLevel, bool GVsWithCode, - CodeModel::Model CMM, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs); + TargetMachine *TM); // @} }; diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h index 0108ecc..40bc031 100644 --- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h +++ b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h @@ -26,13 +26,21 @@ class MCJITMemoryManager : public RTDyldMemoryManager { // FIXME: Multiple modules. Module *M; public: - MCJITMemoryManager(JITMemoryManager *jmm) : JMM(jmm) {} + MCJITMemoryManager(JITMemoryManager *jmm, Module *m) : JMM(jmm), M(m) {} // Allocate ActualSize bytes, or more, for the named function. Return // a pointer to the allocated memory and update Size to reflect how much // memory was acutally allocated. uint8_t *startFunctionBody(const char *Name, uintptr_t &Size) { + // FIXME: This should really reference the MCAsmInfo to get the global + // prefix. + if (Name[0] == '_') ++Name; Function *F = M->getFunction(Name); + // Some ObjC names have a prefixed \01 in the IR. If we failed to find + // the symbol and it's of the ObjC conventions (starts with "-"), try + // prepending a \01 and see if we can find it that way. + if (!F && Name[0] == '-') + F = M->getFunction((Twine("\1") + Name).str()); assert(F && "No matching function in JIT IR Module!"); return JMM->startFunctionBody(F, Size); } @@ -41,7 +49,15 @@ public: // memory was actually used. void endFunctionBody(const char *Name, uint8_t *FunctionStart, uint8_t *FunctionEnd) { + // FIXME: This should really reference the MCAsmInfo to get the global + // prefix. + if (Name[0] == '_') ++Name; Function *F = M->getFunction(Name); + // Some ObjC names have a prefixed \01 in the IR. If we failed to find + // the symbol and it's of the ObjC conventions (starts with "-"), try + // prepending a \01 and see if we can find it that way. + if (!F && Name[0] == '-') + F = M->getFunction((Twine("\1") + Name).str()); assert(F && "No matching function in JIT IR Module!"); JMM->endFunctionBody(F, FunctionStart, FunctionEnd); } diff --git a/lib/ExecutionEngine/MCJIT/TargetSelect.cpp b/lib/ExecutionEngine/MCJIT/TargetSelect.cpp deleted file mode 100644 index 50f6593..0000000 --- a/lib/ExecutionEngine/MCJIT/TargetSelect.cpp +++ /dev/null @@ -1,91 +0,0 @@ -//===-- TargetSelect.cpp - Target Chooser Code ----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This just asks the TargetRegistry for the appropriate JIT to use, and allows -// the user to specify a specific one on the commandline with -march=x. Clients -// should initialize targets prior to calling createJIT. -// -//===----------------------------------------------------------------------===// - -#include "MCJIT.h" -#include "llvm/Module.h" -#include "llvm/ADT/Triple.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Host.h" -#include "llvm/Target/SubtargetFeature.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegistry.h" -using namespace llvm; - -/// selectTarget - Pick a target either via -march or by guessing the native -/// arch. Add any CPU features specified via -mcpu or -mattr. -TargetMachine *MCJIT::selectTarget(Module *Mod, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs, - std::string *ErrorStr) { - Triple TheTriple(Mod->getTargetTriple()); - if (TheTriple.getTriple().empty()) - TheTriple.setTriple(sys::getHostTriple()); - - // Adjust the triple to match what the user requested. - const Target *TheTarget = 0; - if (!MArch.empty()) { - for (TargetRegistry::iterator it = TargetRegistry::begin(), - ie = TargetRegistry::end(); it != ie; ++it) { - if (MArch == it->getName()) { - TheTarget = &*it; - break; - } - } - - if (!TheTarget) { - *ErrorStr = "No available targets are compatible with this -march, " - "see -version for the available targets.\n"; - return 0; - } - - // Adjust the triple to match (if known), otherwise stick with the - // module/host triple. - Triple::ArchType Type = Triple::getArchTypeForLLVMName(MArch); - if (Type != Triple::UnknownArch) - TheTriple.setArch(Type); - } else { - std::string Error; - TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), Error); - if (TheTarget == 0) { - if (ErrorStr) - *ErrorStr = Error; - return 0; - } - } - - if (!TheTarget->hasJIT()) { - errs() << "WARNING: This target JIT is not designed for the host you are" - << " running. If bad things happen, please choose a different " - << "-march switch.\n"; - } - - // Package up features to be passed to target/subtarget - std::string FeaturesStr; - if (!MCPU.empty() || !MAttrs.empty()) { - SubtargetFeatures Features; - Features.setCPU(MCPU); - for (unsigned i = 0; i != MAttrs.size(); ++i) - Features.AddFeature(MAttrs[i]); - FeaturesStr = Features.getString(); - } - - // Allocate a target... - TargetMachine *Target = - TheTarget->createTargetMachine(TheTriple.getTriple(), FeaturesStr); - assert(Target && "Could not allocate target machine!"); - return Target; -} diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 29fced4..eda4cbb 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -41,17 +41,38 @@ class RuntimeDyldImpl { // The MemoryManager to load objects into. RTDyldMemoryManager *MemMgr; + // FIXME: This all assumes we're dealing with external symbols for anything + // explicitly referenced. I.e., we can index by name and things + // will work out. In practice, this may not be the case, so we + // should find a way to effectively generalize. // For each function, we have a MemoryBlock of it's instruction data. StringMap<sys::MemoryBlock> Functions; // Master symbol table. As modules are loaded and external symbols are // resolved, their addresses are stored here. - StringMap<uint64_t> SymbolTable; - - // FIXME: Should have multiple data blocks, one for each loaded chunk of - // compiled code. - sys::MemoryBlock Data; + StringMap<uint8_t*> SymbolTable; + + // For each symbol, keep a list of relocations based on it. Anytime + // its address is reassigned (the JIT re-compiled the function, e.g.), + // the relocations get re-resolved. + struct RelocationEntry { + std::string Target; // Object this relocation is contained in. + uint64_t Offset; // Offset into the object for the relocation. + uint32_t Data; // Second word of the raw macho relocation entry. + int64_t Addend; // Addend encoded in the instruction itself, if any. + bool isResolved; // Has this relocation been resolved previously? + + RelocationEntry(StringRef t, uint64_t offset, uint32_t data, int64_t addend) + : Target(t), Offset(offset), Data(data), Addend(addend), + isResolved(false) {} + }; + typedef SmallVector<RelocationEntry, 4> RelocationList; + StringMap<RelocationList> Relocations; + + // FIXME: Also keep a map of all the relocations contained in an object. Use + // this to dynamically answer whether all of the relocations in it have + // been resolved or not. bool HasError; std::string ErrorStr; @@ -65,12 +86,11 @@ class RuntimeDyldImpl { void extractFunction(StringRef Name, uint8_t *StartAddress, uint8_t *EndAddress); - bool resolveRelocation(uint32_t BaseSection, macho::RelocationEntry RE, - SmallVectorImpl<void *> &SectionBases, - SmallVectorImpl<StringRef> &SymbolNames); - bool resolveX86_64Relocation(intptr_t Address, intptr_t Value, bool isPCRel, + bool resolveRelocation(uint8_t *Address, uint8_t *Value, bool isPCRel, + unsigned Type, unsigned Size); + bool resolveX86_64Relocation(uintptr_t Address, uintptr_t Value, bool isPCRel, unsigned Type, unsigned Size); - bool resolveARMRelocation(intptr_t Address, intptr_t Value, bool isPCRel, + bool resolveARMRelocation(uintptr_t Address, uintptr_t Value, bool isPCRel, unsigned Type, unsigned Size); bool loadSegment32(const MachOObject *Obj, @@ -85,13 +105,15 @@ public: bool loadObject(MemoryBuffer *InputBuffer); - uint64_t getSymbolAddress(StringRef Name) { + void *getSymbolAddress(StringRef Name) { // FIXME: Just look up as a function for now. Overly simple of course. // Work in progress. - return (uint64_t)Functions.lookup(Name).base(); + return SymbolTable.lookup(Name); } - sys::MemoryBlock getMemoryBlock() { return Data; } + void resolveRelocations(); + + void reassignSymbolAddress(StringRef Name, uint8_t *Addr); // Is the linker in an error state? bool hasError() { return HasError; } @@ -107,75 +129,41 @@ void RuntimeDyldImpl::extractFunction(StringRef Name, uint8_t *StartAddress, uint8_t *EndAddress) { // Allocate memory for the function via the memory manager. uintptr_t Size = EndAddress - StartAddress + 1; - uint8_t *Mem = MemMgr->startFunctionBody(Name.data(), Size); + uintptr_t AllocSize = Size; + uint8_t *Mem = MemMgr->startFunctionBody(Name.data(), AllocSize); assert(Size >= (uint64_t)(EndAddress - StartAddress + 1) && "Memory manager failed to allocate enough memory!"); // Copy the function payload into the memory block. - memcpy(Mem, StartAddress, EndAddress - StartAddress + 1); + memcpy(Mem, StartAddress, Size); MemMgr->endFunctionBody(Name.data(), Mem, Mem + Size); // Remember where we put it. Functions[Name] = sys::MemoryBlock(Mem, Size); - DEBUG(dbgs() << " allocated to " << Mem << "\n"); + // Default the assigned address for this symbol to wherever this + // allocated it. + SymbolTable[Name] = Mem; + DEBUG(dbgs() << " allocated to [" << Mem << ", " << Mem + Size << "]\n"); } bool RuntimeDyldImpl:: -resolveRelocation(uint32_t BaseSection, macho::RelocationEntry RE, - SmallVectorImpl<void *> &SectionBases, - SmallVectorImpl<StringRef> &SymbolNames) { - // struct relocation_info { - // int32_t r_address; - // uint32_t r_symbolnum:24, - // r_pcrel:1, - // r_length:2, - // r_extern:1, - // r_type:4; - // }; - uint32_t SymbolNum = RE.Word1 & 0xffffff; // 24-bit value - bool isPCRel = (RE.Word1 >> 24) & 1; - unsigned Log2Size = (RE.Word1 >> 25) & 3; - bool isExtern = (RE.Word1 >> 27) & 1; - unsigned Type = (RE.Word1 >> 28) & 0xf; - if (RE.Word0 & macho::RF_Scattered) - return Error("NOT YET IMPLEMENTED: scattered relocations."); - - // The address requiring a relocation. - intptr_t Address = (intptr_t)SectionBases[BaseSection] + RE.Word0; - - // Figure out the target address of the relocation. If isExtern is true, - // this relocation references the symbol table, otherwise it references - // a section in the same object, numbered from 1 through NumSections - // (SectionBases is [0, NumSections-1]). - intptr_t Value; - if (isExtern) { - StringRef Name = SymbolNames[SymbolNum]; - if (SymbolTable.lookup(Name)) { - // The symbol is in our symbol table, so we can resolve it directly. - Value = (intptr_t)SymbolTable[Name]; - } else { - return Error("NOT YET IMPLEMENTED: relocations to pre-compiled code."); - } - DEBUG(dbgs() << "Resolve relocation(" << Type << ") from '" << Name - << "' to " << format("0x%x", Address) << ".\n"); - } else { - // For non-external relocations, the SymbolNum is actual a section number - // as described above. - Value = (intptr_t)SectionBases[SymbolNum - 1]; - } - - unsigned Size = 1 << Log2Size; +resolveRelocation(uint8_t *Address, uint8_t *Value, bool isPCRel, + unsigned Type, unsigned Size) { + // This just dispatches to the proper target specific routine. switch (CPUType) { default: assert(0 && "Unsupported CPU type!"); case mach::CTM_x86_64: - return resolveX86_64Relocation(Address, Value, isPCRel, Type, Size); + return resolveX86_64Relocation((uintptr_t)Address, (uintptr_t)Value, + isPCRel, Type, Size); case mach::CTM_ARM: - return resolveARMRelocation(Address, Value, isPCRel, Type, Size); + return resolveARMRelocation((uintptr_t)Address, (uintptr_t)Value, + isPCRel, Type, Size); } llvm_unreachable(""); } -bool RuntimeDyldImpl::resolveX86_64Relocation(intptr_t Address, intptr_t Value, - bool isPCRel, unsigned Type, - unsigned Size) { +bool RuntimeDyldImpl:: +resolveX86_64Relocation(uintptr_t Address, uintptr_t Value, + bool isPCRel, unsigned Type, + unsigned Size) { // If the relocation is PC-relative, the value to be encoded is the // pointer difference. if (isPCRel) @@ -210,7 +198,7 @@ bool RuntimeDyldImpl::resolveX86_64Relocation(intptr_t Address, intptr_t Value, return false; } -bool RuntimeDyldImpl::resolveARMRelocation(intptr_t Address, intptr_t Value, +bool RuntimeDyldImpl::resolveARMRelocation(uintptr_t Address, uintptr_t Value, bool isPCRel, unsigned Type, unsigned Size) { // If the relocation is PC-relative, the value to be encoded is the @@ -225,6 +213,7 @@ bool RuntimeDyldImpl::resolveARMRelocation(intptr_t Address, intptr_t Value, switch(Type) { default: + llvm_unreachable("Invalid relocation type!"); case macho::RIT_Vanilla: { llvm_unreachable("Invalid relocation type!"); // Mask in the target value a byte at a time (we don't have an alignment @@ -236,10 +225,6 @@ bool RuntimeDyldImpl::resolveARMRelocation(intptr_t Address, intptr_t Value, } break; } - case macho::RIT_Pair: - case macho::RIT_Difference: - case macho::RIT_ARM_LocalDifference: - case macho::RIT_ARM_PreboundLazyPointer: case macho::RIT_ARM_Branch24Bit: { // Mask the value into the target address. We know instructions are // 32-bit aligned, so we can do it all at once. @@ -260,6 +245,10 @@ bool RuntimeDyldImpl::resolveARMRelocation(intptr_t Address, intptr_t Value, case macho::RIT_ARM_ThumbBranch32Bit: case macho::RIT_ARM_Half: case macho::RIT_ARM_HalfDifference: + case macho::RIT_Pair: + case macho::RIT_Difference: + case macho::RIT_ARM_LocalDifference: + case macho::RIT_ARM_PreboundLazyPointer: return Error("Relocation type not implemented yet!"); } return false; @@ -269,98 +258,137 @@ bool RuntimeDyldImpl:: loadSegment32(const MachOObject *Obj, const MachOObject::LoadCommandInfo *SegmentLCI, const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC) { - InMemoryStruct<macho::SegmentLoadCommand> Segment32LC; - Obj->ReadSegmentLoadCommand(*SegmentLCI, Segment32LC); - if (!Segment32LC) + InMemoryStruct<macho::SegmentLoadCommand> SegmentLC; + Obj->ReadSegmentLoadCommand(*SegmentLCI, SegmentLC); + if (!SegmentLC) return Error("unable to load segment load command"); - // Map the segment into memory. - std::string ErrorStr; - Data = sys::Memory::AllocateRWX(Segment32LC->VMSize, 0, &ErrorStr); - if (!Data.base()) - return Error("unable to allocate memory block: '" + ErrorStr + "'"); - memcpy(Data.base(), Obj->getData(Segment32LC->FileOffset, - Segment32LC->FileSize).data(), - Segment32LC->FileSize); - memset((char*)Data.base() + Segment32LC->FileSize, 0, - Segment32LC->VMSize - Segment32LC->FileSize); - - // Bind the section indices to addresses and record the relocations we - // need to resolve. - typedef std::pair<uint32_t, macho::RelocationEntry> RelocationMap; - SmallVector<RelocationMap, 64> Relocations; - - SmallVector<void *, 16> SectionBases; - for (unsigned i = 0; i != Segment32LC->NumSections; ++i) { + for (unsigned SectNum = 0; SectNum != SegmentLC->NumSections; ++SectNum) { InMemoryStruct<macho::Section> Sect; - Obj->ReadSection(*SegmentLCI, i, Sect); - if (!Sect) - return Error("unable to load section: '" + Twine(i) + "'"); - - // Remember any relocations the section has so we can resolve them later. - for (unsigned j = 0; j != Sect->NumRelocationTableEntries; ++j) { - InMemoryStruct<macho::RelocationEntry> RE; - Obj->ReadRelocationEntry(Sect->RelocationTableOffset, j, RE); - Relocations.push_back(RelocationMap(j, *RE)); - } - - // FIXME: Improve check. -// if (Sect->Flags != 0x80000400) -// return Error("unsupported section type!"); - - SectionBases.push_back((char*) Data.base() + Sect->Address); - } + Obj->ReadSection(*SegmentLCI, SectNum, Sect); + if (!Sect) + return Error("unable to load section: '" + Twine(SectNum) + "'"); - // Bind all the symbols to address. Keep a record of the names for use - // by relocation resolution. - SmallVector<StringRef, 64> SymbolNames; - for (unsigned i = 0; i != SymtabLC->NumSymbolTableEntries; ++i) { - InMemoryStruct<macho::SymbolTableEntry> STE; - Obj->ReadSymbolTableEntry(SymtabLC->SymbolTableOffset, i, STE); - if (!STE) - return Error("unable to read symbol: '" + Twine(i) + "'"); - // Get the symbol name. - StringRef Name = Obj->getStringAtIndex(STE->StringIndex); - SymbolNames.push_back(Name); - - // Just skip undefined symbols. They'll be loaded from whatever - // module they come from (or system dylib) when we resolve relocations - // involving them. - if (STE->SectionIndex == 0) + // FIXME: For the time being, we're only loading text segments. + if (Sect->Flags != 0x80000400) continue; - unsigned Index = STE->SectionIndex - 1; - if (Index >= Segment32LC->NumSections) - return Error("invalid section index for symbol: '" + Twine() + "'"); - - // Get the section base address. - void *SectionBase = SectionBases[Index]; + // Address and names of symbols in the section. + typedef std::pair<uint64_t, StringRef> SymbolEntry; + SmallVector<SymbolEntry, 64> Symbols; + // Index of all the names, in this section or not. Used when we're + // dealing with relocation entries. + SmallVector<StringRef, 64> SymbolNames; + for (unsigned i = 0; i != SymtabLC->NumSymbolTableEntries; ++i) { + InMemoryStruct<macho::SymbolTableEntry> STE; + Obj->ReadSymbolTableEntry(SymtabLC->SymbolTableOffset, i, STE); + if (!STE) + return Error("unable to read symbol: '" + Twine(i) + "'"); + if (STE->SectionIndex > SegmentLC->NumSections) + return Error("invalid section index for symbol: '" + Twine(i) + "'"); + // Get the symbol name. + StringRef Name = Obj->getStringAtIndex(STE->StringIndex); + SymbolNames.push_back(Name); - // Get the symbol address. - uint64_t Address = (uint64_t)SectionBase + STE->Value; + // Just skip symbols not defined in this section. + if ((unsigned)STE->SectionIndex - 1 != SectNum) + continue; - // FIXME: Check the symbol type and flags. - if (STE->Type != 0xF) - return Error("unexpected symbol type!"); - if (STE->Flags != 0x0) - return Error("unexpected symbol type!"); + // FIXME: Check the symbol type and flags. + if (STE->Type != 0xF) // external, defined in this section. + continue; + // Flags == 0x8 marks a thumb function for ARM, which is fine as it + // doesn't require any special handling here. + if (STE->Flags != 0x0 && STE->Flags != 0x8) + continue; - DEBUG(dbgs() << "Symbol: '" << Name << "' @ " << Address << "\n"); + // Remember the symbol. + Symbols.push_back(SymbolEntry(STE->Value, Name)); - SymbolTable[Name] = Address; - } + DEBUG(dbgs() << "Function sym: '" << Name << "' @ " << + (Sect->Address + STE->Value) << "\n"); + } + // Sort the symbols by address, just in case they didn't come in that way. + array_pod_sort(Symbols.begin(), Symbols.end()); - // Now resolve any relocations. - for (unsigned i = 0, e = Relocations.size(); i != e; ++i) { - if (resolveRelocation(Relocations[i].first, Relocations[i].second, - SectionBases, SymbolNames)) - return true; - } + // If there weren't any functions (odd, but just in case...) + if (!Symbols.size()) + continue; - // We've loaded the section; now mark the functions in it as executable. - // FIXME: We really should use the MemoryManager for this. - sys::Memory::setRangeExecutable(Data.base(), Data.size()); + // Extract the function data. + uint8_t *Base = (uint8_t*)Obj->getData(SegmentLC->FileOffset, + SegmentLC->FileSize).data(); + for (unsigned i = 0, e = Symbols.size() - 1; i != e; ++i) { + uint64_t StartOffset = Sect->Address + Symbols[i].first; + uint64_t EndOffset = Symbols[i + 1].first - 1; + DEBUG(dbgs() << "Extracting function: " << Symbols[i].second + << " from [" << StartOffset << ", " << EndOffset << "]\n"); + extractFunction(Symbols[i].second, Base + StartOffset, Base + EndOffset); + } + // The last symbol we do after since the end address is calculated + // differently because there is no next symbol to reference. + uint64_t StartOffset = Symbols[Symbols.size() - 1].first; + uint64_t EndOffset = Sect->Size - 1; + DEBUG(dbgs() << "Extracting function: " << Symbols[Symbols.size()-1].second + << " from [" << StartOffset << ", " << EndOffset << "]\n"); + extractFunction(Symbols[Symbols.size()-1].second, + Base + StartOffset, Base + EndOffset); + // Now extract the relocation information for each function and process it. + for (unsigned j = 0; j != Sect->NumRelocationTableEntries; ++j) { + InMemoryStruct<macho::RelocationEntry> RE; + Obj->ReadRelocationEntry(Sect->RelocationTableOffset, j, RE); + if (RE->Word0 & macho::RF_Scattered) + return Error("NOT YET IMPLEMENTED: scattered relocations."); + // Word0 of the relocation is the offset into the section where the + // relocation should be applied. We need to translate that into an + // offset into a function since that's our atom. + uint32_t Offset = RE->Word0; + // Look for the function containing the address. This is used for JIT + // code, so the number of functions in section is almost always going + // to be very small (usually just one), so until we have use cases + // where that's not true, just use a trivial linear search. + unsigned SymbolNum; + unsigned NumSymbols = Symbols.size(); + assert(NumSymbols > 0 && Symbols[0].first <= Offset && + "No symbol containing relocation!"); + for (SymbolNum = 0; SymbolNum < NumSymbols - 1; ++SymbolNum) + if (Symbols[SymbolNum + 1].first > Offset) + break; + // Adjust the offset to be relative to the symbol. + Offset -= Symbols[SymbolNum].first; + // Get the name of the symbol containing the relocation. + StringRef TargetName = SymbolNames[SymbolNum]; + + bool isExtern = (RE->Word1 >> 27) & 1; + // Figure out the source symbol of the relocation. If isExtern is true, + // this relocation references the symbol table, otherwise it references + // a section in the same object, numbered from 1 through NumSections + // (SectionBases is [0, NumSections-1]). + // FIXME: Some targets (ARM) use internal relocations even for + // externally visible symbols, if the definition is in the same + // file as the reference. We need to convert those back to by-name + // references. We can resolve the address based on the section + // offset and see if we have a symbol at that address. If we do, + // use that; otherwise, puke. + if (!isExtern) + return Error("Internal relocations not supported."); + uint32_t SourceNum = RE->Word1 & 0xffffff; // 24-bit value + StringRef SourceName = SymbolNames[SourceNum]; + + // FIXME: Get the relocation addend from the target address. + + // Now store the relocation information. Associate it with the source + // symbol. + Relocations[SourceName].push_back(RelocationEntry(TargetName, + Offset, + RE->Word1, + 0 /*Addend*/)); + DEBUG(dbgs() << "Relocation at '" << TargetName << "' + " << Offset + << " from '" << SourceName << "(Word1: " + << format("0x%x", RE->Word1) << ")\n"); + } + } return false; } @@ -380,51 +408,55 @@ loadSegment64(const MachOObject *Obj, if (!Sect) return Error("unable to load section: '" + Twine(SectNum) + "'"); - // FIXME: Improve check. + // FIXME: For the time being, we're only loading text segments. if (Sect->Flags != 0x80000400) - return Error("unsupported section type!"); + continue; // Address and names of symbols in the section. typedef std::pair<uint64_t, StringRef> SymbolEntry; SmallVector<SymbolEntry, 64> Symbols; + // Index of all the names, in this section or not. Used when we're + // dealing with relocation entries. + SmallVector<StringRef, 64> SymbolNames; for (unsigned i = 0; i != SymtabLC->NumSymbolTableEntries; ++i) { InMemoryStruct<macho::Symbol64TableEntry> STE; Obj->ReadSymbol64TableEntry(SymtabLC->SymbolTableOffset, i, STE); if (!STE) return Error("unable to read symbol: '" + Twine(i) + "'"); if (STE->SectionIndex > Segment64LC->NumSections) - return Error("invalid section index for symbol: '" + Twine() + "'"); + return Error("invalid section index for symbol: '" + Twine(i) + "'"); + // Get the symbol name. + StringRef Name = Obj->getStringAtIndex(STE->StringIndex); + SymbolNames.push_back(Name); // Just skip symbols not defined in this section. - if (STE->SectionIndex - 1 != SectNum) + if ((unsigned)STE->SectionIndex - 1 != SectNum) continue; - // Get the symbol name. - StringRef Name = Obj->getStringAtIndex(STE->StringIndex); - // FIXME: Check the symbol type and flags. if (STE->Type != 0xF) // external, defined in this section. - return Error("unexpected symbol type!"); + continue; if (STE->Flags != 0x0) - return Error("unexpected symbol type!"); - - uint64_t BaseAddress = Sect->Address; - uint64_t Address = BaseAddress + STE->Value; + continue; // Remember the symbol. - Symbols.push_back(SymbolEntry(Address, Name)); + Symbols.push_back(SymbolEntry(STE->Value, Name)); - DEBUG(dbgs() << "Function sym: '" << Name << "' @ " << Address << "\n"); + DEBUG(dbgs() << "Function sym: '" << Name << "' @ " << + (Sect->Address + STE->Value) << "\n"); } - // Sort the symbols by address, just in case they didn't come in that - // way. + // Sort the symbols by address, just in case they didn't come in that way. array_pod_sort(Symbols.begin(), Symbols.end()); + // If there weren't any functions (odd, but just in case...) + if (!Symbols.size()) + continue; + // Extract the function data. uint8_t *Base = (uint8_t*)Obj->getData(Segment64LC->FileOffset, Segment64LC->FileSize).data(); for (unsigned i = 0, e = Symbols.size() - 1; i != e; ++i) { - uint64_t StartOffset = Symbols[i].first; + uint64_t StartOffset = Sect->Address + Symbols[i].first; uint64_t EndOffset = Symbols[i + 1].first - 1; DEBUG(dbgs() << "Extracting function: " << Symbols[i].second << " from [" << StartOffset << ", " << EndOffset << "]\n"); @@ -438,8 +470,56 @@ loadSegment64(const MachOObject *Obj, << " from [" << StartOffset << ", " << EndOffset << "]\n"); extractFunction(Symbols[Symbols.size()-1].second, Base + StartOffset, Base + EndOffset); - } + // Now extract the relocation information for each function and process it. + for (unsigned j = 0; j != Sect->NumRelocationTableEntries; ++j) { + InMemoryStruct<macho::RelocationEntry> RE; + Obj->ReadRelocationEntry(Sect->RelocationTableOffset, j, RE); + if (RE->Word0 & macho::RF_Scattered) + return Error("NOT YET IMPLEMENTED: scattered relocations."); + // Word0 of the relocation is the offset into the section where the + // relocation should be applied. We need to translate that into an + // offset into a function since that's our atom. + uint32_t Offset = RE->Word0; + // Look for the function containing the address. This is used for JIT + // code, so the number of functions in section is almost always going + // to be very small (usually just one), so until we have use cases + // where that's not true, just use a trivial linear search. + unsigned SymbolNum; + unsigned NumSymbols = Symbols.size(); + assert(NumSymbols > 0 && Symbols[0].first <= Offset && + "No symbol containing relocation!"); + for (SymbolNum = 0; SymbolNum < NumSymbols - 1; ++SymbolNum) + if (Symbols[SymbolNum + 1].first > Offset) + break; + // Adjust the offset to be relative to the symbol. + Offset -= Symbols[SymbolNum].first; + // Get the name of the symbol containing the relocation. + StringRef TargetName = SymbolNames[SymbolNum]; + + bool isExtern = (RE->Word1 >> 27) & 1; + // Figure out the source symbol of the relocation. If isExtern is true, + // this relocation references the symbol table, otherwise it references + // a section in the same object, numbered from 1 through NumSections + // (SectionBases is [0, NumSections-1]). + if (!isExtern) + return Error("Internal relocations not supported."); + uint32_t SourceNum = RE->Word1 & 0xffffff; // 24-bit value + StringRef SourceName = SymbolNames[SourceNum]; + + // FIXME: Get the relocation addend from the target address. + + // Now store the relocation information. Associate it with the source + // symbol. + Relocations[SourceName].push_back(RelocationEntry(TargetName, + Offset, + RE->Word1, + 0 /*Addend*/)); + DEBUG(dbgs() << "Relocation at '" << TargetName << "' + " << Offset + << " from '" << SourceName << "(Word1: " + << format("0x%x", RE->Word1) << ")\n"); + } + } return false; } @@ -530,6 +610,40 @@ bool RuntimeDyldImpl::loadObject(MemoryBuffer *InputBuffer) { return false; } +// Resolve the relocations for all symbols we currently know about. +void RuntimeDyldImpl::resolveRelocations() { + // Just iterate over the symbols in our symbol table and assign their + // addresses. + StringMap<uint8_t*>::iterator i = SymbolTable.begin(); + StringMap<uint8_t*>::iterator e = SymbolTable.end(); + for (;i != e; ++i) + reassignSymbolAddress(i->getKey(), i->getValue()); +} + +// Assign an address to a symbol name and resolve all the relocations +// associated with it. +void RuntimeDyldImpl::reassignSymbolAddress(StringRef Name, uint8_t *Addr) { + // Assign the address in our symbol table. + SymbolTable[Name] = Addr; + + RelocationList &Relocs = Relocations[Name]; + for (unsigned i = 0, e = Relocs.size(); i != e; ++i) { + RelocationEntry &RE = Relocs[i]; + uint8_t *Target = SymbolTable[RE.Target] + RE.Offset; + bool isPCRel = (RE.Data >> 24) & 1; + unsigned Type = (RE.Data >> 28) & 0xf; + unsigned Size = 1 << ((RE.Data >> 25) & 3); + + DEBUG(dbgs() << "Resolving relocation at '" << RE.Target + << "' + " << RE.Offset << " (" << format("%p", Target) << ")" + << " from '" << Name << " (" << format("%p", Addr) << ")" + << "(" << (isPCRel ? "pcrel" : "absolute") + << ", type: " << Type << ", Size: " << Size << ").\n"); + + resolveRelocation(Target, Addr, isPCRel, Type, Size); + RE.isResolved = true; + } +} //===----------------------------------------------------------------------===// // RuntimeDyld class implementation @@ -545,12 +659,16 @@ bool RuntimeDyld::loadObject(MemoryBuffer *InputBuffer) { return Dyld->loadObject(InputBuffer); } -uint64_t RuntimeDyld::getSymbolAddress(StringRef Name) { +void *RuntimeDyld::getSymbolAddress(StringRef Name) { return Dyld->getSymbolAddress(Name); } -sys::MemoryBlock RuntimeDyld::getMemoryBlock() { - return Dyld->getMemoryBlock(); +void RuntimeDyld::resolveRelocations() { + Dyld->resolveRelocations(); +} + +void RuntimeDyld::reassignSymbolAddress(StringRef Name, uint8_t *Addr) { + Dyld->reassignSymbolAddress(Name, Addr); } StringRef RuntimeDyld::getErrorString() { diff --git a/lib/ExecutionEngine/JIT/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp index 8d92ab0..a8822e5 100644 --- a/lib/ExecutionEngine/JIT/TargetSelect.cpp +++ b/lib/ExecutionEngine/TargetSelect.cpp @@ -13,7 +13,7 @@ // //===----------------------------------------------------------------------===// -#include "JIT.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/Module.h" #include "llvm/ADT/Triple.h" #include "llvm/Support/CommandLine.h" @@ -26,11 +26,11 @@ using namespace llvm; /// selectTarget - Pick a target either via -march or by guessing the native /// arch. Add any CPU features specified via -mcpu or -mattr. -TargetMachine *JIT::selectTarget(Module *Mod, - StringRef MArch, - StringRef MCPU, - const SmallVectorImpl<std::string>& MAttrs, - std::string *ErrorStr) { +TargetMachine *EngineBuilder::selectTarget(Module *Mod, + StringRef MArch, + StringRef MCPU, + const SmallVectorImpl<std::string>& MAttrs, + std::string *ErrorStr) { Triple TheTriple(Mod->getTargetTriple()); if (TheTriple.getTriple().empty()) TheTriple.setTriple(sys::getHostTriple()); diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt index 6aed059..a77ecd3 100644 --- a/lib/MC/CMakeLists.txt +++ b/lib/MC/CMakeLists.txt @@ -30,6 +30,7 @@ add_llvm_library(LLVMMC MCStreamer.cpp MCSymbol.cpp MCValue.cpp + MCWin64EH.cpp MachObjectWriter.cpp WinCOFFStreamer.cpp WinCOFFObjectWriter.cpp diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp index b7d30cd..59e1b8e 100644 --- a/lib/MC/ELFObjectWriter.cpp +++ b/lib/MC/ELFObjectWriter.cpp @@ -25,6 +25,8 @@ #include "llvm/Support/ELF.h" #include "llvm/Target/TargetAsmBackend.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/Statistic.h" #include "../Target/X86/X86FixupKinds.h" #include "../Target/ARM/ARMFixupKinds.h" @@ -32,6 +34,9 @@ #include <vector> using namespace llvm; +#undef DEBUG_TYPE +#define DEBUG_TYPE "reloc-info" + bool ELFObjectWriter::isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind) { const MCFixupKindInfo &FKI = Asm.getBackend().getFixupKindInfo((MCFixupKind) Kind); @@ -46,6 +51,7 @@ bool ELFObjectWriter::RelocNeedsGOT(MCSymbolRefExpr::VariantKind Variant) { case MCSymbolRefExpr::VK_GOT: case MCSymbolRefExpr::VK_PLT: case MCSymbolRefExpr::VK_GOTPCREL: + case MCSymbolRefExpr::VK_GOTOFF: case MCSymbolRefExpr::VK_TPOFF: case MCSymbolRefExpr::VK_TLSGD: case MCSymbolRefExpr::VK_GOTTPOFF: @@ -181,8 +187,13 @@ uint64_t ELFObjectWriter::SymbolValue(MCSymbolData &Data, if (!Symbol.isInSection()) return 0; - if (Data.getFragment()) - return Layout.getSymbolOffset(&Data); + + if (Data.getFragment()) { + if (Data.getFlags() & ELF_Other_ThumbFunc) + return Layout.getSymbolOffset(&Data)+1; + else + return Layout.getSymbolOffset(&Data); + } return 0; } @@ -319,7 +330,9 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF, const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm, const MCValue &Target, - const MCFragment &F) const { + const MCFragment &F, + const MCFixup &Fixup, + bool IsPCRel) const { const MCSymbol &Symbol = Target.getSymA()->getSymbol(); const MCSymbol &ASymbol = Symbol.AliasedSymbol(); const MCSymbol *Renamed = Renames.lookup(&Symbol); @@ -342,7 +355,7 @@ const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm, const SectionKind secKind = Section.getKind(); if (secKind.isBSS()) - return ExplicitRelSym(Asm, Target, F, true); + return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel); if (secKind.isThreadLocal()) { if (Renamed) @@ -365,13 +378,14 @@ const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm, if (Section.getFlags() & ELF::SHF_MERGE) { if (Target.getConstant() == 0) - return NULL; + return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel); if (Renamed) return Renamed; return &Symbol; } - return ExplicitRelSym(Asm, Target, F, false); + return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel); + } @@ -390,7 +404,7 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm, if (!Target.isAbsolute()) { const MCSymbol &Symbol = Target.getSymA()->getSymbol(); const MCSymbol &ASymbol = Symbol.AliasedSymbol(); - RelocSymbol = SymbolToReloc(Asm, Target, *Fragment); + RelocSymbol = SymbolToReloc(Asm, Target, *Fragment, Fixup, IsPCRel); if (const MCSymbolRefExpr *RefB = Target.getSymB()) { const MCSymbol &SymbolB = RefB->getSymbol(); @@ -532,6 +546,7 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm, RevGroupMapTy RevGroupMap, unsigned NumRegularSections) { // FIXME: Is this the correct place to do this? + // FIXME: Why is an undefined reference to _GLOBAL_OFFSET_TABLE_ needed? if (NeedsGOT) { llvm::StringRef Name = "_GLOBAL_OFFSET_TABLE_"; MCSymbol *Sym = Asm.getContext().GetOrCreateSymbol(Name); @@ -1261,32 +1276,93 @@ void ARMELFObjectWriter::WriteEFlags() { // In ARM, _MergedGlobals and other most symbols get emitted directly. // I.e. not as an offset to a section symbol. -// This code is a first-cut approximation of what ARM/gcc does. +// This code is an approximation of what ARM/gcc does. + +STATISTIC(PCRelCount, "Total number of PIC Relocations"); +STATISTIC(NonPCRelCount, "Total number of non-PIC relocations"); const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm, const MCValue &Target, const MCFragment &F, - bool IsBSS) const { + const MCFixup &Fixup, + bool IsPCRel) const { const MCSymbol &Symbol = Target.getSymA()->getSymbol(); bool EmitThisSym = false; - if (IsBSS) { - EmitThisSym = StringSwitch<bool>(Symbol.getName()) - .Case("_MergedGlobals", true) - .Default(false); + const MCSectionELF &Section = + static_cast<const MCSectionELF&>(Symbol.getSection()); + bool InNormalSection = true; + unsigned RelocType = 0; + RelocType = GetRelocTypeInner(Target, Fixup, IsPCRel); + + DEBUG( + const MCSymbolRefExpr::VariantKind Kind = Target.getSymA()->getKind(); + MCSymbolRefExpr::VariantKind Kind2; + Kind2 = Target.getSymB() ? Target.getSymB()->getKind() : + MCSymbolRefExpr::VK_None; + dbgs() << "considering symbol " + << Section.getSectionName() << "/" + << Symbol.getName() << "/" + << " Rel:" << (unsigned)RelocType + << " Kind: " << (int)Kind << "/" << (int)Kind2 + << " Tmp:" + << Symbol.isAbsolute() << "/" << Symbol.isDefined() << "/" + << Symbol.isVariable() << "/" << Symbol.isTemporary() + << " Counts:" << PCRelCount << "/" << NonPCRelCount << "\n"); + + if (IsPCRel) { ++PCRelCount; + switch (RelocType) { + default: + // Most relocation types are emitted as explicit symbols + InNormalSection = + StringSwitch<bool>(Section.getSectionName()) + .Case(".data.rel.ro.local", false) + .Case(".data.rel", false) + .Case(".bss", false) + .Default(true); + EmitThisSym = true; + break; + case ELF::R_ARM_ABS32: + // But things get strange with R_ARM_ABS32 + // In this case, most things that go in .rodata show up + // as section relative relocations + InNormalSection = + StringSwitch<bool>(Section.getSectionName()) + .Case(".data.rel.ro.local", false) + .Case(".data.rel", false) + .Case(".rodata", false) + .Case(".bss", false) + .Default(true); + EmitThisSym = false; + break; + } } else { - EmitThisSym = StringSwitch<bool>(Symbol.getName()) - .Case("_MergedGlobals", true) - .StartsWith(".L.str", true) - .Default(false); + NonPCRelCount++; + InNormalSection = + StringSwitch<bool>(Section.getSectionName()) + .Case(".data.rel.ro.local", false) + .Case(".rodata", false) + .Case(".data.rel", false) + .Case(".bss", false) + .Default(true); + + switch (RelocType) { + default: EmitThisSym = true; break; + case ELF::R_ARM_ABS32: EmitThisSym = false; break; + } } + if (EmitThisSym) return &Symbol; - if (! Symbol.isTemporary()) + if (! Symbol.isTemporary() && InNormalSection) { return &Symbol; + } return NULL; } +// Need to examine the Fixup when determining whether to +// emit the relocation as an explicit symbol or as a section relative +// offset unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel, @@ -1295,6 +1371,20 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target, MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); + unsigned Type = GetRelocTypeInner(Target, Fixup, IsPCRel); + + if (RelocNeedsGOT(Modifier)) + NeedsGOT = true; + + return Type; +} + +unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? + MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); + unsigned Type = 0; if (IsPCRel) { switch ((unsigned)Fixup.getKind()) { @@ -1303,7 +1393,7 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target, switch (Modifier) { default: llvm_unreachable("Unsupported Modifier"); case MCSymbolRefExpr::VK_None: - Type = ELF::R_ARM_BASE_PREL; + Type = ELF::R_ARM_REL32; break; case MCSymbolRefExpr::VK_ARM_TLSGD: assert(0 && "unimplemented"); @@ -1342,6 +1432,17 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target, case ARM::fixup_t2_movw_lo16_pcrel: Type = ELF::R_ARM_THM_MOVW_PREL_NC; break; + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + switch (Modifier) { + case MCSymbolRefExpr::VK_ARM_PLT: + Type = ELF::R_ARM_THM_CALL; + break; + default: + Type = ELF::R_ARM_NONE; + break; + } + break; } } else { switch ((unsigned)Fixup.getKind()) { @@ -1399,9 +1500,6 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target, } } - if (RelocNeedsGOT(Modifier)) - NeedsGOT = true; - return Type; } @@ -1476,13 +1574,17 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, if (IsPCRel) { switch ((unsigned)Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); + + case FK_Data_8: Type = ELF::R_X86_64_PC64; break; + case FK_Data_4: Type = ELF::R_X86_64_PC32; break; + case FK_Data_2: Type = ELF::R_X86_64_PC16; break; + case FK_PCRel_8: assert(Modifier == MCSymbolRefExpr::VK_None); Type = ELF::R_X86_64_PC64; break; case X86::reloc_signed_4byte: case X86::reloc_riprel_4byte_movq_load: - case FK_Data_4: // FIXME? case X86::reloc_riprel_4byte: case FK_PCRel_4: switch (Modifier) { @@ -1609,6 +1711,9 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, case MCSymbolRefExpr::VK_DTPOFF: Type = ELF::R_386_TLS_LDO_32; break; + case MCSymbolRefExpr::VK_GOTTPOFF: + Type = ELF::R_386_TLS_IE_32; + break; } break; case FK_Data_2: Type = ELF::R_386_16; break; diff --git a/lib/MC/ELFObjectWriter.h b/lib/MC/ELFObjectWriter.h index f1d514a..7593099 100644 --- a/lib/MC/ELFObjectWriter.h +++ b/lib/MC/ELFObjectWriter.h @@ -140,15 +140,18 @@ class ELFObjectWriter : public MCObjectWriter { unsigned ShstrtabIndex; - const MCSymbol *SymbolToReloc(const MCAssembler &Asm, - const MCValue &Target, - const MCFragment &F) const; + virtual const MCSymbol *SymbolToReloc(const MCAssembler &Asm, + const MCValue &Target, + const MCFragment &F, + const MCFixup &Fixup, + bool IsPCRel) const; // For arch-specific emission of explicit reloc symbol virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm, const MCValue &Target, const MCFragment &F, - bool IsBSS) const { + const MCFixup &Fixup, + bool IsPCRel) const { return NULL; } @@ -380,11 +383,16 @@ class ELFObjectWriter : public MCObjectWriter { virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm, const MCValue &Target, const MCFragment &F, - bool IsBSS) const; + const MCFixup &Fixup, + bool IsPCRel) const; virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel, bool IsRelocWithSymbol, int64_t Addend); + private: + unsigned GetRelocTypeInner(const MCValue &Target, + const MCFixup &Fixup, bool IsPCRel) const; + }; //===- MBlazeELFObjectWriter -------------------------------------------===// diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp index 116c007..73b259e 100644 --- a/lib/MC/MCAsmInfo.cpp +++ b/lib/MC/MCAsmInfo.cpp @@ -13,7 +13,11 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/Support/DataTypes.h" +#include "llvm/Support/Dwarf.h" #include <cctype> #include <cstring> using namespace llvm; @@ -70,9 +74,8 @@ MCAsmInfo::MCAsmInfo() { HasLEB128 = false; SupportsDebugInformation = false; ExceptionsType = ExceptionHandling::None; - DwarfRequiresFrameSection = true; DwarfUsesInlineInfoSection = false; - DwarfUsesAbsoluteLabelForStmtList = true; + DwarfRequiresRelocationForSectionOffset = true; DwarfSectionOffsetDirective = 0; DwarfUsesLabelOffsetForRanges = true; HasMicrosoftFastStdCallMangling = false; @@ -106,3 +109,25 @@ unsigned MCAsmInfo::getSLEB128Size(int Value) { } while (IsMore); return Size; } + +const MCExpr * +MCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + return getExprForFDESymbol(Sym, Encoding, Streamer); +} + +const MCExpr * +MCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + if (!(Encoding & dwarf::DW_EH_PE_pcrel)) + return MCSymbolRefExpr::Create(Sym, Streamer.getContext()); + + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = MCSymbolRefExpr::Create(Sym, Context); + MCSymbol *PCSym = Context.CreateTempSymbol(); + Streamer.EmitLabel(PCSym); + const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context); + return MCBinaryExpr::CreateSub(Res, PC, Context); +} diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp index 526ad0d..5851cb0 100644 --- a/lib/MC/MCAsmInfoDarwin.cpp +++ b/lib/MC/MCAsmInfoDarwin.cpp @@ -13,6 +13,9 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoDarwin.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" using namespace llvm; MCAsmInfoDarwin::MCAsmInfoDarwin() { @@ -53,7 +56,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() { HasNoDeadStrip = true; HasSymbolResolver = true; - DwarfUsesAbsoluteLabelForStmtList = false; + DwarfRequiresRelocationForSectionOffset = false; DwarfUsesLabelOffsetForRanges = false; } - diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp index c0fcce7..e8b09fc 100644 --- a/lib/MC/MCAsmStreamer.cpp +++ b/lib/MC/MCAsmStreamer.cpp @@ -45,20 +45,27 @@ class MCAsmStreamer : public MCStreamer { unsigned IsVerboseAsm : 1; unsigned ShowInst : 1; unsigned UseLoc : 1; + unsigned UseCFI : 1; + + enum EHSymbolFlags { EHGlobal = 1, + EHWeakDefinition = 1 << 1, + EHPrivateExtern = 1 << 2 }; + DenseMap<const MCSymbol*, unsigned> FlagMap; bool needsSet(const MCExpr *Value); + void EmitRegisterName(int64_t Register); + public: MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os, - bool isVerboseAsm, - bool useLoc, + bool isVerboseAsm, bool useLoc, bool useCFI, MCInstPrinter *printer, MCCodeEmitter *emitter, TargetAsmBackend *asmbackend, bool showInst) : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()), InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend), CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm), - ShowInst(showInst), UseLoc(useLoc) { + ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI) { if (InstPrinter && IsVerboseAsm) InstPrinter->setCommentStream(CommentStream); } @@ -118,7 +125,8 @@ public: } virtual void EmitLabel(MCSymbol *Symbol); - + virtual void EmitEHSymAttributes(const MCSymbol *Symbol, + MCSymbol *EHSymbol); virtual void EmitAssemblerFlag(MCAssemblerFlag Flag); virtual void EmitThumbFunc(MCSymbol *Func); @@ -127,6 +135,8 @@ public: virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *LastLabel, const MCSymbol *Label); + virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel, + const MCSymbol *Label); virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute); @@ -154,13 +164,13 @@ public: virtual void EmitBytes(StringRef Data, unsigned AddrSpace); virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace); + unsigned AddrSpace); virtual void EmitIntValue(uint64_t Value, unsigned Size, unsigned AddrSpace = 0); - virtual void EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace = 0); + virtual void EmitULEB128Value(const MCExpr *Value); - virtual void EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace = 0); + virtual void EmitSLEB128Value(const MCExpr *Value); virtual void EmitGPRel32Value(const MCExpr *Value); @@ -182,15 +192,38 @@ public: virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Filename); virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column, unsigned Flags, - unsigned Isa, unsigned Discriminator); - - virtual bool EmitCFIStartProc(); - virtual bool EmitCFIEndProc(); - virtual bool EmitCFIDefCfaOffset(int64_t Offset); - virtual bool EmitCFIDefCfaRegister(int64_t Register); - virtual bool EmitCFIOffset(int64_t Register, int64_t Offset); - virtual bool EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding); - virtual bool EmitCFILsda(const MCSymbol *Sym, unsigned Encoding); + unsigned Isa, unsigned Discriminator, + StringRef FileName); + + virtual void EmitCFISections(bool EH, bool Debug); + virtual void EmitCFIStartProc(); + virtual void EmitCFIEndProc(); + virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset); + virtual void EmitCFIDefCfaOffset(int64_t Offset); + virtual void EmitCFIDefCfaRegister(int64_t Register); + virtual void EmitCFIOffset(int64_t Register, int64_t Offset); + virtual void EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding); + virtual void EmitCFILsda(const MCSymbol *Sym, unsigned Encoding); + virtual void EmitCFIRememberState(); + virtual void EmitCFIRestoreState(); + virtual void EmitCFISameValue(int64_t Register); + virtual void EmitCFIRelOffset(int64_t Register, int64_t Offset); + virtual void EmitCFIAdjustCfaOffset(int64_t Adjustment); + + virtual void EmitWin64EHStartProc(const MCSymbol *Symbol); + virtual void EmitWin64EHEndProc(); + virtual void EmitWin64EHStartChained(); + virtual void EmitWin64EHEndChained(); + virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind, + bool Except); + virtual void EmitWin64EHHandlerData(); + virtual void EmitWin64EHPushReg(unsigned Register); + virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset); + virtual void EmitWin64EHAllocStack(unsigned Size); + virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset); + virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset); + virtual void EmitWin64EHPushFrame(bool Code); + virtual void EmitWin64EHEndProlog(); virtual void EmitFnStart(); virtual void EmitFnEnd(); @@ -269,14 +302,27 @@ void MCAsmStreamer::ChangeSection(const MCSection *Section) { Section->PrintSwitchToSection(MAI, OS); } +void MCAsmStreamer::EmitEHSymAttributes(const MCSymbol *Symbol, + MCSymbol *EHSymbol) { + if (UseCFI) + return; + + unsigned Flags = FlagMap.lookup(Symbol); + + if (Flags & EHGlobal) + EmitSymbolAttribute(EHSymbol, MCSA_Global); + if (Flags & EHWeakDefinition) + EmitSymbolAttribute(EHSymbol, MCSA_WeakDefinition); + if (Flags & EHPrivateExtern) + EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern); +} + void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) { assert(Symbol->isUndefined() && "Cannot define a symbol twice!"); - assert(!Symbol->isVariable() && "Cannot emit a variable symbol!"); - assert(getCurrentSection() && "Cannot emit before setting section!"); + MCStreamer::EmitLabel(Symbol); OS << *Symbol << MAI.getLabelSuffix(); EmitEOL(); - Symbol->setSection(*getCurrentSection()); } void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { @@ -294,7 +340,8 @@ void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) { // This needs to emit to a temporary string to get properly quoted // MCSymbols when they have spaces in them. OS << "\t.thumb_func"; - if (Func) + // Only Mach-O hasSubsectionsViaSymbols() + if (MAI.hasSubsectionsViaSymbols()) OS << '\t' << *Func; EmitEOL(); } @@ -319,6 +366,15 @@ void MCAsmStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta, getContext().getTargetAsmInfo().getPointerSize()); } +void MCAsmStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel, + const MCSymbol *Label) { + EmitIntValue(dwarf::DW_CFA_advance_loc4, 1); + const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel); + AddrDelta = ForceExpAbs(AddrDelta); + EmitValue(AddrDelta, 4); +} + + void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) { switch (Attribute) { @@ -347,6 +403,7 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol, return; case MCSA_Global: // .globl/.global OS << MAI.getGlobalDirective(); + FlagMap[Symbol] |= EHGlobal; break; case MCSA_Hidden: OS << "\t.hidden\t"; break; case MCSA_IndirectSymbol: OS << "\t.indirect_symbol\t"; break; @@ -355,11 +412,17 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol, case MCSA_Local: OS << "\t.local\t"; break; case MCSA_NoDeadStrip: OS << "\t.no_dead_strip\t"; break; case MCSA_SymbolResolver: OS << "\t.symbol_resolver\t"; break; - case MCSA_PrivateExtern: OS << "\t.private_extern\t"; break; + case MCSA_PrivateExtern: + OS << "\t.private_extern\t"; + FlagMap[Symbol] |= EHPrivateExtern; + break; case MCSA_Protected: OS << "\t.protected\t"; break; case MCSA_Reference: OS << "\t.reference\t"; break; case MCSA_Weak: OS << "\t.weak\t"; break; - case MCSA_WeakDefinition: OS << "\t.weak_definition\t"; break; + case MCSA_WeakDefinition: + OS << "\t.weak_definition\t"; + FlagMap[Symbol] |= EHWeakDefinition; + break; // .weak_reference case MCSA_WeakReference: OS << MAI.getWeakRefDirective(); break; case MCSA_WeakDefAutoPrivate: OS << "\t.weak_def_can_be_hidden\t"; break; @@ -522,9 +585,8 @@ void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size, } void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace) { + unsigned AddrSpace) { assert(getCurrentSection() && "Cannot emit contents before setting section!"); - assert(!isPCRel && "Cannot emit pc relative relocations!"); const char *Directive = 0; switch (Size) { default: break; @@ -553,10 +615,10 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, EmitEOL(); } -void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace) { +void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) { int64_t IntValue; if (Value->EvaluateAsAbsolute(IntValue)) { - EmitULEB128IntValue(IntValue, AddrSpace); + EmitULEB128IntValue(IntValue); return; } assert(MAI.hasLEB128() && "Cannot print a .uleb"); @@ -564,10 +626,10 @@ void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace) { EmitEOL(); } -void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace) { +void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) { int64_t IntValue; if (Value->EvaluateAsAbsolute(IntValue)) { - EmitSLEB128IntValue(IntValue, AddrSpace); + EmitSLEB128IntValue(IntValue); return; } assert(MAI.hasLEB128() && "Cannot print a .sleb"); @@ -683,9 +745,10 @@ bool MCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo, StringRef Filename){ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column, unsigned Flags, unsigned Isa, - unsigned Discriminator) { + unsigned Discriminator, + StringRef FileName) { this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags, - Isa, Discriminator); + Isa, Discriminator, FileName); if (!UseLoc) return; @@ -711,78 +774,289 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line, OS << "isa " << Isa; if (Discriminator) OS << "discriminator " << Discriminator; + + if (IsVerboseAsm) { + OS.PadToColumn(MAI.getCommentColumn()); + OS << MAI.getCommentString() << ' ' << FileName << ':' + << Line << ':' << Column; + } EmitEOL(); } -bool MCAsmStreamer::EmitCFIStartProc() { - if (this->MCStreamer::EmitCFIStartProc()) - return true; +void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) { + MCStreamer::EmitCFISections(EH, Debug); + + if (!UseCFI) + return; + + OS << "\t.cfi_sections "; + if (EH) { + OS << ".eh_frame"; + if (Debug) + OS << ", .debug_frame"; + } else if (Debug) { + OS << ".debug_frame"; + } - OS << "\t.cfi_startproc"; EmitEOL(); +} + +void MCAsmStreamer::EmitCFIStartProc() { + MCStreamer::EmitCFIStartProc(); + + if (!UseCFI) + return; - return false; + OS << "\t.cfi_startproc"; + EmitEOL(); } -bool MCAsmStreamer::EmitCFIEndProc() { - if (this->MCStreamer::EmitCFIEndProc()) - return true; +void MCAsmStreamer::EmitCFIEndProc() { + MCStreamer::EmitCFIEndProc(); + + if (!UseCFI) + return; OS << "\t.cfi_endproc"; EmitEOL(); +} + +void MCAsmStreamer::EmitRegisterName(int64_t Register) { + if (InstPrinter) { + const TargetAsmInfo &asmInfo = getContext().getTargetAsmInfo(); + unsigned LLVMRegister = asmInfo.getLLVMRegNum(Register, true); + InstPrinter->printRegName(OS, LLVMRegister); + } else { + OS << Register; + } +} + +void MCAsmStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) { + MCStreamer::EmitCFIDefCfa(Register, Offset); - return false; + if (!UseCFI) + return; + + OS << "\t.cfi_def_cfa "; + EmitRegisterName(Register); + OS << ", " << Offset; + EmitEOL(); } -bool MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) { - if (this->MCStreamer::EmitCFIDefCfaOffset(Offset)) - return true; +void MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) { + MCStreamer::EmitCFIDefCfaOffset(Offset); + + if (!UseCFI) + return; OS << "\t.cfi_def_cfa_offset " << Offset; EmitEOL(); +} + +void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) { + MCStreamer::EmitCFIDefCfaRegister(Register); - return false; + if (!UseCFI) + return; + + OS << "\t.cfi_def_cfa_register "; + EmitRegisterName(Register); + EmitEOL(); } -bool MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) { - if (this->MCStreamer::EmitCFIDefCfaRegister(Register)) - return true; +void MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) { + this->MCStreamer::EmitCFIOffset(Register, Offset); + + if (!UseCFI) + return; - OS << "\t.cfi_def_cfa_register " << Register; + OS << "\t.cfi_offset "; + EmitRegisterName(Register); + OS << ", " << Offset; EmitEOL(); +} + +void MCAsmStreamer::EmitCFIPersonality(const MCSymbol *Sym, + unsigned Encoding) { + MCStreamer::EmitCFIPersonality(Sym, Encoding); - return false; + if (!UseCFI) + return; + + OS << "\t.cfi_personality " << Encoding << ", " << *Sym; + EmitEOL(); } -bool MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) { - if (this->MCStreamer::EmitCFIOffset(Register, Offset)) - return true; +void MCAsmStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) { + MCStreamer::EmitCFILsda(Sym, Encoding); - OS << "\t.cfi_offset " << Register << ", " << Offset; + if (!UseCFI) + return; + + OS << "\t.cfi_lsda " << Encoding << ", " << *Sym; EmitEOL(); +} + +void MCAsmStreamer::EmitCFIRememberState() { + MCStreamer::EmitCFIRememberState(); - return false; + if (!UseCFI) + return; + + OS << "\t.cfi_remember_state"; + EmitEOL(); } -bool MCAsmStreamer::EmitCFIPersonality(const MCSymbol *Sym, - unsigned Encoding) { - if (this->MCStreamer::EmitCFIPersonality(Sym, Encoding)) - return true; +void MCAsmStreamer::EmitCFIRestoreState() { + MCStreamer::EmitCFIRestoreState(); - OS << "\t.cfi_personality " << Encoding << ", " << *Sym; + if (!UseCFI) + return; + + OS << "\t.cfi_restore_state"; EmitEOL(); +} - return false; +void MCAsmStreamer::EmitCFISameValue(int64_t Register) { + MCStreamer::EmitCFISameValue(Register); + + if (!UseCFI) + return; + + OS << "\t.cfi_same_value "; + EmitRegisterName(Register); + EmitEOL(); } -bool MCAsmStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) { - if (this->MCStreamer::EmitCFILsda(Sym, Encoding)) - return true; +void MCAsmStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) { + MCStreamer::EmitCFIRelOffset(Register, Offset); - OS << "\t.cfi_lsda " << Encoding << ", " << *Sym; + if (!UseCFI) + return; + + OS << "\t.cfi_rel_offset "; + EmitRegisterName(Register); + OS << ", " << Offset; + EmitEOL(); +} + +void MCAsmStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) { + MCStreamer::EmitCFIAdjustCfaOffset(Adjustment); + + if (!UseCFI) + return; + + OS << "\t.cfi_adjust_cfa_offset " << Adjustment; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) { + MCStreamer::EmitWin64EHStartProc(Symbol); + + OS << ".seh_proc " << *Symbol; EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHEndProc() { + MCStreamer::EmitWin64EHEndProc(); - return false; + OS << "\t.seh_endproc"; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHStartChained() { + MCStreamer::EmitWin64EHStartChained(); + + OS << "\t.seh_startchained"; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHEndChained() { + MCStreamer::EmitWin64EHEndChained(); + + OS << "\t.seh_endchained"; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind, + bool Except) { + MCStreamer::EmitWin64EHHandler(Sym, Unwind, Except); + + OS << "\t.seh_handler " << *Sym; + if (Unwind) + OS << ", @unwind"; + if (Except) + OS << ", @except"; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHHandlerData() { + MCStreamer::EmitWin64EHHandlerData(); + + // Switch sections. Don't call SwitchSection directly, because that will + // cause the section switch to be visible in the emitted assembly. + // We only do this so the section switch that terminates the handler + // data block is visible. + MCWin64EHUnwindInfo *CurFrame = getCurrentW64UnwindInfo(); + StringRef suffix=MCWin64EHUnwindEmitter::GetSectionSuffix(CurFrame->Function); + const MCSection *xdataSect = + getContext().getTargetAsmInfo().getWin64EHTableSection(suffix); + if (xdataSect) + SwitchSectionNoChange(xdataSect); + + OS << "\t.seh_handlerdata"; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHPushReg(unsigned Register) { + MCStreamer::EmitWin64EHPushReg(Register); + + OS << "\t.seh_pushreg " << Register; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) { + MCStreamer::EmitWin64EHSetFrame(Register, Offset); + + OS << "\t.seh_setframe " << Register << ", " << Offset; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHAllocStack(unsigned Size) { + MCStreamer::EmitWin64EHAllocStack(Size); + + OS << "\t.seh_stackalloc " << Size; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) { + MCStreamer::EmitWin64EHSaveReg(Register, Offset); + + OS << "\t.seh_savereg " << Register << ", " << Offset; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) { + MCStreamer::EmitWin64EHSaveXMM(Register, Offset); + + OS << "\t.seh_savexmm " << Register << ", " << Offset; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHPushFrame(bool Code) { + MCStreamer::EmitWin64EHPushFrame(Code); + + OS << "\t.seh_pushframe"; + if (Code) + OS << " @code"; + EmitEOL(); +} + +void MCAsmStreamer::EmitWin64EHEndProlog(void) { + MCStreamer::EmitWin64EHEndProlog(); + + OS << "\t.seh_endprologue"; + EmitEOL(); } void MCAsmStreamer::AddEncodingComment(const MCInst &Inst) { @@ -895,8 +1169,10 @@ void MCAsmStreamer::EmitPersonality(const MCSymbol *Personality) { } void MCAsmStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) { - OS << "\t.setfp\t" << InstPrinter->getRegName(FpReg) - << ", " << InstPrinter->getRegName(SpReg); + OS << "\t.setfp\t"; + InstPrinter->printRegName(OS, FpReg); + OS << ", "; + InstPrinter->printRegName(OS, SpReg); if (Offset) OS << ", #" << Offset; EmitEOL(); @@ -915,10 +1191,12 @@ void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList, else OS << "\t.save\t{"; - OS << InstPrinter->getRegName(RegList[0]); + InstPrinter->printRegName(OS, RegList[0]); - for (unsigned i = 1, e = RegList.size(); i != e; ++i) - OS << ", " << InstPrinter->getRegName(RegList[i]); + for (unsigned i = 1, e = RegList.size(); i != e; ++i) { + OS << ", "; + InstPrinter->printRegName(OS, RegList[i]); + } OS << "}"; EmitEOL(); @@ -927,9 +1205,6 @@ void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList, void MCAsmStreamer::EmitInstruction(const MCInst &Inst) { assert(getCurrentSection() && "Cannot emit contents before setting section!"); - if (!UseLoc) - MCLineEntry::Make(this, getCurrentSection()); - // Show the encoding in a comment if we have a code emitter. if (Emitter) AddEncodingComment(Inst); @@ -962,13 +1237,17 @@ void MCAsmStreamer::Finish() { // Dump out the dwarf file & directory tables and line tables. if (getContext().hasDwarfFiles() && !UseLoc) MCDwarfFileTable::Emit(this); + + if (!UseCFI) + EmitFrames(false); } MCStreamer *llvm::createAsmStreamer(MCContext &Context, formatted_raw_ostream &OS, bool isVerboseAsm, bool useLoc, + bool useCFI, MCInstPrinter *IP, MCCodeEmitter *CE, TargetAsmBackend *TAB, bool ShowInst) { - return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc, + return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc, useCFI, IP, CE, TAB, ShowInst); } diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp index 9992646..527a63c 100644 --- a/lib/MC/MCAssembler.cpp +++ b/lib/MC/MCAssembler.cpp @@ -28,7 +28,6 @@ #include "llvm/Target/TargetRegistry.h" #include "llvm/Target/TargetAsmBackend.h" -#include <vector> using namespace llvm; namespace { @@ -103,6 +102,33 @@ uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const { } uint64_t MCAsmLayout::getSymbolOffset(const MCSymbolData *SD) const { + const MCSymbol &S = SD->getSymbol(); + + // If this is a variable, then recursively evaluate now. + if (S.isVariable()) { + MCValue Target; + if (!S.getVariableValue()->EvaluateAsRelocatable(Target, *this)) + report_fatal_error("unable to evaluate offset for variable '" + + S.getName() + "'"); + + // Verify that any used symbols are defined. + if (Target.getSymA() && Target.getSymA()->getSymbol().isUndefined()) + report_fatal_error("unable to evaluate offset to undefined symbol '" + + Target.getSymA()->getSymbol().getName() + "'"); + if (Target.getSymB() && Target.getSymB()->getSymbol().isUndefined()) + report_fatal_error("unable to evaluate offset to undefined symbol '" + + Target.getSymB()->getSymbol().getName() + "'"); + + uint64_t Offset = Target.getConstant(); + if (Target.getSymA()) + Offset += getSymbolOffset(&Assembler.getSymbolData( + Target.getSymA()->getSymbol())); + if (Target.getSymB()) + Offset -= getSymbolOffset(&Assembler.getSymbolData( + Target.getSymB()->getSymbol())); + return Offset; + } + assert(SD->getFragment() && "Invalid getOffset() on undefined symbol!"); return getFragmentOffset(SD->getFragment()) + SD->getOffset(); } @@ -692,7 +718,9 @@ bool MCAssembler::RelaxInstruction(MCAsmLayout &Layout, bool MCAssembler::RelaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) { int64_t Value = 0; uint64_t OldSize = LF.getContents().size(); - LF.getValue().EvaluateAsAbsolute(Value, Layout); + bool IsAbs = LF.getValue().EvaluateAsAbsolute(Value, Layout); + (void)IsAbs; + assert(IsAbs); SmallString<8> &Data = LF.getContents(); Data.clear(); raw_svector_ostream OSE(Data); diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp index 7c68713..8faa72e 100644 --- a/lib/MC/MCContext.cpp +++ b/lib/MC/MCContext.cpp @@ -27,7 +27,9 @@ typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy; MCContext::MCContext(const MCAsmInfo &mai, const TargetAsmInfo *tai) : - MAI(mai), TAI(tai), NextUniqueID(0), + MAI(mai), TAI(tai), + Allocator(), Symbols(Allocator), UsedNames(Allocator), + NextUniqueID(0), CurrentDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0), AllowTemporaryLabels(true) { MachOUniquingMap = 0; @@ -85,12 +87,11 @@ MCSymbol *MCContext::CreateSymbol(StringRef Name) { StringMapEntry<bool> *NameEntry = &UsedNames.GetOrCreateValue(Name); if (NameEntry->getValue()) { assert(isTemporary && "Cannot rename non temporary symbols"); - SmallString<128> NewName; + SmallString<128> NewName = Name; do { - Twine T = Name + Twine(NextUniqueID++); - T.toVector(NewName); - StringRef foo = NewName; - NameEntry = &UsedNames.GetOrCreateValue(foo); + NewName.resize(Name.size()); + raw_svector_ostream(NewName) << NextUniqueID++; + NameEntry = &UsedNames.GetOrCreateValue(NewName); } while (NameEntry->getValue()); } NameEntry->setValue(true); @@ -110,9 +111,8 @@ MCSymbol *MCContext::GetOrCreateSymbol(const Twine &Name) { MCSymbol *MCContext::CreateTempSymbol() { SmallString<128> NameSV; - Twine Name = Twine(MAI.getPrivateGlobalPrefix()) + "tmp" + - Twine(NextUniqueID++); - Name.toVector(NameSV); + raw_svector_ostream(NameSV) + << MAI.getPrivateGlobalPrefix() << "tmp" << NextUniqueID++; return CreateSymbol(NameSV); } diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp index 4707198..6e636f0 100644 --- a/lib/MC/MCDisassembler/Disassembler.cpp +++ b/lib/MC/MCDisassembler/Disassembler.cpp @@ -6,11 +6,10 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// + #include "Disassembler.h" -#include <stdio.h> #include "llvm-c/Disassembler.h" -#include <string> #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCDisassembler.h" #include "llvm/MC/MCInst.h" @@ -27,17 +26,12 @@ class Target; } // namespace llvm using namespace llvm; -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// // LLVMCreateDisasm() creates a disassembler for the TripleName. Symbolic // disassembly is supported by passing a block of information in the DisInfo -// parameter and specifing the TagType and call back functions as described in +// parameter and specifying the TagType and callback functions as described in // the header llvm-c/Disassembler.h . The pointer to the block and the -// functions can all be passed as NULL. If successfull this returns a -// disassembler context if not it returns NULL. +// functions can all be passed as NULL. If successful, this returns a +// disassembler context. If not, it returns NULL. // LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo, int TagType, LLVMOpInfoCallback GetOpInfo, @@ -77,8 +71,9 @@ LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo, assert(Ctx && "Unable to create MCContext!"); // Set up disassembler. - const MCDisassembler *DisAsm = TheTarget->createMCDisassembler(); + MCDisassembler *DisAsm = TheTarget->createMCDisassembler(); assert(DisAsm && "Unable to create disassembler!"); + DisAsm->setupForSymbolicDisassembly(GetOpInfo, DisInfo, Ctx); // Set up the instruction printer. int AsmPrinterVariant = MAI->getAssemblerDialect(); @@ -107,7 +102,6 @@ namespace { // The memory object created by LLVMDisasmInstruction(). // class DisasmMemoryObject : public MemoryObject { -private: uint8_t *Bytes; uint64_t Size; uint64_t BasePC; @@ -125,12 +119,12 @@ public: return 0; } }; -} // namespace +} // end anonymous namespace // -// LLVMDisasmInstruction() disassmbles a single instruction using the +// LLVMDisasmInstruction() disassembles a single instruction using the // disassembler context specified in the parameter DC. The bytes of the -// instuction are specified in the parameter Bytes, and contains at least +// instruction are specified in the parameter Bytes, and contains at least // BytesSize number of bytes. The instruction is at the address specified by // the PC parameter. If a valid instruction can be disassembled its string is // returned indirectly in OutString which whos size is specified in the @@ -153,21 +147,15 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes, if (!DisAsm->getInstruction(Inst, Size, MemoryObject, PC, /*REMOVE*/ nulls())) return 0; - std::string InsnStr; - raw_string_ostream OS(InsnStr); - raw_ostream &Out = OS; - IP->printInst(&Inst, Out); - - std::string p; - p = OS.str(); -#ifdef LLVM_ON_WIN32 - sprintf(OutString, "%s", p.c_str()); -#else - snprintf(OutString, OutStringSize, "%s", p.c_str()); -#endif - return Size; -} + SmallVector<char, 64> InsnStr; + raw_svector_ostream OS(InsnStr); + IP->printInst(&Inst, OS); + OS.flush(); + + assert(OutStringSize != 0 && "Output buffer cannot be zero size"); + size_t OutputSize = std::min(OutStringSize-1, InsnStr.size()); + std::memcpy(OutString, InsnStr.data(), OutputSize); + OutString[OutputSize] = '\0'; // Terminate string. -#ifdef __cplusplus + return Size; } -#endif // __cplusplus diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h index f844951..f0ec42a 100644 --- a/lib/MC/MCDisassembler/Disassembler.h +++ b/lib/MC/MCDisassembler/Disassembler.h @@ -13,6 +13,10 @@ // syntax. // //===----------------------------------------------------------------------===// + +#ifndef LLVM_MC_DISASSEMBLER_H +#define LLVM_MC_DISASSEMBLER_H + #include "llvm-c/Disassembler.h" #include <string> #include "llvm/ADT/OwningPtr.h" @@ -69,7 +73,7 @@ private: public: LLVMDisasmContext(std::string tripleName, void *disInfo, int tagType, - LLVMOpInfoCallback getOpInfo, + LLVMOpInfoCallback getOpInfo, LLVMSymbolLookupCallback symbolLookUp, const Target *theTarget, const MCAsmInfo *mAI, llvm::TargetMachine *tM, const TargetAsmInfo *tai, @@ -88,3 +92,5 @@ public: }; } // namespace llvm + +#endif diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp index e36b3a4..91c5284 100644 --- a/lib/MC/MCDisassembler/EDDisassembler.cpp +++ b/lib/MC/MCDisassembler/EDDisassembler.cpp @@ -334,6 +334,15 @@ int EDDisassembler::printInst(std::string &str, MCInst &inst) { return 0; } +static void diag_handler(const SMDiagnostic &diag, + void *context) +{ + if (context) { + EDDisassembler *disassembler = static_cast<EDDisassembler*>(context); + diag.Print("", disassembler->ErrorStream); + } +} + int EDDisassembler::parseInst(SmallVectorImpl<MCParsedAsmOperand*> &operands, SmallVectorImpl<AsmToken> &tokens, const std::string &str) { @@ -356,6 +365,7 @@ int EDDisassembler::parseInst(SmallVectorImpl<MCParsedAsmOperand*> &operands, SMLoc instLoc; SourceMgr sourceMgr; + sourceMgr.setDiagHandler(diag_handler, static_cast<void*>(this)); sourceMgr.AddNewSourceBuffer(buf, SMLoc()); // ownership of buf handed over MCContext context(*AsmInfo, NULL); OwningPtr<MCStreamer> streamer(createNullStreamer(context)); diff --git a/lib/MC/MCDisassembler/EDOperand.cpp b/lib/MC/MCDisassembler/EDOperand.cpp index 04b21cb..492bb08 100644 --- a/lib/MC/MCDisassembler/EDOperand.cpp +++ b/lib/MC/MCDisassembler/EDOperand.cpp @@ -198,15 +198,24 @@ int EDOperand::evaluate(uint64_t &result, default: return -1; case kOperandTypeImmediate: + if (!Inst.Inst->getOperand(MCOpIndex).isImm()) + return -1; + result = Inst.Inst->getOperand(MCOpIndex).getImm(); return 0; case kOperandTypeRegister: { + if (!Inst.Inst->getOperand(MCOpIndex).isReg()) + return -1; + unsigned reg = Inst.Inst->getOperand(MCOpIndex).getReg(); return callback(&result, reg, arg); } case kOperandTypeARMBranchTarget: { + if (!Inst.Inst->getOperand(MCOpIndex).isImm()) + return -1; + int64_t displacement = Inst.Inst->getOperand(MCOpIndex).getImm(); uint64_t pcVal; diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp index 112d7d8..13cb81a 100644 --- a/lib/MC/MCDwarf.cpp +++ b/lib/MC/MCDwarf.cpp @@ -17,6 +17,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Twine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -439,19 +440,105 @@ static int getDataAlignmentFactor(MCStreamer &streamer) { return -size; } -static void EmitCFIInstruction(MCStreamer &Streamer, - const MCCFIInstruction &Instr) { +static unsigned getSizeForEncoding(MCStreamer &streamer, + unsigned symbolEncoding) { + MCContext &context = streamer.getContext(); + const TargetAsmInfo &asmInfo = context.getTargetAsmInfo(); + unsigned format = symbolEncoding & 0x0f; + switch (format) { + default: + assert(0 && "Unknown Encoding"); + case dwarf::DW_EH_PE_absptr: + case dwarf::DW_EH_PE_signed: + return asmInfo.getPointerSize(); + case dwarf::DW_EH_PE_udata2: + case dwarf::DW_EH_PE_sdata2: + return 2; + case dwarf::DW_EH_PE_udata4: + case dwarf::DW_EH_PE_sdata4: + return 4; + case dwarf::DW_EH_PE_udata8: + case dwarf::DW_EH_PE_sdata8: + return 8; + } +} + +static void EmitSymbol(MCStreamer &streamer, const MCSymbol &symbol, + unsigned symbolEncoding) { + MCContext &context = streamer.getContext(); + const MCAsmInfo &asmInfo = context.getAsmInfo(); + const MCExpr *v = asmInfo.getExprForFDESymbol(&symbol, + symbolEncoding, + streamer); + unsigned size = getSizeForEncoding(streamer, symbolEncoding); + streamer.EmitAbsValue(v, size); +} + +static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol, + unsigned symbolEncoding) { + MCContext &context = streamer.getContext(); + const MCAsmInfo &asmInfo = context.getAsmInfo(); + const MCExpr *v = asmInfo.getExprForPersonalitySymbol(&symbol, + symbolEncoding, + streamer); + unsigned size = getSizeForEncoding(streamer, symbolEncoding); + streamer.EmitValue(v, size); +} + +static const MachineLocation TranslateMachineLocation( + const TargetAsmInfo &AsmInfo, + const MachineLocation &Loc) { + unsigned Reg = Loc.getReg() == MachineLocation::VirtualFP ? + MachineLocation::VirtualFP : + unsigned(AsmInfo.getDwarfRegNum(Loc.getReg(), true)); + const MachineLocation &NewLoc = Loc.isReg() ? + MachineLocation(Reg) : MachineLocation(Reg, Loc.getOffset()); + return NewLoc; +} + +namespace { + class FrameEmitterImpl { + int CFAOffset; + int CIENum; + bool UsingCFI; + bool IsEH; + const MCSymbol *SectionStart; + + public: + FrameEmitterImpl(bool usingCFI, bool isEH, const MCSymbol *sectionStart) : + CFAOffset(0), CIENum(0), UsingCFI(usingCFI), IsEH(isEH), + SectionStart(sectionStart) { + } + + const MCSymbol &EmitCIE(MCStreamer &streamer, + const MCSymbol *personality, + unsigned personalityEncoding, + const MCSymbol *lsda, + unsigned lsdaEncoding); + MCSymbol *EmitFDE(MCStreamer &streamer, + const MCSymbol &cieStart, + const MCDwarfFrameInfo &frame); + void EmitCFIInstructions(MCStreamer &streamer, + const std::vector<MCCFIInstruction> &Instrs, + MCSymbol *BaseLabel); + void EmitCFIInstruction(MCStreamer &Streamer, + const MCCFIInstruction &Instr); + }; +} + +void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer, + const MCCFIInstruction &Instr) { int dataAlignmentFactor = getDataAlignmentFactor(Streamer); switch (Instr.getOperation()) { - case MCCFIInstruction::Move: { + case MCCFIInstruction::Move: + case MCCFIInstruction::RelMove: { const MachineLocation &Dst = Instr.getDestination(); const MachineLocation &Src = Instr.getSource(); + const bool IsRelative = Instr.getOperation() == MCCFIInstruction::RelMove; // If advancing cfa. if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) { - assert(!Src.isReg() && "Machine move not supported yet."); - if (Src.getReg() == MachineLocation::VirtualFP) { Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_offset, 1); } else { @@ -459,7 +546,12 @@ static void EmitCFIInstruction(MCStreamer &Streamer, Streamer.EmitULEB128IntValue(Src.getReg()); } - Streamer.EmitULEB128IntValue(-Src.getOffset(), 1); + if (IsRelative) + CFAOffset += Src.getOffset(); + else + CFAOffset = -Src.getOffset(); + + Streamer.EmitULEB128IntValue(CFAOffset); return; } @@ -471,7 +563,11 @@ static void EmitCFIInstruction(MCStreamer &Streamer, } unsigned Reg = Src.getReg(); - int Offset = Dst.getOffset() / dataAlignmentFactor; + + int Offset = Dst.getOffset(); + if (IsRelative) + Offset -= CFAOffset; + Offset = Offset / dataAlignmentFactor; if (Offset < 0) { Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended_sf, 1); @@ -479,11 +575,11 @@ static void EmitCFIInstruction(MCStreamer &Streamer, Streamer.EmitSLEB128IntValue(Offset); } else if (Reg < 64) { Streamer.EmitIntValue(dwarf::DW_CFA_offset + Reg, 1); - Streamer.EmitULEB128IntValue(Offset, 1); + Streamer.EmitULEB128IntValue(Offset); } else { Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended, 1); - Streamer.EmitULEB128IntValue(Reg, 1); - Streamer.EmitULEB128IntValue(Offset, 1); + Streamer.EmitULEB128IntValue(Reg); + Streamer.EmitULEB128IntValue(Offset); } return; } @@ -493,15 +589,21 @@ static void EmitCFIInstruction(MCStreamer &Streamer, case MCCFIInstruction::Restore: Streamer.EmitIntValue(dwarf::DW_CFA_restore_state, 1); return; + case MCCFIInstruction::SameValue: { + unsigned Reg = Instr.getDestination().getReg(); + Streamer.EmitIntValue(dwarf::DW_CFA_same_value, 1); + Streamer.EmitULEB128IntValue(Reg); + return; + } } llvm_unreachable("Unhandled case in switch"); } /// EmitFrameMoves - Emit frame instructions to describe the layout of the /// frame. -static void EmitCFIInstructions(MCStreamer &streamer, - const std::vector<MCCFIInstruction> &Instrs, - MCSymbol *BaseLabel) { +void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer, + const std::vector<MCCFIInstruction> &Instrs, + MCSymbol *BaseLabel) { for (unsigned i = 0, N = Instrs.size(); i < N; ++i) { const MCCFIInstruction &Instr = Instrs[i]; MCSymbol *Label = Instr.getLabel(); @@ -521,90 +623,48 @@ static void EmitCFIInstructions(MCStreamer &streamer, } } -static void EmitSymbol(MCStreamer &streamer, const MCSymbol &symbol, - unsigned symbolEncoding) { +const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer, + const MCSymbol *personality, + unsigned personalityEncoding, + const MCSymbol *lsda, + unsigned lsdaEncoding) { MCContext &context = streamer.getContext(); const TargetAsmInfo &asmInfo = context.getTargetAsmInfo(); - unsigned format = symbolEncoding & 0x0f; - unsigned application = symbolEncoding & 0x70; - unsigned size; - switch (format) { - default: - assert(0 && "Unknown Encoding"); - case dwarf::DW_EH_PE_absptr: - case dwarf::DW_EH_PE_signed: - size = asmInfo.getPointerSize(); - break; - case dwarf::DW_EH_PE_udata2: - case dwarf::DW_EH_PE_sdata2: - size = 2; - break; - case dwarf::DW_EH_PE_udata4: - case dwarf::DW_EH_PE_sdata4: - size = 4; - break; - case dwarf::DW_EH_PE_udata8: - case dwarf::DW_EH_PE_sdata8: - size = 8; - break; - } - switch (application) { - default: - assert(0 && "Unknown Encoding"); - break; - case 0: - streamer.EmitSymbolValue(&symbol, size); - break; - case dwarf::DW_EH_PE_pcrel: - streamer.EmitPCRelSymbolValue(&symbol, size); - break; - } -} -static const MachineLocation TranslateMachineLocation( - const TargetAsmInfo &AsmInfo, - const MachineLocation &Loc) { - unsigned Reg = Loc.getReg() == MachineLocation::VirtualFP ? - MachineLocation::VirtualFP : - unsigned(AsmInfo.getDwarfRegNum(Loc.getReg(), true)); - const MachineLocation &NewLoc = Loc.isReg() ? - MachineLocation(Reg) : MachineLocation(Reg, Loc.getOffset()); - return NewLoc; -} + MCSymbol *sectionStart; + if (asmInfo.isFunctionEHFrameSymbolPrivate() || !IsEH) + sectionStart = context.CreateTempSymbol(); + else + sectionStart = context.GetOrCreateSymbol(Twine("EH_frame") + Twine(CIENum)); + + CIENum++; -static const MCSymbol &EmitCIE(MCStreamer &streamer, - const MCSymbol *personality, - unsigned personalityEncoding, - const MCSymbol *lsda, - unsigned lsdaEncoding) { - MCContext &context = streamer.getContext(); - const TargetAsmInfo &asmInfo = context.getTargetAsmInfo(); - const MCSection §ion = *asmInfo.getEHFrameSection(); - streamer.SwitchSection(§ion); - MCSymbol *sectionStart = streamer.getContext().CreateTempSymbol(); MCSymbol *sectionEnd = streamer.getContext().CreateTempSymbol(); // Length const MCExpr *Length = MakeStartMinusEndExpr(streamer, *sectionStart, *sectionEnd, 4); streamer.EmitLabel(sectionStart); - streamer.EmitValue(Length, 4); + streamer.EmitAbsValue(Length, 4); // CIE ID - streamer.EmitIntValue(0, 4); + unsigned CIE_ID = IsEH ? 0 : -1; + streamer.EmitIntValue(CIE_ID, 4); // Version streamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1); // Augmentation String SmallString<8> Augmentation; - Augmentation += "z"; - if (personality) - Augmentation += "P"; - if (lsda) - Augmentation += "L"; - Augmentation += "R"; - streamer.EmitBytes(Augmentation.str(), 0); + if (IsEH) { + Augmentation += "z"; + if (personality) + Augmentation += "P"; + if (lsda) + Augmentation += "L"; + Augmentation += "R"; + streamer.EmitBytes(Augmentation.str(), 0); + } streamer.EmitIntValue(0, 1); // Code Alignment Factor @@ -617,28 +677,34 @@ static const MCSymbol &EmitCIE(MCStreamer &streamer, streamer.EmitULEB128IntValue(asmInfo.getDwarfRARegNum(true)); // Augmentation Data Length (optional) - MCSymbol *augmentationStart = streamer.getContext().CreateTempSymbol(); - MCSymbol *augmentationEnd = streamer.getContext().CreateTempSymbol(); - const MCExpr *augmentationLength = MakeStartMinusEndExpr(streamer, - *augmentationStart, - *augmentationEnd, 0); - streamer.EmitULEB128Value(augmentationLength); - - // Augmentation Data (optional) - streamer.EmitLabel(augmentationStart); - if (personality) { - // Personality Encoding - streamer.EmitIntValue(personalityEncoding, 1); - // Personality - EmitSymbol(streamer, *personality, personalityEncoding); - } - if (lsda) { - // LSDA Encoding - streamer.EmitIntValue(lsdaEncoding, 1); + + unsigned augmentationLength = 0; + if (IsEH) { + if (personality) { + // Personality Encoding + augmentationLength += 1; + // Personality + augmentationLength += getSizeForEncoding(streamer, personalityEncoding); + } + if (lsda) + augmentationLength += 1; + // Encoding of the FDE pointers + augmentationLength += 1; + + streamer.EmitULEB128IntValue(augmentationLength); + + // Augmentation Data (optional) + if (personality) { + // Personality Encoding + streamer.EmitIntValue(personalityEncoding, 1); + // Personality + EmitPersonality(streamer, *personality, personalityEncoding); + } + if (lsda) + streamer.EmitIntValue(lsdaEncoding, 1); // LSDA Encoding + // Encoding of the FDE pointers + streamer.EmitIntValue(asmInfo.getFDEEncoding(UsingCFI), 1); } - // Encoding of the FDE pointers - streamer.EmitIntValue(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4, 1); - streamer.EmitLabel(augmentationEnd); // Initial Instructions @@ -658,56 +724,80 @@ static const MCSymbol &EmitCIE(MCStreamer &streamer, EmitCFIInstructions(streamer, Instructions, NULL); // Padding - streamer.EmitValueToAlignment(4); + streamer.EmitValueToAlignment(IsEH ? 4 : asmInfo.getPointerSize()); streamer.EmitLabel(sectionEnd); return *sectionStart; } -static MCSymbol *EmitFDE(MCStreamer &streamer, - const MCSymbol &cieStart, - const MCDwarfFrameInfo &frame) { +MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer, + const MCSymbol &cieStart, + const MCDwarfFrameInfo &frame) { MCContext &context = streamer.getContext(); MCSymbol *fdeStart = context.CreateTempSymbol(); MCSymbol *fdeEnd = context.CreateTempSymbol(); + const TargetAsmInfo &TAsmInfo = context.getTargetAsmInfo(); + + if (!TAsmInfo.isFunctionEHFrameSymbolPrivate() && IsEH) { + MCSymbol *EHSym = context.GetOrCreateSymbol( + frame.Function->getName() + Twine(".eh")); + streamer.EmitEHSymAttributes(frame.Function, EHSym); + streamer.EmitLabel(EHSym); + } // Length const MCExpr *Length = MakeStartMinusEndExpr(streamer, *fdeStart, *fdeEnd, 0); - streamer.EmitValue(Length, 4); + streamer.EmitAbsValue(Length, 4); streamer.EmitLabel(fdeStart); + // CIE Pointer - const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart, - 0); - streamer.EmitValue(offset, 4); + const MCAsmInfo &asmInfo = context.getAsmInfo(); + if (IsEH) { + const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart, + 0); + streamer.EmitAbsValue(offset, 4); + } else if (!asmInfo.doesDwarfRequireRelocationForSectionOffset()) { + const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart, + cieStart, 0); + streamer.EmitAbsValue(offset, 4); + } else { + streamer.EmitSymbolValue(&cieStart, 4); + } + unsigned fdeEncoding = TAsmInfo.getFDEEncoding(UsingCFI); + unsigned size = getSizeForEncoding(streamer, fdeEncoding); // PC Begin - streamer.EmitPCRelSymbolValue(frame.Begin, 4); + unsigned PCBeginEncoding = IsEH ? fdeEncoding : + (unsigned)dwarf::DW_EH_PE_absptr; + unsigned PCBeginSize = getSizeForEncoding(streamer, PCBeginEncoding); + EmitSymbol(streamer, *frame.Begin, PCBeginEncoding); // PC Range const MCExpr *Range = MakeStartMinusEndExpr(streamer, *frame.Begin, *frame.End, 0); - streamer.EmitValue(Range, 4); - - // Augmentation Data Length - MCSymbol *augmentationStart = streamer.getContext().CreateTempSymbol(); - MCSymbol *augmentationEnd = streamer.getContext().CreateTempSymbol(); - const MCExpr *augmentationLength = MakeStartMinusEndExpr(streamer, - *augmentationStart, - *augmentationEnd, 0); - streamer.EmitULEB128Value(augmentationLength); - - // Augmentation Data - streamer.EmitLabel(augmentationStart); - if (frame.Lsda) - EmitSymbol(streamer, *frame.Lsda, frame.LsdaEncoding); - streamer.EmitLabel(augmentationEnd); + streamer.EmitAbsValue(Range, size); + + if (IsEH) { + // Augmentation Data Length + unsigned augmentationLength = 0; + + if (frame.Lsda) + augmentationLength += getSizeForEncoding(streamer, frame.LsdaEncoding); + + streamer.EmitULEB128IntValue(augmentationLength); + + // Augmentation Data + if (frame.Lsda) + EmitSymbol(streamer, *frame.Lsda, frame.LsdaEncoding); + } + // Call Frame Instructions EmitCFIInstructions(streamer, frame.Instructions, frame.Begin); // Padding - streamer.EmitValueToAlignment(4); + streamer.EmitValueToAlignment(PCBeginSize); return fdeEnd; } @@ -753,22 +843,32 @@ namespace llvm { }; } -void MCDwarfFrameEmitter::Emit(MCStreamer &streamer) { - const MCContext &context = streamer.getContext(); +void MCDwarfFrameEmitter::Emit(MCStreamer &streamer, + bool usingCFI, + bool isEH) { + MCContext &context = streamer.getContext(); const TargetAsmInfo &asmInfo = context.getTargetAsmInfo(); + const MCSection §ion = isEH ? + *asmInfo.getEHFrameSection() : *asmInfo.getDwarfFrameSection(); + streamer.SwitchSection(§ion); + MCSymbol *SectionStart = context.CreateTempSymbol(); + streamer.EmitLabel(SectionStart); + MCSymbol *fdeEnd = NULL; DenseMap<CIEKey, const MCSymbol*> CIEStarts; + FrameEmitterImpl Emitter(usingCFI, isEH, SectionStart); + const MCSymbol *DummyDebugKey = NULL; for (unsigned i = 0, n = streamer.getNumFrameInfos(); i < n; ++i) { const MCDwarfFrameInfo &frame = streamer.getFrameInfo(i); CIEKey key(frame.Personality, frame.PersonalityEncoding, frame.LsdaEncoding); - const MCSymbol *&cieStart = CIEStarts[key]; + const MCSymbol *&cieStart = isEH ? CIEStarts[key] : DummyDebugKey; if (!cieStart) - cieStart = &EmitCIE(streamer, frame.Personality, - frame.PersonalityEncoding, frame.Lsda, - frame.LsdaEncoding); - fdeEnd = EmitFDE(streamer, *cieStart, frame); + cieStart = &Emitter.EmitCIE(streamer, frame.Personality, + frame.PersonalityEncoding, frame.Lsda, + frame.LsdaEncoding); + fdeEnd = Emitter.EmitFDE(streamer, *cieStart, frame); if (i != n - 1) streamer.EmitLabel(fdeEnd); } diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp index ce7783e..2c3f8e8 100644 --- a/lib/MC/MCELF.cpp +++ b/lib/MC/MCELF.cpp @@ -57,13 +57,13 @@ void MCELF::SetVisibility(MCSymbolData &SD, unsigned Visibility) { assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL || Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED); - uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STV_Shift); + uint32_t OtherFlags = SD.getFlags() & ~(0x3 << ELF_STV_Shift); SD.setFlags(OtherFlags | (Visibility << ELF_STV_Shift)); } unsigned MCELF::GetVisibility(MCSymbolData &SD) { unsigned Visibility = - (SD.getFlags() & (0xf << ELF_STV_Shift)) >> ELF_STV_Shift; + (SD.getFlags() & (0x3 << ELF_STV_Shift)) >> ELF_STV_Shift; assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL || Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED); return Visibility; diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp index 9fc9173..bbb2789 100644 --- a/lib/MC/MCELFStreamer.cpp +++ b/lib/MC/MCELFStreamer.cpp @@ -66,6 +66,11 @@ void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) { // FIXME: Anything needed here to flag the function as thumb? + + getAssembler().setIsThumbFunc(Func); + + MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Func); + SD.setFlags(SD.getFlags() | ELF_Other_ThumbFunc); } void MCELFStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) { @@ -345,8 +350,7 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst) { } void MCELFStreamer::Finish() { - if (getNumFrameInfos()) - MCDwarfFrameEmitter::Emit(*this); + EmitFrames(true); for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(), e = LocalCommons.end(); diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp index 2debe18..fcf1aab 100644 --- a/lib/MC/MCExpr.cpp +++ b/lib/MC/MCExpr.cpp @@ -42,8 +42,8 @@ void MCExpr::print(raw_ostream &OS) const { // absolute names. bool UseParens = Sym.getName()[0] == '$'; - if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_HA16 || - SRE.getKind() == MCSymbolRefExpr::VK_PPC_LO16) { + if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_HA16 || + SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_LO16) { OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind()); UseParens = true; } @@ -61,8 +61,8 @@ void MCExpr::print(raw_ostream &OS) const { SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF) OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind()); else if (SRE.getKind() != MCSymbolRefExpr::VK_None && - SRE.getKind() != MCSymbolRefExpr::VK_PPC_HA16 && - SRE.getKind() != MCSymbolRefExpr::VK_PPC_LO16) + SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 && + SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_LO16) OS << '@' << MCSymbolRefExpr::getVariantKindName(SRE.getKind()); return; @@ -197,8 +197,10 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_ARM_GOTTPOFF: return "(gottpoff)"; case VK_ARM_TLSGD: return "(tlsgd)"; case VK_PPC_TOC: return "toc"; - case VK_PPC_HA16: return "ha16"; - case VK_PPC_LO16: return "lo16"; + case VK_PPC_DARWIN_HA16: return "ha16"; + case VK_PPC_DARWIN_LO16: return "lo16"; + case VK_PPC_GAS_HA16: return "ha"; + case VK_PPC_GAS_LO16: return "l"; } } @@ -389,7 +391,7 @@ static bool EvaluateSymbolicAdd(const MCAssembler *Asm, // (LHS_A - RHS_B), // (RHS_A - LHS_B), // (RHS_A - RHS_B). - // Since we are attempting to be as aggresive as possible about folding, we + // Since we are attempting to be as aggressive as possible about folding, we // attempt to evaluate each possible alternative. AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, LHS_A, LHS_B, Result_Cst); @@ -559,3 +561,45 @@ bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res, assert(0 && "Invalid assembly expression kind!"); return false; } + +const MCSection *MCExpr::FindAssociatedSection() const { + switch (getKind()) { + case Target: + // We never look through target specific expressions. + return cast<MCTargetExpr>(this)->FindAssociatedSection(); + + case Constant: + return MCSymbol::AbsolutePseudoSection; + + case SymbolRef: { + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this); + const MCSymbol &Sym = SRE->getSymbol(); + + if (Sym.isDefined()) + return &Sym.getSection(); + + return 0; + } + + case Unary: + return cast<MCUnaryExpr>(this)->getSubExpr()->FindAssociatedSection(); + + case Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(this); + const MCSection *LHS_S = BE->getLHS()->FindAssociatedSection(); + const MCSection *RHS_S = BE->getRHS()->FindAssociatedSection(); + + // If either section is absolute, return the other. + if (LHS_S == MCSymbol::AbsolutePseudoSection) + return RHS_S; + if (RHS_S == MCSymbol::AbsolutePseudoSection) + return LHS_S; + + // Otherwise, return the first non-null section. + return LHS_S ? LHS_S : RHS_S; + } + } + + assert(0 && "Invalid assembly expression kind!"); + return 0; +} diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp index 212b85e..81a939f 100644 --- a/lib/MC/MCInstPrinter.cpp +++ b/lib/MC/MCInstPrinter.cpp @@ -20,7 +20,6 @@ StringRef MCInstPrinter::getOpcodeName(unsigned Opcode) const { return ""; } -StringRef MCInstPrinter::getRegName(unsigned RegNo) const { +void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { assert(0 && "Target should implement this"); - return ""; } diff --git a/lib/MC/MCLoggingStreamer.cpp b/lib/MC/MCLoggingStreamer.cpp index 012c7f6..46ea9b8 100644 --- a/lib/MC/MCLoggingStreamer.cpp +++ b/lib/MC/MCLoggingStreamer.cpp @@ -154,21 +154,19 @@ public: } virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace){ + unsigned AddrSpace){ LogCall("EmitValue"); - return Child->EmitValueImpl(Value, Size, isPCRel, AddrSpace); + return Child->EmitValueImpl(Value, Size, AddrSpace); } - virtual void EmitULEB128Value(const MCExpr *Value, - unsigned AddrSpace = 0) { + virtual void EmitULEB128Value(const MCExpr *Value) { LogCall("EmitULEB128Value"); - return Child->EmitULEB128Value(Value, AddrSpace); + return Child->EmitULEB128Value(Value); } - virtual void EmitSLEB128Value(const MCExpr *Value, - unsigned AddrSpace = 0) { + virtual void EmitSLEB128Value(const MCExpr *Value) { LogCall("EmitSLEB128Value"); - return Child->EmitSLEB128Value(Value, AddrSpace); + return Child->EmitSLEB128Value(Value); } virtual void EmitGPRel32Value(const MCExpr *Value) { @@ -215,13 +213,14 @@ public: virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column, unsigned Flags, - unsigned Isa, unsigned Discriminator) { + unsigned Isa, unsigned Discriminator, + StringRef FileName) { LogCall("EmitDwarfLocDirective", "FileNo:" + Twine(FileNo) + " Line:" + Twine(Line) + " Column:" + Twine(Column) + " Flags:" + Twine(Flags) + " Isa:" + Twine(Isa) + " Discriminator:" + Twine(Discriminator)); return Child->EmitDwarfLocDirective(FileNo, Line, Column, Flags, - Isa, Discriminator); + Isa, Discriminator, FileName); } virtual void EmitInstruction(const MCInst &Inst) { diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp index d1f9f5c..12aeb4f 100644 --- a/lib/MC/MCMachOStreamer.cpp +++ b/lib/MC/MCMachOStreamer.cpp @@ -44,6 +44,8 @@ public: virtual void InitSections(); virtual void EmitLabel(MCSymbol *Symbol); + virtual void EmitEHSymAttributes(const MCSymbol *Symbol, + MCSymbol *EHSymbol); virtual void EmitAssemblerFlag(MCAssemblerFlag Flag); virtual void EmitThumbFunc(MCSymbol *Func); virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value); @@ -101,6 +103,18 @@ void MCMachOStreamer::InitSections() { } +void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol, + MCSymbol *EHSymbol) { + MCSymbolData &SD = + getAssembler().getOrCreateSymbolData(*Symbol); + if (SD.isExternal()) + EmitSymbolAttribute(EHSymbol, MCSA_Global); + if (SD.getFlags() & SF_WeakDefinition) + EmitSymbolAttribute(EHSymbol, MCSA_WeakDefinition); + if (SD.isPrivateExtern()) + EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern); +} + void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) { assert(Symbol->isUndefined() && "Cannot define a symbol twice!"); @@ -363,6 +377,8 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst) { } void MCMachOStreamer::Finish() { + EmitFrames(true); + // We have to set the fragment atom associations so we can relax properly for // Mach-O. diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp index 08ddf01..f38b822 100644 --- a/lib/MC/MCNullStreamer.cpp +++ b/lib/MC/MCNullStreamer.cpp @@ -67,11 +67,9 @@ namespace { virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {} virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace) {} - virtual void EmitULEB128Value(const MCExpr *Value, - unsigned AddrSpace = 0) {} - virtual void EmitSLEB128Value(const MCExpr *Value, - unsigned AddrSpace = 0) {} + unsigned AddrSpace) {} + virtual void EmitULEB128Value(const MCExpr *Value) {} + virtual void EmitSLEB128Value(const MCExpr *Value) {} virtual void EmitGPRel32Value(const MCExpr *Value) {} virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0, unsigned ValueSize = 1, @@ -89,7 +87,8 @@ namespace { } virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column, unsigned Flags, - unsigned Isa, unsigned Discriminator) {} + unsigned Isa, unsigned Discriminator, + StringRef FileName) {} virtual void EmitInstruction(const MCInst &Inst) {} virtual void Finish() {} diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp index ef22eaa..e230c53 100644 --- a/lib/MC/MCObjectStreamer.cpp +++ b/lib/MC/MCObjectStreamer.cpp @@ -90,7 +90,7 @@ const MCExpr *MCObjectStreamer::AddValueSymbols(const MCExpr *Value) { } void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace) { + unsigned AddrSpace) { assert(AddrSpace == 0 && "Address space must be 0!"); MCDataFragment *DF = getOrCreateDataFragment(); @@ -102,15 +102,12 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, } DF->addFixup(MCFixup::Create(DF->getContents().size(), Value, - MCFixup::getKindForSize(Size, isPCRel))); + MCFixup::getKindForSize(Size, false))); DF->getContents().resize(DF->getContents().size() + Size, 0); } void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) { - assert(!Symbol->isVariable() && "Cannot emit a variable symbol!"); - assert(getCurrentSection() && "Cannot emit before setting section!"); - - Symbol->setSection(*getCurrentSection()); + MCStreamer::EmitLabel(Symbol); MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol); @@ -124,23 +121,23 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) { SD.setOffset(F->getContents().size()); } -void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value, - unsigned AddrSpace) { +void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) { int64_t IntValue; if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) { - EmitULEB128IntValue(IntValue, AddrSpace); + EmitULEB128IntValue(IntValue); return; } + Value = ForceExpAbs(Value); new MCLEBFragment(*Value, false, getCurrentSectionData()); } -void MCObjectStreamer::EmitSLEB128Value(const MCExpr *Value, - unsigned AddrSpace) { +void MCObjectStreamer::EmitSLEB128Value(const MCExpr *Value) { int64_t IntValue; if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) { - EmitSLEB128IntValue(IntValue, AddrSpace); + EmitSLEB128IntValue(IntValue); return; } + Value = ForceExpAbs(Value); new MCLEBFragment(*Value, true, getCurrentSectionData()); } @@ -191,30 +188,11 @@ void MCObjectStreamer::EmitInstruction(const MCInst &Inst) { void MCObjectStreamer::EmitInstToFragment(const MCInst &Inst) { MCInstFragment *IF = new MCInstFragment(Inst, getCurrentSectionData()); - raw_svector_ostream VecOS(IF->getCode()); + SmallString<128> Code; + raw_svector_ostream VecOS(Code); getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, IF->getFixups()); -} - -static const MCExpr *BuildSymbolDiff(MCContext &Context, - const MCSymbol *A, const MCSymbol *B) { - MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; - const MCExpr *ARef = - MCSymbolRefExpr::Create(A, Variant, Context); - const MCExpr *BRef = - MCSymbolRefExpr::Create(B, Variant, Context); - const MCExpr *AddrDelta = - MCBinaryExpr::Create(MCBinaryExpr::Sub, ARef, BRef, Context); - return AddrDelta; -} - -static const MCExpr *ForceExpAbs(MCObjectStreamer *Streamer, - MCContext &Context, const MCExpr* Expr) { - if (Context.getAsmInfo().hasAggressiveSymbolFolding()) - return Expr; - - MCSymbol *ABS = Context.CreateTempSymbol(); - Streamer->EmitAssignment(ABS, Expr); - return MCSymbolRefExpr::Create(ABS, Context); + VecOS.flush(); + IF->getCode().append(Code.begin(), Code.end()); } void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta, @@ -231,7 +209,7 @@ void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta, MCDwarfLineAddr::Emit(this, LineDelta, Res); return; } - AddrDelta = ForceExpAbs(this, getContext(), AddrDelta); + AddrDelta = ForceExpAbs(AddrDelta); new MCDwarfLineAddrFragment(LineDelta, *AddrDelta, getCurrentSectionData()); } @@ -243,7 +221,7 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel, MCDwarfFrameEmitter::EmitAdvanceLoc(*this, Res); return; } - AddrDelta = ForceExpAbs(this, getContext(), AddrDelta); + AddrDelta = ForceExpAbs(AddrDelta); new MCDwarfCallFrameFragment(*AddrDelta, getCurrentSectionData()); } diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp index 6bd8986..0c1f8f0 100644 --- a/lib/MC/MCParser/AsmLexer.cpp +++ b/lib/MC/MCParser/AsmLexer.cpp @@ -213,13 +213,13 @@ AsmToken AsmLexer::LexDigit() { // Requires at least one binary digit. if (CurPtr == NumStart) - return ReturnError(TokStart, "Invalid binary number"); + return ReturnError(TokStart, "invalid binary number"); StringRef Result(TokStart, CurPtr - TokStart); long long Value; if (Result.substr(2).getAsInteger(2, Value)) - return ReturnError(TokStart, "Invalid binary number"); + return ReturnError(TokStart, "invalid binary number"); // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. @@ -236,11 +236,11 @@ AsmToken AsmLexer::LexDigit() { // Requires at least one hex digit. if (CurPtr == NumStart) - return ReturnError(CurPtr-2, "Invalid hexadecimal number"); + return ReturnError(CurPtr-2, "invalid hexadecimal number"); unsigned long long Result; if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) - return ReturnError(TokStart, "Invalid hexadecimal number"); + return ReturnError(TokStart, "invalid hexadecimal number"); // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. @@ -251,13 +251,13 @@ AsmToken AsmLexer::LexDigit() { } // Must be an octal number, it starts with 0. - while (*CurPtr >= '0' && *CurPtr <= '7') + while (*CurPtr >= '0' && *CurPtr <= '9') ++CurPtr; StringRef Result(TokStart, CurPtr - TokStart); long long Value; if (Result.getAsInteger(8, Value)) - return ReturnError(TokStart, "Invalid octal number"); + return ReturnError(TokStart, "invalid octal number"); // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. @@ -388,6 +388,7 @@ AsmToken AsmLexer::LexToken() { case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); + case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); case '=': if (*CurPtr == '=') return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index 09c92b8..bbd6635 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -27,6 +27,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCDwarf.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" @@ -36,15 +37,21 @@ #include <vector> using namespace llvm; +static cl::opt<bool> +FatalAssemblerWarnings("fatal-assembler-warnings", + cl::desc("Consider warnings as error")); + namespace { /// \brief Helper class for tracking macro definitions. struct Macro { StringRef Name; StringRef Body; + std::vector<StringRef> Parameters; public: - Macro(StringRef N, StringRef B) : Name(N), Body(B) {} + Macro(StringRef N, StringRef B, const std::vector<StringRef> &P) : + Name(N), Body(B), Parameters(P) {} }; /// \brief Helper class for storing information about an active macro @@ -64,7 +71,7 @@ struct MacroInstantiation { public: MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL, - const std::vector<std::vector<AsmToken> > &A); + MemoryBuffer *I); }; /// \brief The concrete assembly parser instance. @@ -77,6 +84,7 @@ private: AsmLexer Lexer; MCContext &Ctx; MCStreamer &Out; + const MCAsmInfo &MAI; SourceMgr &SrcMgr; MCAsmParserExtension *GenericParser; MCAsmParserExtension *PlatformParser; @@ -128,7 +136,7 @@ public: virtual MCContext &getContext() { return Ctx; } virtual MCStreamer &getStreamer() { return Out; } - virtual void Warning(SMLoc L, const Twine &Meg); + virtual bool Warning(SMLoc L, const Twine &Msg); virtual bool Error(SMLoc L, const Twine &Msg); const AsmToken &Lex(); @@ -146,11 +154,16 @@ private: bool ParseStatement(); bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M); + bool expandMacro(SmallString<256> &Buf, StringRef Body, + const std::vector<StringRef> &Parameters, + const std::vector<std::vector<AsmToken> > &A, + const SMLoc &L); void HandleMacroExit(); void PrintMacroInstantiations(); - void PrintMessage(SMLoc Loc, const Twine &Msg, const char *Type) const { - SrcMgr.PrintMessage(Loc, Msg, Type); + void PrintMessage(SMLoc Loc, const Twine &Msg, const char *Type, + bool ShowLine = true) const { + SrcMgr.PrintMessage(Loc, Msg, Type, ShowLine); } /// EnterIncludeFile - Enter the specified file. This returns true on failure. @@ -243,6 +256,8 @@ public: AddDirectiveHandler<&GenericAsmParser::ParseDirectiveStabs>(".stabs"); // CFI directives. + AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFISections>( + ".cfi_sections"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIStartProc>( ".cfi_startproc"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIEndProc>( @@ -251,10 +266,14 @@ public: ".cfi_def_cfa"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIDefCfaOffset>( ".cfi_def_cfa_offset"); + AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIAdjustCfaOffset>( + ".cfi_adjust_cfa_offset"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIDefCfaRegister>( ".cfi_def_cfa_register"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIOffset>( ".cfi_offset"); + AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIRelOffset>( + ".cfi_rel_offset"); AddDirectiveHandler< &GenericAsmParser::ParseDirectiveCFIPersonalityOrLsda>(".cfi_personality"); AddDirectiveHandler< @@ -263,6 +282,8 @@ public: &GenericAsmParser::ParseDirectiveCFIRememberState>(".cfi_remember_state"); AddDirectiveHandler< &GenericAsmParser::ParseDirectiveCFIRestoreState>(".cfi_restore_state"); + AddDirectiveHandler< + &GenericAsmParser::ParseDirectiveCFISameValue>(".cfi_same_value"); // Macro directives. AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacrosOnOff>( @@ -283,15 +304,19 @@ public: bool ParseDirectiveLine(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveStabs(StringRef, SMLoc DirectiveLoc); + bool ParseDirectiveCFISections(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIStartProc(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIEndProc(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIDefCfa(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIDefCfaOffset(StringRef, SMLoc DirectiveLoc); + bool ParseDirectiveCFIAdjustCfaOffset(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIDefCfaRegister(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc); + bool ParseDirectiveCFIRelOffset(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIPersonalityOrLsda(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIRememberState(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveCFIRestoreState(StringRef, SMLoc DirectiveLoc); + bool ParseDirectiveCFISameValue(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveMacrosOnOff(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveMacro(StringRef, SMLoc DirectiveLoc); @@ -314,7 +339,7 @@ enum { DEFAULT_ADDRSPACE = 0 }; AsmParser::AsmParser(const Target &T, SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out, const MCAsmInfo &_MAI) - : Lexer(_MAI), Ctx(_Ctx), Out(_Out), SrcMgr(_SM), + : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM), GenericParser(new GenericAsmParser), PlatformParser(0), CurBuffer(0), MacrosEnabled(true) { Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)); @@ -358,9 +383,12 @@ void AsmParser::PrintMacroInstantiations() { "note"); } -void AsmParser::Warning(SMLoc L, const Twine &Msg) { +bool AsmParser::Warning(SMLoc L, const Twine &Msg) { + if (FatalAssemblerWarnings) + return Error(L, Msg); PrintMessage(L, Msg, "warning"); PrintMacroInstantiations(); + return false; } bool AsmParser::Error(SMLoc L, const Twine &Msg) { @@ -371,7 +399,8 @@ bool AsmParser::Error(SMLoc L, const Twine &Msg) { } bool AsmParser::EnterIncludeFile(const std::string &Filename) { - int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc()); + std::string IncludedFile; + int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile); if (NewBuf == -1) return true; @@ -439,6 +468,29 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { TokError("unassigned file number: " + Twine(i) + " for .file directives"); } + // Check to see that all assembler local symbols were actually defined. + // Targets that don't do subsections via symbols may not want this, though, + // so conservatively exclude them. Only do this if we're finalizing, though, + // as otherwise we won't necessarilly have seen everything yet. + if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) { + const MCContext::SymbolTable &Symbols = getContext().getSymbols(); + for (MCContext::SymbolTable::const_iterator i = Symbols.begin(), + e = Symbols.end(); + i != e; ++i) { + MCSymbol *Sym = i->getValue(); + // Variable symbols may not be marked as defined, so check those + // explicitly. If we know it's a variable, we have a definition for + // the purposes of this check. + if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined()) + // FIXME: We would really like to refer back to where the symbol was + // first referenced for a source location. We need to add something + // to track that. Currently, we just point to the end of the file. + PrintMessage(getLexer().getLoc(), "assembler local symbol '" + + Sym->getName() + "' not defined", "error", false); + } + } + + // Finalize the output stream if there are no errors and if the client wants // us to. if (!HadError && !NoFinalize) @@ -517,6 +569,9 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { switch (Lexer.getKind()) { default: return TokError("unknown token in expression"); + // If we have an error assume that we've already handled it. + case AsmToken::Error: + return true; case AsmToken::Exclaim: Lex(); // Eat the operator. if (ParsePrimaryExpr(Res, EndLoc)) @@ -530,7 +585,7 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { StringRef Identifier; if (ParseIdentifier(Identifier)) - return false; + return true; // This is a symbol reference. std::pair<StringRef, StringRef> Split = Identifier.split('@'); @@ -1114,9 +1169,9 @@ bool AsmParser::ParseStatement() { if (!getTargetParser().ParseDirective(ID)) return false; - Warning(IDLoc, "ignoring directive for now"); + bool retval = Warning(IDLoc, "ignoring directive for now"); EatToEndOfStatement(); - return false; + return retval; } CheckForValidSection(); @@ -1159,27 +1214,33 @@ bool AsmParser::ParseStatement() { return false; } -MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL, - const std::vector<std::vector<AsmToken> > &A) - : TheMacro(M), InstantiationLoc(IL), ExitLoc(EL) -{ - // Macro instantiation is lexical, unfortunately. We construct a new buffer - // to hold the macro body with substitutions. - SmallString<256> Buf; +bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body, + const std::vector<StringRef> &Parameters, + const std::vector<std::vector<AsmToken> > &A, + const SMLoc &L) { raw_svector_ostream OS(Buf); + unsigned NParameters = Parameters.size(); + if (NParameters != 0 && NParameters != A.size()) + return Error(L, "Wrong number of arguments"); - StringRef Body = M->Body; while (!Body.empty()) { // Scan for the next substitution. std::size_t End = Body.size(), Pos = 0; for (; Pos != End; ++Pos) { // Check for a substitution or escape. - if (Body[Pos] != '$' || Pos + 1 == End) - continue; - - char Next = Body[Pos + 1]; - if (Next == '$' || Next == 'n' || isdigit(Next)) - break; + if (!NParameters) { + // This macro has no parameters, look for $0, $1, etc. + if (Body[Pos] != '$' || Pos + 1 == End) + continue; + + char Next = Body[Pos + 1]; + if (Next == '$' || Next == 'n' || isdigit(Next)) + break; + } else { + // This macro has parameters, look for \foo, \bar, etc. + if (Body[Pos] == '\\' && Pos + 1 != End) + break; + } } // Add the prefix. @@ -1189,41 +1250,69 @@ MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL, if (Pos == End) break; - switch (Body[Pos+1]) { - // $$ => $ - case '$': - OS << '$'; - break; + if (!NParameters) { + switch (Body[Pos+1]) { + // $$ => $ + case '$': + OS << '$'; + break; - // $n => number of arguments - case 'n': - OS << A.size(); - break; + // $n => number of arguments + case 'n': + OS << A.size(); + break; - // $[0-9] => argument - default: { - // Missing arguments are ignored. - unsigned Index = Body[Pos+1] - '0'; - if (Index >= A.size()) + // $[0-9] => argument + default: { + // Missing arguments are ignored. + unsigned Index = Body[Pos+1] - '0'; + if (Index >= A.size()) + break; + + // Otherwise substitute with the token values, with spaces eliminated. + for (std::vector<AsmToken>::const_iterator it = A[Index].begin(), + ie = A[Index].end(); it != ie; ++it) + OS << it->getString(); break; + } + } + Pos += 2; + } else { + unsigned I = Pos + 1; + while (isalnum(Body[I]) && I + 1 != End) + ++I; + + const char *Begin = Body.data() + Pos +1; + StringRef Argument(Begin, I - (Pos +1)); + unsigned Index = 0; + for (; Index < NParameters; ++Index) + if (Parameters[Index] == Argument) + break; + + // FIXME: We should error at the macro definition. + if (Index == NParameters) + return Error(L, "Parameter not found"); - // Otherwise substitute with the token values, with spaces eliminated. for (std::vector<AsmToken>::const_iterator it = A[Index].begin(), ie = A[Index].end(); it != ie; ++it) OS << it->getString(); - break; - } - } + Pos += 1 + Argument.size(); + } // Update the scan point. - Body = Body.substr(Pos + 2); + Body = Body.substr(Pos); } // We include the .endmacro in the buffer as our queue to exit the macro // instantiation. OS << ".endmacro\n"; + return false; +} - Instantiation = MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>"); +MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL, + MemoryBuffer *I) + : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitLoc(EL) +{ } bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc, @@ -1260,11 +1349,22 @@ bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc, Lex(); } + // Macro instantiation is lexical, unfortunately. We construct a new buffer + // to hold the macro body with substitutions. + SmallString<256> Buf; + StringRef Body = M->Body; + + if (expandMacro(Buf, Body, M->Parameters, MacroArguments, getTok().getLoc())) + return true; + + MemoryBuffer *Instantiation = + MemoryBuffer::getMemBufferCopy(Buf.str(), "<instantiation>"); + // Create the macro instantiation object and add to the current macro // instantiation stack. MacroInstantiation *MI = new MacroInstantiation(M, NameLoc, getTok().getLoc(), - MacroArguments); + Instantiation); ActiveMacros.push_back(MI); // Jump to the macro instantiation and prime the lexer. @@ -1336,7 +1436,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef) { // FIXME: Diagnose assignment to protected identifier (e.g., register name). if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable()) ; // Allow redefinitions of undefined symbols only used in directives. - else if (!Sym->isUndefined() && (!Sym->isAbsolute() || !allow_redef)) + else if (!Sym->isUndefined() && (!Sym->isVariable() || !allow_redef)) return Error(EqualLoc, "redefinition of '" + Name + "'"); else if (!Sym->isVariable()) return Error(EqualLoc, "invalid assignment to '" + Name + "'"); @@ -2241,7 +2341,7 @@ bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) { } getStreamer().EmitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags, - Isa, Discriminator); + Isa, Discriminator, StringRef()); return false; } @@ -2253,17 +2353,52 @@ bool GenericAsmParser::ParseDirectiveStabs(StringRef Directive, return TokError("unsupported directive '" + Directive + "'"); } +/// ParseDirectiveCFISections +/// ::= .cfi_sections section [, section] +bool GenericAsmParser::ParseDirectiveCFISections(StringRef, + SMLoc DirectiveLoc) { + StringRef Name; + bool EH = false; + bool Debug = false; + + if (getParser().ParseIdentifier(Name)) + return TokError("Expected an identifier"); + + if (Name == ".eh_frame") + EH = true; + else if (Name == ".debug_frame") + Debug = true; + + if (getLexer().is(AsmToken::Comma)) { + Lex(); + + if (getParser().ParseIdentifier(Name)) + return TokError("Expected an identifier"); + + if (Name == ".eh_frame") + EH = true; + else if (Name == ".debug_frame") + Debug = true; + } + + getStreamer().EmitCFISections(EH, Debug); + + return false; +} + /// ParseDirectiveCFIStartProc /// ::= .cfi_startproc bool GenericAsmParser::ParseDirectiveCFIStartProc(StringRef, SMLoc DirectiveLoc) { - return getStreamer().EmitCFIStartProc(); + getStreamer().EmitCFIStartProc(); + return false; } /// ParseDirectiveCFIEndProc /// ::= .cfi_endproc bool GenericAsmParser::ParseDirectiveCFIEndProc(StringRef, SMLoc DirectiveLoc) { - return getStreamer().EmitCFIEndProc(); + getStreamer().EmitCFIEndProc(); + return false; } /// ParseRegisterOrRegisterNumber - parse register name or number. @@ -2271,7 +2406,7 @@ bool GenericAsmParser::ParseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc) { unsigned RegNo; - if (getLexer().is(AsmToken::Percent)) { + if (getLexer().isNot(AsmToken::Integer)) { if (getParser().getTargetParser().ParseRegister(RegNo, DirectiveLoc, DirectiveLoc)) return true; @@ -2298,7 +2433,8 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfa(StringRef, if (getParser().ParseAbsoluteExpression(Offset)) return true; - return getStreamer().EmitCFIDefCfa(Register, Offset); + getStreamer().EmitCFIDefCfa(Register, Offset); + return false; } /// ParseDirectiveCFIDefCfaOffset @@ -2309,7 +2445,20 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfaOffset(StringRef, if (getParser().ParseAbsoluteExpression(Offset)) return true; - return getStreamer().EmitCFIDefCfaOffset(Offset); + getStreamer().EmitCFIDefCfaOffset(Offset); + return false; +} + +/// ParseDirectiveCFIAdjustCfaOffset +/// ::= .cfi_adjust_cfa_offset adjustment +bool GenericAsmParser::ParseDirectiveCFIAdjustCfaOffset(StringRef, + SMLoc DirectiveLoc) { + int64_t Adjustment = 0; + if (getParser().ParseAbsoluteExpression(Adjustment)) + return true; + + getStreamer().EmitCFIAdjustCfaOffset(Adjustment); + return false; } /// ParseDirectiveCFIDefCfaRegister @@ -2320,11 +2469,12 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfaRegister(StringRef, if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc)) return true; - return getStreamer().EmitCFIDefCfaRegister(Register); + getStreamer().EmitCFIDefCfaRegister(Register); + return false; } /// ParseDirectiveCFIOffset -/// ::= .cfi_off register, offset +/// ::= .cfi_offset register, offset bool GenericAsmParser::ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc) { int64_t Register = 0; int64_t Offset = 0; @@ -2339,7 +2489,29 @@ bool GenericAsmParser::ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc) { if (getParser().ParseAbsoluteExpression(Offset)) return true; - return getStreamer().EmitCFIOffset(Register, Offset); + getStreamer().EmitCFIOffset(Register, Offset); + return false; +} + +/// ParseDirectiveCFIRelOffset +/// ::= .cfi_rel_offset register, offset +bool GenericAsmParser::ParseDirectiveCFIRelOffset(StringRef, + SMLoc DirectiveLoc) { + int64_t Register = 0; + + if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc)) + return true; + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("unexpected token in directive"); + Lex(); + + int64_t Offset = 0; + if (getParser().ParseAbsoluteExpression(Offset)) + return true; + + getStreamer().EmitCFIRelOffset(Register, Offset); + return false; } static bool isValidEncoding(int64_t Encoding) { @@ -2389,25 +2561,42 @@ bool GenericAsmParser::ParseDirectiveCFIPersonalityOrLsda(StringRef IDVal, MCSymbol *Sym = getContext().GetOrCreateSymbol(Name); if (IDVal == ".cfi_personality") - return getStreamer().EmitCFIPersonality(Sym, Encoding); + getStreamer().EmitCFIPersonality(Sym, Encoding); else { assert(IDVal == ".cfi_lsda"); - return getStreamer().EmitCFILsda(Sym, Encoding); + getStreamer().EmitCFILsda(Sym, Encoding); } + return false; } /// ParseDirectiveCFIRememberState /// ::= .cfi_remember_state bool GenericAsmParser::ParseDirectiveCFIRememberState(StringRef IDVal, SMLoc DirectiveLoc) { - return getStreamer().EmitCFIRememberState(); + getStreamer().EmitCFIRememberState(); + return false; } /// ParseDirectiveCFIRestoreState /// ::= .cfi_remember_state bool GenericAsmParser::ParseDirectiveCFIRestoreState(StringRef IDVal, SMLoc DirectiveLoc) { - return getStreamer().EmitCFIRestoreState(); + getStreamer().EmitCFIRestoreState(); + return false; +} + +/// ParseDirectiveCFISameValue +/// ::= .cfi_same_value register +bool GenericAsmParser::ParseDirectiveCFISameValue(StringRef IDVal, + SMLoc DirectiveLoc) { + int64_t Register = 0; + + if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc)) + return true; + + getStreamer().EmitCFISameValue(Register); + + return false; } /// ParseDirectiveMacrosOnOff @@ -2425,13 +2614,27 @@ bool GenericAsmParser::ParseDirectiveMacrosOnOff(StringRef Directive, } /// ParseDirectiveMacro -/// ::= .macro name +/// ::= .macro name [parameters] bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive, SMLoc DirectiveLoc) { StringRef Name; if (getParser().ParseIdentifier(Name)) return TokError("expected identifier in directive"); + std::vector<StringRef> Parameters; + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for(;;) { + StringRef Parameter; + if (getParser().ParseIdentifier(Parameter)) + return TokError("expected identifier in directive"); + Parameters.push_back(Parameter); + + if (getLexer().isNot(AsmToken::Comma)) + break; + Lex(); + } + } + if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.macro' directive"); @@ -2469,7 +2672,7 @@ bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive, const char *BodyStart = StartToken.getLoc().getPointer(); const char *BodyEnd = EndToken.getLoc().getPointer(); StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart); - getParser().MacroMap[Name] = new Macro(Name, Body); + getParser().MacroMap[Name] = new Macro(Name, Body, Parameters); return false; } diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp index 5ecab03..64f6355 100644 --- a/lib/MC/MCParser/COFFAsmParser.cpp +++ b/lib/MC/MCParser/COFFAsmParser.cpp @@ -14,6 +14,9 @@ #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetAsmParser.h" #include "llvm/Support/COFF.h" using namespace llvm; @@ -41,6 +44,34 @@ class COFFAsmParser : public MCAsmParserExtension { AddDirectiveHandler<&COFFAsmParser::ParseDirectiveScl>(".scl"); AddDirectiveHandler<&COFFAsmParser::ParseDirectiveType>(".type"); AddDirectiveHandler<&COFFAsmParser::ParseDirectiveEndef>(".endef"); + + // Win64 EH directives. + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>( + ".seh_proc"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProc>( + ".seh_endproc"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartChained>( + ".seh_startchained"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndChained>( + ".seh_endchained"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandler>( + ".seh_handler"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandlerData>( + ".seh_handlerdata"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushReg>( + ".seh_pushreg"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSetFrame>( + ".seh_setframe"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveAllocStack>( + ".seh_stackalloc"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveReg>( + ".seh_savereg"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveXMM>( + ".seh_savexmm"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushFrame>( + ".seh_pushframe"); + AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProlog>( + ".seh_endprologue"); } bool ParseSectionDirectiveText(StringRef, SMLoc) { @@ -70,6 +101,23 @@ class COFFAsmParser : public MCAsmParserExtension { bool ParseDirectiveType(StringRef, SMLoc); bool ParseDirectiveEndef(StringRef, SMLoc); + // Win64 EH directives. + bool ParseSEHDirectiveStartProc(StringRef, SMLoc); + bool ParseSEHDirectiveEndProc(StringRef, SMLoc); + bool ParseSEHDirectiveStartChained(StringRef, SMLoc); + bool ParseSEHDirectiveEndChained(StringRef, SMLoc); + bool ParseSEHDirectiveHandler(StringRef, SMLoc); + bool ParseSEHDirectiveHandlerData(StringRef, SMLoc); + bool ParseSEHDirectivePushReg(StringRef, SMLoc); + bool ParseSEHDirectiveSetFrame(StringRef, SMLoc); + bool ParseSEHDirectiveAllocStack(StringRef, SMLoc); + bool ParseSEHDirectiveSaveReg(StringRef, SMLoc); + bool ParseSEHDirectiveSaveXMM(StringRef, SMLoc); + bool ParseSEHDirectivePushFrame(StringRef, SMLoc); + bool ParseSEHDirectiveEndProlog(StringRef, SMLoc); + + bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except); + bool ParseSEHRegisterNumber(unsigned &RegNo); public: COFFAsmParser() {} }; @@ -135,6 +183,256 @@ bool COFFAsmParser::ParseDirectiveEndef(StringRef, SMLoc) { return false; } +bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc) { + StringRef SymbolID; + if (getParser().ParseIdentifier(SymbolID)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + MCSymbol *Symbol = getContext().GetOrCreateSymbol(SymbolID); + + Lex(); + getStreamer().EmitWin64EHStartProc(Symbol); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveEndProc(StringRef, SMLoc) { + Lex(); + getStreamer().EmitWin64EHEndProc(); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveStartChained(StringRef, SMLoc) { + Lex(); + getStreamer().EmitWin64EHStartChained(); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveEndChained(StringRef, SMLoc) { + Lex(); + getStreamer().EmitWin64EHEndChained(); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveHandler(StringRef, SMLoc) { + StringRef SymbolID; + if (getParser().ParseIdentifier(SymbolID)) + return true; + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify one or both of @unwind or @except"); + Lex(); + bool unwind = false, except = false; + if (ParseAtUnwindOrAtExcept(unwind, except)) + return true; + if (getLexer().is(AsmToken::Comma)) { + Lex(); + if (ParseAtUnwindOrAtExcept(unwind, except)) + return true; + } + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + MCSymbol *handler = getContext().GetOrCreateSymbol(SymbolID); + + Lex(); + getStreamer().EmitWin64EHHandler(handler, unwind, except); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc) { + Lex(); + getStreamer().EmitWin64EHHandlerData(); + return false; +} + +bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc L) { + unsigned Reg; + if (ParseSEHRegisterNumber(Reg)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + Lex(); + getStreamer().EmitWin64EHPushReg(Reg); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) { + unsigned Reg; + int64_t Off; + if (ParseSEHRegisterNumber(Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify a stack pointer offset"); + + Lex(); + SMLoc startLoc = getLexer().getLoc(); + if (getParser().ParseAbsoluteExpression(Off)) + return true; + + if (Off & 0x0F) + return Error(startLoc, "offset is not a multiple of 16"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + Lex(); + getStreamer().EmitWin64EHSetFrame(Reg, Off); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc) { + int64_t Size; + SMLoc startLoc = getLexer().getLoc(); + if (getParser().ParseAbsoluteExpression(Size)) + return true; + + if (Size & 7) + return Error(startLoc, "size is not a multiple of 8"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + Lex(); + getStreamer().EmitWin64EHAllocStack(Size); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) { + unsigned Reg; + int64_t Off; + if (ParseSEHRegisterNumber(Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify an offset on the stack"); + + Lex(); + SMLoc startLoc = getLexer().getLoc(); + if (getParser().ParseAbsoluteExpression(Off)) + return true; + + if (Off & 7) + return Error(startLoc, "size is not a multiple of 8"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + Lex(); + // FIXME: Err on %xmm* registers + getStreamer().EmitWin64EHSaveReg(Reg, Off); + return false; +} + +// FIXME: This method is inherently x86-specific. It should really be in the +// x86 backend. +bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc L) { + unsigned Reg; + int64_t Off; + if (ParseSEHRegisterNumber(Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify an offset on the stack"); + + Lex(); + SMLoc startLoc = getLexer().getLoc(); + if (getParser().ParseAbsoluteExpression(Off)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + if (Off & 0x0F) + return Error(startLoc, "offset is not a multiple of 16"); + + Lex(); + // FIXME: Err on non-%xmm* registers + getStreamer().EmitWin64EHSaveXMM(Reg, Off); + return false; +} + +bool COFFAsmParser::ParseSEHDirectivePushFrame(StringRef, SMLoc) { + bool Code = false; + StringRef CodeID; + if (getLexer().is(AsmToken::At)) { + SMLoc startLoc = getLexer().getLoc(); + Lex(); + if (!getParser().ParseIdentifier(CodeID)) { + if (CodeID != "code") + return Error(startLoc, "expected @code"); + Code = true; + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + Lex(); + getStreamer().EmitWin64EHPushFrame(Code); + return false; +} + +bool COFFAsmParser::ParseSEHDirectiveEndProlog(StringRef, SMLoc) { + Lex(); + getStreamer().EmitWin64EHEndProlog(); + return false; +} + +bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) { + StringRef identifier; + if (getLexer().isNot(AsmToken::At)) + return TokError("a handler attribute must begin with '@'"); + SMLoc startLoc = getLexer().getLoc(); + Lex(); + if (getParser().ParseIdentifier(identifier)) + return Error(startLoc, "expected @unwind or @except"); + if (identifier == "unwind") + unwind = true; + else if (identifier == "except") + except = true; + else + return Error(startLoc, "expected @unwind or @except"); + return false; +} + +bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) { + SMLoc startLoc = getLexer().getLoc(); + if (getLexer().is(AsmToken::Percent)) { + const TargetAsmInfo &asmInfo = getContext().getTargetAsmInfo(); + SMLoc endLoc; + unsigned LLVMRegNo; + if (getParser().getTargetParser().ParseRegister(LLVMRegNo,startLoc,endLoc)) + return true; + + // Check that this is a non-volatile register. + const unsigned *NVRegs = asmInfo.getCalleeSavedRegs(); + unsigned i; + for (i = 0; NVRegs[i] != 0; ++i) + if (NVRegs[i] == LLVMRegNo) + break; + if (NVRegs[i] == 0) + return Error(startLoc, "expected non-volatile register"); + + int SEHRegNo = asmInfo.getSEHRegNum(LLVMRegNo); + if (SEHRegNo < 0) + return Error(startLoc,"register can't be represented in SEH unwind info"); + RegNo = SEHRegNo; + } + else { + int64_t n; + if (getParser().ParseAbsoluteExpression(n)) + return true; + if (n > 15) + return Error(startLoc, "register number is too high"); + RegNo = n; + } + + return false; +} + namespace llvm { MCAsmParserExtension *createCOFFAsmParser() { diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp index 3c092cd..6f45068 100644 --- a/lib/MC/MCParser/DarwinAsmParser.cpp +++ b/lib/MC/MCParser/DarwinAsmParser.cpp @@ -369,11 +369,9 @@ bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive, // FIXME: If/when .dump and .load are implemented they will be done in the // the assembly parser and not have any need for an MCStreamer API. if (IsDump) - Warning(IDLoc, "ignoring directive .dump for now"); + return Warning(IDLoc, "ignoring directive .dump for now"); else - Warning(IDLoc, "ignoring directive .load for now"); - - return false; + return Warning(IDLoc, "ignoring directive .load for now"); } /// ParseDirectiveLsym diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp index 1bd287b..ae3ed0f 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -12,19 +12,48 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetAsmInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/Twine.h" #include <cstdlib> using namespace llvm; -MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx) { +MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx), EmitEHFrame(true), + EmitDebugFrame(false), + CurrentW64UnwindInfo(0) { const MCSection *section = NULL; SectionStack.push_back(std::make_pair(section, section)); } MCStreamer::~MCStreamer() { + for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i) + delete W64UnwindInfos[i]; +} + +const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context, + const MCSymbol *A, + const MCSymbol *B) { + MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; + const MCExpr *ARef = + MCSymbolRefExpr::Create(A, Variant, Context); + const MCExpr *BRef = + MCSymbolRefExpr::Create(B, Variant, Context); + const MCExpr *AddrDelta = + MCBinaryExpr::Create(MCBinaryExpr::Sub, ARef, BRef, Context); + return AddrDelta; +} + +const MCExpr *MCStreamer::ForceExpAbs(const MCExpr* Expr) { + if (Context.getAsmInfo().hasAggressiveSymbolFolding() || + isa<MCSymbolRefExpr>(Expr)) + return Expr; + + MCSymbol *ABS = Context.CreateTempSymbol(); + EmitAssignment(ABS, Expr); + return MCSymbolRefExpr::Create(ABS, Context); } raw_ostream &MCStreamer::GetCommentOS() { @@ -52,9 +81,11 @@ void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size, assert((isUIntN(8 * Size, Value) || isIntN(8 * Size, Value)) && "Invalid size"); char buf[8]; - // FIXME: Endianness assumption. - for (unsigned i = 0; i != Size; ++i) - buf[i] = uint8_t(Value >> (i * 8)); + const bool isLittleEndian = Context.getTargetAsmInfo().isLittleEndian(); + for (unsigned i = 0; i != Size; ++i) { + unsigned index = isLittleEndian ? i : (Size - i - 1); + buf[i] = uint8_t(Value >> (index * 8)); + } EmitBytes(StringRef(buf, Size), AddrSpace); } @@ -78,42 +109,22 @@ void MCStreamer::EmitSLEB128IntValue(int64_t Value, unsigned AddrSpace) { void MCStreamer::EmitAbsValue(const MCExpr *Value, unsigned Size, unsigned AddrSpace) { - if (getContext().getAsmInfo().hasAggressiveSymbolFolding()) { - EmitValue(Value, Size, AddrSpace); - return; - } - MCSymbol *ABS = getContext().CreateTempSymbol(); - EmitAssignment(ABS, Value); - EmitSymbolValue(ABS, Size, AddrSpace); + const MCExpr *ABS = ForceExpAbs(Value); + EmitValue(ABS, Size, AddrSpace); } void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size, unsigned AddrSpace) { - EmitValueImpl(Value, Size, false, AddrSpace); -} - -void MCStreamer::EmitPCRelValue(const MCExpr *Value, unsigned Size, - unsigned AddrSpace) { - EmitValueImpl(Value, Size, true, AddrSpace); + EmitValueImpl(Value, Size, AddrSpace); } void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size, - bool isPCRel, unsigned AddrSpace) { - EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size, isPCRel, + unsigned AddrSpace) { + EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size, AddrSpace); } -void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size, - unsigned AddrSpace) { - EmitSymbolValue(Sym, Size, false, AddrSpace); -} - -void MCStreamer::EmitPCRelSymbolValue(const MCSymbol *Sym, unsigned Size, - unsigned AddrSpace) { - EmitSymbolValue(Sym, Size, true, AddrSpace); -} - void MCStreamer::EmitGPRel32Value(const MCExpr *Value) { report_fatal_error("unsupported directive in streamer"); } @@ -135,7 +146,8 @@ bool MCStreamer::EmitDwarfFileDirective(unsigned FileNo, void MCStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column, unsigned Flags, unsigned Isa, - unsigned Discriminator) { + unsigned Discriminator, + StringRef FileName) { getContext().setCurrentDwarfLoc(FileNo, Line, Column, Flags, Isa, Discriminator); } @@ -152,28 +164,45 @@ void MCStreamer::EnsureValidFrame() { report_fatal_error("No open frame"); } -bool MCStreamer::EmitCFIStartProc() { +void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol, + MCSymbol *EHSymbol) { +} + +void MCStreamer::EmitLabel(MCSymbol *Symbol) { + assert(!Symbol->isVariable() && "Cannot emit a variable symbol!"); + assert(getCurrentSection() && "Cannot emit before setting section!"); + Symbol->setSection(*getCurrentSection()); + + StringRef Prefix = getContext().getAsmInfo().getPrivateGlobalPrefix(); + if (!Symbol->getName().startswith(Prefix)) + LastNonPrivate = Symbol; +} + +void MCStreamer::EmitCFISections(bool EH, bool Debug) { + assert(EH || Debug); + EmitEHFrame = EH; + EmitDebugFrame = Debug; +} + +void MCStreamer::EmitCFIStartProc() { MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); - if (CurFrame && !CurFrame->End) { + if (CurFrame && !CurFrame->End) report_fatal_error("Starting a frame before finishing the previous one!"); - return true; - } MCDwarfFrameInfo Frame; Frame.Begin = getContext().CreateTempSymbol(); + Frame.Function = LastNonPrivate; EmitLabel(Frame.Begin); FrameInfos.push_back(Frame); - return false; } -bool MCStreamer::EmitCFIEndProc() { +void MCStreamer::EmitCFIEndProc() { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); CurFrame->End = getContext().CreateTempSymbol(); EmitLabel(CurFrame->End); - return false; } -bool MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) { +void MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); MCSymbol *Label = getContext().CreateTempSymbol(); @@ -182,10 +211,9 @@ bool MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) { MachineLocation Source(Register, -Offset); MCCFIInstruction Instruction(Label, Dest, Source); CurFrame->Instructions.push_back(Instruction); - return false; } -bool MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) { +void MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); MCSymbol *Label = getContext().CreateTempSymbol(); @@ -194,10 +222,20 @@ bool MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) { MachineLocation Source(MachineLocation::VirtualFP, -Offset); MCCFIInstruction Instruction(Label, Dest, Source); CurFrame->Instructions.push_back(Instruction); - return false; } -bool MCStreamer::EmitCFIDefCfaRegister(int64_t Register) { +void MCStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) { + EnsureValidFrame(); + MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCSymbol *Label = getContext().CreateTempSymbol(); + EmitLabel(Label); + MachineLocation Dest(MachineLocation::VirtualFP); + MachineLocation Source(MachineLocation::VirtualFP, Adjustment); + MCCFIInstruction Instruction(MCCFIInstruction::RelMove, Label, Dest, Source); + CurFrame->Instructions.push_back(Instruction); +} + +void MCStreamer::EmitCFIDefCfaRegister(int64_t Register) { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); MCSymbol *Label = getContext().CreateTempSymbol(); @@ -206,10 +244,9 @@ bool MCStreamer::EmitCFIDefCfaRegister(int64_t Register) { MachineLocation Source(MachineLocation::VirtualFP); MCCFIInstruction Instruction(Label, Dest, Source); CurFrame->Instructions.push_back(Instruction); - return false; } -bool MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) { +void MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); MCSymbol *Label = getContext().CreateTempSymbol(); @@ -218,37 +255,44 @@ bool MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) { MachineLocation Source(Register, Offset); MCCFIInstruction Instruction(Label, Dest, Source); CurFrame->Instructions.push_back(Instruction); - return false; } -bool MCStreamer::EmitCFIPersonality(const MCSymbol *Sym, +void MCStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) { + EnsureValidFrame(); + MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCSymbol *Label = getContext().CreateTempSymbol(); + EmitLabel(Label); + MachineLocation Dest(Register, Offset); + MachineLocation Source(Register, Offset); + MCCFIInstruction Instruction(MCCFIInstruction::RelMove, Label, Dest, Source); + CurFrame->Instructions.push_back(Instruction); +} + +void MCStreamer::EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding) { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); CurFrame->Personality = Sym; CurFrame->PersonalityEncoding = Encoding; - return false; } -bool MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) { +void MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); CurFrame->Lsda = Sym; CurFrame->LsdaEncoding = Encoding; - return false; } -bool MCStreamer::EmitCFIRememberState() { +void MCStreamer::EmitCFIRememberState() { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); MCSymbol *Label = getContext().CreateTempSymbol(); EmitLabel(Label); MCCFIInstruction Instruction(MCCFIInstruction::Remember, Label); CurFrame->Instructions.push_back(Instruction); - return false; } -bool MCStreamer::EmitCFIRestoreState() { +void MCStreamer::EmitCFIRestoreState() { // FIXME: Error if there is no matching cfi_remember_state. EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); @@ -256,7 +300,165 @@ bool MCStreamer::EmitCFIRestoreState() { EmitLabel(Label); MCCFIInstruction Instruction(MCCFIInstruction::Restore, Label); CurFrame->Instructions.push_back(Instruction); - return false; +} + +void MCStreamer::EmitCFISameValue(int64_t Register) { + EnsureValidFrame(); + MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); + MCSymbol *Label = getContext().CreateTempSymbol(); + EmitLabel(Label); + MCCFIInstruction Instruction(MCCFIInstruction::SameValue, Label, Register); + CurFrame->Instructions.push_back(Instruction); +} + +void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) { + W64UnwindInfos.push_back(Frame); + CurrentW64UnwindInfo = W64UnwindInfos.back(); +} + +void MCStreamer::EnsureValidW64UnwindInfo() { + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + if (!CurFrame || CurFrame->End) + report_fatal_error("No open Win64 EH frame function!"); +} + +void MCStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) { + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + if (CurFrame && !CurFrame->End) + report_fatal_error("Starting a function before ending the previous one!"); + MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo; + Frame->Begin = getContext().CreateTempSymbol(); + Frame->Function = Symbol; + EmitLabel(Frame->Begin); + setCurrentW64UnwindInfo(Frame); +} + +void MCStreamer::EmitWin64EHEndProc() { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + if (CurFrame->ChainedParent) + report_fatal_error("Not all chained regions terminated!"); + CurFrame->End = getContext().CreateTempSymbol(); + EmitLabel(CurFrame->End); +} + +void MCStreamer::EmitWin64EHStartChained() { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo; + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + Frame->Begin = getContext().CreateTempSymbol(); + Frame->Function = CurFrame->Function; + Frame->ChainedParent = CurFrame; + EmitLabel(Frame->Begin); + setCurrentW64UnwindInfo(Frame); +} + +void MCStreamer::EmitWin64EHEndChained() { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + if (!CurFrame->ChainedParent) + report_fatal_error("End of a chained region outside a chained region!"); + CurFrame->End = getContext().CreateTempSymbol(); + EmitLabel(CurFrame->End); + CurrentW64UnwindInfo = CurFrame->ChainedParent; +} + +void MCStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind, + bool Except) { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + if (CurFrame->ChainedParent) + report_fatal_error("Chained unwind areas can't have handlers!"); + CurFrame->ExceptionHandler = Sym; + if (!Except && !Unwind) + report_fatal_error("Don't know what kind of handler this is!"); + if (Unwind) + CurFrame->HandlesUnwind = true; + if (Except) + CurFrame->HandlesExceptions = true; +} + +void MCStreamer::EmitWin64EHHandlerData() { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + if (CurFrame->ChainedParent) + report_fatal_error("Chained unwind areas can't have handlers!"); +} + +void MCStreamer::EmitWin64EHPushReg(unsigned Register) { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + MCSymbol *Label = getContext().CreateTempSymbol(); + MCWin64EHInstruction Inst(Win64EH::UOP_PushNonVol, Label, Register); + EmitLabel(Label); + CurFrame->Instructions.push_back(Inst); +} + +void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + if (CurFrame->LastFrameInst >= 0) + report_fatal_error("Frame register and offset already specified!"); + if (Offset & 0x0F) + report_fatal_error("Misaligned frame pointer offset!"); + MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, NULL, Register, Offset); + CurFrame->LastFrameInst = CurFrame->Instructions.size(); + CurFrame->Instructions.push_back(Inst); +} + +void MCStreamer::EmitWin64EHAllocStack(unsigned Size) { + EnsureValidW64UnwindInfo(); + if (Size & 7) + report_fatal_error("Misaligned stack allocation!"); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + MCSymbol *Label = getContext().CreateTempSymbol(); + MCWin64EHInstruction Inst(Label, Size); + EmitLabel(Label); + CurFrame->Instructions.push_back(Inst); +} + +void MCStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) { + EnsureValidW64UnwindInfo(); + if (Offset & 7) + report_fatal_error("Misaligned saved register offset!"); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + MCSymbol *Label = getContext().CreateTempSymbol(); + MCWin64EHInstruction Inst( + Offset > 512*1024-8 ? Win64EH::UOP_SaveNonVolBig : Win64EH::UOP_SaveNonVol, + Label, Register, Offset); + EmitLabel(Label); + CurFrame->Instructions.push_back(Inst); +} + +void MCStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) { + EnsureValidW64UnwindInfo(); + if (Offset & 0x0F) + report_fatal_error("Misaligned saved vector register offset!"); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + MCSymbol *Label = getContext().CreateTempSymbol(); + MCWin64EHInstruction Inst( + Offset > 512*1024-16 ? Win64EH::UOP_SaveXMM128Big : Win64EH::UOP_SaveXMM128, + Label, Register, Offset); + EmitLabel(Label); + CurFrame->Instructions.push_back(Inst); +} + +void MCStreamer::EmitWin64EHPushFrame(bool Code) { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + if (CurFrame->Instructions.size() > 0) + report_fatal_error("If present, PushMachFrame must be the first UOP"); + MCSymbol *Label = getContext().CreateTempSymbol(); + MCWin64EHInstruction Inst(Win64EH::UOP_PushMachFrame, Label, Code); + EmitLabel(Label); + CurFrame->Instructions.push_back(Inst); +} + +void MCStreamer::EmitWin64EHEndProlog() { + EnsureValidW64UnwindInfo(); + MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo; + CurFrame->PrologEnd = getContext().CreateTempSymbol(); + EmitLabel(CurFrame->PrologEnd); } void MCStreamer::EmitFnStart() { @@ -313,3 +515,21 @@ void MCStreamer::EmitRawText(const Twine &T) { T.toVector(Str); EmitRawText(Str.str()); } + +void MCStreamer::EmitFrames(bool usingCFI) { + if (!getNumFrameInfos()) + return; + + if (EmitEHFrame) + MCDwarfFrameEmitter::Emit(*this, usingCFI, true); + + if (EmitDebugFrame) + MCDwarfFrameEmitter::Emit(*this, usingCFI, false); +} + +void MCStreamer::EmitW64Tables() { + if (!getNumW64UnwindInfos()) + return; + + MCWin64EHUnwindEmitter::Emit(*this); +} diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp index 1c71f26..c2fad167 100644 --- a/lib/MC/MCSymbol.cpp +++ b/lib/MC/MCSymbol.cpp @@ -58,9 +58,13 @@ void MCSymbol::setVariableValue(const MCExpr *Value) { "Invalid redefinition!"); this->Value = Value; - // Mark the variable as absolute as appropriate. - if (isa<MCConstantExpr>(Value)) - setAbsolute(); + // Variables should always be marked as in the same "section" as the value. + const MCSection *Section = Value->FindAssociatedSection(); + if (Section) { + setSection(*Section); + } else { + setUndefined(); + } } void MCSymbol::print(raw_ostream &OS) const { diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp new file mode 100644 index 0000000..9453f5c --- /dev/null +++ b/lib/MC/MCWin64EH.cpp @@ -0,0 +1,258 @@ +//===- lib/MC/MCWin64EH.cpp - MCWin64EH implementation --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCWin64EH.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/Target/TargetAsmInfo.h" + +namespace llvm { + +// NOTE: All relocations generated here are 4-byte image-relative. + +static uint8_t CountOfUnwindCodes(std::vector<MCWin64EHInstruction> &instArray){ + uint8_t count = 0; + for (std::vector<MCWin64EHInstruction>::const_iterator I = instArray.begin(), + E = instArray.end(); I != E; ++I) { + switch (I->getOperation()) { + case Win64EH::UOP_PushNonVol: + case Win64EH::UOP_AllocSmall: + case Win64EH::UOP_SetFPReg: + case Win64EH::UOP_PushMachFrame: + count += 1; + break; + case Win64EH::UOP_SaveNonVol: + case Win64EH::UOP_SaveXMM128: + count += 2; + break; + case Win64EH::UOP_SaveNonVolBig: + case Win64EH::UOP_SaveXMM128Big: + count += 3; + break; + case Win64EH::UOP_AllocLarge: + if (I->getSize() > 512*1024-8) + count += 3; + else + count += 2; + break; + } + } + return count; +} + +static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs, + MCSymbol *rhs) { + MCContext &context = streamer.getContext(); + const MCExpr *diff = MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create( + lhs, context), + MCSymbolRefExpr::Create( + rhs, context), + context); + streamer.EmitAbsValue(diff, 1); + +} + +static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin, + MCWin64EHInstruction &inst) { + uint8_t b1, b2; + uint16_t w; + b2 = (inst.getOperation() & 0x0F); + switch (inst.getOperation()) { + case Win64EH::UOP_PushNonVol: + EmitAbsDifference(streamer, inst.getLabel(), begin); + b2 |= (inst.getRegister() & 0x0F) << 4; + streamer.EmitIntValue(b2, 1); + break; + case Win64EH::UOP_AllocLarge: + EmitAbsDifference(streamer, inst.getLabel(), begin); + if (inst.getSize() > 512*1024-8) { + b2 |= 0x10; + streamer.EmitIntValue(b2, 1); + w = inst.getSize() & 0xFFF8; + streamer.EmitIntValue(w, 2); + w = inst.getSize() >> 16; + } else { + streamer.EmitIntValue(b2, 1); + w = inst.getSize() >> 3; + } + streamer.EmitIntValue(w, 2); + break; + case Win64EH::UOP_AllocSmall: + b2 |= (((inst.getSize()-8) >> 3) & 0x0F) << 4; + EmitAbsDifference(streamer, inst.getLabel(), begin); + streamer.EmitIntValue(b2, 1); + break; + case Win64EH::UOP_SetFPReg: + b1 = inst.getOffset() & 0xF0; + streamer.EmitIntValue(b1, 1); + streamer.EmitIntValue(b2, 1); + break; + case Win64EH::UOP_SaveNonVol: + case Win64EH::UOP_SaveXMM128: + b2 |= (inst.getRegister() & 0x0F) << 4; + EmitAbsDifference(streamer, inst.getLabel(), begin); + streamer.EmitIntValue(b2, 1); + w = inst.getOffset() >> 3; + if (inst.getOperation() == Win64EH::UOP_SaveXMM128) + w >>= 1; + streamer.EmitIntValue(w, 2); + break; + case Win64EH::UOP_SaveNonVolBig: + case Win64EH::UOP_SaveXMM128Big: + b2 |= (inst.getRegister() & 0x0F) << 4; + EmitAbsDifference(streamer, inst.getLabel(), begin); + streamer.EmitIntValue(b2, 1); + if (inst.getOperation() == Win64EH::UOP_SaveXMM128Big) + w = inst.getOffset() & 0xFFF0; + else + w = inst.getOffset() & 0xFFF8; + streamer.EmitIntValue(w, 2); + w = inst.getOffset() >> 16; + streamer.EmitIntValue(w, 2); + break; + case Win64EH::UOP_PushMachFrame: + if (inst.isPushCodeFrame()) + b2 |= 0x10; + EmitAbsDifference(streamer, inst.getLabel(), begin); + streamer.EmitIntValue(b2, 1); + break; + } +} + +static void EmitRuntimeFunction(MCStreamer &streamer, + const MCWin64EHUnwindInfo *info) { + MCContext &context = streamer.getContext(); + + streamer.EmitValueToAlignment(4); + streamer.EmitValue(MCSymbolRefExpr::Create(info->Begin, context), 4); + streamer.EmitValue(MCSymbolRefExpr::Create(info->End, context), 4); + streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol, context), 4); +} + +static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) { + // If this UNWIND_INFO already has a symbol, it's already been emitted. + if (info->Symbol) return; + + MCContext &context = streamer.getContext(); + streamer.EmitValueToAlignment(4); + // Upper 3 bits are the version number (currently 1). + uint8_t flags = 0x01; + info->Symbol = context.CreateTempSymbol(); + streamer.EmitLabel(info->Symbol); + + if (info->ChainedParent) + flags |= Win64EH::UNW_ChainInfo << 3; + else { + if (info->HandlesUnwind) + flags |= Win64EH::UNW_TerminateHandler << 3; + if (info->HandlesExceptions) + flags |= Win64EH::UNW_ExceptionHandler << 3; + } + streamer.EmitIntValue(flags, 1); + + if (info->PrologEnd) + EmitAbsDifference(streamer, info->PrologEnd, info->Begin); + else + streamer.EmitIntValue(0, 1); + + uint8_t numCodes = CountOfUnwindCodes(info->Instructions); + streamer.EmitIntValue(numCodes, 1); + + uint8_t frame = 0; + if (info->LastFrameInst >= 0) { + MCWin64EHInstruction &frameInst = info->Instructions[info->LastFrameInst]; + assert(frameInst.getOperation() == Win64EH::UOP_SetFPReg); + frame = (frameInst.getRegister() & 0x0F) | + (frameInst.getOffset() & 0xF0); + } + streamer.EmitIntValue(frame, 1); + + // Emit unwind instructions (in reverse order). + uint8_t numInst = info->Instructions.size(); + for (uint8_t c = 0; c < numInst; ++c) { + MCWin64EHInstruction inst = info->Instructions.back(); + info->Instructions.pop_back(); + EmitUnwindCode(streamer, info->Begin, inst); + } + + if (flags & (Win64EH::UNW_ChainInfo << 3)) + EmitRuntimeFunction(streamer, info->ChainedParent); + else if (flags & + ((Win64EH::UNW_TerminateHandler|Win64EH::UNW_ExceptionHandler) << 3)) + streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler, context), + 4); + else if (numCodes < 2) { + // The minimum size of an UNWIND_INFO struct is 8 bytes. If we're not + // a chained unwind info, if there is no handler, and if there are fewer + // than 2 slots used in the unwind code array, we have to pad to 8 bytes. + if (numCodes == 1) + streamer.EmitIntValue(0, 2); + else + streamer.EmitIntValue(0, 4); + } +} + +StringRef MCWin64EHUnwindEmitter::GetSectionSuffix(const MCSymbol *func) { + if (!func || !func->isInSection()) return ""; + const MCSection *section = &func->getSection(); + const MCSectionCOFF *COFFSection; + if ((COFFSection = dyn_cast<MCSectionCOFF>(section))) { + StringRef name = COFFSection->getSectionName(); + size_t dollar = name.find('$'); + size_t dot = name.find('.', 1); + if (dollar == StringRef::npos && dot == StringRef::npos) + return ""; + if (dot == StringRef::npos) + return name.substr(dollar); + if (dollar == StringRef::npos || dot < dollar) + return name.substr(dot); + return name.substr(dollar); + } + return ""; +} + +void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer, + MCWin64EHUnwindInfo *info) { + // Switch sections (the static function above is meant to be called from + // here and from Emit(). + MCContext &context = streamer.getContext(); + const TargetAsmInfo &asmInfo = context.getTargetAsmInfo(); + const MCSection *xdataSect = + asmInfo.getWin64EHTableSection(GetSectionSuffix(info->Function)); + streamer.SwitchSection(xdataSect); + + llvm::EmitUnwindInfo(streamer, info); +} + +void MCWin64EHUnwindEmitter::Emit(MCStreamer &streamer) { + MCContext &context = streamer.getContext(); + // Emit the unwind info structs first. + const TargetAsmInfo &asmInfo = context.getTargetAsmInfo(); + for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) { + MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i); + const MCSection *xdataSect = + asmInfo.getWin64EHTableSection(GetSectionSuffix(info.Function)); + streamer.SwitchSection(xdataSect); + llvm::EmitUnwindInfo(streamer, &info); + } + // Now emit RUNTIME_FUNCTION entries. + for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) { + MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i); + const MCSection *pdataSect = + asmInfo.getWin64EHFuncTableSection(GetSectionSuffix(info.Function)); + streamer.SwitchSection(pdataSect); + EmitRuntimeFunction(streamer, &info); + } +} + +} // End of namespace llvm + diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp index 105506a..f049b1c 100644 --- a/lib/MC/MachObjectWriter.cpp +++ b/lib/MC/MachObjectWriter.cpp @@ -121,6 +121,33 @@ private: } uint64_t getSymbolAddress(const MCSymbolData* SD, const MCAsmLayout &Layout) const { + const MCSymbol &S = SD->getSymbol(); + + // If this is a variable, then recursively evaluate now. + if (S.isVariable()) { + MCValue Target; + if (!S.getVariableValue()->EvaluateAsRelocatable(Target, Layout)) + report_fatal_error("unable to evaluate offset for variable '" + + S.getName() + "'"); + + // Verify that any used symbols are defined. + if (Target.getSymA() && Target.getSymA()->getSymbol().isUndefined()) + report_fatal_error("unable to evaluate offset to undefined symbol '" + + Target.getSymA()->getSymbol().getName() + "'"); + if (Target.getSymB() && Target.getSymB()->getSymbol().isUndefined()) + report_fatal_error("unable to evaluate offset to undefined symbol '" + + Target.getSymB()->getSymbol().getName() + "'"); + + uint64_t Address = Target.getConstant(); + if (Target.getSymA()) + Address += getSymbolAddress(&Layout.getAssembler().getSymbolData( + Target.getSymA()->getSymbol()), Layout); + if (Target.getSymB()) + Address += getSymbolAddress(&Layout.getAssembler().getSymbolData( + Target.getSymB()->getSymbol()), Layout); + return Address; + } + return getSectionAddress(SD->getFragment()->getParent()) + Layout.getSymbolOffset(SD); } @@ -440,7 +467,7 @@ public: // Compensate for the relocation offset, Darwin x86_64 relocations only // have the addend and appear to have attempted to define it to be the // actual expression addend without the PCrel bias. However, instructions - // with data following the relocation are not accomodated for (see comment + // with data following the relocation are not accommodated for (see comment // below regarding SIGNED{1,2,4}), so it isn't exactly that either. Value += 1LL << Log2Size; } @@ -541,7 +568,7 @@ public: } // x86_64 almost always uses external relocations, except when there is no - // symbol to use as a base address (a local symbol with no preceeding + // symbol to use as a base address (a local symbol with no preceding // non-local symbol). if (Base) { Index = Base->getIndex(); @@ -550,7 +577,7 @@ public: // Add the local offset, if needed. if (Base != &SD) Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); - } else if (Symbol->isInSection()) { + } else if (Symbol->isInSection() && !Symbol->isVariable()) { // The index is the section ordinal (1-based). Index = SD.getFragment()->getParent()->getOrdinal() + 1; IsExtern = 0; @@ -1028,17 +1055,17 @@ public: // FIXME! report_fatal_error("FIXME: relocations to absolute targets " "not yet implemented"); - } else if (SD->getSymbol().isVariable()) { - int64_t Res; - if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( - Res, Layout, SectionAddress)) { - FixedValue = Res; - return; + } else { + // Resolve constant variables. + if (SD->getSymbol().isVariable()) { + int64_t Res; + if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( + Res, Layout, SectionAddress)) { + FixedValue = Res; + return; + } } - report_fatal_error("unsupported relocation of variable '" + - SD->getSymbol().getName() + "'"); - } else { // Check whether we need an external or internal relocation. if (doesSymbolRequireExternRelocation(SD)) { IsExtern = 1; @@ -1050,8 +1077,10 @@ public: FixedValue -= Layout.getSymbolOffset(SD); } else { // The index is the section ordinal (1-based). - Index = SD->getFragment()->getParent()->getOrdinal() + 1; - FixedValue += getSectionAddress(SD->getFragment()->getParent()); + const MCSectionData &SymSD = Asm.getSectionData( + SD->getSymbol().getSection()); + Index = SymSD.getOrdinal() + 1; + FixedValue += getSectionAddress(&SymSD); } if (IsPCRel) FixedValue -= getSectionAddress(Fragment->getParent()); @@ -1127,17 +1156,17 @@ public: // FIXME: Currently, these are never generated (see code below). I cannot // find a case where they are actually emitted. Type = macho::RIT_Vanilla; - } else if (SD->getSymbol().isVariable()) { - int64_t Res; - if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( - Res, Layout, SectionAddress)) { - FixedValue = Res; - return; + } else { + // Resolve constant variables. + if (SD->getSymbol().isVariable()) { + int64_t Res; + if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( + Res, Layout, SectionAddress)) { + FixedValue = Res; + return; + } } - report_fatal_error("unsupported relocation of variable '" + - SD->getSymbol().getName() + "'"); - } else { // Check whether we need an external or internal relocation. if (doesSymbolRequireExternRelocation(SD)) { IsExtern = 1; @@ -1149,8 +1178,10 @@ public: FixedValue -= Layout.getSymbolOffset(SD); } else { // The index is the section ordinal (1-based). - Index = SD->getFragment()->getParent()->getOrdinal() + 1; - FixedValue += getSectionAddress(SD->getFragment()->getParent()); + const MCSectionData &SymSD = Asm.getSectionData( + SD->getSymbol().getSection()); + Index = SymSD.getOrdinal() + 1; + FixedValue += getSectionAddress(&SymSD); } if (IsPCRel) FixedValue -= getSectionAddress(Fragment->getParent()); diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp index 6ca5d37..101237a 100644 --- a/lib/MC/WinCOFFObjectWriter.cpp +++ b/lib/MC/WinCOFFObjectWriter.cpp @@ -647,22 +647,27 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm, COFFSection *coff_section = SectionMap[&SectionData->getSection()]; COFFSymbol *coff_symbol = SymbolMap[&A_SD.getSymbol()]; + const MCSymbolRefExpr *SymA = Target.getSymA(); + const MCSymbolRefExpr *SymB = Target.getSymB(); + const bool CrossSection = SymB && + &SymA->getSymbol().getSection() != &SymB->getSymbol().getSection(); if (Target.getSymB()) { - if (&Target.getSymA()->getSymbol().getSection() - != &Target.getSymB()->getSymbol().getSection()) { - llvm_unreachable("Symbol relative relocations are only allowed between " - "symbols in the same section"); - } const MCSymbol *B = &Target.getSymB()->getSymbol(); MCSymbolData &B_SD = Asm.getSymbolData(*B); - FixedValue = Layout.getSymbolOffset(&A_SD) - Layout.getSymbolOffset(&B_SD); + // Offset of the symbol in the section + int64_t a = Layout.getSymbolOffset(&B_SD); + + // Ofeset of the relocation in the section + int64_t b = Layout.getFragmentOffset(Fragment) + Fixup.getOffset(); + FixedValue = b - a; // In the case where we have SymbA and SymB, we just need to store the delta // between the two symbols. Update FixedValue to account for the delta, and // skip recording the relocation. - return; + if (!CrossSection) + return; } else { FixedValue = Target.getConstant(); } @@ -673,7 +678,7 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm, Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment); // Turn relocations for temporary symbols into section relocations. - if (coff_symbol->MCData->getSymbol().isTemporary()) { + if (coff_symbol->MCData->getSymbol().isTemporary() || CrossSection) { Reloc.Symb = coff_symbol->Section->Symbol; FixedValue += Layout.getFragmentOffset(coff_symbol->MCData->Fragment) + coff_symbol->MCData->getOffset(); @@ -684,7 +689,12 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm, Reloc.Data.VirtualAddress += Fixup.getOffset(); - switch ((unsigned)Fixup.getKind()) { + unsigned FixupKind = Fixup.getKind(); + + if (CrossSection) + FixupKind = FK_PCRel_4; + + switch (FixupKind) { case FK_PCRel_4: case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_movq_load: diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp index 46968e6..6c36c12 100644 --- a/lib/MC/WinCOFFStreamer.cpp +++ b/lib/MC/WinCOFFStreamer.cpp @@ -23,6 +23,7 @@ #include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCWin64EH.h" #include "llvm/Target/TargetRegistry.h" #include "llvm/Target/TargetAsmBackend.h" #include "llvm/ADT/StringMap.h" @@ -74,6 +75,7 @@ public: unsigned MaxBytesToEmit); virtual void EmitFileDirective(StringRef Filename); virtual void EmitInstruction(const MCInst &Instruction); + virtual void EmitWin64EHHandlerData(); virtual void Finish(); private: @@ -377,7 +379,16 @@ void WinCOFFStreamer::EmitInstruction(const MCInst &Instruction) { Fragment->getFixups()); } +void WinCOFFStreamer::EmitWin64EHHandlerData() { + MCStreamer::EmitWin64EHHandlerData(); + + // We have to emit the unwind info now, because this directive + // actually switches to the .xdata section! + MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo()); +} + void WinCOFFStreamer::Finish() { + EmitW64Tables(); MCObjectStreamer::Finish(); } diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt index f28d2ec..703d385 100644 --- a/lib/Object/CMakeLists.txt +++ b/lib/Object/CMakeLists.txt @@ -1,7 +1,8 @@ add_llvm_library(LLVMObject + COFFObjectFile.cpp + ELFObjectFile.cpp MachOObject.cpp + MachOObjectFile.cpp Object.cpp ObjectFile.cpp - COFFObjectFile.cpp - ELFObjectFile.cpp ) diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp index 30709f1..86bf44b 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -186,11 +186,8 @@ char COFFObjectFile::getSymbolNMTypeChar(DataRefImpl Symb) const { return ret; uint32_t Characteristics = 0; - uint32_t PointerToRawData = 0; - const coff_section *Section = getSection(symb->SectionNumber); - if (Section) { + if (const coff_section *Section = getSection(symb->SectionNumber)) { Characteristics = Section->Characteristics; - PointerToRawData = Section->PointerToRawData; } switch (symb->SectionNumber) { diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp new file mode 100644 index 0000000..877cbfb --- /dev/null +++ b/lib/Object/MachOObjectFile.cpp @@ -0,0 +1,327 @@ +//===- MachOObjectFile.cpp - Mach-O object file binding ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the MachOObjectFile class, which binds the MachOObject +// class to the generic ObjectFile wrapper. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Triple.h" +#include "llvm/Object/MachOFormat.h" +#include "llvm/Object/MachOObject.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/MachO.h" + +#include <cctype> +#include <cstring> +#include <limits> + +using namespace llvm; +using namespace object; + +namespace llvm { + +typedef MachOObject::LoadCommandInfo LoadCommandInfo; + +class MachOObjectFile : public ObjectFile { +public: + MachOObjectFile(MemoryBuffer *Object, MachOObject *MOO) + : ObjectFile(Object), + MachOObj(MOO), + RegisteredStringTable(std::numeric_limits<uint32_t>::max()) {} + + virtual symbol_iterator begin_symbols() const; + virtual symbol_iterator end_symbols() const; + virtual section_iterator begin_sections() const; + virtual section_iterator end_sections() const; + + virtual uint8_t getBytesInAddress() const; + virtual StringRef getFileFormatName() const; + virtual unsigned getArch() const; + +protected: + virtual SymbolRef getSymbolNext(DataRefImpl Symb) const; + virtual StringRef getSymbolName(DataRefImpl Symb) const; + virtual uint64_t getSymbolAddress(DataRefImpl Symb) const; + virtual uint64_t getSymbolSize(DataRefImpl Symb) const; + virtual char getSymbolNMTypeChar(DataRefImpl Symb) const; + virtual bool isSymbolInternal(DataRefImpl Symb) const; + + virtual SectionRef getSectionNext(DataRefImpl Sec) const; + virtual StringRef getSectionName(DataRefImpl Sec) const; + virtual uint64_t getSectionAddress(DataRefImpl Sec) const; + virtual uint64_t getSectionSize(DataRefImpl Sec) const; + virtual StringRef getSectionContents(DataRefImpl Sec) const; + virtual bool isSectionText(DataRefImpl Sec) const; + +private: + MachOObject *MachOObj; + mutable uint32_t RegisteredStringTable; + + void moveToNextSection(DataRefImpl &DRI) const; + void getSymbolTableEntry(DataRefImpl DRI, + InMemoryStruct<macho::SymbolTableEntry> &Res) const; + void moveToNextSymbol(DataRefImpl &DRI) const; + void getSection(DataRefImpl DRI, InMemoryStruct<macho::Section> &Res) const; +}; + +ObjectFile *ObjectFile::createMachOObjectFile(MemoryBuffer *Buffer) { + std::string Err; + MachOObject *MachOObj = MachOObject::LoadFromBuffer(Buffer, &Err); + if (!MachOObj) + return NULL; + return new MachOObjectFile(Buffer, MachOObj); +} + +/*===-- Symbols -----------------------------------------------------------===*/ + +void MachOObjectFile::moveToNextSymbol(DataRefImpl &DRI) const { + uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands; + while (DRI.d.a < LoadCommandCount) { + LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a); + if (LCI.Command.Type == macho::LCT_Symtab) { + InMemoryStruct<macho::SymtabLoadCommand> SymtabLoadCmd; + MachOObj->ReadSymtabLoadCommand(LCI, SymtabLoadCmd); + if (DRI.d.b < SymtabLoadCmd->NumSymbolTableEntries) + return; + } + + DRI.d.a++; + DRI.d.b = 0; + } +} + +void MachOObjectFile::getSymbolTableEntry(DataRefImpl DRI, + InMemoryStruct<macho::SymbolTableEntry> &Res) const { + InMemoryStruct<macho::SymtabLoadCommand> SymtabLoadCmd; + LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a); + MachOObj->ReadSymtabLoadCommand(LCI, SymtabLoadCmd); + + if (RegisteredStringTable != DRI.d.a) { + MachOObj->RegisterStringTable(*SymtabLoadCmd); + RegisteredStringTable = DRI.d.a; + } + + MachOObj->ReadSymbolTableEntry(SymtabLoadCmd->SymbolTableOffset, DRI.d.b, + Res); +} + + +SymbolRef MachOObjectFile::getSymbolNext(DataRefImpl DRI) const { + DRI.d.b++; + moveToNextSymbol(DRI); + return SymbolRef(DRI, this); +} + +StringRef MachOObjectFile::getSymbolName(DataRefImpl DRI) const { + InMemoryStruct<macho::SymbolTableEntry> Entry; + getSymbolTableEntry(DRI, Entry); + return MachOObj->getStringAtIndex(Entry->StringIndex); +} + +uint64_t MachOObjectFile::getSymbolAddress(DataRefImpl DRI) const { + InMemoryStruct<macho::SymbolTableEntry> Entry; + getSymbolTableEntry(DRI, Entry); + return Entry->Value; +} + +uint64_t MachOObjectFile::getSymbolSize(DataRefImpl DRI) const { + return UnknownAddressOrSize; +} + +char MachOObjectFile::getSymbolNMTypeChar(DataRefImpl DRI) const { + InMemoryStruct<macho::SymbolTableEntry> Entry; + getSymbolTableEntry(DRI, Entry); + + char Char; + switch (Entry->Type & macho::STF_TypeMask) { + case macho::STT_Undefined: + Char = 'u'; + break; + case macho::STT_Absolute: + case macho::STT_Section: + Char = 's'; + break; + default: + Char = '?'; + break; + } + + if (Entry->Flags & (macho::STF_External | macho::STF_PrivateExtern)) + Char = toupper(Char); + return Char; +} + +bool MachOObjectFile::isSymbolInternal(DataRefImpl DRI) const { + InMemoryStruct<macho::SymbolTableEntry> Entry; + getSymbolTableEntry(DRI, Entry); + return Entry->Flags & macho::STF_StabsEntryMask; +} + +ObjectFile::symbol_iterator MachOObjectFile::begin_symbols() const { + // DRI.d.a = segment number; DRI.d.b = symbol index. + DataRefImpl DRI; + DRI.d.a = DRI.d.b = 0; + moveToNextSymbol(DRI); + return symbol_iterator(SymbolRef(DRI, this)); +} + +ObjectFile::symbol_iterator MachOObjectFile::end_symbols() const { + DataRefImpl DRI; + DRI.d.a = MachOObj->getHeader().NumLoadCommands; + DRI.d.b = 0; + return symbol_iterator(SymbolRef(DRI, this)); +} + + +/*===-- Sections ----------------------------------------------------------===*/ + +void MachOObjectFile::moveToNextSection(DataRefImpl &DRI) const { + uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands; + while (DRI.d.a < LoadCommandCount) { + LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a); + if (LCI.Command.Type == macho::LCT_Segment) { + InMemoryStruct<macho::SegmentLoadCommand> SegmentLoadCmd; + MachOObj->ReadSegmentLoadCommand(LCI, SegmentLoadCmd); + if (DRI.d.b < SegmentLoadCmd->NumSections) + return; + } else if (LCI.Command.Type == macho::LCT_Segment64) { + InMemoryStruct<macho::Segment64LoadCommand> Segment64LoadCmd; + MachOObj->ReadSegment64LoadCommand(LCI, Segment64LoadCmd); + if (DRI.d.b < Segment64LoadCmd->NumSections) + return; + } + + DRI.d.a++; + DRI.d.b = 0; + } +} + +SectionRef MachOObjectFile::getSectionNext(DataRefImpl DRI) const { + DRI.d.b++; + moveToNextSection(DRI); + return SectionRef(DRI, this); +} + +void +MachOObjectFile::getSection(DataRefImpl DRI, + InMemoryStruct<macho::Section> &Res) const { + InMemoryStruct<macho::SegmentLoadCommand> SLC; + LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a); + MachOObj->ReadSegmentLoadCommand(LCI, SLC); + MachOObj->ReadSection(LCI, DRI.d.b, Res); +} + +StringRef MachOObjectFile::getSectionName(DataRefImpl DRI) const { + InMemoryStruct<macho::SegmentLoadCommand> SLC; + LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a); + MachOObj->ReadSegmentLoadCommand(LCI, SLC); + InMemoryStruct<macho::Section> Sect; + MachOObj->ReadSection(LCI, DRI.d.b, Sect); + + static char Result[34]; + strcpy(Result, SLC->Name); + strcat(Result, ","); + strcat(Result, Sect->Name); + return StringRef(Result); +} + +uint64_t MachOObjectFile::getSectionAddress(DataRefImpl DRI) const { + InMemoryStruct<macho::Section> Sect; + getSection(DRI, Sect); + return Sect->Address; +} + +uint64_t MachOObjectFile::getSectionSize(DataRefImpl DRI) const { + InMemoryStruct<macho::Section> Sect; + getSection(DRI, Sect); + return Sect->Size; +} + +StringRef MachOObjectFile::getSectionContents(DataRefImpl DRI) const { + InMemoryStruct<macho::Section> Sect; + getSection(DRI, Sect); + return MachOObj->getData(Sect->Offset, Sect->Size); +} + +bool MachOObjectFile::isSectionText(DataRefImpl DRI) const { + InMemoryStruct<macho::SegmentLoadCommand> SLC; + LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a); + MachOObj->ReadSegmentLoadCommand(LCI, SLC); + return !strcmp(SLC->Name, "__TEXT"); +} + +ObjectFile::section_iterator MachOObjectFile::begin_sections() const { + DataRefImpl DRI; + DRI.d.a = DRI.d.b = 0; + moveToNextSection(DRI); + return section_iterator(SectionRef(DRI, this)); +} + +ObjectFile::section_iterator MachOObjectFile::end_sections() const { + DataRefImpl DRI; + DRI.d.a = MachOObj->getHeader().NumLoadCommands; + DRI.d.b = 0; + return section_iterator(SectionRef(DRI, this)); +} + +/*===-- Miscellaneous -----------------------------------------------------===*/ + +uint8_t MachOObjectFile::getBytesInAddress() const { + return MachOObj->is64Bit() ? 8 : 4; +} + +StringRef MachOObjectFile::getFileFormatName() const { + if (!MachOObj->is64Bit()) { + switch (MachOObj->getHeader().CPUType) { + case llvm::MachO::CPUTypeI386: + return "Mach-O 32-bit i386"; + case llvm::MachO::CPUTypeARM: + return "Mach-O arm"; + case llvm::MachO::CPUTypePowerPC: + return "Mach-O 32-bit ppc"; + default: + assert((MachOObj->getHeader().CPUType & llvm::MachO::CPUArchABI64) == 0 && + "64-bit object file when we're not 64-bit?"); + return "Mach-O 32-bit unknown"; + } + } + + switch (MachOObj->getHeader().CPUType) { + case llvm::MachO::CPUTypeX86_64: + return "Mach-O 64-bit x86-64"; + case llvm::MachO::CPUTypePowerPC64: + return "Mach-O 64-bit ppc64"; + default: + assert((MachOObj->getHeader().CPUType & llvm::MachO::CPUArchABI64) == 1 && + "32-bit object file when we're 64-bit?"); + return "Mach-O 64-bit unknown"; + } +} + +unsigned MachOObjectFile::getArch() const { + switch (MachOObj->getHeader().CPUType) { + case llvm::MachO::CPUTypeI386: + return Triple::x86; + case llvm::MachO::CPUTypeX86_64: + return Triple::x86_64; + case llvm::MachO::CPUTypeARM: + return Triple::arm; + case llvm::MachO::CPUTypePowerPC: + return Triple::ppc; + case llvm::MachO::CPUTypePowerPC64: + return Triple::ppc64; + default: + return Triple::UnknownArch; + } +} + +} // end namespace llvm + diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp index 161ae3a..47b6311 100644 --- a/lib/Object/ObjectFile.cpp +++ b/lib/Object/ObjectFile.cpp @@ -55,7 +55,7 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) { case sys::Mach_O_DynamicLinker_FileType: case sys::Mach_O_Bundle_FileType: case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType: - return 0; + return createMachOObjectFile(Object); case sys::COFF_FileType: return createCOFFObjectFile(Object); default: diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp index 3a63258..c3169ac 100644 --- a/lib/Support/APFloat.cpp +++ b/lib/Support/APFloat.cpp @@ -3564,7 +3564,7 @@ void APFloat::toString(SmallVectorImpl<char> &Str, } bool APFloat::getExactInverse(APFloat *inv) const { - // We can only guarantee the existance of an exact inverse for IEEE floats. + // We can only guarantee the existence of an exact inverse for IEEE floats. if (semantics != &IEEEhalf && semantics != &IEEEsingle && semantics != &IEEEdouble && semantics != &IEEEquad) return false; diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp index 5789721..76265d4 100644 --- a/lib/Support/APInt.cpp +++ b/lib/Support/APInt.cpp @@ -1375,7 +1375,7 @@ APInt APInt::sqrt() const { uint64_t(::round(::sqrt(double(isSingleWord()?VAL:pVal[0]))))); #else return APInt(BitWidth, - uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0]))) + 0.5); + uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0])) + 0.5)); #endif } @@ -1518,7 +1518,7 @@ APInt::ms APInt::magic() const { /// Requires that the divisor not be 0. Taken from "Hacker's Delight", Henry /// S. Warren, Jr., chapter 10. /// LeadingZeros can be used to simplify the calculation if the upper bits -/// of the devided value are known zero. +/// of the divided value are known zero. APInt::mu APInt::magicu(unsigned LeadingZeros) const { const APInt& d = *this; unsigned p; @@ -2164,12 +2164,33 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) { } void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix, - bool Signed) const { + bool Signed, bool formatAsCLiteral) const { assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2) && "Radix should be 2, 8, 10, or 16!"); + const char *Prefix = ""; + if (formatAsCLiteral) { + switch (Radix) { + case 2: + // Binary literals are a non-standard extension added in gcc 4.3: + // http://gcc.gnu.org/onlinedocs/gcc-4.3.0/gcc/Binary-constants.html + Prefix = "0b"; + break; + case 8: + Prefix = "0"; + break; + case 16: + Prefix = "0x"; + break; + } + } + // First, check for a zero value and just short circuit the logic below. if (*this == 0) { + while (*Prefix) { + Str.push_back(*Prefix); + ++Prefix; + }; Str.push_back('0'); return; } @@ -2193,6 +2214,11 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix, } } + while (*Prefix) { + Str.push_back(*Prefix); + ++Prefix; + }; + while (N) { *--BufPtr = Digits[N % Radix]; N /= Radix; @@ -2212,6 +2238,11 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix, Str.push_back('-'); } + while (*Prefix) { + Str.push_back(*Prefix); + ++Prefix; + }; + // We insert the digits backward, then reverse them to get the right order. unsigned StartDig = Str.size(); @@ -2251,7 +2282,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix, /// to the methods above. std::string APInt::toString(unsigned Radix = 10, bool Signed = true) const { SmallString<40> S; - toString(S, Radix, Signed); + toString(S, Radix, Signed, /* formatAsCLiteral = */false); return S.str(); } @@ -2266,7 +2297,7 @@ void APInt::dump() const { void APInt::print(raw_ostream &OS, bool isSigned) const { SmallString<40> S; - this->toString(S, 10, isSigned); + this->toString(S, 10, isSigned, /* formatAsCLiteral = */false); OS << S.str(); } diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp index 5e27df6..215b0f2 100644 --- a/lib/Support/Allocator.cpp +++ b/lib/Support/Allocator.cpp @@ -136,6 +136,14 @@ unsigned BumpPtrAllocator::GetNumSlabs() const { return NumSlabs; } +size_t BumpPtrAllocator::getTotalMemory() const { + size_t TotalMemory = 0; + for (MemSlab *Slab = CurSlab; Slab != 0; Slab = Slab->NextPtr) { + TotalMemory += Slab->Size; + } + return TotalMemory; +} + void BumpPtrAllocator::PrintStats() const { unsigned NumSlabs = 0; size_t TotalMemory = 0; diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp new file mode 100644 index 0000000..97342da --- /dev/null +++ b/lib/Support/BranchProbability.cpp @@ -0,0 +1,44 @@ +//===-------------- lib/Support/BranchProbability.cpp -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements Branch Probability class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +BranchProbability::BranchProbability(uint32_t n, uint32_t d) { + assert(d > 0 && "Denomiator cannot be 0!"); + assert(n <= d && "Probability cannot be bigger than 1!"); + N = n; + D = d; +} + +raw_ostream &BranchProbability::print(raw_ostream &OS) const { + OS << N << " / " << D << " = " << ((double)N / D); + return OS; +} + +void BranchProbability::dump() const { + print(dbgs()); + dbgs() << "\n"; +} + +namespace llvm { + +raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob) { + Prob.print(OS); + return OS; +} + +} diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index a0e997d..867d930 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_library(LLVMSupport APInt.cpp APSInt.cpp Allocator.cpp + BranchProbability.cpp circular_raw_ostream.cpp CommandLine.cpp ConstantRange.cpp diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index a1f2fce..2914337 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -186,12 +186,14 @@ static Option *LookupOption(StringRef &Arg, StringRef &Value, /// have already been stripped. static Option *LookupNearestOption(StringRef Arg, const StringMap<Option*> &OptionsMap, - const char *&NearestString) { + std::string &NearestString) { // Reject all dashes. if (Arg.empty()) return 0; // Split on any equal sign. - StringRef LHS = Arg.split('=').first; + std::pair<StringRef, StringRef> SplitArg = Arg.split('='); + StringRef &LHS = SplitArg.first; // LHS == Arg when no '=' is present. + StringRef &RHS = SplitArg.second; // Find the closest match. Option *Best = 0; @@ -204,14 +206,19 @@ static Option *LookupNearestOption(StringRef Arg, if (O->ArgStr[0]) OptionNames.push_back(O->ArgStr); + bool PermitValue = O->getValueExpectedFlag() != cl::ValueDisallowed; + StringRef Flag = PermitValue ? LHS : Arg; for (size_t i = 0, e = OptionNames.size(); i != e; ++i) { StringRef Name = OptionNames[i]; unsigned Distance = StringRef(Name).edit_distance( - Arg, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance); + Flag, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance); if (!Best || Distance < BestDistance) { Best = O; - NearestString = OptionNames[i]; BestDistance = Distance; + if (RHS.empty() || !PermitValue) + NearestString = OptionNames[i]; + else + NearestString = std::string(OptionNames[i]) + "=" + RHS.str(); } } } @@ -611,7 +618,7 @@ void cl::ParseCommandLineOptions(int argc, char **argv, for (int i = 1; i < argc; ++i) { Option *Handler = 0; Option *NearestHandler = 0; - const char *NearestHandlerString = 0; + std::string NearestHandlerString; StringRef Value; StringRef ArgName = ""; @@ -904,8 +911,8 @@ size_t alias::getOptionWidth() const { // Print out the option for the alias. void alias::printOptionInfo(size_t GlobalWidth) const { size_t L = std::strlen(ArgStr); - errs() << " -" << ArgStr; - errs().indent(GlobalWidth-L-6) << " - " << HelpStr << "\n"; + outs() << " -" << ArgStr; + outs().indent(GlobalWidth-L-6) << " - " << HelpStr << "\n"; } //===----------------------------------------------------------------------===// diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp index 9799ef5..0813321 100644 --- a/lib/Support/Dwarf.cpp +++ b/lib/Support/Dwarf.cpp @@ -203,6 +203,11 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) { case DW_AT_APPLE_major_runtime_vers: return "DW_AT_APPLE_major_runtime_vers"; case DW_AT_APPLE_runtime_class: return "DW_AT_APPLE_runtime_class"; case DW_AT_APPLE_omit_frame_ptr: return "DW_AT_APPLE_omit_frame_ptr"; + case DW_AT_APPLE_property_name: return "DW_AT_APPLE_property_name"; + case DW_AT_APPLE_property_getter: return "DW_AT_APPLE_property_getter"; + case DW_AT_APPLE_property_setter: return "DW_AT_APPLE_property_setter"; + case DW_AT_APPLE_property_attribute: return "DW_AT_APPLE_property_attribute"; + case DW_AT_APPLE_objc_complete_type: return "DW_AT_APPLE_objc_complete_type"; } return 0; } @@ -391,6 +396,7 @@ const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) { case DW_OP_call_ref: return "DW_OP_call_ref"; case DW_OP_form_tls_address: return "DW_OP_form_tls_address"; case DW_OP_call_frame_cfa: return "DW_OP_call_frame_cfa"; + case DW_OP_bit_piece: return "DW_OP_bit_piece"; case DW_OP_lo_user: return "DW_OP_lo_user"; case DW_OP_hi_user: return "DW_OP_hi_user"; } diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp index 3579546..e6cc57d 100644 --- a/lib/Support/ErrorHandling.cpp +++ b/lib/Support/ErrorHandling.cpp @@ -32,7 +32,6 @@ #endif using namespace llvm; -using namespace std; static fatal_error_handler_t ErrorHandler = 0; static void *ErrorHandlerUserData = 0; diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp index 5dbabee..4c8c0c6 100644 --- a/lib/Support/FileUtilities.cpp +++ b/lib/Support/FileUtilities.cpp @@ -198,7 +198,7 @@ int llvm::DiffFilesWithTolerance(const sys::PathWithStatus &FileA, return 1; } - // Now its safe to mmap the files into memory becasue both files + // Now its safe to mmap the files into memory because both files // have a non-zero size. error_code ec; OwningPtr<MemoryBuffer> F1; diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp index a4f80a9..1568342 100644 --- a/lib/Support/FoldingSet.cpp +++ b/lib/Support/FoldingSet.cpp @@ -92,7 +92,7 @@ void FoldingSetNodeID::AddInteger(long long I) { } void FoldingSetNodeID::AddInteger(unsigned long long I) { AddInteger(unsigned(I)); - if ((uint64_t)(int)I != I) + if ((uint64_t)(unsigned)I != I) Bits.push_back(unsigned(I >> 32)); } @@ -147,6 +147,11 @@ void FoldingSetNodeID::AddString(StringRef String) { Bits.push_back(V); } +// AddNodeID - Adds the Bit data of another ID to *this. +void FoldingSetNodeID::AddNodeID(const FoldingSetNodeID &ID) { + Bits.append(ID.Bits.begin(), ID.Bits.end()); +} + /// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used to /// lookup the node in the FoldingSetImpl. unsigned FoldingSetNodeID::ComputeHash() const { diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 911c64a..4299aa4 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -215,7 +215,8 @@ std::string sys::getHostCPUName() { case 37: // Intel Core i7, laptop version. return "corei7"; case 42: // SandyBridge - return "sandybridge"; + case 45: + return "corei7-avx"; case 28: // Intel Atom processor. All processors are manufactured using // the 45 nm process diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp index ea72720..d264be9 100644 --- a/lib/Support/MemoryBuffer.cpp +++ b/lib/Support/MemoryBuffer.cpp @@ -67,7 +67,7 @@ static void CopyStringRef(char *Memory, StringRef Data) { /// GetNamedBuffer - Allocates a new MemoryBuffer with Name copied after it. template <typename T> -static T* GetNamedBuffer(StringRef Buffer, StringRef Name, +static T *GetNamedBuffer(StringRef Buffer, StringRef Name, bool RequiresNullTerminator) { char *Mem = static_cast<char*>(operator new(sizeof(T) + Name.size() + 1)); CopyStringRef(Mem + sizeof(T), Name); @@ -86,11 +86,15 @@ public: // The name is stored after the class itself. return reinterpret_cast<const char*>(this + 1); } + + virtual BufferKind getBufferKind() const { + return MemoryBuffer_Malloc; + } }; } /// getMemBuffer - Open the specified memory range as a MemoryBuffer. Note -/// that EndPtr[0] must be a null byte and be accessible! +/// that InputData must be a null terminated if RequiresNullTerminator is true! MemoryBuffer *MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName, bool RequiresNullTerminator) { @@ -191,6 +195,10 @@ public: sys::Path::UnMapFilePages(reinterpret_cast<const char*>(RealStart), RealSize); } + + virtual BufferKind getBufferKind() const { + return MemoryBuffer_MMap; + } }; } @@ -213,9 +221,9 @@ error_code MemoryBuffer::getFile(const char *Filename, OpenFlags |= O_BINARY; // Open input file in binary mode on win32. #endif int FD = ::open(Filename, OpenFlags); - if (FD == -1) { + if (FD == -1) return error_code(errno, posix_category()); - } + error_code ret = getOpenFile(FD, Filename, result, FileSize, FileSize, 0, RequiresNullTerminator); close(FD); diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp index cfe23d8..8fbaf2d 100644 --- a/lib/Support/Path.cpp +++ b/lib/Support/Path.cpp @@ -92,15 +92,21 @@ sys::IdentifyFileType(const char *magic, unsigned length) { } break; + // The two magic numbers for mach-o are: + // 0xfeedface - 32-bit mach-o + // 0xfeedfacf - 64-bit mach-o case 0xFE: - case 0xCE: { + case 0xCE: + case 0xCF: { uint16_t type = 0; if (magic[0] == char(0xFE) && magic[1] == char(0xED) && - magic[2] == char(0xFA) && magic[3] == char(0xCE)) { + magic[2] == char(0xFA) && + (magic[3] == char(0xCE) || magic[3] == char(0xCF))) { /* Native endian */ if (length >= 16) type = magic[14] << 8 | magic[15]; - } else if (magic[0] == char(0xCE) && magic[1] == char(0xFA) && - magic[2] == char(0xED) && magic[3] == char(0xFE)) { + } else if ((magic[0] == char(0xCE) || magic[0] == char(0xCF)) && + magic[1] == char(0xFA) && magic[2] == char(0xED) && + magic[3] == char(0xFE)) { /* Reverse endian */ if (length >= 14) type = magic[13] << 8 | magic[12]; } diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp index a9f4709..082b701 100644 --- a/lib/Support/PrettyStackTrace.cpp +++ b/lib/Support/PrettyStackTrace.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This file defines some helpful functions for dealing with the possibility of -// Unix signals occuring while your program is running. +// Unix signals occurring while your program is running. // //===----------------------------------------------------------------------===// diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp index 309ffb0..d293da0 100644 --- a/lib/Support/Regex.cpp +++ b/lib/Support/Regex.cpp @@ -82,7 +82,7 @@ bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ Matches->push_back(StringRef()); continue; } - assert(pm[i].rm_eo > pm[i].rm_so); + assert(pm[i].rm_eo >= pm[i].rm_so); Matches->push_back(StringRef(String.data()+pm[i].rm_so, pm[i].rm_eo-pm[i].rm_so)); } diff --git a/lib/Support/Signals.cpp b/lib/Support/Signals.cpp index a3af37d..a117893 100644 --- a/lib/Support/Signals.cpp +++ b/lib/Support/Signals.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This file defines some helpful functions for dealing with the possibility of -// Unix signals occuring while your program is running. +// Unix signals occurring while your program is running. // //===----------------------------------------------------------------------===// diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp index ef09916..de042a9f 100644 --- a/lib/Support/SourceMgr.cpp +++ b/lib/Support/SourceMgr.cpp @@ -49,14 +49,16 @@ SourceMgr::~SourceMgr() { /// directory or in one of the IncludeDirs. If no file is found, this returns /// ~0, otherwise it returns the buffer ID of the stacked file. unsigned SourceMgr::AddIncludeFile(const std::string &Filename, - SMLoc IncludeLoc) { + SMLoc IncludeLoc, + std::string &IncludedFile) { OwningPtr<MemoryBuffer> NewBuf; - MemoryBuffer::getFile(Filename.c_str(), NewBuf); + IncludedFile = Filename; + MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf); // If the file didn't exist directly, see if it's in an include path. for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) { - std::string IncFile = IncludeDirectories[i] + "/" + Filename; - MemoryBuffer::getFile(IncFile.c_str(), NewBuf); + IncludedFile = IncludeDirectories[i] + "/" + Filename; + MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf); } if (NewBuf == 0) return ~0U; diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp index 5398051..8c3fc09 100644 --- a/lib/Support/StringRef.cpp +++ b/lib/Support/StringRef.cpp @@ -131,7 +131,7 @@ unsigned StringRef::edit_distance(llvm::StringRef Other, /// find - Search for the first string \arg Str in the string. /// -/// \return - The index of the first occurence of \arg Str, or npos if not +/// \return - The index of the first occurrence of \arg Str, or npos if not /// found. size_t StringRef::find(StringRef Str, size_t From) const { size_t N = Str.size(); @@ -145,7 +145,7 @@ size_t StringRef::find(StringRef Str, size_t From) const { /// rfind - Search for the last string \arg Str in the string. /// -/// \return - The index of the last occurence of \arg Str, or npos if not +/// \return - The index of the last occurrence of \arg Str, or npos if not /// found. size_t StringRef::rfind(StringRef Str) const { size_t N = Str.size(); diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index 53ca48f..dbdb303 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -41,7 +41,8 @@ const char *Triple::getArchTypeName(ArchType Kind) { case x86_64: return "x86_64"; case xcore: return "xcore"; case mblaze: return "mblaze"; - case ptx: return "ptx"; + case ptx32: return "ptx32"; + case ptx64: return "ptx64"; } return "<invalid>"; @@ -74,7 +75,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) { case xcore: return "xcore"; - case ptx: return "ptx"; + case ptx32: return "ptx"; + case ptx64: return "ptx"; } } @@ -99,8 +101,10 @@ const char *Triple::getOSTypeName(OSType Kind) { case Darwin: return "darwin"; case DragonFly: return "dragonfly"; case FreeBSD: return "freebsd"; + case IOS: return "ios"; case Linux: return "linux"; case Lv2: return "lv2"; + case MacOSX: return "macosx"; case MinGW32: return "mingw32"; case NetBSD: return "netbsd"; case OpenBSD: return "openbsd"; @@ -163,8 +167,10 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { return x86_64; if (Name == "xcore") return xcore; - if (Name == "ptx") - return ptx; + if (Name == "ptx32") + return ptx32; + if (Name == "ptx64") + return ptx64; return UnknownArch; } @@ -203,15 +209,17 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) { Str == "armv6" || Str == "armv7") return Triple::arm; - if (Str == "ptx") - return Triple::ptx; + if (Str == "ptx32") + return Triple::ptx32; + if (Str == "ptx64") + return Triple::ptx64; return Triple::UnknownArch; } // Returns architecture name that is understood by the target assembler. const char *Triple::getArchNameForAssembler() { - if (getOS() != Triple::Darwin && getVendor() != Triple::Apple) + if (!isOSDarwin() && getVendor() != Triple::Apple) return NULL; StringRef Str = getArchName(); @@ -236,8 +244,10 @@ const char *Triple::getArchNameForAssembler() { return "armv6"; if (Str == "armv7" || Str == "thumbv7") return "armv7"; - if (Str == "ptx") - return "ptx"; + if (Str == "ptx32") + return "ptx32"; + if (Str == "ptx64") + return "ptx64"; return NULL; } @@ -286,8 +296,10 @@ Triple::ArchType Triple::ParseArch(StringRef ArchName) { return tce; else if (ArchName == "xcore") return xcore; - else if (ArchName == "ptx") - return ptx; + else if (ArchName == "ptx32") + return ptx32; + else if (ArchName == "ptx64") + return ptx64; else return UnknownArch; } @@ -314,10 +326,14 @@ Triple::OSType Triple::ParseOS(StringRef OSName) { return DragonFly; else if (OSName.startswith("freebsd")) return FreeBSD; + else if (OSName.startswith("ios")) + return IOS; else if (OSName.startswith("linux")) return Linux; else if (OSName.startswith("lv2")) return Lv2; + else if (OSName.startswith("macosx")) + return MacOSX; else if (OSName.startswith("mingw32")) return MinGW32; else if (OSName.startswith("netbsd")) @@ -526,67 +542,44 @@ StringRef Triple::getOSAndEnvironmentName() const { static unsigned EatNumber(StringRef &Str) { assert(!Str.empty() && Str[0] >= '0' && Str[0] <= '9' && "Not a number"); - unsigned Result = Str[0]-'0'; + unsigned Result = 0; - // Eat the digit. - Str = Str.substr(1); - - // Handle "darwin11". - if (Result == 1 && !Str.empty() && Str[0] >= '0' && Str[0] <= '9') { + do { + // Consume the leading digit. Result = Result*10 + (Str[0] - '0'); + // Eat the digit. Str = Str.substr(1); - } + } while (!Str.empty() && Str[0] >= '0' && Str[0] <= '9'); return Result; } -/// getDarwinNumber - Parse the 'darwin number' out of the specific target -/// triple. For example, if we have darwin8.5 return 8,5,0. If any entry is -/// not defined, return 0's. This requires that the triple have an OSType of -/// darwin before it is called. -void Triple::getDarwinNumber(unsigned &Maj, unsigned &Min, - unsigned &Revision) const { - assert(getOS() == Darwin && "Not a darwin target triple!"); +void Triple::getOSVersion(unsigned &Major, unsigned &Minor, + unsigned &Micro) const { StringRef OSName = getOSName(); - assert(OSName.startswith("darwin") && "Unknown darwin target triple!"); - - // Strip off "darwin". - OSName = OSName.substr(6); - - Maj = Min = Revision = 0; - - if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9') - return; - // The major version is the first digit. - Maj = EatNumber(OSName); - if (OSName.empty()) return; + // Assume that the OS portion of the triple starts with the canonical name. + StringRef OSTypeName = getOSTypeName(getOS()); + if (OSName.startswith(OSTypeName)) + OSName = OSName.substr(OSTypeName.size()); - // Handle minor version: 10.4.9 -> darwin8.9. - if (OSName[0] != '.') - return; + // Any unset version defaults to 0. + Major = Minor = Micro = 0; - // Eat the '.'. - OSName = OSName.substr(1); - - if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9') - return; - - Min = EatNumber(OSName); - if (OSName.empty()) return; - - // Handle revision darwin8.9.1 - if (OSName[0] != '.') - return; - - // Eat the '.'. - OSName = OSName.substr(1); + // Parse up to three components. + unsigned *Components[3] = { &Major, &Minor, &Micro }; + for (unsigned i = 0; i != 3; ++i) { + if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9') + break; - if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9') - return; + // Consume the leading number. + *Components[i] = EatNumber(OSName); - Revision = EatNumber(OSName); + // Consume the separator, if present. + if (OSName.startswith(".")) + OSName = OSName.substr(1); + } } void Triple::setTriple(const Twine &Str) { diff --git a/lib/Support/Unix/Host.inc b/lib/Support/Unix/Host.inc index c971955..a286a15 100644 --- a/lib/Support/Unix/Host.inc +++ b/lib/Support/Unix/Host.inc @@ -45,35 +45,6 @@ std::string sys::getHostTriple() { // Normalize the arch, since the host triple may not actually match the host. std::string Arch = ArchSplit.first; - // It would be nice to do this in terms of llvm::Triple, but that is in - // Support which is layered above us. -#if defined(__x86_64__) - Arch = "x86_64"; -#elif defined(__i386__) - Arch = "i386"; -#elif defined(__ppc64__) - Arch = "powerpc64"; -#elif defined(__ppc__) - Arch = "powerpc"; -#elif defined(__arm__) - - // FIXME: We need to pick the right ARM triple (which involves querying the - // chip). However, for now this is most important for LLVM arch selection, so - // we only need to make sure to distinguish ARM and Thumb. -# if defined(__thumb__) - Arch = "thumb"; -# else - Arch = "arm"; -# endif - -#else - - // FIXME: When enough auto-detection is in place, this should just - // #error. Then at least the arch selection is done, and we only need the OS - // etc selection to kill off the use of LLVM_HOSTTRIPLE. - -#endif - std::string Triple(Arch); Triple += '-'; Triple += ArchSplit.second; @@ -88,10 +59,7 @@ std::string sys::getHostTriple() { std::string::size_type DarwinDashIdx = Triple.find("-darwin"); if (DarwinDashIdx != std::string::npos) { Triple.resize(DarwinDashIdx + strlen("-darwin")); - - // Only add the major part of the os version. - std::string Version = getOSVersion(); - Triple += Version.substr(0, Version.find('.')); + Triple += getOSVersion(); } return Triple; diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc index 6efc8cd..346baf1 100644 --- a/lib/Support/Unix/Program.inc +++ b/lib/Support/Unix/Program.inc @@ -236,7 +236,7 @@ Program::Execute(const Path &path, const char **args, const char **envp, // Create a child process. int child = fork(); switch (child) { - // An error occured: Return to the caller. + // An error occurred: Return to the caller. case -1: MakeErrMsg(ErrMsg, "Couldn't fork"); return false; @@ -338,7 +338,7 @@ Program::Wait(const sys::Path &path, else MakeErrMsg(ErrMsg, "Child timed out", 0); - return -1; // Timeout detected + return -2; // Timeout detected } else if (errno != EINTR) { MakeErrMsg(ErrMsg, "Error waiting for child process"); return -1; @@ -382,7 +382,9 @@ Program::Wait(const sys::Path &path, *ErrMsg += " (core dumped)"; #endif } - return -1; + // Return a special value to indicate that the process received an unhandled + // signal during execution as opposed to failing to execute. + return -2; } return result; #else diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc index 0a61759..e286869 100644 --- a/lib/Support/Unix/Signals.inc +++ b/lib/Support/Unix/Signals.inc @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This file defines some helpful functions for dealing with the possibility of -// Unix signals occuring while your program is running. +// Unix signals occurring while your program is running. // //===----------------------------------------------------------------------===// @@ -274,6 +274,9 @@ void llvm::sys::PrintStackTraceOnErrorSignal() { #ifdef __APPLE__ +#include <signal.h> +#include <pthread.h> + int raise(int sig) { return pthread_kill(pthread_self(), sig); } @@ -291,9 +294,6 @@ void __assert_rtn(const char *func, abort(); } -#include <signal.h> -#include <pthread.h> - void abort() { raise(SIGABRT); usleep(1000); diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc index 2c14366..4227844 100644 --- a/lib/Support/Windows/DynamicLibrary.inc +++ b/lib/Support/Windows/DynamicLibrary.inc @@ -41,41 +41,12 @@ using namespace sys; static std::vector<HMODULE> OpenedHandles; -#ifdef _WIN64 - typedef DWORD64 ModuleBaseType; -#else - typedef ULONG ModuleBaseType; -#endif - extern "C" { -// Use old callback if: -// - Not using Visual Studio -// - Visual Studio 2005 or earlier but only if we are not using the Windows SDK -// or Windows SDK version is older than 6.0 -// Use new callback if: -// - Newer Visual Studio (comes with newer SDK). -// - Visual Studio 2005 with Windows SDK 6.0+ -#if defined(_MSC_VER) - #if _MSC_VER < 1500 && (!defined(VER_PRODUCTBUILD) || VER_PRODUCTBUILD < 6000) - #define OLD_ELM_CALLBACK_DECL 1 - #endif -#elif defined(__MINGW64__) - // Use new callback. -#elif defined(__MINGW32__) - #define OLD_ELM_CALLBACK_DECL 1 -#endif -#ifdef OLD_ELM_CALLBACK_DECL - static BOOL CALLBACK ELM_Callback(PSTR ModuleName, - ModuleBaseType ModuleBase, + static BOOL CALLBACK ELM_Callback(WIN32_ELMCB_PCSTR ModuleName, + ULONG_PTR ModuleBase, ULONG ModuleSize, PVOID UserContext) -#else - static BOOL CALLBACK ELM_Callback(PCSTR ModuleName, - ModuleBaseType ModuleBase, - ULONG ModuleSize, - PVOID UserContext) -#endif { // Ignore VC++ runtimes prior to 7.1. Somehow some of them get loaded // into the process. diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc index 350363c..e486e6e 100644 --- a/lib/Support/Windows/Program.inc +++ b/lib/Support/Windows/Program.inc @@ -349,7 +349,8 @@ Program::Wait(const Path &path, if (WaitForSingleObject(hProcess, millisecondsToWait) == WAIT_TIMEOUT) { if (!TerminateProcess(hProcess, 1)) { MakeErrMsg(ErrMsg, "Failed to terminate timed-out program."); - return -1; + // -2 indicates a crash or timeout as opposed to failure to execute. + return -2; } WaitForSingleObject(hProcess, INFINITE); } @@ -362,7 +363,8 @@ Program::Wait(const Path &path, if (!rc) { SetLastError(err); MakeErrMsg(ErrMsg, "Failed getting status for program."); - return -1; + // -2 indicates a crash or timeout as opposed to failure to execute. + return -2; } return status; diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index e690e18..6af5f85 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -67,6 +67,14 @@ def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", "Prefer 32-bit Thumb instrs">; +/// Some instructions update CPSR partially, which can add false dependency for +/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is +/// mapped to a separate physical register. Avoid partial CPSR update for these +/// processors. +def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr", + "AvoidCPSRPartialUpdate", "true", + "Avoid CPSR partial update for OOO execution">; + // Multiprocessing extension. def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", "Supports Multiprocessing extension">; @@ -110,8 +118,9 @@ def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", FeatureT2XtPk]>; def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", "Cortex-A9 ARM processors", - [FeatureHasSlowFPVMLx, FeatureVMLxForwarding, - FeatureT2XtPk, FeatureFP16]>; + [FeatureVMLxForwarding, + FeatureT2XtPk, FeatureFP16, + FeatureAvoidPartialCPSR]>; class ProcNoItin<string Name, list<SubtargetFeature> Features> : Processor<Name, GenericItineraries, Features>; @@ -178,6 +187,8 @@ def : Processor<"cortex-a8", CortexA8Itineraries, [ArchV7A, ProcA8]>; def : Processor<"cortex-a9", CortexA9Itineraries, [ArchV7A, ProcA9]>; +def : Processor<"cortex-a9-mp", CortexA9Itineraries, + [ArchV7A, ProcA9, FeatureMP]>; // V7M Processors. def : ProcNoItin<"cortex-m3", [ArchV7M]>; diff --git a/lib/Target/ARM/ARMAsmBackend.cpp b/lib/Target/ARM/ARMAsmBackend.cpp index e972a08..db132da 100644 --- a/lib/Target/ARM/ARMAsmBackend.cpp +++ b/lib/Target/ARM/ARMAsmBackend.cpp @@ -76,7 +76,7 @@ public: { "fixup_arm_thumb_blx", 7, 21, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_arm_thumb_cp", 1, 8, MCFixupKindInfo::FKF_IsPCRel }, -{ "fixup_arm_thumb_bcc", 1, 8, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_bcc", 0, 8, MCFixupKindInfo::FKF_IsPCRel }, // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19. { "fixup_arm_movt_hi16", 0, 20, 0 }, { "fixup_arm_movw_lo16", 0, 20, 0 }, @@ -164,23 +164,25 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { case FK_Data_4: return Value; case ARM::fixup_arm_movt_hi16: - case ARM::fixup_arm_movt_hi16_pcrel: Value >>= 16; // Fallthrough case ARM::fixup_arm_movw_lo16: + case ARM::fixup_arm_movt_hi16_pcrel: case ARM::fixup_arm_movw_lo16_pcrel: { unsigned Hi4 = (Value & 0xF000) >> 12; unsigned Lo12 = Value & 0x0FFF; + assert ((((int64_t)Value) >= -0x8000) && (((int64_t)Value) <= 0x7fff) && + "Out of range pc-relative fixup value!"); // inst{19-16} = Hi4; // inst{11-0} = Lo12; Value = (Hi4 << 16) | (Lo12); return Value; } case ARM::fixup_t2_movt_hi16: - case ARM::fixup_t2_movt_hi16_pcrel: Value >>= 16; // Fallthrough case ARM::fixup_t2_movw_lo16: + case ARM::fixup_t2_movt_hi16_pcrel: case ARM::fixup_t2_movw_lo16_pcrel: { unsigned Hi4 = (Value & 0xF000) >> 12; unsigned i = (Value & 0x800) >> 11; @@ -190,8 +192,9 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { // inst{26} = i; // inst{14-12} = Mid3; // inst{7-0} = Lo8; + assert ((((int64_t)Value) >= -0x8000) && (((int64_t)Value) <= 0x7fff) && + "Out of range pc-relative fixup value!"); Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8); - uint64_t swapped = (Value & 0xFFFF0000) >> 16; swapped |= (Value & 0x0000FFFF) << 16; return swapped; @@ -305,7 +308,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { // // Note that the halfwords are stored high first, low second; so we need // to transpose the fixup value here to map properly. - unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0; + unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0; uint32_t Binary = 0; Value = 0x3fffff & ((Value - 4) >> 1); Binary = (Value & 0x7ff) << 16; // Low imm11 value. @@ -323,7 +326,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { // // Note that the halfwords are stored high first, low second; so we need // to transpose the fixup value here to map properly. - unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0; + unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0; uint32_t Binary = 0; Value = 0xfffff & ((Value - 2) >> 2); Binary = (Value & 0x3ff) << 17; // Low imm10L value. @@ -404,7 +407,6 @@ void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, if (!Value) return; // Doesn't change encoding. unsigned Offset = Fixup.getOffset(); - assert(Offset % NumBytes == 0 && "Offset mod NumBytes is nonzero!"); // For each byte of the fragment that the fixup touches, mask in the bits from // the fixup value. The Value has been "split up" into the appropriate @@ -501,18 +503,22 @@ void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, TargetAsmBackend *llvm::createARMAsmBackend(const Target &T, const std::string &TT) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: { - if (TheTriple.getArchName() == "armv6" || + + if (TheTriple.isOSDarwin()) { + if (TheTriple.getArchName() == "armv4t" || + TheTriple.getArchName() == "thumbv4t") + return new DarwinARMAsmBackend(T, object::mach::CSARM_V4T); + else if (TheTriple.getArchName() == "armv5e" || + TheTriple.getArchName() == "thumbv5e") + return new DarwinARMAsmBackend(T, object::mach::CSARM_V5TEJ); + else if (TheTriple.getArchName() == "armv6" || TheTriple.getArchName() == "thumbv6") return new DarwinARMAsmBackend(T, object::mach::CSARM_V6); return new DarwinARMAsmBackend(T, object::mach::CSARM_V7); } - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: + + if (TheTriple.isOSWindows()) assert(0 && "Windows not supported on ARM"); - default: - return new ELFARMAsmBackend(T, Triple(TT).getOS()); - } + + return new ELFARMAsmBackend(T, Triple(TT).getOS()); } diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 8eb1993..eb73902 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -172,10 +172,70 @@ getDebugValueLocation(const MachineInstr *MI) const { return Location; } +/// EmitDwarfRegOp - Emit dwarf register operation. +void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const { + const TargetRegisterInfo *RI = TM.getRegisterInfo(); + if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) + AsmPrinter::EmitDwarfRegOp(MLoc); + else { + unsigned Reg = MLoc.getReg(); + if (Reg >= ARM::S0 && Reg <= ARM::S31) { + assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering"); + // S registers are described as bit-pieces of a register + // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0) + // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32) + + unsigned SReg = Reg - ARM::S0; + bool odd = SReg & 0x1; + unsigned Rx = 256 + (SReg >> 1); + + OutStreamer.AddComment("DW_OP_regx for S register"); + EmitInt8(dwarf::DW_OP_regx); + + OutStreamer.AddComment(Twine(SReg)); + EmitULEB128(Rx); + + if (odd) { + OutStreamer.AddComment("DW_OP_bit_piece 32 32"); + EmitInt8(dwarf::DW_OP_bit_piece); + EmitULEB128(32); + EmitULEB128(32); + } else { + OutStreamer.AddComment("DW_OP_bit_piece 32 0"); + EmitInt8(dwarf::DW_OP_bit_piece); + EmitULEB128(32); + EmitULEB128(0); + } + } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) { + assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering"); + // Q registers Q0-Q15 are described by composing two D registers together. + // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8) + + unsigned QReg = Reg - ARM::Q0; + unsigned D1 = 256 + 2 * QReg; + unsigned D2 = D1 + 1; + + OutStreamer.AddComment("DW_OP_regx for Q register: D1"); + EmitInt8(dwarf::DW_OP_regx); + EmitULEB128(D1); + OutStreamer.AddComment("DW_OP_piece 8"); + EmitInt8(dwarf::DW_OP_piece); + EmitULEB128(8); + + OutStreamer.AddComment("DW_OP_regx for Q register: D2"); + EmitInt8(dwarf::DW_OP_regx); + EmitULEB128(D2); + OutStreamer.AddComment("DW_OP_piece 8"); + EmitInt8(dwarf::DW_OP_piece); + EmitULEB128(8); + } + } +} + void ARMAsmPrinter::EmitFunctionEntryLabel() { if (AFI->isThumbFunction()) { OutStreamer.EmitAssemblerFlag(MCAF_Code16); - OutStreamer.EmitThumbFunc(Subtarget->isTargetDarwin()? CurrentFnSym : 0); + OutStreamer.EmitThumbFunc(CurrentFnSym); } OutStreamer.EmitLabel(CurrentFnSym); @@ -305,10 +365,63 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, case 'q': // Print a NEON quad precision register. printOperand(MI, OpNum, O); return false; - case 'Q': - case 'R': - case 'H': - // These modifiers are not yet supported. + case 'y': // Print a VFP single precision register as indexed double. + // This uses the ordering of the alias table to get the first 'd' register + // that overlaps the 's' register. Also, s0 is an odd register, hence the + // odd modulus check below. + if (MI->getOperand(OpNum).isReg()) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); + O << ARMInstPrinter::getRegisterName(TRI->getAliasSet(Reg)[0]) << + (((Reg % 2) == 1) ? "[0]" : "[1]"); + return false; + } + return true; + case 'B': // Bitwise inverse of integer or symbol without a preceding #. + if (!MI->getOperand(OpNum).isImm()) + return true; + O << ~(MI->getOperand(OpNum).getImm()); + return false; + case 'L': // The low 16 bits of an immediate constant. + if (!MI->getOperand(OpNum).isImm()) + return true; + O << (MI->getOperand(OpNum).getImm() & 0xffff); + return false; + case 'M': { // A register range suitable for LDM/STM. + if (!MI->getOperand(OpNum).isReg()) + return true; + const MachineOperand &MO = MI->getOperand(OpNum); + unsigned RegBegin = MO.getReg(); + // This takes advantage of the 2 operand-ness of ldm/stm and that we've + // already got the operands in registers that are operands to the + // inline asm statement. + + O << "{" << ARMInstPrinter::getRegisterName(RegBegin); + + // FIXME: The register allocator not only may not have given us the + // registers in sequence, but may not be in ascending registers. This + // will require changes in the register allocator that'll need to be + // propagated down here if the operands change. + unsigned RegOps = OpNum + 1; + while (MI->getOperand(RegOps).isReg()) { + O << ", " + << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg()); + RegOps++; + } + + O << "}"; + + return false; + } + // These modifiers are not yet supported. + case 'p': // The high single-precision register of a VFP double-precision + // register. + case 'e': // The low doubleword register of a NEON quad register. + case 'f': // The high doubleword register of a NEON quad register. + case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1. + case 'Q': // The least significant register of a pair. + case 'R': // The most significant register of a pair. + case 'H': // The highest-numbered register of a pair. return true; } } @@ -321,9 +434,21 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) - return true; // Unknown modifier. - + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + case 'A': // A memory operand for a VLD1/VST1 instruction. + default: return true; // Unknown modifier. + case 'm': // The base register of a memory operand. + if (!MI->getOperand(OpNum).isReg()) + return true; + O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg()); + return false; + } + } + const MachineOperand &MO = MI->getOperand(OpNum); assert(MO.isReg() && "unexpected inline asm memory operand"); O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]"; @@ -489,6 +614,12 @@ void ARMAsmPrinter::emitAttributes() { // /// ADD additional Else-cases here! + } else if (CPUString == "xscale") { + AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, + ARMBuildAttrs::Allowed); + AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::Allowed); } else if (CPUString == "generic") { // FIXME: Why these defaults? AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T); @@ -1077,6 +1208,26 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { } return; } + case ARM::tBXr9_CALL: + case ARM::tBX_CALL: { + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::LR)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tBX); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } case ARM::BMOVPCRXr9_CALL: case ARM::BMOVPCRX_CALL: { { @@ -1698,7 +1849,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { } { MCInst TmpInst; - TmpInst.setOpcode(ARM::tBX_RET_vararg); + TmpInst.setOpcode(ARM::tBX); TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); // Predicate. TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); @@ -1708,7 +1859,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } // Tail jump branches are really just branch instructions with additional - // code-gen attributes. Convert them to the cannonical form here. + // code-gen attributes. Convert them to the canonical form here. case ARM::TAILJMPd: case ARM::TAILJMPdND: { MCInst TmpInst, TmpInst2; @@ -1727,7 +1878,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { case ARM::tTAILJMPdND: { MCInst TmpInst, TmpInst2; LowerARMMachineInstrToMCInst(MI, TmpInst2, *this); - TmpInst.setOpcode(ARM::tB); + // The Darwin toolchain doesn't support tail call relocations of 16-bit + // branches. + TmpInst.setOpcode(Opc == ARM::tTAILJMPd ? ARM::t2B : ARM::tB); TmpInst.addOperand(TmpInst2.getOperand(0)); OutStreamer.AddComment("TAILCALL"); OutStreamer.EmitInstruction(TmpInst); diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h index 9db139b..5f9169e 100644 --- a/lib/Target/ARM/ARMAsmPrinter.h +++ b/lib/Target/ARM/ARMAsmPrinter.h @@ -89,6 +89,9 @@ public: MachineLocation getDebugValueLocation(const MachineInstr *MI) const; + /// EmitDwarfRegOp - Emit dwarf register operation. + virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const; + virtual unsigned getISAEncoding() { // ARM/Darwin adds ISA to the DWARF info for each function. if (!Subtarget->isTargetDarwin()) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 30148c2..44a3976 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1021,7 +1021,7 @@ reMaterialize(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode), DestReg) .addConstantPoolIndex(CPI).addImm(PCLabelId); - (*MIB).setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end()); + MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end()); break; } } @@ -1201,7 +1201,7 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, } /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to -/// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should +/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should /// be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets @@ -1270,19 +1270,19 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, } bool ARMBaseInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, unsigned ExtraPredCycles, float Probability, float Confidence) const { - if (!NumCyles) + if (!NumCycles) return false; // Attempt to estimate the relative costs of predication versus branching. - float UnpredCost = Probability * NumCyles; + float UnpredCost = Probability * NumCycles; UnpredCost += 1.0; // The branch itself UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty(); - return (float)(NumCyles + ExtraPredCycles) < UnpredCost; + return (float)(NumCycles + ExtraPredCycles) < UnpredCost; } bool ARMBaseInstrInfo:: @@ -1618,17 +1618,39 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, // Set the "zero" bit in CPSR. switch (MI->getOpcode()) { default: break; + case ARM::RSBrr: case ARM::RSBri: + case ARM::RSCrr: case ARM::RSCri: + case ARM::ADDrr: case ARM::ADDri: + case ARM::ADCrr: case ARM::ADCri: + case ARM::SUBrr: case ARM::SUBri: + case ARM::SBCrr: case ARM::SBCri: case ARM::t2RSBri: + case ARM::t2ADDrr: case ARM::t2ADDri: + case ARM::t2ADCrr: case ARM::t2ADCri: + case ARM::t2SUBrr: case ARM::t2SUBri: - case ARM::t2SBCri: { + case ARM::t2SBCrr: + case ARM::t2SBCri: + case ARM::ANDrr: + case ARM::ANDri: + case ARM::t2ANDrr: + case ARM::t2ANDri: + case ARM::ORRrr: + case ARM::ORRri: + case ARM::t2ORRrr: + case ARM::t2ORRri: + case ARM::EORrr: + case ARM::EORri: + case ARM::t2EORrr: + case ARM::t2EORri: { // Scan forward for the use of CPSR, if it's a conditional code requires // checking of V bit, then this is not safe to do. If we can't find the // CPSR use (i.e. used in another block), then it's not safe to perform @@ -1667,16 +1689,13 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, if (!isSafe) return false; - // fallthrough - } - case ARM::ANDri: - case ARM::t2ANDri: // Toggle the optional operand to CPSR. MI->getOperand(5).setReg(ARM::CPSR); MI->getOperand(5).setIsDef(true); CmpInstr->eraseFromParent(); return true; } + } return false; } @@ -2203,6 +2222,101 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, } } + if (DefAlign < 8 && Subtarget.isCortexA9()) + switch (DefTID.getOpcode()) { + default: break; + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: + case ARM::VLD1q8_UPD: + case ARM::VLD1q16_UPD: + case ARM::VLD1q32_UPD: + case ARM::VLD1q64_UPD: + case ARM::VLD2d8: + case ARM::VLD2d16: + case ARM::VLD2d32: + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + case ARM::VLD2d8_UPD: + case ARM::VLD2d16_UPD: + case ARM::VLD2d32_UPD: + case ARM::VLD2q8_UPD: + case ARM::VLD2q16_UPD: + case ARM::VLD2q32_UPD: + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD1d64T: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD1d64T_UPD: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD1d64Q: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + case ARM::VLD1d64Q_UPD: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + case ARM::VLD1DUPq8: + case ARM::VLD1DUPq16: + case ARM::VLD1DUPq32: + case ARM::VLD1DUPq8_UPD: + case ARM::VLD1DUPq16_UPD: + case ARM::VLD1DUPq32_UPD: + case ARM::VLD2DUPd8: + case ARM::VLD2DUPd16: + case ARM::VLD2DUPd32: + case ARM::VLD2DUPd8_UPD: + case ARM::VLD2DUPd16_UPD: + case ARM::VLD2DUPd32_UPD: + case ARM::VLD4DUPd8: + case ARM::VLD4DUPd16: + case ARM::VLD4DUPd32: + case ARM::VLD4DUPd8_UPD: + case ARM::VLD4DUPd16_UPD: + case ARM::VLD4DUPd32_UPD: + case ARM::VLD1LNd8: + case ARM::VLD1LNd16: + case ARM::VLD1LNd32: + case ARM::VLD1LNd8_UPD: + case ARM::VLD1LNd16_UPD: + case ARM::VLD1LNd32_UPD: + case ARM::VLD2LNd8: + case ARM::VLD2LNd16: + case ARM::VLD2LNd32: + case ARM::VLD2LNq16: + case ARM::VLD2LNq32: + case ARM::VLD2LNd8_UPD: + case ARM::VLD2LNd16_UPD: + case ARM::VLD2LNd32_UPD: + case ARM::VLD2LNq16_UPD: + case ARM::VLD2LNq32_UPD: + case ARM::VLD4LNd8: + case ARM::VLD4LNd16: + case ARM::VLD4LNd32: + case ARM::VLD4LNq16: + case ARM::VLD4LNq32: + case ARM::VLD4LNd8_UPD: + case ARM::VLD4LNd16_UPD: + case ARM::VLD4LNd32_UPD: + case ARM::VLD4LNq16_UPD: + case ARM::VLD4LNq32_UPD: + // If the address is not 64-bit aligned, the latencies of these + // instructions increases by one. + ++Latency; + break; + } + return Latency; } @@ -2269,6 +2383,113 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, } } + if (DefAlign < 8 && Subtarget.isCortexA9()) + switch (DefTID.getOpcode()) { + default: break; + case ARM::VLD1q8Pseudo: + case ARM::VLD1q16Pseudo: + case ARM::VLD1q32Pseudo: + case ARM::VLD1q64Pseudo: + case ARM::VLD1q8Pseudo_UPD: + case ARM::VLD1q16Pseudo_UPD: + case ARM::VLD1q32Pseudo_UPD: + case ARM::VLD1q64Pseudo_UPD: + case ARM::VLD2d8Pseudo: + case ARM::VLD2d16Pseudo: + case ARM::VLD2d32Pseudo: + case ARM::VLD2q8Pseudo: + case ARM::VLD2q16Pseudo: + case ARM::VLD2q32Pseudo: + case ARM::VLD2d8Pseudo_UPD: + case ARM::VLD2d16Pseudo_UPD: + case ARM::VLD2d32Pseudo_UPD: + case ARM::VLD2q8Pseudo_UPD: + case ARM::VLD2q16Pseudo_UPD: + case ARM::VLD2q32Pseudo_UPD: + case ARM::VLD3d8Pseudo: + case ARM::VLD3d16Pseudo: + case ARM::VLD3d32Pseudo: + case ARM::VLD1d64TPseudo: + case ARM::VLD3d8Pseudo_UPD: + case ARM::VLD3d16Pseudo_UPD: + case ARM::VLD3d32Pseudo_UPD: + case ARM::VLD1d64TPseudo_UPD: + case ARM::VLD3q8Pseudo_UPD: + case ARM::VLD3q16Pseudo_UPD: + case ARM::VLD3q32Pseudo_UPD: + case ARM::VLD3q8oddPseudo: + case ARM::VLD3q16oddPseudo: + case ARM::VLD3q32oddPseudo: + case ARM::VLD3q8oddPseudo_UPD: + case ARM::VLD3q16oddPseudo_UPD: + case ARM::VLD3q32oddPseudo_UPD: + case ARM::VLD4d8Pseudo: + case ARM::VLD4d16Pseudo: + case ARM::VLD4d32Pseudo: + case ARM::VLD1d64QPseudo: + case ARM::VLD4d8Pseudo_UPD: + case ARM::VLD4d16Pseudo_UPD: + case ARM::VLD4d32Pseudo_UPD: + case ARM::VLD1d64QPseudo_UPD: + case ARM::VLD4q8Pseudo_UPD: + case ARM::VLD4q16Pseudo_UPD: + case ARM::VLD4q32Pseudo_UPD: + case ARM::VLD4q8oddPseudo: + case ARM::VLD4q16oddPseudo: + case ARM::VLD4q32oddPseudo: + case ARM::VLD4q8oddPseudo_UPD: + case ARM::VLD4q16oddPseudo_UPD: + case ARM::VLD4q32oddPseudo_UPD: + case ARM::VLD1DUPq8Pseudo: + case ARM::VLD1DUPq16Pseudo: + case ARM::VLD1DUPq32Pseudo: + case ARM::VLD1DUPq8Pseudo_UPD: + case ARM::VLD1DUPq16Pseudo_UPD: + case ARM::VLD1DUPq32Pseudo_UPD: + case ARM::VLD2DUPd8Pseudo: + case ARM::VLD2DUPd16Pseudo: + case ARM::VLD2DUPd32Pseudo: + case ARM::VLD2DUPd8Pseudo_UPD: + case ARM::VLD2DUPd16Pseudo_UPD: + case ARM::VLD2DUPd32Pseudo_UPD: + case ARM::VLD4DUPd8Pseudo: + case ARM::VLD4DUPd16Pseudo: + case ARM::VLD4DUPd32Pseudo: + case ARM::VLD4DUPd8Pseudo_UPD: + case ARM::VLD4DUPd16Pseudo_UPD: + case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD1LNq8Pseudo: + case ARM::VLD1LNq16Pseudo: + case ARM::VLD1LNq32Pseudo: + case ARM::VLD1LNq8Pseudo_UPD: + case ARM::VLD1LNq16Pseudo_UPD: + case ARM::VLD1LNq32Pseudo_UPD: + case ARM::VLD2LNd8Pseudo: + case ARM::VLD2LNd16Pseudo: + case ARM::VLD2LNd32Pseudo: + case ARM::VLD2LNq16Pseudo: + case ARM::VLD2LNq32Pseudo: + case ARM::VLD2LNd8Pseudo_UPD: + case ARM::VLD2LNd16Pseudo_UPD: + case ARM::VLD2LNd32Pseudo_UPD: + case ARM::VLD2LNq16Pseudo_UPD: + case ARM::VLD2LNq32Pseudo_UPD: + case ARM::VLD4LNd8Pseudo: + case ARM::VLD4LNd16Pseudo: + case ARM::VLD4LNd32Pseudo: + case ARM::VLD4LNq16Pseudo: + case ARM::VLD4LNq32Pseudo: + case ARM::VLD4LNd8Pseudo_UPD: + case ARM::VLD4LNd16Pseudo_UPD: + case ARM::VLD4LNd32Pseudo_UPD: + case ARM::VLD4LNq16Pseudo_UPD: + case ARM::VLD4LNq32Pseudo_UPD: + // If the address is not 64-bit aligned, the latencies of these + // instructions increases by one. + ++Latency; + break; + } + return Latency; } diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 30095fe..96f0e76 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -291,8 +291,8 @@ public: int64_t &Offset1, int64_t &Offset2)const; /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to - /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should - /// be scheduled togther. On some targets if two loads are loading from + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads + /// should be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets /// from the common base address. It returns true if it decides it's desirable @@ -307,7 +307,7 @@ public: const MachineFunction &MF) const; virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, unsigned ExtraPredCycles, + unsigned NumCycles, unsigned ExtraPredCycles, float Prob, float Confidence) const; virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB, @@ -317,10 +317,10 @@ public: float Probability, float Confidence) const; virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, float Probability, float Confidence) const { - return NumCyles == 1; + return NumCycles == 1; } /// AnalyzeCompare - For a comparison instruction, return the source register diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 1918fd9..2adcd2c 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -88,7 +88,7 @@ BitVector ARMBaseRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - // FIXME: avoid re-calculating this everytime. + // FIXME: avoid re-calculating this every time. BitVector Reserved(getNumRegs()); Reserved.set(ARM::SP); Reserved.set(ARM::PC); @@ -342,6 +342,25 @@ ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC, return false; } +const TargetRegisterClass* +ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) + const { + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->superclasses_begin(); + do { + switch (Super->getID()) { + case ARM::GPRRegClassID: + case ARM::SPRRegClassID: + case ARM::DPRRegClassID: + case ARM::QPRRegClassID: + case ARM::QQPRRegClassID: + case ARM::QQQQPRRegClassID: + return Super; + } + Super = *I++; + } while (Super); + return RC; +} const TargetRegisterClass * ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const { @@ -368,12 +387,12 @@ ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } } -/// getAllocationOrder - Returns the register allocation order for a specified -/// register class in the form of a pair of TargetRegisterClass iterators. -std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> -ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, - unsigned HintType, unsigned HintReg, - const MachineFunction &MF) const { +/// getRawAllocationOrder - Returns the register allocation order for a +/// specified register class with a target-dependent hint. +ArrayRef<unsigned> +ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC, + unsigned HintType, unsigned HintReg, + const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); // Alternative register allocation orders when favoring even / odd registers // of register pairs. @@ -450,70 +469,54 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, // We only support even/odd hints for GPR and rGPR. if (RC != ARM::GPRRegisterClass && RC != ARM::rGPRRegisterClass) - return std::make_pair(RC->allocation_order_begin(MF), - RC->allocation_order_end(MF)); + return RC->getRawAllocationOrder(MF); if (HintType == ARMRI::RegPairEven) { if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0) // It's no longer possible to fulfill this hint. Return the default // allocation order. - return std::make_pair(RC->allocation_order_begin(MF), - RC->allocation_order_end(MF)); + return RC->getRawAllocationOrder(MF); if (!TFI->hasFP(MF)) { if (!STI.isR9Reserved()) - return std::make_pair(GPREven1, - GPREven1 + (sizeof(GPREven1)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPREven1); else - return std::make_pair(GPREven4, - GPREven4 + (sizeof(GPREven4)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPREven4); } else if (FramePtr == ARM::R7) { if (!STI.isR9Reserved()) - return std::make_pair(GPREven2, - GPREven2 + (sizeof(GPREven2)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPREven2); else - return std::make_pair(GPREven5, - GPREven5 + (sizeof(GPREven5)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPREven5); } else { // FramePtr == ARM::R11 if (!STI.isR9Reserved()) - return std::make_pair(GPREven3, - GPREven3 + (sizeof(GPREven3)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPREven3); else - return std::make_pair(GPREven6, - GPREven6 + (sizeof(GPREven6)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPREven6); } } else if (HintType == ARMRI::RegPairOdd) { if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0) // It's no longer possible to fulfill this hint. Return the default // allocation order. - return std::make_pair(RC->allocation_order_begin(MF), - RC->allocation_order_end(MF)); + return RC->getRawAllocationOrder(MF); if (!TFI->hasFP(MF)) { if (!STI.isR9Reserved()) - return std::make_pair(GPROdd1, - GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPROdd1); else - return std::make_pair(GPROdd4, - GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPROdd4); } else if (FramePtr == ARM::R7) { if (!STI.isR9Reserved()) - return std::make_pair(GPROdd2, - GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPROdd2); else - return std::make_pair(GPROdd5, - GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPROdd5); } else { // FramePtr == ARM::R11 if (!STI.isR9Reserved()) - return std::make_pair(GPROdd3, - GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPROdd3); else - return std::make_pair(GPROdd6, - GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned))); + return ArrayRef<unsigned>(GPROdd6); } } - return std::make_pair(RC->allocation_order_begin(MF), - RC->allocation_order_end(MF)); + return RC->getRawAllocationOrder(MF); } /// ResolveRegAllocHint - Resolves the specified register allocation hint @@ -554,6 +557,29 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, } } +bool +ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const { + // CortexA9 has a Write-after-write hazard for NEON registers. + if (!STI.isCortexA9()) + return false; + + switch (RC->getID()) { + case ARM::DPRRegClassID: + case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + case ARM::QPRRegClassID: + case ARM::QPR_8RegClassID: + case ARM::QPR_VFP2RegClassID: + case ARM::SPRRegClassID: + case ARM::SPR_8RegClassID: + // Avoid reusing S, D, and Q registers. + // Don't increase register pressure for QQ and QQQQ. + return true; + default: + return false; + } +} + bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -642,6 +668,10 @@ int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); } +int ARMBaseRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + return ARMGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0); +} + unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const { switch (Reg) { @@ -1069,8 +1099,11 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB, if (Ins != MBB->end()) DL = Ins->getDebugLoc(); - MachineInstrBuilder MIB = - BuildMI(*MBB, Ins, DL, TII.get(ADDriOpc), BaseReg) + const TargetInstrDesc &TID = TII.get(ADDriOpc); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MRI.constrainRegClass(BaseReg, TID.OpInfo[0].getRegClass(this)); + + MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, TID, BaseReg) .addFrameIndex(FrameIdx).addImm(Offset); if (!AFI->isThumb1OnlyFunction()) diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 0507396..70b6f01 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -128,13 +128,15 @@ public: const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const; - std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> - getAllocationOrder(const TargetRegisterClass *RC, - unsigned HintType, unsigned HintReg, - const MachineFunction &MF) const; + ArrayRef<unsigned> getRawAllocationOrder(const TargetRegisterClass *RC, + unsigned HintType, unsigned HintReg, + const MachineFunction &MF) const; unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg, const MachineFunction &MF) const; @@ -142,6 +144,8 @@ public: void UpdateRegAllocHint(unsigned Reg, unsigned NewReg, MachineFunction &MF) const; + virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const; + bool hasBasePointer(const MachineFunction &MF) const; bool canRealignStack(const MachineFunction &MF) const; @@ -167,6 +171,7 @@ public: unsigned getEHHandlerRegister() const; int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; bool isLowRegister(unsigned Reg) const; diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 1e6b95e..d2981c0 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -23,7 +23,7 @@ class CCIfAlign<string Align, CCAction A>: def CC_ARM_APCS : CallingConv<[ // Handles byval parameters. - CCIfByVal<CCPassByVal<8, 8>>, + CCIfByVal<CCPassByVal<4, 4>>, CCIfType<[i8, i16], CCPromoteToType<i32>>, diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp index d381ce7..97bac88 100644 --- a/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -231,6 +231,9 @@ namespace { const { return 0; } unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op) const { return 0; } + unsigned getAddrMode6OneLane32AddressOpValue(const MachineInstr &MI, + unsigned Op) + const { return 0; } unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op) const { return 0; } unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op) @@ -239,6 +242,8 @@ namespace { unsigned Op) const { return 0; } unsigned getMsbOpValue(const MachineInstr &MI, unsigned Op) const { return 0; } + unsigned getSsatBitPosValue(const MachineInstr &MI, + unsigned Op) const { return 0; } uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx) const {return 0; } uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx) @@ -1459,6 +1464,7 @@ void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) { // Set the conditional execution predicate Binary |= II->getPredicate(&MI) << ARMII::CondShift; + // PKH instructions are finished at this point if (TID.Opcode == ARM::PKHBT || TID.Opcode == ARM::PKHTB) { emitWordLE(Binary); return; diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index a14c952..b6b3c75 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -455,6 +455,10 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { // Add an implicit def for the super-register. MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); TransferImpOps(MI, MIB, MIB); + + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MI.eraseFromParent(); } @@ -496,10 +500,13 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { MIB.addOperand(MI.getOperand(OpIdx++)); MIB.addOperand(MI.getOperand(OpIdx++)); - if (SrcIsKill) - // Add an implicit kill for the super-reg. - (*MIB).addRegisterKilled(SrcReg, TRI, true); + if (SrcIsKill) // Add an implicit kill for the super-reg. + MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); + + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MI.eraseFromParent(); } @@ -622,9 +629,8 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, MIB.addOperand(MI.getOperand(OpIdx++)); MIB.addOperand(MI.getOperand(OpIdx++)); - if (SrcIsKill) - // Add an implicit kill for the super-reg. - (*MIB).addRegisterKilled(SrcReg, TRI, true); + if (SrcIsKill) // Add an implicit kill for the super-reg. + MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); } @@ -655,8 +661,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); LO16 = LO16.addImm(SOImmValV1); HI16 = HI16.addImm(SOImmValV2); - (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); LO16.addImm(Pred).addReg(PredReg).addReg(0); HI16.addImm(Pred).addReg(PredReg).addReg(0); TransferImpOps(MI, LO16, HI16); @@ -692,8 +698,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); } - (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); LO16.addImm(Pred).addReg(PredReg); HI16.addImm(Pred).addReg(PredReg); @@ -856,7 +862,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, TII->get(ARM::BL)) .addExternalSymbol("__aeabi_read_tp", 0); - (*MIB).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); return true; @@ -871,7 +877,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg) .addOperand(MI.getOperand(1))); - (*MIB1).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) @@ -927,7 +933,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, if (isARM) { AddDefaultPred(MIB3); if (Opcode == ARM::MOV_ga_pcrel_ldr) - (*MIB2).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB2->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } TransferImpOps(MI, MIB1, MIB3); MI.eraseFromParent(); @@ -1019,9 +1025,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); MIB.addReg(D0).addReg(D1); - if (SrcIsKill) - // Add an implicit kill for the Q register. - (*MIB).addRegisterKilled(SrcReg, TRI, true); + if (SrcIsKill) // Add an implicit kill for the Q register. + MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 1e94ab6..5cf73c4 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "ARM.h" +#include "ARMAddressingModes.h" #include "ARMBaseInstrInfo.h" #include "ARMCallingConv.h" #include "ARMRegisterInfo.h" @@ -26,6 +27,7 @@ #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Module.h" +#include "llvm/Operator.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -69,12 +71,10 @@ namespace { } Base; int Offset; - unsigned Scale; - unsigned PlusReg; // Innocuous defaults for our address. Address() - : BaseType(RegBase), Offset(0), Scale(0), PlusReg(0) { + : BaseType(RegBase), Offset(0) { Base.Reg = 0; } } Address; @@ -136,6 +136,9 @@ class ARMFastISel : public FastISel { virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode, const TargetRegisterClass *RC, uint64_t Imm); + virtual unsigned FastEmitInst_ii(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm1, uint64_t Imm2); virtual unsigned FastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill, @@ -164,6 +167,7 @@ class ARMFastISel : public FastISel { bool SelectCall(const Instruction *I); bool SelectSelect(const Instruction *I); bool SelectRet(const Instruction *I); + bool SelectIntCast(const Instruction *I); // Utility routines. private: @@ -203,7 +207,8 @@ class ARMFastISel : public FastISel { bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR); const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB); void AddLoadStoreOperands(EVT VT, Address &Addr, - const MachineInstrBuilder &MIB); + const MachineInstrBuilder &MIB, + unsigned Flags); }; } // end anonymous namespace @@ -230,16 +235,16 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) { bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) { const TargetInstrDesc &TID = MI->getDesc(); - + // If we're a thumb2 or not NEON function we were handled via isPredicable. if ((TID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON || AFI->isThumb2Function()) return false; - + for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) if (TID.OpInfo[i].isPredicate()) return true; - + return false; } @@ -257,7 +262,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { // we're not predicable but add it anyways. if (TII.isPredicable(MI) || isARMNEONPred(MI)) AddDefaultPred(MIB); - + // Do we optionally set a predicate? Preds is size > 0 iff the predicate // defines CPSR. All other OptionalDefines in ARM are the CCR register. bool CPSR = false; @@ -433,6 +438,26 @@ unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode, return ResultReg; } +unsigned ARMFastISel::FastEmitInst_ii(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm1, uint64_t Imm2) { + unsigned ResultReg = createResultReg(RC); + const TargetInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addImm(Imm1).addImm(Imm2)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addImm(Imm1).addImm(Imm2)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), + ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill, uint32_t Idx) { @@ -552,9 +577,6 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) { Reloc::Model RelocM = TM.getRelocationModel(); - // TODO: No external globals for now. - if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) return 0; - // TODO: Need more magic for ARM PIC. if (!isThumb && (RelocM == Reloc::PIC_)) return 0; @@ -589,6 +611,23 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) { .addImm(0); } AddOptionalDefs(MIB); + + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) { + unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + if (isThumb) + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::t2LDRi12), + NewDestReg) + .addReg(DestReg) + .addImm(0); + else + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRi12), + NewDestReg) + .addReg(DestReg) + .addImm(0); + DestReg = NewDestReg; + AddOptionalDefs(MIB); + } + return DestReg; } @@ -727,7 +766,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] == FuncInfo.MBB) && isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { - // An add (in the same block) with a constant operand. Fold the + // An add (in the same block) with a constant operand. Fold the // constant. ConstantInt *CI = cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); @@ -735,7 +774,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { // Iterate on the other operand. Op = cast<AddOperator>(Op)->getOperand(0); continue; - } + } // Unsupported goto unsupported_gep; } @@ -821,37 +860,21 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT) { // Since the offset is too large for the load/store instruction // get the reg+offset into a register. if (needsLowering) { - ARMCC::CondCodes Pred = ARMCC::AL; - unsigned PredReg = 0; - - TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass : - ARM::GPRRegisterClass; - unsigned BaseReg = createResultReg(RC); - - if (!isThumb) - emitARMRegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - BaseReg, Addr.Base.Reg, Addr.Offset, - Pred, PredReg, - static_cast<const ARMBaseInstrInfo&>(TII)); - else { - assert(AFI->isThumb2Function()); - emitT2RegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - BaseReg, Addr.Base.Reg, Addr.Offset, Pred, PredReg, - static_cast<const ARMBaseInstrInfo&>(TII)); - } + Addr.Base.Reg = FastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg, + /*Op0IsKill*/false, Addr.Offset, MVT::i32); Addr.Offset = 0; - Addr.Base.Reg = BaseReg; } } void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr, - const MachineInstrBuilder &MIB) { + const MachineInstrBuilder &MIB, + unsigned Flags) { // addrmode5 output depends on the selection dag addressing dividing the // offset by 4 that it then later multiplies. Do this here as well. if (VT.getSimpleVT().SimpleTy == MVT::f32 || VT.getSimpleVT().SimpleTy == MVT::f64) Addr.Offset /= 4; - + // Frame base works a bit differently. Handle it separately. if (Addr.BaseType == Address::FrameIndexBase) { int FI = Addr.Base.FI; @@ -859,7 +882,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr, MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(FI, Offset), - MachineMemOperand::MOLoad, + Flags, MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // Now add the rest of the operands. @@ -873,7 +896,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr, } else { // Now add the rest of the operands. MIB.addReg(Addr.Base.Reg); - + // ARM halfword load/stores need an additional operand. if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0); @@ -918,7 +941,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr) { ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); - AddLoadStoreOperands(VT, Addr, MIB); + AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad); return true; } @@ -977,7 +1000,7 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(StrOpc)) .addReg(SrcReg, getKillRegState(true)); - AddLoadStoreOperands(VT, Addr, MIB); + AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore); return true; } @@ -1061,18 +1084,16 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { // behavior. // TODO: Factor this out. if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { - if (CI->hasOneUse() && (CI->getParent() == I->getParent())) { - MVT VT; - const Type *Ty = CI->getOperand(0)->getType(); - if (!isTypeLegal(Ty, VT)) - return false; - + MVT SourceVT; + const Type *Ty = CI->getOperand(0)->getType(); + if (CI->hasOneUse() && (CI->getParent() == I->getParent()) + && isTypeLegal(Ty, SourceVT)) { bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); if (isFloat && !Subtarget->hasVFP2()) return false; unsigned CmpOpc; - switch (VT.SimpleTy) { + switch (SourceVT.SimpleTy) { default: return false; // TODO: Verify compares. case MVT::f32: @@ -1087,7 +1108,14 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { } // Get the compare predicate. - ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate()); + // Try to take advantage of fallthrough opportunities. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + ARMCC::CondCodes ARMPred = getComparePred(Predicate); // We may not handle every CC for now. if (ARMPred == ARMCC::AL) return false; @@ -1115,19 +1143,55 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { FuncInfo.MBB->addSuccessor(TBB); return true; } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) { + unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri; + unsigned OpReg = getRegForValue(TI->getOperand(0)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TstOpc)) + .addReg(OpReg).addImm(1)); + + unsigned CCMode = ARMCC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CCMode = ARMCC::EQ; + } + + unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) + .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); + + FastEmitBranch(FBB, DL); + FuncInfo.MBB->addSuccessor(TBB); + return true; + } } unsigned CmpReg = getRegForValue(BI->getCondition()); if (CmpReg == 0) return false; - // Re-set the flags just in case. - unsigned CmpOpc = isThumb ? ARM::t2CMPri : ARM::CMPri; - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) - .addReg(CmpReg).addImm(0)); + // We've been divorced from our compare! Our block was split, and + // now our compare lives in a predecessor block. We musn't + // re-compare here, as the children of the compare aren't guaranteed + // live across the block boundary (we *could* check for this). + // Regardless, the compare has been done in the predecessor block, + // and it left a value for us in a virtual register. Ergo, we test + // the one-bit value left in the virtual register. + unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc)) + .addReg(CmpReg).addImm(1)); + + unsigned CCMode = ARMCC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CCMode = ARMCC::EQ; + } unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) - .addMBB(TBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); FastEmitBranch(FBB, DL); FuncInfo.MBB->addSuccessor(TBB); return true; @@ -1249,6 +1313,10 @@ bool ARMFastISel::SelectSIToFP(const Instruction *I) { if (!isTypeLegal(Ty, DstVT)) return false; + // FIXME: Handle sign-extension where necessary. + if (!I->getOperand(0)->getType()->isIntegerTy(32)) + return false; + unsigned Op = getRegForValue(I->getOperand(0)); if (Op == 0) return false; @@ -1474,7 +1542,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, CallingConv::ID CC, unsigned &NumBytes) { SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, false, TM, ArgLocs, *Context); + CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context); CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false)); // Get a count of how many bytes are to be pushed on the stack. @@ -1587,7 +1655,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, // Now the return value. if (RetVT != MVT::isVoid) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CC, false, TM, RVLocs, *Context); + CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context); CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true)); // Copy all of the result registers out of their specified physreg. @@ -1643,7 +1711,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; - CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext()); + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, I->getContext()); CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */)); const Value *RV = Ret->getOperand(0); @@ -1717,9 +1785,6 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { else if (!isTypeLegal(RetTy, RetVT)) return false; - // For now we're using BLX etc on the assumption that we have v5t ops. - if (!Subtarget->hasV5TOps()) return false; - // TODO: For now if we have long calls specified we don't handle the call. if (EnableARMLongCalls) return false; @@ -1757,7 +1822,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) return false; - // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops. + // Issue the call, BLr9 for darwin, BL otherwise. // TODO: Turn this into the table of arm call ops. MachineInstrBuilder MIB; unsigned CallOpc = ARMSelectCallOp(NULL); @@ -1793,9 +1858,9 @@ bool ARMFastISel::SelectCall(const Instruction *I) { // Can't handle inline asm or worry about intrinsics yet. if (isa<InlineAsm>(Callee) || isa<IntrinsicInst>(CI)) return false; - // Only handle global variable Callees that are direct calls. + // Only handle global variable Callees. const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); - if (!GV || Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel())) + if (!GV) return false; // Check the calling convention. @@ -1818,13 +1883,9 @@ bool ARMFastISel::SelectCall(const Instruction *I) { else if (!isTypeLegal(RetTy, RetVT)) return false; - // For now we're using BLX etc on the assumption that we have v5t ops. - // TODO: Maybe? - if (!Subtarget->hasV5TOps()) return false; - // TODO: For now if we have long calls specified we don't handle the call. if (EnableARMLongCalls) return false; - + // Set up the argument vectors. SmallVector<Value*, 8> Args; SmallVector<unsigned, 8> ArgRegs; @@ -1873,7 +1934,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) { if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) return false; - // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops. + // Issue the call, BLr9 for darwin, BL otherwise. // TODO: Turn this into the table of arm call ops. MachineInstrBuilder MIB; unsigned CallOpc = ARMSelectCallOp(GV); @@ -1888,7 +1949,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) { MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) .addGlobalAddress(GV, 0, 0)); - + // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) MIB.addReg(RegArgs[i]); @@ -1904,6 +1965,79 @@ bool ARMFastISel::SelectCall(const Instruction *I) { } +bool ARMFastISel::SelectIntCast(const Instruction *I) { + // On ARM, in general, integer casts don't involve legal types; this code + // handles promotable integers. The high bits for a type smaller than + // the register size are assumed to be undefined. + const Type *DestTy = I->getType(); + Value *Op = I->getOperand(0); + const Type *SrcTy = Op->getType(); + + EVT SrcVT, DestVT; + SrcVT = TLI.getValueType(SrcTy, true); + DestVT = TLI.getValueType(DestTy, true); + + if (isa<TruncInst>(I)) { + if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8) + return false; + if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1) + return false; + + unsigned SrcReg = getRegForValue(Op); + if (!SrcReg) return false; + + // Because the high bits are undefined, a truncate doesn't generate + // any code. + UpdateValueMap(I, SrcReg); + return true; + } + if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8) + return false; + + unsigned Opc; + bool isZext = isa<ZExtInst>(I); + bool isBoolZext = false; + if (!SrcVT.isSimple()) + return false; + switch (SrcVT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i16: + if (isZext) + Opc = isThumb ? ARM::t2UXTHr : ARM::UXTHr; + else + Opc = isThumb ? ARM::t2SXTHr : ARM::SXTHr; + break; + case MVT::i8: + if (isZext) + Opc = isThumb ? ARM::t2UXTBr : ARM::UXTBr; + else + Opc = isThumb ? ARM::t2SXTBr : ARM::SXTBr; + break; + case MVT::i1: + if (isZext) { + Opc = isThumb ? ARM::t2ANDri : ARM::ANDri; + isBoolZext = true; + break; + } + return false; + } + + // FIXME: We could save an instruction in many cases by special-casing + // load instructions. + unsigned SrcReg = getRegForValue(Op); + if (!SrcReg) return false; + + unsigned DestReg = createResultReg(TLI.getRegClassFor(MVT::i32)); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg) + .addReg(SrcReg); + if (isBoolZext) + MIB.addImm(1); + AddOptionalDefs(MIB); + UpdateValueMap(I, DestReg); + return true; +} + // TODO: SoftFP support. bool ARMFastISel::TargetSelectInstruction(const Instruction *I) { @@ -1941,6 +2075,10 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) { return SelectSelect(I); case Instruction::Ret: return SelectRet(I); + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + return SelectIntCast(I); default: break; } return false; diff --git a/lib/Target/ARM/ARMFixupKinds.h b/lib/Target/ARM/ARMFixupKinds.h index 3d175e3..350c92d 100644 --- a/lib/Target/ARM/ARMFixupKinds.h +++ b/lib/Target/ARM/ARMFixupKinds.h @@ -56,7 +56,7 @@ enum Fixups { // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions. fixup_arm_thumb_br, - // fixup_arm_thumb_blx - Fixup for Thumb BL instructions. + // fixup_arm_thumb_bl - Fixup for Thumb BL instructions. fixup_arm_thumb_bl, // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions. diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index c99259d..4ef2666 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -427,6 +427,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); + MBBI = NewMI; } if (VARegSaveSize) @@ -445,8 +446,7 @@ ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, int ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, - int FI, - unsigned &FrameReg, + int FI, unsigned &FrameReg, int SPAdj) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const ARMBaseRegisterInfo *RegInfo = @@ -490,19 +490,23 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, return FPOffset; } else if (MFI->hasVarSizedObjects()) { assert(RegInfo->hasBasePointer(MF) && "missing base pointer!"); - // Try to use the frame pointer if we can, else use the base pointer - // since it's available. This is handy for the emergency spill slot, in - // particular. if (AFI->isThumb2Function()) { + // Try to use the frame pointer if we can, else use the base pointer + // since it's available. This is handy for the emergency spill slot, in + // particular. if (FPOffset >= -255 && FPOffset < 0) { FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; } - } else - FrameReg = RegInfo->getBaseRegister(); + } } else if (AFI->isThumb2Function()) { + // Use add <rd>, sp, #<imm8> + // ldr <rd>, [sp, #<imm8>] + // if at all possible to save space. + if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020) + return Offset; // In Thumb2 mode, the negative offset is very limited. Try to avoid - // out of range references. + // out of range references. ldr <rt>,[<rn>, #-<imm8>] if (FPOffset >= -255 && FPOffset < 0) { FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; @@ -838,9 +842,14 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, if (AFI->getVarArgsRegSaveSize() > 0) MF.getRegInfo().setPhysRegUsed(ARM::LR); - // Spill R4 if Thumb1 epilogue has to restore SP from FP since + // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know + // for sure what the stack size will be, but for this, an estimate is good + // enough. If there anything changes it, it'll be a spill, which implies + // we've used all the registers and so R4 is already used, so not marking + // it here will be OK. // FIXME: It will be better just to find spare register here. - if (MFI->hasVarSizedObjects()) + unsigned StackSize = estimateStackSize(MF); + if (MFI->hasVarSizedObjects() || StackSize > 508) MF.getRegInfo().setPhysRegUsed(ARM::R4); } diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index a7b7f15..61bb8af 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -51,7 +51,8 @@ public: bool canSimplifyCallFramePseudos(const MachineFunction &MF) const; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const; - int ResolveFrameIndexReference(const MachineFunction &MF, int FI, + int ResolveFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg, int SPAdj) const; int getFrameIndexOffset(const MachineFunction &MF, int FI) const; diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp index e97ce50..517bba8 100644 --- a/lib/Target/ARM/ARMHazardRecognizer.cpp +++ b/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -49,6 +49,8 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { const TargetInstrDesc &LastTID = LastMI->getDesc(); // Skip over one non-VFP / NEON instruction. if (!LastTID.isBarrier() && + // On A9, AGU and NEON/FPU are muxed. + !(STI.isCortexA9() && (LastTID.mayLoad() || LastTID.mayStore())) && (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { MachineBasicBlock::iterator I = LastMI; if (I != LastMI->getParent()->begin()) { diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 1201d91..6f57a04 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -45,7 +45,7 @@ DisableShifterOp("disable-shifter-op", cl::Hidden, static cl::opt<bool> CheckVMLxHazard("check-vmlx-hazard", cl::Hidden, cl::desc("Check fp vmla / vmls hazard at isel time"), - cl::init(false)); + cl::init(true)); //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine @@ -179,16 +179,6 @@ public: return ARM_AM::getT2SOImmVal(~Imm) != -1; } - inline bool Pred_so_imm(SDNode *inN) const { - ConstantSDNode *N = cast<ConstantSDNode>(inN); - return is_so_imm(N->getZExtValue()); - } - - inline bool Pred_t2_so_imm(SDNode *inN) const { - ConstantSDNode *N = cast<ConstantSDNode>(inN); - return is_t2_so_imm(N->getZExtValue()); - } - // Include the pieces autogenerated from the target description. #include "ARMGenDAGISel.inc" @@ -1364,30 +1354,34 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { /// SDNode *ARMDAGToDAGISel::PairSRegs(EVT VT, SDValue V0, SDValue V1) { DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = + CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32); - const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); } /// PairDRegs - Form a quad register from a pair of D registers. /// SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) { DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); - const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); } /// PairQRegs - Form 4 consecutive D registers from a pair of Q registers. /// SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) { DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); - const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); } /// QuadSRegs - Form 4 consecutive S registers. @@ -1395,12 +1389,15 @@ SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) { SDNode *ARMDAGToDAGISel::QuadSRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = + CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32); SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, MVT::i32); SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32); - const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); } /// QuadDRegs - Form 4 consecutive D registers. @@ -1408,12 +1405,14 @@ SDNode *ARMDAGToDAGISel::QuadSRegs(EVT VT, SDValue V0, SDValue V1, SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32); SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32); - const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); } /// QuadQRegs - Form 4 consecutive Q registers. @@ -1421,12 +1420,14 @@ SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDNode *ARMDAGToDAGISel::QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, MVT::i32); SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, MVT::i32); - const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); } /// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand @@ -1553,6 +1554,11 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.data(), Ops.size()); } + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1); + if (NumVecs == 1) return VLd; @@ -1582,6 +1588,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + SDValue Chain = N->getOperand(0); EVT VT = N->getOperand(Vec0Idx).getValueType(); bool is64BitVector = VT.is64BitVector(); @@ -1654,7 +1663,13 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); - return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + SDNode *VSt = + CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + + // Transfer memoperands. + cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1); + + return VSt; } // Otherwise, quad registers are stored with two separate instructions, @@ -1675,6 +1690,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, MemAddr.getValueType(), MVT::Other, OpsA, 7); + cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1); Chain = SDValue(VStA, 1); // Store the odd D registers. @@ -1691,8 +1707,10 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); - return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, - Ops.data(), Ops.size()); + SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops.data(), Ops.size()); + cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1); + return VStB; } SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, @@ -1708,6 +1726,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + SDValue Chain = N->getOperand(0); unsigned Lane = cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); @@ -1794,6 +1815,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, QOpcodes[OpcodeIndex]); SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1); if (!IsLoad) return VLdLn; @@ -1820,6 +1842,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) return NULL; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + SDValue Chain = N->getOperand(0); EVT VT = N->getValueType(0); @@ -1864,12 +1889,13 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; std::vector<EVT> ResTys; - ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts)); + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts)); if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1); SuperReg = SDValue(VLdDup, 0); // Extract the subregisters. @@ -2676,6 +2702,111 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { default: break; + case Intrinsic::arm_ldrexd: { + SDValue MemAddr = N->getOperand(2); + DebugLoc dl = N->getDebugLoc(); + SDValue Chain = N->getOperand(0); + + unsigned NewOpc = ARM::LDREXD; + if (Subtarget->isThumb() && Subtarget->hasThumb2()) + NewOpc = ARM::t2LDREXD; + + // arm_ldrexd returns a i64 value in {i32, i32} + std::vector<EVT> ResTys; + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + // place arguments in the right order + SmallVector<SDValue, 7> Ops; + Ops.push_back(MemAddr); + Ops.push_back(getAL(CurDAG)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(Chain); + SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(), + Ops.size()); + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1); + + // Until there's support for specifing explicit register constraints + // like the use of even/odd register pair, hardcode ldrexd to always + // use the pair [R0, R1] to hold the load result. + Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ARM::R0, + SDValue(Ld, 0), SDValue(0,0)); + Chain = CurDAG->getCopyToReg(Chain, dl, ARM::R1, + SDValue(Ld, 1), Chain.getValue(1)); + + // Remap uses. + SDValue Glue = Chain.getValue(1); + if (!SDValue(N, 0).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + ARM::R0, MVT::i32, Glue); + Glue = Result.getValue(2); + ReplaceUses(SDValue(N, 0), Result); + } + if (!SDValue(N, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + ARM::R1, MVT::i32, Glue); + Glue = Result.getValue(2); + ReplaceUses(SDValue(N, 1), Result); + } + + ReplaceUses(SDValue(N, 2), SDValue(Ld, 2)); + return NULL; + } + + case Intrinsic::arm_strexd: { + DebugLoc dl = N->getDebugLoc(); + SDValue Chain = N->getOperand(0); + SDValue Val0 = N->getOperand(2); + SDValue Val1 = N->getOperand(3); + SDValue MemAddr = N->getOperand(4); + + // Until there's support for specifing explicit register constraints + // like the use of even/odd register pair, hardcode strexd to always + // use the pair [R2, R3] to hold the i64 (i32, i32) value to be stored. + Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ARM::R2, Val0, + SDValue(0, 0)); + Chain = CurDAG->getCopyToReg(Chain, dl, ARM::R3, Val1, Chain.getValue(1)); + + SDValue Glue = Chain.getValue(1); + Val0 = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + ARM::R2, MVT::i32, Glue); + Glue = Val0.getValue(1); + Val1 = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + ARM::R3, MVT::i32, Glue); + + // Store exclusive double return a i32 value which is the return status + // of the issued store. + std::vector<EVT> ResTys; + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + // place arguments in the right order + SmallVector<SDValue, 7> Ops; + Ops.push_back(Val0); + Ops.push_back(Val1); + Ops.push_back(MemAddr); + Ops.push_back(getAL(CurDAG)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(Chain); + + unsigned NewOpc = ARM::STREXD; + if (Subtarget->isThumb() && Subtarget->hasThumb2()) + NewOpc = ARM::t2STREXD; + + SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(), + Ops.size()); + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); + + return St; + } + case Intrinsic::arm_neon_vld1: { unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, ARM::VLD1d32, ARM::VLD1d64 }; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 7c456ed..7c44c10 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -72,10 +72,25 @@ ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true)); -static cl::opt<std::string> -TrapFuncName("arm-trap-func", cl::Hidden, - cl::desc("Emit a call to trap function rather than a trap instruction"), - cl::init("")); +namespace llvm { + class ARMCCState : public CCState { + public: + ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, + const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs, + LLVMContext &C, ParmContext PC) + : CCState(CC, isVarArg, MF, TM, locs, C) { + assert(((PC == Call) || (PC == Prologue)) && + "ARMCCState users must specify whether their context is call" + "or prologue generation."); + CallOrPrologue = PC; + } + }; +} + +// The APCS parameter registers. +static const unsigned GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 +}; void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT) { @@ -396,11 +411,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS); - } - if (HasDivModLibcall) { - setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); - setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); + // Memory operations + // RTABI chapter 4.3.4 + setLibcallName(RTLIB::MEMCPY, "__aeabi_memcpy"); + setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove"); + setLibcallName(RTLIB::MEMSET, "__aeabi_memset"); } if (Subtarget->isThumb1Only()) @@ -516,18 +532,15 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } // i64 operation support. + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); if (Subtarget->isThumb1Only()) { - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i32, Expand); - setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); - } else { - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i32, Expand); - if (!Subtarget->hasV6Ops()) - setOperationAction(ISD::MULHS, MVT::i32, Expand); } + if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()) + setOperationAction(ISD::MULHS, MVT::i32, Expand); + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); @@ -562,10 +575,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); - if (TrapFuncName.size()) - setOperationAction(ISD::TRAP, MVT::Other, Custom); - else - setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::TRAP, MVT::Other, Legal); // Use the default implementation. setOperationAction(ISD::VASTART, MVT::Other, Custom); @@ -614,6 +624,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); // Since the libcalls include locking, fold in the fences setShouldFoldAtomicFences(true); } @@ -649,6 +671,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom); + setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); } setOperationAction(ISD::SETCC, MVT::i32, Expand); @@ -723,6 +746,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setMinStackArgumentAlignment(4); benefitFromCodePlacementOpt = true; + + setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } // FIXME: It might make sense to define the representative register class as the @@ -733,7 +758,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // pressure of the register class's representative and all of it's super // classes' representatives transitively. We have not implemented this because // of the difficulty prior to coalescing of modeling operand register classes -// due to the common occurence of cross class copies and subregister insertions +// due to the common occurrence of cross class copies and subregister insertions // and extractions. std::pair<const TargetRegisterClass*, uint8_t> ARMTargetLowering::findRepresentativeClass(EVT VT) const{ @@ -924,11 +949,6 @@ ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { return ARM::createFastISel(funcInfo); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const { - return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2; -} - /// getMaximalGlobalOffset - Returns the maximal possible offset which can /// be used for loads / stores from the global. unsigned ARMTargetLowering::getMaximalGlobalOffset() const { @@ -1066,8 +1086,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext(), Call); CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv, /* Return*/ true, isVarArg)); @@ -1128,22 +1148,6 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, return Chain; } -/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified -/// by "Src" to address "Dst" of size "Size". Alignment information is -/// specified by the specific parameter attribute. The copy will be passed as -/// a byval function parameter. -/// Sometimes what we are copying is the end of a larger object, the part that -/// does not fit in registers. -static SDValue -CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, - ISD::ArgFlagsTy Flags, SelectionDAG &DAG, - DebugLoc dl) { - SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); - return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), - /*isVolatile=*/false, /*AlwaysInline=*/false, - MachinePointerInfo(0), MachinePointerInfo(0)); -} - /// LowerMemOpCallTo - Store the argument to the stack. SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, @@ -1154,9 +1158,6 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); - if (Flags.isByVal()) - return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - return DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(LocMemOffset), false, false, 0); @@ -1220,8 +1221,8 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext(), Call); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); @@ -1298,7 +1299,44 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, } } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - } else if (!IsSibCall || isByVal) { + } else if (isByVal) { + assert(VA.isMemLoc()); + unsigned offset = 0; + + // True if this byval aggregate will be split between registers + // and memory. + if (CCInfo.isFirstByValRegValid()) { + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + unsigned int i, j; + for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) { + SDValue Const = DAG.getConstant(4*i, MVT::i32); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(j, Load)); + } + offset = ARM::R4 - CCInfo.getFirstByValReg(); + CCInfo.clearFirstByValReg(); + } + + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); + SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, + StkPtrOff); + SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); + SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, + MVT::i32); + MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, + Flags.getByValAlign(), + /*isVolatile=*/false, + /*AlwaysInline=*/false, + MachinePointerInfo(0), + MachinePointerInfo(0))); + + } else if (!IsSibCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -1331,7 +1369,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. - // Do not flag preceeding copytoreg stuff together with the following stuff. + // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, @@ -1492,14 +1530,32 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, } /// HandleByVal - Every parameter *after* a byval parameter is passed -/// on the stack. Confiscate all the parameter registers to insure +/// on the stack. Remember the next parameter register to allocate, +/// and then confiscate the rest of the parameter registers to insure /// this. void -llvm::ARMTargetLowering::HandleByVal(CCState *State) const { - static const unsigned RegList1[] = { - ARM::R0, ARM::R1, ARM::R2, ARM::R3 - }; - do {} while (State->AllocateReg(RegList1, 4)); +llvm::ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const { + unsigned reg = State->AllocateReg(GPRArgRegs, 4); + assert((State->getCallOrPrologue() == Prologue || + State->getCallOrPrologue() == Call) && + "unhandled ParmContext"); + if ((!State->isFirstByValRegValid()) && + (ARM::R0 <= reg) && (reg <= ARM::R3)) { + State->setFirstByValReg(reg); + // At a call site, a byval parameter that is split between + // registers and memory needs its size truncated here. In a + // function prologue, such byval parameters are reassembled in + // memory, and are not truncated. + if (State->getCallOrPrologue() == Call) { + unsigned excess = 4 * (ARM::R4 - reg); + assert(size >= excess && "expected larger existing stack allocation"); + size -= excess; + } + } + // Confiscate any remaining parameter registers to preclude their + // assignment to subsequent parameters. + while (State->AllocateReg(GPRArgRegs, 4)) + ; } /// MatchingStackOffset - Return true if the given stack call argument is @@ -1596,13 +1652,13 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, getTargetMachine(), - RVLocs1, *DAG.getContext()); + ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext(), Call); CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, getTargetMachine(), - RVLocs2, *DAG.getContext()); + ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext(), Call); CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); if (RVLocs1.size() != RVLocs2.size()) @@ -1628,8 +1684,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext(), Call); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC, false, isVarArg)); if (CCInfo.getNextStackOffset()) { @@ -1688,8 +1744,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, SmallVector<CCValAssign, 16> RVLocs; // CCState - Info about the registers and stack slots. - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, - *DAG.getContext()); + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext(), Call); // Analyze outgoing return values. CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, @@ -2041,7 +2097,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - if (Subtarget->useMovt()) { + // FIXME: Enable this for static codegen when tool issues are fixed. + if (Subtarget->useMovt() && RelocM != Reloc::Static) { ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. @@ -2116,7 +2173,7 @@ ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other, - Op.getOperand(0)); + Op.getOperand(0), Op.getOperand(1)); } SDValue @@ -2224,12 +2281,13 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, // ARMv7 with MP extension has PLDW. return Op.getOperand(0); - if (Subtarget->isThumb()) + unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); + if (Subtarget->isThumb()) { // Invert the bits. isRead = ~isRead & 1; - unsigned isData = Subtarget->isThumb() ? 0 : 1; + isData = ~isData & 1; + } - // Currently there is no intrinsic that matches pli. return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(isRead, MVT::i32), DAG.getConstant(isData, MVT::i32)); @@ -2284,6 +2342,88 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); } +void +ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, + unsigned &VARegSize, unsigned &VARegSaveSize) + const { + unsigned NumGPRs; + if (CCInfo.isFirstByValRegValid()) + NumGPRs = ARM::R4 - CCInfo.getFirstByValReg(); + else { + unsigned int firstUnalloced; + firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs, + sizeof(GPRArgRegs) / + sizeof(GPRArgRegs[0])); + NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0; + } + + unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); + VARegSize = NumGPRs * 4; + VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); +} + +// The remaining GPRs hold either the beginning of variable-argument +// data, or the beginning of an aggregate passed by value (usuall +// byval). Either way, we allocate stack slots adjacent to the data +// provided by our caller, and store the unallocated registers there. +// If this is a variadic function, the va_list pointer will begin with +// these values; otherwise, this reassembles a (byval) structure that +// was split between registers and memory. +void +ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, + unsigned ArgOffset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned firstRegToSaveIndex; + if (CCInfo.isFirstByValRegValid()) + firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0; + else { + firstRegToSaveIndex = CCInfo.getFirstUnallocated + (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); + } + + unsigned VARegSize, VARegSaveSize; + computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); + if (VARegSaveSize) { + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing + // the result of va_next. + AFI->setVarArgsRegSaveSize(VARegSaveSize); + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize, + ArgOffset + VARegSaveSize + - VARegSize, + false)); + SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), + getPointerTy()); + + SmallVector<SDValue, 4> MemOps; + for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) { + TargetRegisterClass *RC; + if (AFI->isThumb1OnlyFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; + + unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), + false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, + DAG.getConstant(4, getPointerTy())); + } + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); + } else + // This will point to the next argument passed via stack. + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); +} + SDValue ARMTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2292,7 +2432,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2300,8 +2439,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue); CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); @@ -2399,14 +2538,19 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, if (index != lastInsIndex) { ISD::ArgFlagsTy Flags = Ins[index].Flags; - // FIXME: For now, all byval parameter objects are marked mutable. This can be - // changed with more analysis. - // In case of tail call optimization mark all arguments mutable. Since they - // could be overwritten by lowering of arguments in case of a tail call. + // FIXME: For now, all byval parameter objects are marked mutable. + // This can be changed with more analysis. + // In case of tail call optimization mark all arguments mutable. + // Since they could be overwritten by lowering of arguments in case of + // a tail call. if (Flags.isByVal()) { - unsigned Bytes = Flags.getByValSize(); + unsigned VARegSize, VARegSaveSize; + computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0); + unsigned Bytes = Flags.getByValSize() - VARegSize; if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. - int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), false); + int FI = MFI->CreateFixedObject(Bytes, + VA.getLocMemOffset(), false); InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); } else { int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, @@ -2424,55 +2568,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, } // varargs - if (isVarArg) { - static const unsigned GPRArgRegs[] = { - ARM::R0, ARM::R1, ARM::R2, ARM::R3 - }; - - unsigned NumGPRs = CCInfo.getFirstUnallocated - (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); - - unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); - unsigned VARegSize = (4 - NumGPRs) * 4; - unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); - unsigned ArgOffset = CCInfo.getNextStackOffset(); - if (VARegSaveSize) { - // If this function is vararg, store any remaining integer argument regs - // to their spots on the stack so that they may be loaded by deferencing - // the result of va_next. - AFI->setVarArgsRegSaveSize(VARegSaveSize); - AFI->setVarArgsFrameIndex( - MFI->CreateFixedObject(VARegSaveSize, - ArgOffset + VARegSaveSize - VARegSize, - false)); - SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), - getPointerTy()); - - SmallVector<SDValue, 4> MemOps; - for (; NumGPRs < 4; ++NumGPRs) { - TargetRegisterClass *RC; - if (AFI->isThumb1OnlyFunction()) - RC = ARM::tGPRRegisterClass; - else - RC = ARM::GPRRegisterClass; - - unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), - false, false, 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, - DAG.getConstant(4, getPointerTy())); - } - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOps[0], MemOps.size()); - } else - // This will point to the next argument passed via stack. - AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); - } + if (isVarArg) + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset()); return Chain; } @@ -2618,10 +2715,11 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (True.getNode() && False.getNode()) { - EVT VT = Cond.getValueType(); + EVT VT = Op.getValueType(); SDValue ARMcc = Cond.getOperand(2); SDValue CCR = Cond.getOperand(3); SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); + assert(True.getValueType() == VT); return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp); } } @@ -2960,7 +3058,10 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, MVT::i32)); - } + } else if (VT == MVT::f32) + Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), + DAG.getConstant(32, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); @@ -4104,7 +4205,16 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: - return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); + // VREV divides the vector in half and swaps within the half. + if (VT.getVectorElementType() == MVT::i32 || + VT.getVectorElementType() == MVT::f32) + return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); + // vrev <4 x i16> -> VREV32 + if (VT.getVectorElementType() == MVT::i16) + return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); + // vrev <4 x i8> -> VREV16 + assert(VT.getVectorElementType() == MVT::i8); + return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: @@ -4575,10 +4685,10 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { // Because short has a smaller range than ushort, we can actually get away // with only a single newton step. This requires that we use a weird bias // of 89, however (again, this has been exhaustively tested). - // float4 result = as_float4(as_int4(xf*recip) + 89); + // float4 result = as_float4(as_int4(xf*recip) + 0x89); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); - N1 = DAG.getConstant(89, MVT::i32); + N1 = DAG.getConstant(0x89, MVT::i32); N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); @@ -4665,26 +4775,26 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); - N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); // Use reciprocal estimate and two refinement steps. // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, - DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), - N1, N2); + BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), - N1, N2); + BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Simply multiplying by the reciprocal estimate can leave us a few ulps // too low, so we add 2 ulps (exhaustive testing shows that this is enough, // and that it will never cause us to return an answer too large). - // float4 result = as_float4(as_int4(xf*recip) + 89); + // float4 result = as_float4(as_int4(xf*recip) + 2); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); N1 = DAG.getConstant(2, MVT::i32); @@ -4698,19 +4808,6 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { return N0; } -static SDValue LowerTrap(SDValue Op, SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - TargetLowering::ArgListTy Args; - std::pair<SDValue, SDValue> CallResult = - TLI.LowerCallTo(Op.getOperand(0), Type::getVoidTy(*DAG.getContext()), - false, false, false, false, 0, CallingConv::C, - /*isTailCall=*/false, - /*isReturnValueUsed=*/true, - DAG.getExternalSymbol(TrapFuncName.c_str(), TLI.getPointerTy()), - Args, DAG, Op.getDebugLoc()); - return CallResult.second; -} - SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); @@ -4757,7 +4854,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: return LowerSDIV(Op, DAG); case ISD::UDIV: return LowerUDIV(Op, DAG); - case ISD::TRAP: return LowerTrap(Op, DAG); } return SDValue(); } @@ -4796,12 +4892,21 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, unsigned ptr = MI->getOperand(1).getReg(); unsigned oldval = MI->getOperand(2).getReg(); unsigned newval = MI->getOperand(3).getReg(); - unsigned scratch = BB->getParent()->getRegInfo() - .createVirtualRegister(ARM::GPRRegisterClass); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + unsigned scratch = + MRI.createVirtualRegister(isThumb2 ? ARM::rGPRRegisterClass + : ARM::GPRRegisterClass); + + if (isThumb2) { + MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); + MRI.constrainRegClass(oldval, ARM::rGPRRegisterClass); + MRI.constrainRegClass(newval, ARM::rGPRRegisterClass); + } + unsigned ldrOpc, strOpc; switch (Size) { default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); @@ -4893,8 +4998,14 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, unsigned ptr = MI->getOperand(1).getReg(); unsigned incr = MI->getOperand(2).getReg(); DebugLoc dl = MI->getDebugLoc(); - bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + if (isThumb2) { + MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); + MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + } + unsigned ldrOpc, strOpc; switch (Size) { default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); @@ -4923,10 +5034,10 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); - MachineRegisterInfo &RegInfo = MF->getRegInfo(); - unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass); - unsigned scratch2 = (!BinOpcode) ? incr : - RegInfo.createVirtualRegister(ARM::GPRRegisterClass); + TargetRegisterClass *TRC = + isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + unsigned scratch = MRI.createVirtualRegister(TRC); + unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); // thisMBB: // ... @@ -4971,6 +5082,116 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +MachineBasicBlock * +ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, + bool signExtend, + ARMCC::CondCodes Cond) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *MF = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptr = MI->getOperand(1).getReg(); + unsigned incr = MI->getOperand(2).getReg(); + unsigned oldval = dest; + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + if (isThumb2) { + MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); + MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + } + + unsigned ldrOpc, strOpc, extendOpc; + switch (Size) { + default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + case 1: + ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; + strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; + extendOpc = isThumb2 ? ARM::t2SXTBr : ARM::SXTBr; + break; + case 2: + ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; + strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; + extendOpc = isThumb2 ? ARM::t2SXTHr : ARM::SXTHr; + break; + case 4: + ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; + strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; + extendOpc = 0; + break; + } + + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + TargetRegisterClass *TRC = + isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + unsigned scratch = MRI.createVirtualRegister(TRC); + unsigned scratch2 = MRI.createVirtualRegister(TRC); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // ldrex dest, ptr + // (sign extend dest, if required) + // cmp dest, incr + // cmov.cond scratch2, dest, incr + // strex scratch, scratch2, ptr + // cmp scratch, #0 + // bne- loopMBB + // fallthrough --> exitMBB + BB = loopMBB; + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr)); + + // Sign extend the value, if necessary. + if (signExtend && extendOpc) { + oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass); + AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval).addReg(dest)); + } + + // Build compare and cmov instructions. + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(oldval).addReg(incr)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2) + .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR); + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2) + .addReg(ptr)); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(scratch).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + static MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), @@ -4980,6 +5201,72 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { llvm_unreachable("Expecting a BB with two successors!"); } +// FIXME: This opcode table should obviously be expressed in the target +// description. We probably just need a "machine opcode" value in the pseudo +// instruction. But the ideal solution maybe to simply remove the "S" version +// of the opcode altogether. +struct AddSubFlagsOpcodePair { + unsigned PseudoOpc; + unsigned MachineOpc; +}; + +static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { + {ARM::ADCSri, ARM::ADCri}, + {ARM::ADCSrr, ARM::ADCrr}, + {ARM::ADCSrs, ARM::ADCrs}, + {ARM::SBCSri, ARM::SBCri}, + {ARM::SBCSrr, ARM::SBCrr}, + {ARM::SBCSrs, ARM::SBCrs}, + {ARM::RSBSri, ARM::RSBri}, + {ARM::RSBSrr, ARM::RSBrr}, + {ARM::RSBSrs, ARM::RSBrs}, + {ARM::RSCSri, ARM::RSCri}, + {ARM::RSCSrs, ARM::RSCrs}, + {ARM::t2ADCSri, ARM::t2ADCri}, + {ARM::t2ADCSrr, ARM::t2ADCrr}, + {ARM::t2ADCSrs, ARM::t2ADCrs}, + {ARM::t2SBCSri, ARM::t2SBCri}, + {ARM::t2SBCSrr, ARM::t2SBCrr}, + {ARM::t2SBCSrs, ARM::t2SBCrs}, + {ARM::t2RSBSri, ARM::t2RSBri}, + {ARM::t2RSBSrs, ARM::t2RSBrs}, +}; + +// Convert and Add or Subtract with Carry and Flags to a generic opcode with +// CPSR<def> operand. e.g. ADCS (...) -> ADC (... CPSR<def>). +// +// FIXME: Somewhere we should assert that CPSR<def> is in the correct +// position to be recognized by the target descrition as the 'S' bit. +bool ARMTargetLowering::RemapAddSubWithFlags(MachineInstr *MI, + MachineBasicBlock *BB) const { + unsigned OldOpc = MI->getOpcode(); + unsigned NewOpc = 0; + + // This is only called for instructions that need remapping, so iterating over + // the tiny opcode table is not costly. + static const int NPairs = + sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair); + for (AddSubFlagsOpcodePair *Pair = &AddSubFlagsOpcodeMap[0], + *End = &AddSubFlagsOpcodeMap[NPairs]; Pair != End; ++Pair) { + if (OldOpc == Pair->PseudoOpc) { + NewOpc = Pair->MachineOpc; + break; + } + } + if (!NewOpc) + return false; + + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); + for (unsigned i = 0; i < MI->getNumOperands(); ++i) + MIB.addOperand(MI->getOperand(i)); + AddDefaultPred(MIB); + MIB.addReg(ARM::CPSR, RegState::Define); // S bit + MI->eraseFromParent(); + return true; +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -4987,10 +5274,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); switch (MI->getOpcode()) { - default: + default: { + if (RemapAddSubWithFlags(MI, BB)) + return BB; + MI->dump(); llvm_unreachable("Unexpected instr type to insert"); - + } case ARM::ATOMIC_LOAD_ADD_I8: return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); case ARM::ATOMIC_LOAD_ADD_I16: @@ -5033,6 +5323,34 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::ATOMIC_LOAD_SUB_I32: return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); + case ARM::ATOMIC_LOAD_MIN_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT); + case ARM::ATOMIC_LOAD_MIN_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT); + case ARM::ATOMIC_LOAD_MIN_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT); + + case ARM::ATOMIC_LOAD_MAX_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT); + case ARM::ATOMIC_LOAD_MAX_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT); + case ARM::ATOMIC_LOAD_MAX_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT); + + case ARM::ATOMIC_LOAD_UMIN_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO); + case ARM::ATOMIC_LOAD_UMIN_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO); + case ARM::ATOMIC_LOAD_UMIN_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO); + + case ARM::ATOMIC_LOAD_UMAX_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI); + case ARM::ATOMIC_LOAD_UMAX_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI); + case ARM::ATOMIC_LOAD_UMAX_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI); + case ARM::ATOMIC_SWAP_I8: return EmitAtomicBinary(MI, BB, 1, 0); case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0); case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0); @@ -5041,68 +5359,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2); case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4); - case ARM::ADCSSri: - case ARM::ADCSSrr: - case ARM::ADCSSrs: - case ARM::SBCSSri: - case ARM::SBCSSrr: - case ARM::SBCSSrs: - case ARM::RSBSri: - case ARM::RSBSrr: - case ARM::RSBSrs: - case ARM::RSCSri: - case ARM::RSCSrs: { - unsigned OldOpc = MI->getOpcode(); - unsigned Opc = 0; - switch (OldOpc) { - case ARM::ADCSSrr: - Opc = ARM::ADCrr; - break; - case ARM::ADCSSri: - Opc = ARM::ADCri; - break; - case ARM::ADCSSrs: - Opc = ARM::ADCrs; - break; - case ARM::SBCSSrr: - Opc = ARM::SBCrr; - break; - case ARM::SBCSSri: - Opc = ARM::SBCri; - break; - case ARM::SBCSSrs: - Opc = ARM::SBCrs; - break; - case ARM::RSBSri: - Opc = ARM::RSBri; - break; - case ARM::RSBSrr: - Opc = ARM::RSBrr; - break; - case ARM::RSBSrs: - Opc = ARM::RSBrs; - break; - case ARM::RSCSri: - Opc = ARM::RSCri; - break; - case ARM::RSCSrs: - Opc = ARM::RSCrs; - break; - default: - llvm_unreachable("Unknown opcode?"); - } - - MachineInstrBuilder MIB = - BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(Opc)); - for (unsigned i = 0; i < MI->getNumOperands(); ++i) - MIB.addOperand(MI->getOperand(i)); - AddDefaultPred(MIB); - MIB.addReg(ARM::CPSR, RegState::Define); // S bit - MI->eraseFromParent(); - return BB; - } - - case ARM::tMOVCCr_pseudo: { // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the @@ -5267,12 +5523,109 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, return SDValue(); } +// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction +// (only after legalization). +static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + // Only perform optimization if after legalize, and if NEON is available. We + // also expected both operands to be BUILD_VECTORs. + if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() + || N0.getOpcode() != ISD::BUILD_VECTOR + || N1.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // Check output type since VPADDL operand elements can only be 8, 16, or 32. + EVT VT = N->getValueType(0); + if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) + return SDValue(); + + // Check that the vector operands are of the right form. + // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR + // operands, where N is the size of the formed vector. + // Each EXTRACT_VECTOR should have the same input vector and odd or even + // index such that we have a pair wise add pattern. + + // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. + if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + SDValue Vec = N0->getOperand(0)->getOperand(0); + SDNode *V = Vec.getNode(); + unsigned nextIndex = 0; + + // For each operands to the ADD which are BUILD_VECTORs, + // check to see if each of their operands are an EXTRACT_VECTOR with + // the same vector and appropriate index. + for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { + if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT + && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + + SDValue ExtVec0 = N0->getOperand(i); + SDValue ExtVec1 = N1->getOperand(i); + + // First operand is the vector, verify its the same. + if (V != ExtVec0->getOperand(0).getNode() || + V != ExtVec1->getOperand(0).getNode()) + return SDValue(); + + // Second is the constant, verify its correct. + ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); + ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); + + // For the constant, we want to see all the even or all the odd. + if (!C0 || !C1 || C0->getZExtValue() != nextIndex + || C1->getZExtValue() != nextIndex+1) + return SDValue(); + + // Increment index. + nextIndex+=2; + } else + return SDValue(); + } + + // Create VPADDL node. + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + DebugLoc DL = N->getDebugLoc(); + + // Build operand list. + SmallVector<SDValue, 8> Ops; + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, + TLI.getPointerTy())); + + // Input is the vector. + Ops.push_back(Vec); + + // Get widened type and narrowed type. + MVT widenType; + unsigned numElem = VT.getVectorNumElements(); + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; + case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; + case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; + default: + assert(0 && "Invalid vector element type for padd optimization."); + } + + SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), + widenType, &Ops[0], Ops.size()); + return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp); +} + /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with /// operands N0 and N1. This is a helper for PerformADDCombine that is /// called with the default operands, and if that fails, with commuted /// operands. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget){ + + // Attempt to create vpaddl for this add. + SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget); + if (Result.getNode()) + return Result; + // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N0, N1, DCI); @@ -5284,17 +5637,18 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // First try with the default operand order. - SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI); + SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget); if (Result.getNode()) return Result; // If that didn't work, try again with the operands commuted. - return PerformADDCombineWithOperands(N, N1, N0, DCI); + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); } /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. @@ -5333,7 +5687,7 @@ static SDValue PerformVMULCombine(SDNode *N, unsigned Opcode = N0.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) { - Opcode = N0.getOpcode(); + Opcode = N1.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) return SDValue(); @@ -5414,7 +5768,7 @@ static SDValue PerformANDCombine(SDNode *N, if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -5450,7 +5804,7 @@ static SDValue PerformORCombine(SDNode *N, if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -5496,7 +5850,7 @@ static SDValue PerformORCombine(SDNode *N, EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, N0->getOperand(1), N0->getOperand(0), - N1->getOperand(1)); + N1->getOperand(0)); return DAG.getNode(ISD::BITCAST, dl, VT, Result); } } @@ -5619,8 +5973,8 @@ static SDValue PerformORCombine(SDNode *N, return SDValue(); } -/// PerformBFICombine - (bfi A, (and B, C1), C2) -> (bfi A, B, C2) iff -/// C1 & C2 == C1. +/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff +/// the bits being cleared by the AND are not demanded by the BFI. static SDValue PerformBFICombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); @@ -5628,9 +5982,12 @@ static SDValue PerformBFICombine(SDNode *N, ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); if (!N11C) return SDValue(); - unsigned Mask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned LSB = CountTrailingZeros_32(~InvMask); + unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB; + unsigned Mask = (1 << Width)-1; unsigned Mask2 = N11C->getZExtValue(); - if ((Mask & Mask2) == Mask2) + if ((Mask & (~Mask2)) == 0) return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); @@ -5706,8 +6063,28 @@ static SDValue PerformSTORECombine(SDNode *N, // Otherwise, the i64 value will be legalized to a pair of i32 values. StoreSDNode *St = cast<StoreSDNode>(N); SDValue StVal = St->getValue(); - if (!ISD::isNormalStore(St) || St->isVolatile() || - StVal.getValueType() != MVT::i64 || + if (!ISD::isNormalStore(St) || St->isVolatile()) + return SDValue(); + + if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && + StVal.getNode()->hasOneUse() && !St->isVolatile()) { + SelectionDAG &DAG = DCI.DAG; + DebugLoc DL = St->getDebugLoc(); + SDValue BasePtr = St->getBasePtr(); + SDValue NewST1 = DAG.getStore(St->getChain(), DL, + StVal.getNode()->getOperand(0), BasePtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, MVT::i32)); + return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1), + OffsetPtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), + std::min(4U, St->getAlignment() / 2)); + } + + if (StVal.getValueType() != MVT::i64 || StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); @@ -6479,7 +6856,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; - case ISD::ADD: return PerformADDCombine(N, DCI); + case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DCI, Subtarget); @@ -6753,6 +7130,14 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { return Imm >= 0 && Imm <= 255; } +/// isLegalAddImmediate - Return true if the specified immediate is legal +/// add immediate, that is the target has add instructions which can add +/// a register with the immediate without having to materialize the +/// immediate into a register. +bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { + return ARM_AM::getSOImmVal(Imm) != -1; +} + static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, @@ -6995,6 +7380,9 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const { case 'l': return C_RegisterClass; case 'w': return C_RegisterClass; } + } else { + if (Constraint == "Uv") + return C_Memory; } return TargetLowering::getConstraintType(Constraint); } @@ -7106,12 +7494,16 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint, /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, - char Constraint, + std::string &Constraint, std::vector<SDValue>&Ops, SelectionDAG &DAG) const { SDValue Result(0, 0); - switch (Constraint) { + // Currently only support length 1 constraints. + if (Constraint.length() != 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { default: break; case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': @@ -7126,7 +7518,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (CVal != CVal64) return; - switch (Constraint) { + switch (ConstraintLetter) { case 'I': if (Subtarget->isThumb1Only()) { // This must be a constant between 0 and 255, for ADD @@ -7389,6 +7781,28 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.writeMem = true; return true; } + case Intrinsic::arm_strexd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i64; + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = 8; + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::arm_ldrexd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i64; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = 8; + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } default: break; } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index e37855d..21a9a3a 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -88,7 +88,7 @@ namespace llvm { MEMBARRIER_MCR, // Memory barrier (MCR) PRELOAD, // Preload - + VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. VCGE, // Vector compare greater than or equal. @@ -173,7 +173,7 @@ namespace llvm { // Bit-field insert BFI, - + // Vector OR with immediate VORRIMM, // Vector AND with NOT of immediate @@ -264,6 +264,12 @@ namespace llvm { /// the immediate into a register. virtual bool isLegalICmpImmediate(int64_t Imm) const; + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + virtual bool isLegalAddImmediate(int64_t Imm) const; + /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. @@ -309,7 +315,7 @@ namespace llvm { /// true it means one of the asm constraint of the inline asm instruction /// being processed is 'm'. virtual void LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, + std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const; @@ -321,9 +327,6 @@ namespace llvm { /// specified value type. virtual TargetRegisterClass *getRegClassFor(EVT VT) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - /// getMaximalGlobalOffset - Returns the maximal possible offset which can /// be used for loads / stores from the global. virtual unsigned getMaximalGlobalOffset() const; @@ -408,7 +411,7 @@ namespace llvm { SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; @@ -426,6 +429,13 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; + void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, unsigned ArgOffset) + const; + + void computeRegArea(CCState &CCInfo, MachineFunction &MF, + unsigned &VARegSize, unsigned &VARegSaveSize) const; + virtual SDValue LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, @@ -437,7 +447,7 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; /// HandleByVal - Target-specific cleanup for ByVal support. - virtual void HandleByVal(CCState *) const; + virtual void HandleByVal(CCState *, unsigned &) const; /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call @@ -477,16 +487,22 @@ namespace llvm { MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode) const; + MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, + bool signExtend, + ARMCC::CondCodes Cond) const; + bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const; }; - + enum NEONModImmType { VMOVModImm, VMVNModImm, OtherModImm }; - - + + namespace ARM { FastISel *createFastISel(FunctionLoweringInfo &funcInfo); } diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index f5fb98e..897d8a5 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -860,6 +860,9 @@ class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin, class ARMPat<dag pattern, dag result> : Pat<pattern, result> { list<Predicate> Predicates = [IsARM]; } +class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV5T]; +} class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> { list<Predicate> Predicates = [IsARM, HasV5TE]; } @@ -1020,6 +1023,10 @@ class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 { } class T1LdStSP<bits<3> opB> : T1LoadStore<0b1001, opB>; // SP relative +class T1BranchCond<bits<4> opcode> : Encoding16 { + let Inst{15-12} = opcode; +} + // Helper classes to encode Thumb1 loads and stores. For immediates, the // following bits are used for "opA" (see A6.2.4): // @@ -1208,6 +1215,11 @@ class T1Pat<dag pattern, dag result> : Pat<pattern, result> { list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } +// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode. +class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb2, HasV6T2]; +} + // T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode. class T2Pat<dag pattern, dag result> : Pat<pattern, result> { list<Predicate> Predicates = [IsThumb2]; @@ -1742,9 +1754,10 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, // NEON 3 vector register format. -class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, - dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string dt, string asm, string cstr, list<dag> pattern> +class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { let Inst{24} = op24; let Inst{23} = op23; @@ -1773,9 +1786,10 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, let Inst{5} = Vm{4}; } -class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, - dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string dt, string asm, string cstr, list<dag> pattern> +class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, oops, iops, f, itin, opc, dt, asm, cstr, pattern> { @@ -1793,9 +1807,10 @@ class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bi let Inst{5} = lane; } -class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, - dag oops, dag iops, Format f, InstrItinClass itin, - string opc, string dt, string asm, string cstr, list<dag> pattern> +class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, oops, iops, f, itin, opc, dt, asm, cstr, pattern> { diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index b787d35..2537fc3 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -58,10 +58,13 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; -def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 0, []>; +def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, + SDTCisInt<1>]>; + def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, @@ -130,7 +133,7 @@ def ARMMemBarrier : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER, [SDNPHasChain]>; def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, [SDNPHasChain]>; -def ARMPreload : SDNode<"ARMISD::PRELOAD", SDTPrefetch, +def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH, [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; @@ -203,13 +206,13 @@ def so_imm_not_XFORM : SDNodeXForm<imm, [{ }]>; /// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15]. -def imm1_15 : PatLeaf<(i32 imm), [{ - return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16; +def imm1_15 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 1 && (int32_t)Imm < 16; }]>; /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. -def imm16_31 : PatLeaf<(i32 imm), [{ - return (int32_t)N->getZExtValue() >= 16 && (int32_t)N->getZExtValue() < 32; +def imm16_31 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 16 && (int32_t)Imm < 32; }]>; def so_imm_neg : @@ -239,8 +242,8 @@ def lo16AllZero : PatLeaf<(i32 imm), [{ /// imm0_65535 predicate - True if the 32-bit immediate is in the range /// [0.65535]. -def imm0_65535 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 65536; +def imm0_65535 : ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 65536; }]>; class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; @@ -375,8 +378,8 @@ def neon_vcvt_imm32 : Operand<i32> { } // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24. -def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{ - int32_t v = (int32_t)N->getZExtValue(); +def rot_imm : Operand<i32>, ImmLeaf<i32, [{ + int32_t v = (int32_t)Imm; return v == 8 || v == 16 || v == 24; }]> { let EncoderMethod = "getRotImmOpValue"; } @@ -412,7 +415,9 @@ def shift_so_reg : Operand<i32>, // reg reg imm // so_imm - Match a 32-bit shifter_operand immediate operand, which is an // 8-bit immediate rotated by an arbitrary number of bits. -def so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_so_imm(N); }]> { +def so_imm : Operand<i32>, ImmLeaf<i32, [{ + return ARM_AM::getSOImmVal(Imm) != -1; + }]> { let EncoderMethod = "getSOImmOpValue"; let PrintMethod = "printSOImmOperand"; } @@ -433,13 +438,13 @@ def arm_i32imm : PatLeaf<(imm), [{ }]>; /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. -def imm0_31 : Operand<i32>, PatLeaf<(imm), [{ - return (int32_t)N->getZExtValue() < 32; +def imm0_31 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 32; }]>; /// imm0_31_m1 - Matches and prints like imm0_31, but encodes as 'value - 1'. -def imm0_31_m1 : Operand<i32>, PatLeaf<(imm), [{ - return (int32_t)N->getZExtValue() < 32; +def imm0_31_m1 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 32; }]> { let EncoderMethod = "getImmMinusOneOpValue"; } @@ -462,17 +467,23 @@ def bf_inv_mask_imm : Operand<i32>, } /// lsb_pos_imm - position of the lsb bit, used by BFI4p and t2BFI4p -def lsb_pos_imm : Operand<i32>, PatLeaf<(imm), [{ - return isInt<5>(N->getSExtValue()); +def lsb_pos_imm : Operand<i32>, ImmLeaf<i32, [{ + return isInt<5>(Imm); }]>; /// width_imm - number of bits to be copied, used by BFI4p and t2BFI4p -def width_imm : Operand<i32>, PatLeaf<(imm), [{ - return N->getSExtValue() > 0 && N->getSExtValue() <= 32; +def width_imm : Operand<i32>, ImmLeaf<i32, [{ + return Imm > 0 && Imm <= 32; }] > { let EncoderMethod = "getMsbOpValue"; } +def ssat_imm : Operand<i32>, ImmLeaf<i32, [{ + return Imm > 0 && Imm <= 32; +}]> { + let EncoderMethod = "getSsatBitPosValue"; +} + // Define ARM specific addressing modes. def MemMode2AsmOperand : AsmOperandClass { @@ -586,6 +597,15 @@ def am6offset : Operand<i32>, let EncoderMethod = "getAddrMode6OffsetOpValue"; } +// Special version of addrmode6 to handle alignment encoding for VST1/VLD1 +// (single element from one lane) for size 32. +def addrmode6oneL32 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); + let EncoderMethod = "getAddrMode6OneLane32AddressOpValue"; +} + // Special version of addrmode6 to handle alignment encoding for VLD-dup // instructions, specifically VLD4-dup. def addrmode6dup : Operand<i32>, @@ -934,22 +954,25 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{19-16} = Rn; } } +} + // Carry setting variants // NOTE: CPSR def omitted because it will be handled by the custom inserter. let usesCustomInserter = 1 in { multiclass AI1_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> { - def Sri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), - Size4Bytes, IIC_iALUi, + def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + Size4Bytes, IIC_iALUi, [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>; - def Srr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), - Size4Bytes, IIC_iALUr, - [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>; - def Srs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), - Size4Bytes, IIC_iALUsr, + def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + Size4Bytes, IIC_iALUr, + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> { + let isCommutable = Commutable; + } + def rs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + Size4Bytes, IIC_iALUsr, [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>; } } -} let canFoldAsLoad = 1, isReMaterializable = 1 in { multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii, @@ -1299,6 +1322,15 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { let Inst{3-0} = dst; } + // For disassembly only. + def BX_pred : AXI<(outs), (ins GPR:$dst, pred:$p), BrMiscFrm, IIC_Br, + "bx$p\t$dst", [/* pattern left blank */]>, + Requires<[IsARM, HasV4T]> { + bits<4> dst; + let Inst{27-4} = 0b000100101111111111110001; + let Inst{3-0} = dst; + } + // ARMV4 only // FIXME: We would really like to define this as a vanilla ARMPat like: // ARMPat<(brind GPR:$dst), (MOVr PC, GPR:$dst)> @@ -1316,10 +1348,7 @@ let isCall = 1, // FIXME: Do we really need a non-predicated version? If so, it should // at least be a pseudo instruction expanding to the predicated version // at MC lowering time. - Defs = [R0, R1, R2, R3, R12, LR, - D0, D1, D2, D3, D4, D5, D6, D7, - D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], + Defs = [R0, R1, R2, R3, R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR], Uses = [SP] in { def BL : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops), IIC_Br, "bl\t$func", @@ -1373,10 +1402,7 @@ let isCall = 1, // On Darwin R9 is call-clobbered. // R7 is marked as a use to prevent frame-pointer assignments from being // moved above / below calls. - Defs = [R0, R1, R2, R3, R9, R12, LR, - D0, D1, D2, D3, D4, D5, D6, D7, - D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], + Defs = [R0, R1, R2, R3, R9, R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR], Uses = [R7, SP] in { def BLr9 : ARMPseudoInst<(outs), (ins bltarget:$func, variable_ops), Size4Bytes, IIC_Br, @@ -1415,10 +1441,7 @@ let isCall = 1, // FIXME: The Thumb versions of these should live in ARMInstrThumb.td let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { // Darwin versions. - let Defs = [R0, R1, R2, R3, R9, R12, - D0, D1, D2, D3, D4, D5, D6, D7, - D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, - D27, D28, D29, D30, D31, PC], + let Defs = [R0, R1, R2, R3, R9, R12, QQQQ0, QQQQ2, QQQQ3, PC], Uses = [SP] in { def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops), IIC_Br, []>, Requires<[IsDarwin]>; @@ -1444,10 +1467,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { } // Non-Darwin versions (the difference is R9). - let Defs = [R0, R1, R2, R3, R12, - D0, D1, D2, D3, D4, D5, D6, D7, - D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, - D27, D28, D29, D30, D31, PC], + let Defs = [R0, R1, R2, R3, R12, QQQQ0, QQQQ2, QQQQ3, PC], Uses = [SP] in { def TCRETURNdiND : PseudoInst<(outs), (ins i32imm:$dst, variable_ops), IIC_Br, []>, Requires<[IsNotDarwin]>; @@ -1629,7 +1649,7 @@ def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr", [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>; -let mayLoad = 1, neverHasSideEffects = 1 in { +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { // Load doubleword def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm, @@ -1704,6 +1724,7 @@ let mayLoad = 1, neverHasSideEffects = 1 in { defm LDRH : AI3_ldridx<0b1011, 1, "ldrh", IIC_iLoad_bh_ru>; defm LDRSH : AI3_ldridx<0b1111, 1, "ldrsh", IIC_iLoad_bh_ru>; defm LDRSB : AI3_ldridx<0b1101, 1, "ldrsb", IIC_iLoad_bh_ru>; +let hasExtraDefRegAllocReq = 1 in { def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), (ins addrmode3:$addr), IndexModePre, LdMiscFrm, IIC_iLoad_d_ru, @@ -1729,6 +1750,7 @@ def LDRD_POST: AI3ldstidx<0b1101, 0, 1, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), let Inst{11-8} = offset{7-4}; // imm7_4/zero let Inst{3-0} = offset{3-0}; // imm3_0/Rm } +} // hasExtraDefRegAllocReq = 1 } // mayLoad = 1, neverHasSideEffects = 1 // LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only. @@ -1797,45 +1819,52 @@ def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr), def STR_PRE : AI2stridx<0, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), IndexModePre, StFrm, IIC_iStore_ru, - "str", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + "str", "\t$Rt, [$Rn, $offset]!", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (pre_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; def STR_POST : AI2stridx<0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), IndexModePost, StFrm, IIC_iStore_ru, - "str", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + "str", "\t$Rt, [$Rn], $offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (post_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; def STRB_PRE : AI2stridx<1, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), IndexModePre, StFrm, IIC_iStore_bh_ru, - "strb", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + "strb", "\t$Rt, [$Rn, $offset]!", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; def STRB_POST: AI2stridx<1, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), IndexModePost, StFrm, IIC_iStore_bh_ru, - "strb", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + "strb", "\t$Rt, [$Rn], $offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (post_truncsti8 GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; def STRH_PRE : AI3stridx<0b1011, 0, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am3offset:$offset), IndexModePre, StMiscFrm, IIC_iStore_ru, - "strh", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + "strh", "\t$Rt, [$Rn, $offset]!", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>; def STRH_POST: AI3stridx<0b1011, 0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am3offset:$offset), IndexModePost, StMiscFrm, IIC_iStore_bh_ru, - "strh", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + "strh", "\t$Rt, [$Rn], $offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>; // For disassembly only +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { def STRD_PRE : AI3stdpr<(outs GPR:$base_wb), (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset), StMiscFrm, IIC_iStore_d_ru, @@ -1848,6 +1877,7 @@ def STRD_POST: AI3stdpo<(outs GPR:$base_wb), StMiscFrm, IIC_iStore_d_ru, "strd", "\t$src1, $src2, [$base], $offset", "$base = $base_wb", []>; +} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 // STRT, STRBT, and STRHT are for disassembly only. @@ -2317,8 +2347,10 @@ def : ARMPat<(addc GPR:$src, so_imm_neg:$imm), // The with-carry-in form matches bitwise not instead of the negation. // Effectively, the inverse interpretation of the carry flag already accounts // for part of the negation. -def : ARMPat<(adde GPR:$src, so_imm_not:$imm), +def : ARMPat<(adde_dead_carry GPR:$src, so_imm_not:$imm), (SBCri GPR:$src, so_imm_not:$imm)>; +def : ARMPat<(adde_live_carry GPR:$src, so_imm_not:$imm), + (SBCSri GPR:$src, so_imm_not:$imm)>; // Note: These are implemented in C++ code, because they have to generate // ADD/SUBrs instructions, which use a complex pattern that a xform function @@ -2432,7 +2464,7 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), // Signed/Unsigned saturate -- for disassembly only -def SSAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh), +def SSAT : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$a, shift_imm:$sh), SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $a$sh", [/* For disassembly only; pattern left blank */]> { bits<4> Rd; @@ -2448,7 +2480,7 @@ def SSAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh), let Inst{3-0} = Rn; } -def SSAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$Rn), SatFrm, +def SSAT16 : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$Rn), SatFrm, NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", [/* For disassembly only; pattern left blank */]> { bits<4> Rd; @@ -2641,9 +2673,9 @@ def MUL : AsMul1I32<0b0000000, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), let Constraints = "@earlyclobber $Rd" in def MLAv5: ARMPseudoInst<(outs GPR:$Rd), - (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s), - Size4Bytes, IIC_iMAC32, - [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s), + Size4Bytes, IIC_iMAC32, + [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, Requires<[IsARM, NoV6]> { bits<4> Ra; let Inst{15-12} = Ra; @@ -2997,6 +3029,10 @@ def : ARMV6Pat<(sext_inreg (or (srl (and GPR:$Rm, 0xFF00), (i32 8)), (shl GPR:$Rm, (i32 8))), i16), (REVSH GPR:$Rm)>; +def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)), + (and (srl GPR:$Rm, (i32 8)), 0xFF)), + (REVSH GPR:$Rm)>; + // Need the AddedComplexity or else MOVs + REV would be chosen. let AddedComplexity = 5 in def : ARMV6Pat<(sra (bswap GPR:$Rm), (i32 16)), (REVSH GPR:$Rm)>; @@ -3006,8 +3042,8 @@ def lsl_shift_imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(Sh, MVT::i32); }]>; -def lsl_amt : PatLeaf<(i32 imm), [{ - return (N->getZExtValue() < 32); +def lsl_amt : ImmLeaf<i32, [{ + return Imm > 0 && Imm < 32; }], lsl_shift_imm>; def PKHBT : APKHI<0b01101000, 0, (outs GPR:$Rd), @@ -3029,8 +3065,8 @@ def asr_shift_imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(Sh, MVT::i32); }]>; -def asr_amt : PatLeaf<(i32 imm), [{ - return (N->getZExtValue() <= 32); +def asr_amt : ImmLeaf<i32, [{ + return Imm > 0 && Imm <= 32; }], asr_shift_imm>; // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and @@ -3241,6 +3277,18 @@ let usesCustomInserter = 1 in { def ATOMIC_LOAD_NAND_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>; def ATOMIC_LOAD_ADD_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>; @@ -3259,6 +3307,18 @@ let usesCustomInserter = 1 in { def ATOMIC_LOAD_NAND_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>; def ATOMIC_LOAD_ADD_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>; @@ -3277,6 +3337,18 @@ let usesCustomInserter = 1 in { def ATOMIC_LOAD_NAND_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>; def ATOMIC_SWAP_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, @@ -3307,8 +3379,9 @@ def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary, "ldrexh", "\t$Rt, $addr", []>; def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary, "ldrex", "\t$Rt, $addr", []>; -def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr), - NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>; +let hasExtraDefRegAllocReq = 1 in + def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr), + NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>; } let mayStore = 1, Constraints = "@earlyclobber $Rd" in { @@ -3318,10 +3391,12 @@ def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr), NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>; def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr), NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>; +} + +let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in def STREXD : AIstrex<0b01, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rt2, addrmode7:$addr), NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []>; -} // Clear-Exclusive is for disassembly only. def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", @@ -3345,7 +3420,8 @@ def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb", def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { + [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { bits<4> opc1; bits<4> CRn; bits<4> CRd; @@ -3365,7 +3441,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { + [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { let Inst{31-28} = 0b1111; bits<4> opc1; bits<4> CRn; @@ -3489,10 +3566,10 @@ defm STC2 : LdStCop<0b1111, 0, (ins), "stc2", "">; // Move between coprocessor and ARM core register -- for disassembly only // -class MovRCopro<string opc, bit direction, dag oops, dag iops> +class MovRCopro<string opc, bit direction, dag oops, dag iops, + list<dag> pattern> : ABI<0b1110, oops, iops, NoItinerary, opc, - "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { + "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> { let Inst{20} = direction; let Inst{4} = 1; @@ -3512,17 +3589,23 @@ class MovRCopro<string opc, bit direction, dag oops, dag iops> } def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, - (outs), (ins p_imm:$cop, i32imm:$opc1, - GPR:$Rt, c_imm:$CRn, c_imm:$CRm, - i32imm:$opc2)>; + (outs), + (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, i32imm:$opc2), + [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>; def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */, - (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, - c_imm:$CRn, c_imm:$CRm, i32imm:$opc2)>; + (outs GPR:$Rt), + (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm, + i32imm:$opc2), []>; + +def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), + (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; -class MovRCopro2<string opc, bit direction, dag oops, dag iops> +class MovRCopro2<string opc, bit direction, dag oops, dag iops, + list<dag> pattern> : ABXI<0b1110, oops, iops, NoItinerary, - !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), - [/* For disassembly only; pattern left blank */]> { + !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> { let Inst{31-28} = 0b1111; let Inst{20} = direction; let Inst{4} = 1; @@ -3543,19 +3626,25 @@ class MovRCopro2<string opc, bit direction, dag oops, dag iops> } def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */, - (outs), (ins p_imm:$cop, i32imm:$opc1, - GPR:$Rt, c_imm:$CRn, c_imm:$CRm, - i32imm:$opc2)>; + (outs), + (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, i32imm:$opc2), + [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>; def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */, - (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, - c_imm:$CRn, c_imm:$CRm, - i32imm:$opc2)>; + (outs GPR:$Rt), + (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm, + i32imm:$opc2), []>; + +def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, + imm:$CRm, imm:$opc2), + (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; -class MovRRCopro<string opc, bit direction> +class MovRRCopro<string opc, bit direction, + list<dag> pattern = [/* For disassembly only */]> : ABI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), - NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", - [/* For disassembly only; pattern left blank */]> { + NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> { let Inst{23-21} = 0b010; let Inst{20} = direction; @@ -3572,14 +3661,16 @@ class MovRRCopro<string opc, bit direction> let Inst{3-0} = CRm; } -def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>; +def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */, + [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, + imm:$CRm)]>; def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>; -class MovRRCopro2<string opc, bit direction> +class MovRRCopro2<string opc, bit direction, + list<dag> pattern = [/* For disassembly only */]> : ABXI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1, - GPR:$Rt, GPR:$Rt2, c_imm:$CRm), - NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), - [/* For disassembly only; pattern left blank */]> { + GPR:$Rt, GPR:$Rt2, c_imm:$CRm), NoItinerary, + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> { let Inst{31-28} = 0b1111; let Inst{23-21} = 0b010; let Inst{20} = direction; @@ -3597,7 +3688,9 @@ class MovRRCopro2<string opc, bit direction> let Inst{3-0} = CRm; } -def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */>; +def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */, + [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, + imm:$CRm)]>; def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>; //===----------------------------------------------------------------------===// @@ -3677,7 +3770,7 @@ let isCall = 1, // here, and we're using the stack frame for the containing function to // save/restore registers, we can't keep anything live in regs across // the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon -// when we get here from a longjmp(). We force everthing out of registers +// when we get here from a longjmp(). We force everything out of registers // except for our own input by listing the relevant registers in Defs. By // doing so, we also cause the prologue/epilogue code to actively preserve // all of the callee-saved resgisters, which is exactly what we want. @@ -3686,10 +3779,8 @@ let isCall = 1, // These are pseudo-instructions and are lowered to individual MC-insts, so // no encoding information is necessary. let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, - D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, - D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, - D31 ], hasSideEffects = 1, isBarrier = 1 in { + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, + QQQQ0, QQQQ1, QQQQ2, QQQQ3 ], hasSideEffects = 1, isBarrier = 1 in { def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), NoItinerary, [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, @@ -3697,7 +3788,7 @@ let Defs = } let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ], + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ], hasSideEffects = 1, isBarrier = 1 in { def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), NoItinerary, @@ -3720,8 +3811,8 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), // that need the instruction size). let isBarrier = 1, hasSideEffects = 1 in def Int_eh_sjlj_dispatchsetup : - PseudoInst<(outs), (ins), NoItinerary, - [(ARMeh_sjlj_dispatchsetup)]>, + PseudoInst<(outs), (ins GPR:$src), NoItinerary, + [(ARMeh_sjlj_dispatchsetup GPR:$src)]>, Requires<[IsDarwin]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index e34d69a..977139f 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -175,7 +175,7 @@ class VLDQQWBPseudo<InstrItinClass itin> (ins addrmode6:$addr, am6offset:$offset), itin, "$addr.addr = $wb">; class VLDQQQQPseudo<InstrItinClass itin> - : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src), itin,"">; + : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,"">; class VLDQQQQWBPseudo<InstrItinClass itin> : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb), (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin, @@ -531,6 +531,17 @@ class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, imm:$lane))]> { let Rm = 0b1111; } +class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd), + (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane), + IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn", + "$src = $Vd", + [(set DPR:$Vd, (vector_insert (Ty DPR:$src), + (i32 (LoadOp addrmode6oneL32:$Rn)), + imm:$lane))]> { + let Rm = 0b1111; +} class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> { let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src), (i32 (LoadOp addrmode6:$addr)), @@ -544,7 +555,7 @@ def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> { let Inst{7-6} = lane{1-0}; let Inst{4} = Rn{4}; } -def VLD1LNd32 : VLD1LN<0b1000, {?,0,?,?}, "32", v2i32, load> { +def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> { let Inst{7} = lane{0}; let Inst{5} = Rn{4}; let Inst{4} = Rn{4}; @@ -1371,6 +1382,14 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> { let Rm = 0b1111; } +class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag StoreOp, SDNode ExtractOp> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6oneL32:$Rn, DPR:$Vd, nohash_imm:$lane), + IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "", + [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6oneL32:$Rn)]>{ + let Rm = 0b1111; +} class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp> : VSTQLNPseudo<IIC_VST1ln> { let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), @@ -1386,7 +1405,8 @@ def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16, let Inst{7-6} = lane{1-0}; let Inst{4} = Rn{5}; } -def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> { + +def VST1LNd32 : VST1LN32<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> { let Inst{7} = lane{0}; let Inst{5-4} = Rn{5-4}; } @@ -3773,7 +3793,8 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VCNTiD, "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", - [(set DPR:$Vd, (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; + [(set DPR:$Vd, + (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd), (and DPR:$Vm, (vnotd DPR:$Vd)))), @@ -3783,7 +3804,8 @@ def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VCNTiQ, "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", - [(set QPR:$Vd, (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; + [(set QPR:$Vd, + (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd), (and QPR:$Vm, (vnotq QPR:$Vd)))), @@ -4683,8 +4705,9 @@ def VEXTd32 : VEXTd<"vext", "32", v2i32> { let Inst{9-8} = 0b00; } def VEXTdf : VEXTd<"vext", "32", v2f32> { - let Inst{11} = index{0}; - let Inst{10-8} = 0b000; + let Inst{11-10} = index{1-0}; + let Inst{9-8} = 0b00; + } def VEXTq8 : VEXTq<"vext", "8", v16i8> { diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 15ab455..8430aa3 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -27,22 +27,22 @@ def imm_comp_XFORM : SDNodeXForm<imm, [{ }]>; /// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7]. -def imm0_7 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 8; +def imm0_7 : ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 8; }]>; def imm0_7_neg : PatLeaf<(i32 imm), [{ return (uint32_t)-N->getZExtValue() < 8; }], imm_neg_XFORM>; -def imm0_255 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 256; +def imm0_255 : ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 256; }]>; def imm0_255_comp : PatLeaf<(i32 imm), [{ return ~((uint32_t)N->getZExtValue()) < 256; }]>; -def imm8_255 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() >= 8 && (uint32_t)N->getZExtValue() < 256; +def imm8_255 : ImmLeaf<i32, [{ + return Imm >= 8 && Imm < 256; }]>; def imm8_255_neg : PatLeaf<(i32 imm), [{ unsigned Val = -N->getZExtValue(); @@ -383,6 +383,14 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in { // Indirect branches let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>, + T1Special<{1,1,0,?}> { + // A6.2.3 & A8.6.25 + bits<4> Rm; + let Inst{6-3} = Rm; + let Inst{2-0} = 0b000; + } + def tBRIND : TI<(outs), (ins GPR:$Rm), IIC_Br, "mov\tpc, $Rm", @@ -414,10 +422,7 @@ def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), // potentially appearing dead. let isCall = 1, // On non-Darwin platforms R9 is callee-saved. - Defs = [R0, R1, R2, R3, R12, LR, - D0, D1, D2, D3, D4, D5, D6, D7, - D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], + Defs = [R0, R1, R2, R3, R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR], Uses = [SP] in { // Also used for Thumb2 def tBL : TIx2<0b11110, 0b11, 1, @@ -451,14 +456,15 @@ let isCall = 1, "blx\t$func", [(ARMtcall GPR:$func)]>, Requires<[IsThumb, HasV5T, IsNotDarwin]>, - T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24; + T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24; + bits<4> func; + let Inst{6-3} = func; + let Inst{2-0} = 0b000; + } // ARMv4T - // FIXME: Should be a pseudo. - let isCodeGenOnly = 1 in - def tBX : TIx2<{?,?,?,?,?}, {?,?}, ?, - (outs), (ins tGPR:$func, variable_ops), IIC_Br, - "mov\tlr, pc\n\tbx\t$func", + def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops), + Size4Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsThumb, IsThumb1Only, IsNotDarwin]>; } @@ -467,10 +473,7 @@ let isCall = 1, // On Darwin R9 is call-clobbered. // R7 is marked as a use to prevent frame-pointer assignments from being // moved above / below calls. - Defs = [R0, R1, R2, R3, R9, R12, LR, - D0, D1, D2, D3, D4, D5, D6, D7, - D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], + Defs = [R0, R1, R2, R3, R9, R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR], Uses = [R7, SP] in { // Also used for Thumb2 def tBLr9 : TIx2<0b11110, 0b11, 1, @@ -512,11 +515,8 @@ let isCall = 1, } // ARMv4T - let isCodeGenOnly = 1 in - // FIXME: Should be a pseudo. - def tBXr9 : TIx2<{?,?,?,?,?}, {?,?}, ?, - (outs), (ins tGPR:$func, variable_ops), IIC_Br, - "mov\tlr, pc\n\tbx\t$func", + def tBXr9_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops), + Size4Bytes, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsThumb, IsThumb1Only, IsDarwin]>; } @@ -551,7 +551,7 @@ let isBranch = 1, isTerminator = 1 in def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br, "b${p}\t$target", [/*(ARMbrcond bb:$target, imm:$cc)*/]>, - T1Encoding<{1,1,0,1,?,?}> { + T1BranchCond<{1,1,0,1}> { bits<4> p; bits<8> target; let Inst{11-8} = p; @@ -597,7 +597,7 @@ def tSVC : T1pI<(outs), (ins i32imm:$imm), IIC_Br, // The assembler uses 0xDEFE for a trap instruction. let isBarrier = 1, isTerminator = 1 in -def tTRAP : TI<(outs), (ins), IIC_Br, +def tTRAP : TI<(outs), (ins), IIC_Br, "trap", [(trap)]>, Encoding16 { let Inst = 0xdefe; } @@ -712,6 +712,19 @@ def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, let Inst{7-0} = addr; } +// FIXME: Remove this entry when the above ldr.n workaround is fixed. +// For disassembly use only. +def tLDRpciDIS : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [/* disassembly only */]>, + T1Encoding<{0,1,0,0,1,?}> { + // A6.2 & A8.6.59 + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + // A8.6.194 & A8.6.192 defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4, t_addrmode_is4, AddrModeT1_4, @@ -726,9 +739,9 @@ defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1, // A8.6.207 & A8.6.205 defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2, - t_addrmode_is2, AddrModeT1_2, - IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", - BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; + t_addrmode_is2, AddrModeT1_2, + IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", + BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, @@ -791,7 +804,7 @@ defm tLDM : thumb_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, let mayStore = 1, hasExtraSrcRegAllocReq = 1 in defm tSTM : thumb_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu, {1,1,0,0,0,?}, 0>; - + } // neverHasSideEffects let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in @@ -898,7 +911,8 @@ def tADC : // A8.6.2 // Add immediate def tADDi3 : // A8.6.4 T1 - T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3), IIC_iALUi, + T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3), + IIC_iALUi, "add", "\t$Rd, $Rm, $imm3", [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> { bits<3> imm3; @@ -1175,10 +1189,18 @@ def tREVSH : // A8.6.136 "revsh", "\t$Rd, $Rm", [(set tGPR:$Rd, (sext_inreg - (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)), + (or (srl tGPR:$Rm, (i32 8)), (shl tGPR:$Rm, (i32 8))), i16))]>, Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sext_inreg (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)), + (shl tGPR:$Rm, (i32 8))), i16), + (tREVSH tGPR:$Rm)>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def : T1Pat<(sra (bswap tGPR:$Rm), (i32 16)), (tREVSH tGPR:$Rm)>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + // Rotate right register def tROR : // A8.6.139 T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), @@ -1322,9 +1344,10 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), // Move between coprocessor and ARM core register -- for disassembly only // -class tMovRCopro<string opc, bit direction, dag oops, dag iops> +class tMovRCopro<string opc, bit direction, dag oops, dag iops, + list<dag> pattern> : T1Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), - [/* For disassembly only; pattern left blank */]> { + pattern> { let Inst{27-24} = 0b1110; let Inst{20} = direction; let Inst{4} = 1; @@ -1345,16 +1368,24 @@ class tMovRCopro<string opc, bit direction, dag oops, dag iops> } def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, - (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn, - c_imm:$CRm, i32imm:$opc2)>; + (outs), + (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, i32imm:$opc2), + [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>; def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */, - (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, - c_imm:$CRm, i32imm:$opc2)>; + (outs GPR:$Rt), + (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), + []>; + +def : Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), + (tMRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>, + Requires<[IsThumb, HasV6T2]>; -class tMovRRCopro<string opc, bit direction> +class tMovRRCopro<string opc, bit direction, + list<dag> pattern = [/* For disassembly only */]> : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), - !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), - [/* For disassembly only; pattern left blank */]> { + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> { let Inst{27-24} = 0b1100; let Inst{23-21} = 0b010; let Inst{20} = direction; @@ -1372,7 +1403,9 @@ class tMovRRCopro<string opc, bit direction> let Inst{3-0} = CRm; } -def tMCRR : tMovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>; +def tMCRR : tMovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */, + [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, + imm:$CRm)]>; def tMRRC : tMovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>; //===----------------------------------------------------------------------===// @@ -1381,7 +1414,8 @@ def tMRRC : tMovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>; def tCDP : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { + [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { let Inst{27-24} = 0b1110; bits<4> opc1; @@ -1415,19 +1449,19 @@ def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br, //===----------------------------------------------------------------------===// // SJLJ Exception handling intrinsics -// +// // eh_sjlj_setjmp() is an instruction sequence to store the return address and // save #0 in R0 for the non-longjmp case. Since by its nature we may be coming // from some other function to get here, and we're using the stack frame for the // containing function to save/restore registers, we can't keep anything live in // regs across the eh_sjlj_setjmp(), else it will almost certainly have been -// tromped upon when we get here from a longjmp(). We force everthing out of +// tromped upon when we get here from a longjmp(). We force everything out of // registers except for our own input by listing the relevant registers in // Defs. By doing so, we also cause the prologue/epilogue code to actively // preserve all of the callee-saved resgisters, which is exactly what we want. // $val is a scratch register for our use. -let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12 ], +let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ], hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), AddrModeNone, SizeSpecial, NoItinerary, "","", diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 6794e75..53b9cec 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -44,7 +44,9 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ // t2_so_imm - Match a 32-bit immediate operand, which is an // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit // immediate splatted into multiple bytes of the word. -def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]> { +def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{ + return ARM_AM::getT2SOImmVal(Imm) != -1; + }]> { let EncoderMethod = "getT2SOImmOpValue"; } @@ -62,14 +64,14 @@ def t2_so_imm_neg : Operand<i32>, }], t2_so_imm_neg_XFORM>; /// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31]. -def imm1_31 : PatLeaf<(i32 imm), [{ - return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 32; +def imm1_31 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 1 && (int32_t)Imm < 32; }]>; /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095]. def imm0_4095 : Operand<i32>, - PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 4096; + ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 4096; }]>; def imm0_4095_neg : PatLeaf<(i32 imm), [{ @@ -84,6 +86,11 @@ def imm0_255_not : PatLeaf<(i32 imm), [{ return (uint32_t)(~N->getZExtValue()) < 255; }], imm_comp_XFORM>; +def lo5AllOne : PatLeaf<(i32 imm), [{ + // Returns true if all low 5-bits are 1. + return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL; +}]>; + // Define Thumb2 specific addressing modes. // t2addrmode_imm12 := reg + imm12 @@ -151,7 +158,7 @@ def t2addrmode_so_reg : Operand<i32>, // def t2addrmode_reg : Operand<i32> { let PrintMethod = "printAddrMode7Operand"; - let MIOperandInfo = (ops tGPR); + let MIOperandInfo = (ops GPR); let ParserMatchClass = MemMode7AsmOperand; } @@ -681,49 +688,27 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{24-21} = opcod; } } +} // Carry setting variants -let isCodeGenOnly = 1, Defs = [CPSR] in { -multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0> { +// NOTE: CPSR def omitted because it will be handled by the custom inserter. +let usesCustomInserter = 1 in { +multiclass T2I_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> { // shifted imm - def ri : T2sTwoRegImm< - (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi, - opc, "\t$Rd, $Rn, $imm", - [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>, - Requires<[IsThumb2]> { - let Inst{31-27} = 0b11110; - let Inst{25} = 0; - let Inst{24-21} = opcod; - let Inst{20} = 1; // The S bit. - let Inst{15} = 0; - } + def ri : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), + Size4Bytes, IIC_iALUi, + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>; // register - def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, - opc, ".w\t$Rd, $Rn, $Rm", - [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>, - Requires<[IsThumb2]> { + def rr : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + Size4Bytes, IIC_iALUr, + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> { let isCommutable = Commutable; - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b01; - let Inst{24-21} = opcod; - let Inst{20} = 1; // The S bit. - let Inst{14-12} = 0b000; // imm3 - let Inst{7-6} = 0b00; // imm2 - let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sTwoRegShiftedReg< - (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), - IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", - [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>, - Requires<[IsThumb2]> { - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b01; - let Inst{24-21} = opcod; - let Inst{20} = 1; // The S bit. - } -} + def rs : t2PseudoInst< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + Size4Bytes, IIC_iALUsi, + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>; } } @@ -845,6 +830,7 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, let Inst{15-12} = Rt; bits<17> addr; + let addr{12} = 1; // add = TRUE let Inst{19-16} = addr{16-13}; // Rn let Inst{23} = addr{12}; // U let Inst{11-0} = addr{11-0}; // imm @@ -925,6 +911,7 @@ multiclass T2I_st<bits<2> opcod, string opc, let Inst{15-12} = Rt; bits<17> addr; + let addr{12} = 1; // add = TRUE let Inst{19-16} = addr{16-13}; // Rn let Inst{23} = addr{12}; // U let Inst{11-0} = addr{11-0}; // imm @@ -1097,7 +1084,7 @@ multiclass T2I_exta_rrot_DO<bits<3> opcod, string opc> { let Inst{7} = 1; let Inst{5-4} = 0b00; // rotate } - def rr_rot : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, i32imm:$rot), + def rr_rot :T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, i32imm:$rot), IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot", []> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; @@ -1379,7 +1366,7 @@ def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$Rn), // for disassembly only. // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii> - : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, + : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, "\t$Rt, $addr", []> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; @@ -1421,42 +1408,48 @@ def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), def t2STR_PRE : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePre, IIC_iStore_iu, - "str", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", + "str", "\t$Rt, [$Rn, $addr]!", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (pre_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePost, IIC_iStore_iu, - "str", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", + "str", "\t$Rt, [$Rn], $addr", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (post_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRH_PRE : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePre, IIC_iStore_iu, - "strh", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", + "strh", "\t$Rt, [$Rn, $addr]!", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (pre_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, - "strh", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", + "strh", "\t$Rt, [$Rn], $addr", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (post_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRB_PRE : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu, - "strb", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", + "strb", "\t$Rt, [$Rn, $addr]!", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (pre_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, - "strb", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", + "strb", "\t$Rt, [$Rn], $addr", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (post_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; @@ -1464,7 +1457,7 @@ def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb), // only. // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4 class T2IstT<bits<2> type, string opc, InstrItinClass ii> - : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, + : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, "\t$Rt, $addr", []> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; @@ -1489,20 +1482,20 @@ def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>; // ldrd / strd pre / post variants // For disassembly only. -def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs GPR:$Rt, GPR:$Rt2), +def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2), (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, [$base, $imm]!", []>; -def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$Rt, GPR:$Rt2), +def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2), (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, [$base], $imm", []>; def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs), - (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), + (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base, $imm]!", []>; def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs), - (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), + (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base], $imm", []>; // T2Ipl (Preload Data/Instruction) signals the memory system of possible future @@ -1522,6 +1515,7 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> { let Inst{15-12} = 0b1111; bits<17> addr; + let addr{12} = 1; // add = TRUE let Inst{19-16} = addr{16-13}; // Rn let Inst{23} = addr{12}; // U let Inst{11-0} = addr{11-0}; // imm12 @@ -1794,10 +1788,10 @@ defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>; defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>; -defm t2ADCS : T2I_adde_sube_s_irs<0b1010, "adc", - BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>; -defm t2SBCS : T2I_adde_sube_s_irs<0b1011, "sbc", - BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>; +defm t2ADCS : T2I_adde_sube_s_irs<BinOpFrag<(adde_live_carry node:$LHS, + node:$RHS)>, 1>; +defm t2SBCS : T2I_adde_sube_s_irs<BinOpFrag<(sube_live_carry node:$LHS, + node:$RHS)>>; // RSB defm t2RSB : T2I_rbin_irs <0b1110, "rsb", @@ -1828,9 +1822,14 @@ def : T2Pat<(addc rGPR:$src, t2_so_imm_neg:$imm), // Effectively, the inverse interpretation of the carry flag already accounts // for part of the negation. let AddedComplexity = 1 in -def : T2Pat<(adde rGPR:$src, imm0_255_not:$imm), +def : T2Pat<(adde_dead_carry rGPR:$src, imm0_255_not:$imm), + (t2SBCri rGPR:$src, imm0_255_not:$imm)>; +def : T2Pat<(adde_dead_carry rGPR:$src, t2_so_imm_not:$imm), + (t2SBCri rGPR:$src, t2_so_imm_not:$imm)>; +let AddedComplexity = 1 in +def : T2Pat<(adde_live_carry rGPR:$src, imm0_255_not:$imm), (t2SBCSri rGPR:$src, imm0_255_not:$imm)>; -def : T2Pat<(adde rGPR:$src, t2_so_imm_not:$imm), +def : T2Pat<(adde_live_carry rGPR:$src, t2_so_imm_not:$imm), (t2SBCSri rGPR:$src, t2_so_imm_not:$imm)>; // Select Bytes -- for disassembly only @@ -1976,9 +1975,9 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin, } def t2SSAT: T2SatI< - (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh), - NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", - [/* For disassembly only; pattern left blank */]> { + (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn, shift_imm:$sh), + NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", + [/* For disassembly only; pattern left blank */]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; let Inst{20} = 0; @@ -1986,9 +1985,9 @@ def t2SSAT: T2SatI< } def t2SSAT16: T2SatI< - (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary, - "ssat16", "\t$Rd, $sat_imm, $Rn", - [/* For disassembly only; pattern left blank */]> { + (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn), NoItinerary, + "ssat16", "\t$Rd, $sat_imm, $Rn", + [/* For disassembly only; pattern left blank */]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; let Inst{20} = 0; @@ -2008,10 +2007,10 @@ def t2USAT: T2SatI< let Inst{15} = 0; } -def t2USAT16: T2SatI< - (outs rGPR:$dst), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary, - "usat16", "\t$dst, $sat_imm, $Rn", - [/* For disassembly only; pattern left blank */]> { +def t2USAT16: T2SatI<(outs rGPR:$dst), (ins i32imm:$sat_imm, rGPR:$Rn), + NoItinerary, + "usat16", "\t$dst, $sat_imm, $Rn", + [/* For disassembly only; pattern left blank */]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1110; let Inst{20} = 0; @@ -2033,6 +2032,10 @@ defm t2LSR : T2I_sh_ir<0b01, "lsr", BinOpFrag<(srl node:$LHS, node:$RHS)>>; defm t2ASR : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra node:$LHS, node:$RHS)>>; defm t2ROR : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>; +// (rotr x, (and y, 0x...1f)) ==> (ROR x, y) +def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)), + (t2RORrr rGPR:$lhs, rGPR:$rhs)>; + let Uses = [CPSR] in { def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, "rrx", "\t$Rd, $Rm", @@ -2121,10 +2124,12 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm), IIC_iUNAsi, "bfc", "\t$Rd, $imm", [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> { let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. let Inst{25} = 1; let Inst{24-20} = 0b10110; let Inst{19-16} = 0b1111; // Rn let Inst{15} = 0; + let Inst{5} = 0; // should be 0. bits<10> imm; let msb{4-0} = imm{9-5}; @@ -2157,9 +2162,11 @@ let Constraints = "$src = $Rd" in { [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm))]> { let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. let Inst{25} = 1; let Inst{24-20} = 0b10110; let Inst{15} = 0; + let Inst{5} = 0; // should be 0. bits<10> imm; let msb{4-0} = imm{9-5}; @@ -2174,9 +2181,11 @@ let Constraints = "$src = $Rd" in { IIC_iBITi, "bfi", "\t$Rd, $Rn, $lsbit, $width", []> { let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. let Inst{25} = 1; let Inst{24-20} = 0b10110; let Inst{15} = 0; + let Inst{5} = 0; // should be 0. bits<5> lsbit; bits<5> width; @@ -2595,6 +2604,10 @@ def : T2Pat<(sext_inreg (or (srl (and rGPR:$Rm, 0xFF00), (i32 8)), (shl rGPR:$Rm, (i32 8))), i16), (t2REVSH rGPR:$Rm)>; +def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)), + (and (srl rGPR:$Rm, (i32 8)), 0xFF)), + (t2REVSH rGPR:$Rm)>; + def : T2Pat<(sra (bswap rGPR:$Rm), (i32 16)), (t2REVSH rGPR:$Rm)>; def t2PKHBT : T2ThreeReg< @@ -2854,16 +2867,15 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, } let mayLoad = 1 in { -def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone, - Size4Bytes, NoItinerary, "ldrexb", "\t$Rt, $addr", - "", []>; -def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone, - Size4Bytes, NoItinerary, "ldrexh", "\t$Rt, $addr", - "", []>; -def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone, - Size4Bytes, NoItinerary, - "ldrex", "\t$Rt, $addr", "", - []> { +def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), + AddrModeNone, Size4Bytes, NoItinerary, + "ldrexb", "\t$Rt, $addr", "", []>; +def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), + AddrModeNone, Size4Bytes, NoItinerary, + "ldrexh", "\t$Rt, $addr", "", []>; +def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr), + AddrModeNone, Size4Bytes, NoItinerary, + "ldrex", "\t$Rt, $addr", "", []> { let Inst{31-27} = 0b11101; let Inst{26-20} = 0b0000101; let Inst{11-8} = 0b1111; @@ -2874,7 +2886,9 @@ def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone let Inst{19-16} = addr; let Inst{15-12} = Rt; } -def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_reg:$addr), +let hasExtraDefRegAllocReq = 1 in +def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), + (ins t2addrmode_reg:$addr), AddrModeNone, Size4Bytes, NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}> { @@ -2884,12 +2898,14 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_reg:$ } let mayStore = 1, Constraints = "@earlyclobber $Rd" in { -def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr), - AddrModeNone, Size4Bytes, NoItinerary, - "strexb", "\t$Rd, $Rt, $addr", "", []>; -def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr), - AddrModeNone, Size4Bytes, NoItinerary, - "strexh", "\t$Rd, $Rt, $addr", "", []>; +def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), + (ins rGPR:$Rt, t2addrmode_reg:$addr), + AddrModeNone, Size4Bytes, NoItinerary, + "strexb", "\t$Rd, $Rt, $addr", "", []>; +def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), + (ins rGPR:$Rt, t2addrmode_reg:$addr), + AddrModeNone, Size4Bytes, NoItinerary, + "strexh", "\t$Rd, $Rt, $addr", "", []>; def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr), AddrModeNone, Size4Bytes, NoItinerary, "strex", "\t$Rd, $Rt, $addr", "", @@ -2905,6 +2921,9 @@ def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr), let Inst{19-16} = addr; let Inst{15-12} = Rt; } +} + +let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_reg:$addr), AddrModeNone, Size4Bytes, NoItinerary, @@ -2913,7 +2932,6 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd), bits<4> Rt2; let Inst{11-8} = Rt2; } -} // Clear-Exclusive is for disassembly only. def t2CLREX : T2XI<(outs), (ins), NoItinerary, "clrex", @@ -2952,16 +2970,15 @@ let isCall = 1, // here, and we're using the stack frame for the containing function to // save/restore registers, we can't keep anything live in regs across // the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon -// when we get here from a longjmp(). We force everthing out of registers +// when we get here from a longjmp(). We force everything out of registers // except for our own input by listing the relevant registers in Defs. By // doing so, we also cause the prologue/epilogue code to actively preserve // all of the callee-saved resgisters, which is exactly what we want. // $val is a scratch register for our use. let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, - D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, - D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, - D31 ], hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in { + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, + QQQQ0, QQQQ1, QQQQ2, QQQQ3 ], + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in { def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), AddrModeNone, SizeSpecial, NoItinerary, "", "", [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>, @@ -2969,7 +2986,7 @@ let Defs = } let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ], + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ], hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in { def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), AddrModeNone, SizeSpecial, NoItinerary, "", "", @@ -3225,19 +3242,20 @@ class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin, bits<4> Rn; let Inst{19-16} = Rn; + let Inst{15-0} = 0xc000; } def t2RFEDBW : T2RFE<0b111010000011, - (outs), (ins rGPR:$Rn), NoItinerary, "rfedb", "\t$Rn!", + (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!", [/* For disassembly only; pattern left blank */]>; def t2RFEDB : T2RFE<0b111010000001, - (outs), (ins rGPR:$Rn), NoItinerary, "rfeab", "\t$Rn", + (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn", [/* For disassembly only; pattern left blank */]>; def t2RFEIAW : T2RFE<0b111010011011, - (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn!", + (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!", [/* For disassembly only; pattern left blank */]>; def t2RFEIA : T2RFE<0b111010011001, - (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn", + (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn", [/* For disassembly only; pattern left blank */]>; //===----------------------------------------------------------------------===// @@ -3339,9 +3357,10 @@ def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */, // Move between coprocessor and ARM core register -- for disassembly only // -class t2MovRCopro<string opc, bit direction, dag oops, dag iops> +class t2MovRCopro<string opc, bit direction, dag oops, dag iops, + list<dag> pattern> : T2Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), - [/* For disassembly only; pattern left blank */]> { + pattern> { let Inst{27-24} = 0b1110; let Inst{20} = direction; let Inst{4} = 1; @@ -3363,15 +3382,21 @@ class t2MovRCopro<string opc, bit direction, dag oops, dag iops> def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn, - c_imm:$CRm, i32imm:$opc2)>; + c_imm:$CRm, i32imm:$opc2), + [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>; def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */, (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, - c_imm:$CRm, i32imm:$opc2)>; + c_imm:$CRm, i32imm:$opc2), []>; + +def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, + imm:$CRm, imm:$opc2), + (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; -class t2MovRRCopro<string opc, bit direction> +class t2MovRRCopro<string opc, bit direction, + list<dag> pattern = [/* For disassembly only */]> : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), - !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), - [/* For disassembly only; pattern left blank */]> { + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> { let Inst{27-24} = 0b1100; let Inst{23-21} = 0b010; let Inst{20} = direction; @@ -3390,7 +3415,9 @@ class t2MovRRCopro<string opc, bit direction> } def t2MCRR2 : t2MovRRCopro<"mcrr2", - 0 /* from ARM core register to coprocessor */>; + 0 /* from ARM core register to coprocessor */, + [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, + GPR:$Rt2, imm:$CRm)]>; def t2MRRC2 : t2MovRRCopro<"mrrc2", 1 /* from coprocessor to ARM core register */>; @@ -3401,7 +3428,8 @@ def t2MRRC2 : t2MovRRCopro<"mrrc2", def t2CDP2 : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [/* For disassembly only; pattern left blank */]> { + [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { let Inst{27-24} = 0b1110; bits<4> opc1; diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index a731731..b4c3239 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -94,7 +94,8 @@ multiclass vfp_ldst_mult<string asm, bit L_bit, let Inst{20} = L_bit; } def DIA_UPD : - AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, + variable_ops), IndexModeUpd, itin_upd, !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { let Inst{24-23} = 0b01; // Increment After @@ -102,7 +103,8 @@ multiclass vfp_ldst_mult<string asm, bit L_bit, let Inst{20} = L_bit; } def DDB_UPD : - AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, + variable_ops), IndexModeUpd, itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { let Inst{24-23} = 0b10; // Decrement Before @@ -124,7 +126,8 @@ multiclass vfp_ldst_mult<string asm, bit L_bit, let D = VFPNeonDomain; } def SIA_UPD : - AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), + AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, + variable_ops), IndexModeUpd, itin_upd, !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { let Inst{24-23} = 0b01; // Increment After @@ -136,7 +139,8 @@ multiclass vfp_ldst_mult<string asm, bit L_bit, let D = VFPNeonDomain; } def SDB_UPD : - AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), + AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, + variable_ops), IndexModeUpd, itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { let Inst{24-23} = 0b10; // Decrement Before @@ -447,6 +451,10 @@ def VMOVRS : AVConv2I<0b11100001, 0b1010, let Inst{6-5} = 0b00; let Inst{3-0} = 0b0000; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } def VMOVSR : AVConv4I<0b11100000, 0b1010, @@ -464,6 +472,10 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010, let Inst{6-5} = 0b00; let Inst{3-0} = 0b0000; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } let neverHasSideEffects = 1 in { @@ -483,6 +495,10 @@ def VMOVRRD : AVConv3I<0b11000101, 0b1011, let Inst{19-16} = Rt2; let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } def VMOVRRS : AVConv3I<0b11000101, 0b1010, @@ -490,6 +506,10 @@ def VMOVRRS : AVConv3I<0b11000101, 0b1010, IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src1, $src2", [/* For disassembly only; pattern left blank */]> { let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } } // neverHasSideEffects @@ -512,6 +532,10 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011, let Inst{19-16} = Rt2; let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } let neverHasSideEffects = 1 in @@ -520,6 +544,10 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010, IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2", [/* For disassembly only; pattern left blank */]> { let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } // FMRDH: SPR -> GPR diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index ac5cbfe..f4645f1 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -761,7 +761,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, MIB.addOperand(MI->getOperand(OpNum)); // Transfer memoperands. - (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); MBB.erase(MBBI); return true; @@ -947,8 +947,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, return true; } -/// isMemoryOp - Returns true if instruction is a memory operations (that this -/// pass is capable of operating on). +/// isMemoryOp - Returns true if instruction is a memory operation that this +/// pass is capable of operating on. static bool isMemoryOp(const MachineInstr *MI) { // When no memory operands are present, conservatively assume unaligned, // volatile, unfoldable. @@ -1287,14 +1287,14 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, CurrPred, CurrPredReg, Scratch, MemOps, Merges); - // Try folding preceeding/trailing base inc/dec into the generated + // Try folding preceding/trailing base inc/dec into the generated // LDM/STM ops. for (unsigned i = 0, e = Merges.size(); i < e; ++i) if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) ++NumMerges; NumMerges += Merges.size(); - // Try folding preceeding/trailing base inc/dec into those load/store + // Try folding preceding/trailing base inc/dec into those load/store // that were not merged to form LDM/STM ops. for (unsigned i = 0; i != NumMemOps; ++i) if (!MemOps[i].Merged) @@ -1304,7 +1304,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // RS may be pointing to an instruction that's deleted. RS->skipTo(prior(MBBI)); } else if (NumMemOps == 1) { - // Try folding preceeding/trailing base inc/dec into the single + // Try folding preceding/trailing base inc/dec into the single // load/store. if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { ++NumMerges; @@ -1334,7 +1334,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { } /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops -/// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it +/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it /// directly restore the value of LR into pc. /// ldmfd sp!, {..., lr} /// bx lr @@ -1672,10 +1672,14 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, Ops.pop_back(); Ops.pop_back(); + const TargetInstrDesc &TID = TII->get(NewOpc); + const TargetRegisterClass *TRC = TID.OpInfo[0].getRegClass(TRI); + MRI->constrainRegClass(EvenReg, TRC); + MRI->constrainRegClass(OddReg, TRC); + // Form the pair instruction. if (isLd) { - MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, - dl, TII->get(NewOpc)) + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, TID) .addReg(EvenReg, RegState::Define) .addReg(OddReg, RegState::Define) .addReg(BaseReg); @@ -1687,8 +1691,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, MIB.addImm(Offset).addImm(Pred).addReg(PredReg); ++NumLDRDFormed; } else { - MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, - dl, TII->get(NewOpc)) + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, TID) .addReg(EvenReg) .addReg(OddReg) .addReg(BaseReg); diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp index a3f89e9..53b4c95 100644 --- a/lib/Target/ARM/ARMMCAsmInfo.cpp +++ b/lib/Target/ARM/ARMMCAsmInfo.cpp @@ -70,8 +70,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() { WeakRefDirective = "\t.weak\t"; HasLCOMMDirective = true; - DwarfRequiresFrameSection = false; - SupportsDebugInformation = true; // Exceptions handling diff --git a/lib/Target/ARM/ARMMCCodeEmitter.cpp b/lib/Target/ARM/ARMMCCodeEmitter.cpp index 10607b1..c5f727d 100644 --- a/lib/Target/ARM/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/ARMMCCodeEmitter.cpp @@ -269,10 +269,15 @@ public: unsigned getMsbOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getSsatBitPosValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const; unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const; unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, @@ -1122,6 +1127,13 @@ getMsbOpValue(const MCInst &MI, unsigned Op, } unsigned ARMMCCodeEmitter:: +getSsatBitPosValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + // For ssat instructions, the bit position should be encoded decremented by 1 + return MI.getOperand(Op).getImm()-1; +} + +unsigned ARMMCCodeEmitter:: getRegisterListOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const { // VLDM/VSTM: @@ -1178,6 +1190,30 @@ getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op, return RegNo | (Align << 4); } +/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number +/// along with the alignment operand for use in VST1 and VLD1 with size 32. +unsigned ARMMCCodeEmitter:: +getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = getARMRegisterNumbering(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 2: + case 4: + case 8: + case 16: Align = 0x00; break; + case 32: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + + /// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and /// alignment operand for use in VLD-dup instructions. This is the same as /// getAddrMode6AddressOpValue except for the alignment encoding, which is diff --git a/lib/Target/ARM/ARMMCExpr.h b/lib/Target/ARM/ARMMCExpr.h index d42f766..0a2e883 100644 --- a/lib/Target/ARM/ARMMCExpr.h +++ b/lib/Target/ARM/ARMMCExpr.h @@ -60,6 +60,9 @@ public: bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const; void AddValueSymbols(MCAssembler *) const; + const MCSection *FindAssociatedSection() const { + return getSubExpr()->FindAssociatedSection(); + } static bool classof(const MCExpr *E) { return E->getKind() == MCExpr::Target; diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h index edecc4b..18e1620 100644 --- a/lib/Target/ARM/ARMPerfectShuffle.h +++ b/lib/Target/ARM/ARMPerfectShuffle.h @@ -21,6566 +21,6566 @@ // This table is 6561*4 = 26244 bytes in size. static const unsigned PerfectShuffleTable[6561+1] = { - 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS - 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS - 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> - 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> - 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS - 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> - 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> - 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> - 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS - 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> - 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS - 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS - 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> - 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> - 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> - 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> - 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> - 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS - 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> - 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> - 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS - 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> - 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> - 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> - 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> - 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> - 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS - 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> - 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> - 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> - 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> - 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> - 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> - 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> - 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> - 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> - 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> - 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS - 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS - 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> - 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> - 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS - 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS - 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> - 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS - 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> - 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> - 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> - 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> - 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> - 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> - 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> - 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> - 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> - 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> - 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS - 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> - 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> - 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS - 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> - 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> - 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> - 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> - 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> - 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> - 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> - 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> - 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> - 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> - 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> - 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> - 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> - 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS - 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS - 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS - 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> - 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS - 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS - 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS - 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> - 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS - 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> - 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS - 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> - 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> - 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> - 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> - 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> - 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> - 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS - 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> - 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> - 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> - 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> - 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS - 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> - 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> - 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> - 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS - 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS - 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> - 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> - 835584U, // <0,1,2,3>: Cost 0 copy LHS - 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS - 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> - 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> - 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> - 835584U, // <0,1,2,u>: Cost 0 copy LHS - 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> - 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> - 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> - 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> - 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS - 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> - 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> - 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> - 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> - 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS - 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> - 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> - 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> - 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS - 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS - 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS - 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> - 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS - 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> - 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> - 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> - 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> - 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> - 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> - 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> - 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> - 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> - 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS - 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> - 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> - 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> - 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS - 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> - 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> - 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> - 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> - 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> - 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> - 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> - 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> - 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> - 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> - 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> - 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> - 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> - 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS - 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS - 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS - 835584U, // <0,1,u,3>: Cost 0 copy LHS - 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS - 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS - 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS - 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> - 835584U, // <0,1,u,u>: Cost 0 copy LHS - 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> - 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS - 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS - 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> - 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> - 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> - 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> - 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> - 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS - 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> - 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> - 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> - 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> - 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS - 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> - 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> - 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> - 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> - 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> - 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> - 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> - 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> - 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS - 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> - 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> - 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> - 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS - 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> - 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> - 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> - 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> - 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> - 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> - 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> - 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> - 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> - 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS - 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> - 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> - 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> - 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS - 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS - 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS - 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> - 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS - 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> - 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> - 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> - 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> - 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> - 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> - 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> - 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS - 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS - 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> - 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> - 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> - 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> - 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> - 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> - 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> - 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> - 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> - 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> - 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> - 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> - 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> - 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> - 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> - 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> - 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> - 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> - 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> - 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS - 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS - 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> - 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS - 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS - 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS - 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS - 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS - 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> - 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> - 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> - 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> - 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS - 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> - 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> - 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> - 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS - 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> - 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> - 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> - 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> - 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> - 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> - 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> - 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> - 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> - 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS - 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> - 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> - 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> - 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS - 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> - 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> - 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> - 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS - 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> - 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> - 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> - 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> - 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> - 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> - 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> - 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> - 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> - 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> - 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> - 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> - 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> - 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> - 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> - 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS - 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> - 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> - 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS - 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> - 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> - 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> - 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> - 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> - 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> - 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> - 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> - 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> - 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> - 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> - 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> - 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> - 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> - 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> - 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> - 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> - 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> - 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> - 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> - 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> - 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> - 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> - 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> - 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> - 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> - 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS - 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> - 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> - 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> - 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS - 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> - 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> - 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> - 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS - 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> - 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS - 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> - 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> - 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> - 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> - 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS - 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> - 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS - 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS - 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> - 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> - 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> - 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS - 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS - 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS - 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> - 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS - 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS - 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> - 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> - 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> - 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS - 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS - 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS - 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> - 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS - 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> - 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> - 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> - 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> - 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> - 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> - 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS - 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> - 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> - 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> - 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> - 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> - 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> - 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> - 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS - 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS - 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> - 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS - 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS - 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> - 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> - 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> - 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS - 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> - 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS - 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> - 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS - 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> - 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> - 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> - 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> - 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> - 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> - 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> - 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> - 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> - 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS - 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> - 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> - 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> - 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS - 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> - 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> - 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> - 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> - 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS - 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS - 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS - 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> - 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS - 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS - 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS - 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> - 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS - 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> - 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS - 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> - 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> - 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS - 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> - 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> - 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS - 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS - 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS - 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> - 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> - 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> - 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> - 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> - 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> - 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> - 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS - 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS - 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> - 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> - 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> - 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS - 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> - 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> - 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS - 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS - 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> - 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> - 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> - 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> - 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> - 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> - 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> - 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> - 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> - 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> - 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> - 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> - 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> - 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> - 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS - 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> - 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> - 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS - 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> - 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> - 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> - 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> - 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> - 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> - 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> - 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> - 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> - 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS - 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> - 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> - 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> - 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS - 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> - 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> - 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> - 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> - 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS - 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> - 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> - 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> - 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS - 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> - 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> - 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> - 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS - 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS - 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS - 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> - 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> - 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> - 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS - 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> - 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> - 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> - 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS - 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS - 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> - 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> - 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> - 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> - 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> - 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS - 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS - 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS - 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> - 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> - 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> - 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS - 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> - 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> - 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS - 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS - 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS - 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> - 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> - 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> - 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> - 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> - 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> - 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS - 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS - 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> - 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> - 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> - 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> - 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> - 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> - 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> - 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> - 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> - 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS - 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> - 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> - 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> - 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS - 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS - 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> - 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS - 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS - 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> - 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> - 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> - 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> - 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> - 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> - 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> - 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS - 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> - 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> - 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> - 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> - 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> - 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> - 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> - 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> - 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> - 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> - 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> - 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> - 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> - 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> - 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> - 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> - 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> - 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> - 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> - 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS - 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS - 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> - 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> - 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> - 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS - 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> - 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS - 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS - 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> - 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS - 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> - 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> - 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> - 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> - 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> - 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> - 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS - 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> - 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> - 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> - 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> - 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> - 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> - 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> - 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> - 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> - 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS - 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> - 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> - 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> - 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS - 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> - 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> - 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> - 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> - 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> - 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> - 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> - 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> - 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> - 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> - 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> - 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> - 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> - 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS - 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> - 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> - 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> - 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> - 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS - 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> - 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> - 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS - 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> - 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> - 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> - 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> - 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> - 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> - 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> - 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> - 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> - 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> - 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> - 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> - 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> - 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS - 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> - 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> - 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> - 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> - 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> - 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> - 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> - 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> - 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS - 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> - 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> - 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> - 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> - 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> - 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> - 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> - 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> - 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> - 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS - 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> - 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> - 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> - 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS - 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS - 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS - 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> - 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS - 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> - 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS - 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> - 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS - 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> - 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS - 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS - 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> - 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS - 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS - 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> - 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS - 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS - 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS - 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> - 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS - 835584U, // <0,u,2,3>: Cost 0 copy LHS - 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS - 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> - 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS - 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> - 835584U, // <0,u,2,u>: Cost 0 copy LHS - 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> - 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> - 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> - 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> - 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> - 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> - 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> - 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> - 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> - 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS - 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS - 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS - 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> - 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS - 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS - 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS - 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> - 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS - 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS - 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> - 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> - 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> - 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS - 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> - 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS - 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS - 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS - 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS - 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> - 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> - 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> - 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS - 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> - 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> - 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> - 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> - 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS - 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> - 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> - 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> - 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS - 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> - 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> - 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> - 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS - 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS - 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS - 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS - 835584U, // <0,u,u,3>: Cost 0 copy LHS - 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS - 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS - 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS - 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> - 835584U, // <0,u,u,u>: Cost 0 copy LHS - 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> - 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> - 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> - 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> - 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> - 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> - 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> - 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> - 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> - 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS - 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> - 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS - 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> - 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS - 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> - 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> - 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> - 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS - 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> - 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> - 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> - 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> - 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> - 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> - 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> - 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> - 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> - 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> - 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> - 67944550U, // <1,0,3,2>: Cost 1 vrev LHS - 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> - 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS - 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> - 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> - 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> - 68386972U, // <1,0,3,u>: Cost 1 vrev LHS - 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> - 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> - 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> - 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> - 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> - 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS - 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> - 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> - 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS - 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> - 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS - 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS - 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> - 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> - 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> - 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> - 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS - 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS - 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> - 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> - 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> - 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> - 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> - 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> - 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> - 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> - 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> - 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> - 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> - 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> - 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> - 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> - 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> - 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> - 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> - 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> - 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> - 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> - 67985515U, // <1,0,u,2>: Cost 1 vrev LHS - 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> - 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> - 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS - 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> - 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> - 68427937U, // <1,0,u,u>: Cost 1 vrev LHS - 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> - 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS - 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> - 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> - 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> - 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> - 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> - 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> - 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> - 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS - 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> - 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> - 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS - 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> - 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> - 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> - 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS - 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> - 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> - 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> - 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> - 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS - 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> - 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> - 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> - 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> - 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> - 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> - 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> - 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS - 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> - 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> - 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> - 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> - 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS - 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS - 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> - 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> - 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> - 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS - 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS - 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS - 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> - 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS - 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> - 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> - 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> - 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> - 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> - 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> - 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> - 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS - 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> - 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> - 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> - 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> - 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> - 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> - 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> - 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> - 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> - 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> - 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> - 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> - 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> - 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS - 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> - 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> - 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> - 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> - 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> - 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS - 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> - 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS - 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS - 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS - 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> - 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS - 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS - 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> - 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS - 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> - 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> - 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> - 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> - 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> - 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> - 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS - 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> - 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> - 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> - 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS - 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS - 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> - 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> - 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> - 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS - 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> - 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> - 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> - 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> - 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> - 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> - 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> - 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> - 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> - 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS - 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> - 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS - 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> - 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> - 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS - 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> - 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> - 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> - 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> - 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> - 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS - 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> - 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> - 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS - 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS - 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> - 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> - 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS - 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS - 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> - 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> - 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> - 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS - 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> - 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> - 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> - 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> - 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> - 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> - 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> - 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> - 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> - 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> - 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> - 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> - 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> - 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> - 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> - 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> - 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> - 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> - 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS - 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> - 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS - 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS - 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> - 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS - 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> - 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS - 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> - 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> - 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> - 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> - 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> - 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> - 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS - 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> - 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> - 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> - 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS - 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS - 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> - 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> - 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> - 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS - 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> - 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> - 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> - 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> - 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> - 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> - 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> - 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> - 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> - 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS - 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> - 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> - 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> - 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS - 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> - 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> - 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> - 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS - 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS - 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> - 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> - 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> - 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS - 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS - 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> - 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> - 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS - 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS - 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> - 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> - 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> - 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS - 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> - 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> - 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS - 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS - 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> - 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> - 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> - 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> - 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> - 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> - 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> - 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> - 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> - 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> - 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> - 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> - 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> - 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> - 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> - 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> - 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> - 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> - 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS - 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> - 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> - 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS - 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS - 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS - 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> - 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS - 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS - 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> - 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS - 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> - 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> - 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> - 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> - 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> - 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> - 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> - 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> - 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> - 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> - 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> - 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> - 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS - 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS - 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> - 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS - 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> - 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> - 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> - 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> - 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> - 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS - 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> - 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> - 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS - 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS - 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> - 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> - 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> - 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> - 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> - 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> - 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> - 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> - 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> - 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> - 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> - 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> - 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> - 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS - 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> - 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> - 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS - 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS - 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> - 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> - 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> - 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS - 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS - 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS - 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS - 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS - 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS - 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> - 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> - 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> - 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS - 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> - 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> - 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> - 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS - 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> - 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> - 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> - 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> - 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS - 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> - 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> - 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> - 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> - 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS - 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS - 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> - 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> - 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> - 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> - 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS - 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> - 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS - 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> - 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS - 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> - 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> - 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> - 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> - 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> - 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> - 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS - 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> - 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> - 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> - 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> - 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> - 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> - 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> - 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS - 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> - 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> - 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> - 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> - 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> - 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> - 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> - 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> - 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS - 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> - 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> - 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> - 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> - 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> - 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> - 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> - 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> - 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS - 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS - 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> - 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> - 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> - 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> - 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> - 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS - 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS - 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> - 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS - 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS - 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> - 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> - 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> - 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS - 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> - 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> - 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> - 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS - 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> - 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> - 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> - 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> - 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> - 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> - 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> - 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> - 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> - 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS - 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> - 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> - 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> - 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS - 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> - 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> - 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> - 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS - 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> - 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS - 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> - 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> - 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> - 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS - 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> - 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS - 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS - 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> - 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS - 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> - 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> - 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> - 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> - 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> - 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS - 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS - 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> - 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> - 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> - 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> - 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> - 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> - 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> - 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS - 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS - 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> - 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> - 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> - 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> - 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS - 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> - 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> - 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS - 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS - 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS - 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> - 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> - 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> - 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS - 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> - 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> - 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> - 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> - 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> - 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> - 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> - 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> - 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> - 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS - 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS - 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS - 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS - 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> - 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> - 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> - 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> - 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> - 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> - 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> - 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS - 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS - 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> - 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> - 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> - 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> - 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS - 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> - 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> - 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> - 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> - 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> - 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> - 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> - 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> - 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> - 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> - 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> - 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS - 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> - 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> - 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS - 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> - 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> - 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> - 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS - 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> - 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> - 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> - 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> - 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS - 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> - 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> - 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> - 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> - 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> - 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> - 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS - 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS - 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> - 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> - 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> - 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS - 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> - 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> - 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> - 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> - 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS - 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> - 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> - 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> - 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS - 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> - 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> - 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> - 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS - 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS - 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> - 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> - 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> - 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS - 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> - 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> - 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> - 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS - 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> - 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> - 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> - 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> - 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> - 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS - 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> - 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> - 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS - 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS - 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> - 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> - 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> - 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS - 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> - 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> - 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS - 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS - 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> - 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> - 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> - 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> - 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS - 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> - 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> - 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> - 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> - 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> - 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> - 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> - 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> - 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS - 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> - 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> - 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> - 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> - 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS - 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS - 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> - 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> - 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS - 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> - 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> - 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> - 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS - 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> - 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS - 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> - 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> - 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> - 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> - 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> - 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> - 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS - 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS - 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS - 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS - 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS - 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> - 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS - 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS - 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS - 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> - 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS - 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> - 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> - 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> - 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS - 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> - 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS - 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> - 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS - 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 115726126U, // <1,u,3,2>: Cost 1 vrev LHS - 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS - 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS - 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> - 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS - 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS - 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> - 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> - 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> - 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> - 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS - 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS - 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> - 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> - 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS - 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS - 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> - 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS - 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS - 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS - 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS - 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS - 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS - 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS - 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> - 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> - 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> - 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> - 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> - 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> - 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> - 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> - 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> - 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> - 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> - 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> - 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> - 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS - 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> - 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> - 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> - 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> - 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS - 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS - 115767091U, // <1,u,u,2>: Cost 1 vrev LHS - 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS - 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS - 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS - 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS - 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS - 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS - 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> - 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> - 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> - 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> - 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS - 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> - 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> - 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> - 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> - 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> - 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> - 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS - 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS - 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS - 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> - 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> - 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> - 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS - 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS - 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> - 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> - 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> - 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS - 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> - 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> - 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> - 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS - 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> - 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> - 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> - 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> - 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS - 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> - 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> - 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> - 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> - 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS - 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> - 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> - 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> - 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS - 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS - 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS - 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> - 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> - 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS - 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> - 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> - 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> - 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> - 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> - 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> - 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS - 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> - 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> - 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS - 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> - 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> - 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> - 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> - 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> - 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> - 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> - 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> - 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> - 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> - 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> - 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> - 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> - 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> - 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> - 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> - 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS - 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> - 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS - 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> - 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS - 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS - 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS - 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS - 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS - 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS - 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS - 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> - 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> - 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS - 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> - 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> - 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> - 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> - 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS - 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> - 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> - 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> - 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS - 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> - 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> - 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> - 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> - 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS - 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> - 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> - 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> - 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS - 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> - 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> - 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> - 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> - 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS - 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> - 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> - 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> - 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS - 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> - 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> - 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> - 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> - 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> - 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> - 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> - 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> - 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS - 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS - 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS - 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> - 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> - 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS - 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> - 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> - 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> - 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS - 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> - 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> - 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS - 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> - 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS - 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> - 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> - 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS - 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS - 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> - 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> - 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> - 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS - 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> - 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> - 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> - 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> - 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> - 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> - 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> - 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> - 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> - 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS - 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> - 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> - 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> - 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS - 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> - 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> - 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> - 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> - 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> - 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS - 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> - 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> - 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS - 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> - 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> - 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> - 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> - 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> - 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> - 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> - 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS - 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS - 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> - 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> - 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> - 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS - 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS - 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> - 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS - 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> - 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS - 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> - 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> - 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> - 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS - 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> - 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> - 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> - 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS - 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> - 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> - 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> - 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> - 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS - 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS - 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> - 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> - 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> - 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS - 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS - 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS - 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> - 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS - 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> - 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> - 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> - 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS - 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> - 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> - 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> - 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS - 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS - 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> - 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> - 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> - 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> - 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS - 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> - 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> - 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> - 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> - 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> - 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> - 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> - 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS - 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> - 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> - 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> - 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> - 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> - 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS - 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS - 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS - 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS - 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS - 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS - 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> - 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS - 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS - 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS - 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> - 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> - 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> - 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> - 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS - 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS - 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> - 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> - 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> - 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> - 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> - 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> - 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> - 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> - 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> - 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> - 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> - 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> - 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> - 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> - 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> - 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> - 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> - 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS - 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> - 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> - 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> - 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS - 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS - 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> - 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS - 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS - 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> - 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> - 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> - 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> - 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> - 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> - 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> - 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> - 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> - 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> - 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> - 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> - 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> - 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> - 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> - 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> - 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> - 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> - 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> - 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> - 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> - 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> - 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS - 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> - 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> - 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> - 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS - 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> - 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> - 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS - 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> - 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS - 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> - 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> - 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> - 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> - 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> - 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> - 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> - 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> - 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> - 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> - 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS - 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> - 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> - 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> - 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> - 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS - 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> - 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> - 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> - 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> - 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> - 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS - 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS - 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> - 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS - 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> - 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> - 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> - 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> - 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> - 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> - 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> - 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> - 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> - 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> - 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> - 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> - 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> - 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> - 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS - 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS - 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> - 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS - 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> - 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> - 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> - 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> - 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS - 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> - 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS - 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS - 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS - 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS - 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> - 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> - 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> - 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS - 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS - 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> - 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> - 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS - 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> - 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> - 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> - 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> - 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> - 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> - 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> - 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> - 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> - 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS - 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS - 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> - 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> - 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS - 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> - 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS - 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS - 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS - 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> - 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS - 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> - 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> - 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> - 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> - 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> - 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS - 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS - 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> - 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> - 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> - 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> - 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> - 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> - 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> - 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> - 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> - 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> - 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> - 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> - 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> - 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> - 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> - 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> - 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS - 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS - 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS - 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> - 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> - 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> - 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> - 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> - 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> - 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> - 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS - 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> - 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> - 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> - 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS - 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS - 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> - 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS - 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS - 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS - 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> - 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> - 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> - 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS - 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> - 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> - 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> - 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> - 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS - 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> - 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> - 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> - 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> - 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> - 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> - 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS - 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS - 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS - 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> - 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> - 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> - 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS - 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> - 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> - 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS - 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS - 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS - 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS - 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> - 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> - 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> - 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS - 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> - 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS - 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> - 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS - 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> - 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> - 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> - 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> - 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> - 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> - 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS - 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> - 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> - 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> - 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS - 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> - 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> - 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> - 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> - 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS - 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> - 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> - 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> - 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> - 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> - 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> - 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> - 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS - 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> - 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> - 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> - 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> - 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> - 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> - 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> - 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> - 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS - 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS - 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> - 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> - 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> - 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> - 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> - 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS - 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> - 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS - 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS - 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> - 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> - 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> - 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> - 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> - 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> - 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> - 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS - 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS - 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS - 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> - 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> - 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> - 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS - 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> - 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> - 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS - 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS - 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> - 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> - 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> - 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> - 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> - 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> - 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> - 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> - 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> - 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> - 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS - 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> - 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS - 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> - 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS - 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> - 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS - 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS - 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> - 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> - 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> - 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> - 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> - 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> - 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> - 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> - 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> - 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> - 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> - 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> - 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> - 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> - 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> - 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> - 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> - 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> - 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> - 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> - 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> - 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> - 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> - 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> - 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> - 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> - 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> - 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS - 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> - 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> - 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> - 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS - 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> - 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> - 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> - 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS - 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> - 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> - 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> - 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> - 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> - 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS - 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> - 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> - 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS - 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> - 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> - 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> - 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> - 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS - 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> - 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> - 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> - 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS - 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS - 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> - 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> - 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> - 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS - 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> - 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> - 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> - 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS - 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> - 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> - 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> - 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> - 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS - 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> - 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> - 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> - 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> - 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS - 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> - 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> - 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> - 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS - 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS - 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> - 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> - 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS - 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS - 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> - 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> - 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> - 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS - 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS - 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> - 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> - 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> - 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> - 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS - 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> - 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS - 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS - 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS - 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS - 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> - 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> - 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS - 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> - 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> - 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS - 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS - 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS - 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> - 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> - 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> - 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS - 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS - 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS - 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS - 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> - 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> - 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> - 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> - 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS - 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> - 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS - 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS - 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> - 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> - 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> - 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS - 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS - 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS - 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> - 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> - 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> - 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> - 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> - 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> - 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> - 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> - 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS - 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS - 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS - 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS - 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS - 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS - 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS - 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS - 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS - 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> - 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> - 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> - 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> - 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> - 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> - 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> - 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> - 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> - 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS - 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> - 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS - 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> - 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS - 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> - 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> - 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> - 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS - 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> - 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> - 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> - 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> - 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> - 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> - 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> - 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> - 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> - 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> - 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> - 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> - 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> - 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> - 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> - 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> - 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> - 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> - 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> - 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> - 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> - 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS - 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> - 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> - 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> - 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> - 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> - 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS - 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> - 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> - 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> - 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> - 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> - 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS - 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> - 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> - 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> - 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> - 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> - 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> - 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> - 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> - 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> - 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> - 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> - 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> - 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> - 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> - 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> - 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> - 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> - 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> - 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> - 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> - 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS - 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> - 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> - 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS - 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> - 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> - 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS - 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS - 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS - 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> - 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> - 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS - 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> - 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> - 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> - 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> - 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> - 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> - 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> - 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> - 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> - 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> - 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> - 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> - 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> - 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> - 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> - 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> - 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> - 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS - 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> - 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> - 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> - 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> - 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS - 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> - 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> - 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> - 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS - 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> - 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> - 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> - 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> - 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS - 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> - 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> - 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> - 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS - 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS - 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS - 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> - 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> - 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> - 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> - 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> - 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> - 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> - 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> - 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> - 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS - 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> - 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> - 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> - 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> - 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> - 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> - 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> - 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> - 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> - 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> - 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS - 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> - 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> - 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS - 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS - 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> - 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> - 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> - 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS - 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS - 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> - 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> - 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> - 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS - 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> - 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> - 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS - 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> - 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> - 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS - 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> - 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> - 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> - 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> - 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> - 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> - 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS - 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> - 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> - 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> - 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> - 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS - 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> - 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> - 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> - 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> - 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> - 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> - 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> - 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> - 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> - 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> - 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> - 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> - 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> - 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> - 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> - 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> - 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> - 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> - 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> - 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> - 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> - 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> - 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS - 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> - 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> - 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> - 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS - 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS - 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> - 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> - 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS - 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS - 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> - 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> - 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> - 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> - 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> - 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> - 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> - 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> - 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> - 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> - 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> - 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> - 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> - 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> - 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> - 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> - 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> - 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> - 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> - 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> - 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS - 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> - 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> - 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> - 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> - 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS - 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> - 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS - 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> - 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> - 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> - 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS - 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> - 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> - 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> - 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> - 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> - 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> - 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> - 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> - 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> - 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> - 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> - 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> - 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> - 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> - 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> - 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> - 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS - 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> - 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> - 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> - 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> - 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS - 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> - 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> - 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> - 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS - 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> - 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> - 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> - 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> - 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS - 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> - 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> - 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS - 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS - 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> - 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> - 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> - 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS - 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS - 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> - 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> - 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> - 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> - 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> - 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS - 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> - 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> - 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS - 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> - 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> - 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> - 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS - 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> - 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> - 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS - 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS - 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> - 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> - 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> - 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> - 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> - 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> - 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> - 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> - 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> - 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS - 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> - 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> - 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> - 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS - 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> - 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> - 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> - 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS - 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS - 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> - 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> - 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS - 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS - 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> - 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> - 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS - 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS - 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> - 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS - 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> - 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> - 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> - 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> - 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> - 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> - 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS - 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> - 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> - 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> - 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> - 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS - 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> - 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> - 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> - 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> - 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS - 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> - 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> - 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> - 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> - 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> - 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> - 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> - 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> - 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> - 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> - 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> - 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> - 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> - 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS - 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS - 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> - 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> - 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS - 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> - 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> - 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> - 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> - 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS - 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> - 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> - 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS - 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS - 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> - 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> - 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> - 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS - 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> - 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS - 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> - 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS - 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> - 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> - 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> - 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> - 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> - 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> - 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> - 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> - 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> - 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> - 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> - 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> - 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> - 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> - 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> - 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> - 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> - 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> - 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS - 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS - 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> - 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> - 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS - 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS - 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS - 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> - 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS - 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> - 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS - 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> - 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> - 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> - 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> - 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> - 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> - 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS - 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS - 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> - 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> - 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> - 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> - 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> - 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> - 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> - 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> - 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> - 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> - 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> - 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> - 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> - 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> - 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> - 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> - 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> - 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> - 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> - 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> - 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> - 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> - 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> - 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> - 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS - 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS - 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS - 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> - 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> - 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> - 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS - 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS - 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS - 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> - 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> - 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS - 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> - 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> - 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> - 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS - 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> - 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> - 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> - 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> - 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> - 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> - 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> - 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> - 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> - 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> - 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> - 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> - 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> - 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS - 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> - 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> - 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> - 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS - 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> - 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> - 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> - 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS - 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS - 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> - 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> - 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> - 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS - 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> - 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS - 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> - 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS - 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS - 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> - 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> - 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> - 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> - 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> - 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> - 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS - 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> - 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> - 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> - 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> - 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> - 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> - 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> - 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> - 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> - 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> - 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS - 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> - 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> - 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> - 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> - 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> - 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> - 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> - 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> - 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> - 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> - 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> - 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> - 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> - 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> - 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> - 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS - 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS - 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> - 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> - 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> - 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> - 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> - 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> - 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> - 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS - 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> - 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS - 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> - 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> - 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> - 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> - 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> - 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> - 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> - 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS - 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> - 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> - 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> - 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> - 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> - 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> - 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> - 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> - 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> - 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> - 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> - 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> - 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> - 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> - 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> - 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS - 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> - 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> - 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> - 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> - 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> - 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> - 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> - 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> - 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> - 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> - 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS - 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> - 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> - 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> - 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> - 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> - 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> - 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS - 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> - 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> - 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> - 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> - 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS - 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> - 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> - 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> - 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> - 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS - 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> - 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> - 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> - 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS - 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> - 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> - 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> - 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> - 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> - 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> - 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> - 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> - 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> - 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> - 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> - 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> - 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> - 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS - 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> - 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> - 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> - 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS - 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS - 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> - 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> - 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS - 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> - 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> - 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> - 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> - 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> - 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> - 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> - 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS - 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS - 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> - 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> - 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> - 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> - 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> - 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> - 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> - 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> - 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> - 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS - 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> - 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> - 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> - 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS - 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> - 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> - 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> - 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> - 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS - 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> - 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS - 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> - 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS - 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> - 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS - 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> - 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> - 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> - 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> - 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> - 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> - 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> - 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS - 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> - 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> - 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> - 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS - 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> - 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS - 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> - 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> - 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> - 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS - 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> - 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> - 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> - 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> - 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> - 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> - 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> - 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> - 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> - 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> - 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS - 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> - 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> - 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> - 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS - 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS - 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS - 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> - 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> - 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> - 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> - 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> - 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> - 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> - 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> - 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS - 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> - 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> - 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> - 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS - 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> - 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS - 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> - 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS - 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> - 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> - 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> - 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> - 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> - 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> - 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> - 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> - 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS - 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> - 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> - 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS - 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS - 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> - 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> - 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS - 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> - 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> - 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS - 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS - 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> - 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> - 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS - 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> - 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS - 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> - 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> - 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> - 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> - 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> - 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> - 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> - 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> - 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> - 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS - 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> - 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> - 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS - 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> - 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> - 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> - 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> - 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> - 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> - 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> - 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> - 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> - 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> - 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> - 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> - 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS - 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> - 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> - 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> - 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS - 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> - 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> - 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> - 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> - 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS - 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> - 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> - 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> - 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS - 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS - 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> - 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> - 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS - 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS - 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS - 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> - 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> - 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS - 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> - 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> - 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> - 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS - 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS - 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> - 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS - 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> - 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS - 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> - 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> - 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> - 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS - 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> - 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS - 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS - 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> - 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> - 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> - 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> - 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> - 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS - 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS - 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS - 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> - 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS - 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS - 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> - 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> - 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> - 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS - 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> - 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> - 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> - 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> - 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> - 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> - 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS - 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> - 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> - 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> - 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> - 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> - 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> - 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> - 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> - 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> - 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS - 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> - 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> - 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> - 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS - 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> - 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> - 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> - 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> - 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS - 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> - 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> - 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> - 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS - 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> - 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> - 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> - 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> - 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> - 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> - 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS - 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> - 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS - 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> - 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> - 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS - 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS - 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> - 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> - 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> - 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS - 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> - 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> - 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> - 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS - 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS - 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> - 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> - 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS - 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS - 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> - 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> - 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> - 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS - 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> - 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> - 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> - 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> - 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> - 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> - 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> - 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> - 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> - 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS - 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> - 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> - 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> - 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS - 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS - 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> - 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS - 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> - 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS - 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> - 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> - 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> - 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> - 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> - 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> - 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS - 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> - 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> - 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> - 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS - 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> - 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> - 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> - 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> - 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> - 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> - 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> - 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> - 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> - 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> - 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> - 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> - 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> - 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> - 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> - 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> - 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> - 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> - 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> - 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> - 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> - 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> - 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> - 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS - 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> - 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> - 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> - 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS - 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS - 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> - 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> - 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS - 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS - 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> - 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> - 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS - 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS - 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> - 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> - 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS - 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS - 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS - 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> - 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> - 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS - 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS - 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> - 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> - 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> - 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS - 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> - 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> - 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> - 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> - 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> - 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> - 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> - 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> - 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> - 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS - 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> - 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> - 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS - 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS - 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS - 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> - 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> - 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS - 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> - 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> - 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> - 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> - 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> - 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> - 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> - 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> - 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> - 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> - 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> - 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> - 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> - 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> - 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> - 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> - 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> - 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> - 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS - 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> - 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> - 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> - 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS - 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> - 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> - 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> - 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> - 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> - 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> - 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> - 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> - 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> - 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> - 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> - 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> - 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> - 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> - 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> - 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> - 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> - 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> - 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> - 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> - 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> - 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> - 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS - 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> - 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> - 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> - 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS - 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> - 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> - 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> - 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS - 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS - 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> - 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> - 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> - 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS - 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> - 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> - 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> - 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> - 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> - 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> - 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> - 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> - 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> - 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> - 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> - 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> - 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> - 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS - 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> - 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> - 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> - 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS - 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> - 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> - 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> - 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> - 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> - 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS - 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS - 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> - 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> - 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> - 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> - 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> - 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS - 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> - 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> - 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> - 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> - 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> - 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> - 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> - 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> - 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> - 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> - 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> - 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> - 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> - 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> - 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> - 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> - 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> - 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> - 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> - 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> - 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> - 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> - 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> - 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> - 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> - 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> - 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> - 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS - 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> - 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> - 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> - 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS - 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS - 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS - 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> - 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS - 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS - 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> - 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> - 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> - 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS - 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS - 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS - 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> - 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS - 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS - 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> - 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> - 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> - 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS - 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> - 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS - 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> - 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS - 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> - 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> - 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> - 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> - 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> - 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> - 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> - 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> - 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> - 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS - 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS - 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS - 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> - 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS - 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS - 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS - 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> - 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS - 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> - 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS - 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> - 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> - 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> - 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> - 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> - 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> - 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS - 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> - 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> - 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> - 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS - 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> - 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> - 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> - 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> - 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS - 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS - 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> - 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> - 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> - 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> - 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> - 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> - 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS - 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> - 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> - 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> - 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> - 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> - 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> - 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> - 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> - 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> - 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> - 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS - 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> - 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> - 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> - 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS - 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS - 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> - 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> - 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS - 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS - 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> - 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> - 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> - 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> - 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> - 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> - 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS - 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS - 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS - 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> - 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> - 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> - 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS - 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> - 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> - 27705344U, // <4,5,6,7>: Cost 0 copy RHS - 27705344U, // <4,5,6,u>: Cost 0 copy RHS - 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS - 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> - 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> - 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> - 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS - 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> - 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> - 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> - 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS - 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS - 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS - 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> - 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> - 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS - 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS - 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> - 27705344U, // <4,5,u,7>: Cost 0 copy RHS - 27705344U, // <4,5,u,u>: Cost 0 copy RHS - 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> - 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS - 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> - 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> - 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> - 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> - 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> - 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS - 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS - 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> - 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> - 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> - 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> - 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> - 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> - 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> - 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS - 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> - 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> - 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> - 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> - 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> - 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> - 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> - 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> - 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> - 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> - 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> - 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> - 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> - 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> - 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> - 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> - 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> - 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> - 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> - 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS - 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> - 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> - 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> - 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS - 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS - 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS - 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> - 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS - 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS - 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> - 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> - 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> - 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> - 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> - 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> - 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS - 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS - 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS - 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> - 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> - 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> - 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> - 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> - 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> - 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS - 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS - 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> - 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> - 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> - 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> - 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> - 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> - 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> - 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> - 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> - 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS - 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS - 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS - 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> - 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> - 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS - 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS - 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS - 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS - 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> - 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS - 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> - 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> - 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> - 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> - 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> - 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> - 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS - 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> - 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> - 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> - 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> - 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS - 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> - 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> - 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> - 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> - 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS - 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> - 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> - 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> - 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> - 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> - 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> - 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> - 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> - 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> - 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> - 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> - 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> - 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> - 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> - 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> - 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> - 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> - 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> - 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> - 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> - 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> - 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> - 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS - 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> - 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> - 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS - 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> - 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> - 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> - 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> - 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> - 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> - 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> - 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> - 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> - 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS - 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> - 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> - 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> - 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS - 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> - 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> - 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> - 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS - 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS - 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> - 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> - 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> - 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> - 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> - 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> - 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> - 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> - 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS - 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS - 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> - 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> - 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS - 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> - 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> - 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> - 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS - 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> - 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS - 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> - 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> - 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> - 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> - 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> - 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> - 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS - 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> - 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> - 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> - 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> - 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> - 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> - 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> - 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS - 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> - 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> - 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> - 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> - 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> - 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> - 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> - 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> - 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> - 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> - 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> - 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> - 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> - 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> - 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> - 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> - 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS - 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> - 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> - 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> - 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS - 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS - 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS - 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> - 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS - 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS - 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS - 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> - 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS - 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS - 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS - 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS - 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS - 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS - 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS - 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> - 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS - 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> - 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS - 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> - 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS - 27705344U, // <4,u,6,7>: Cost 0 copy RHS - 27705344U, // <4,u,6,u>: Cost 0 copy RHS - 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS - 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> - 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> - 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> - 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS - 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> - 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> - 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> - 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS - 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS - 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS - 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS - 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> - 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS - 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS - 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS - 27705344U, // <4,u,u,7>: Cost 0 copy RHS - 27705344U, // <4,u,u,u>: Cost 0 copy RHS - 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> - 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> - 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> - 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> - 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> - 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> - 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> - 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> - 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> - 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS - 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> - 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS - 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> - 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS - 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> - 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> - 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> - 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> - 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> - 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> - 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> - 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> - 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> - 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> - 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> - 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> - 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> - 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> - 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> - 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> - 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> - 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> - 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> - 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> - 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> - 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS - 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> - 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> - 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> - 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS - 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS - 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> - 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> - 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> - 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> - 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS - 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS - 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> - 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> - 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> - 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> - 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS - 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS - 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> - 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS - 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> - 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> - 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> - 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> - 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> - 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> - 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS - 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS - 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> - 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> - 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> - 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS - 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> - 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> - 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> - 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS - 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> - 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> - 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS - 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> - 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> - 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS - 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> - 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> - 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> - 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS - 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> - 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> - 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> - 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> - 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> - 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> - 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS - 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> - 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> - 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> - 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> - 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> - 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> - 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> - 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> - 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> - 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> - 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> - 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> - 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> - 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> - 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> - 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> - 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> - 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> - 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS - 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> - 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> - 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> - 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> - 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> - 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> - 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> - 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> - 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> - 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> - 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> - 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> - 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> - 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS - 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> - 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> - 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> - 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> - 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> - 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> - 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> - 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> - 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> - 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> - 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> - 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> - 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS - 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> - 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> - 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> - 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS - 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> - 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> - 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> - 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> - 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS - 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> - 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> - 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS - 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS - 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> - 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> - 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> - 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS - 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> - 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS - 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> - 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS - 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> - 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS - 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> - 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> - 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS - 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> - 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS - 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> - 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> - 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> - 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> - 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> - 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> - 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS - 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> - 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> - 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> - 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> - 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS - 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> - 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> - 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> - 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> - 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS - 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> - 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> - 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> - 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> - 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> - 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> - 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> - 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> - 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> - 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> - 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> - 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> - 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> - 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> - 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> - 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> - 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> - 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> - 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> - 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> - 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> - 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> - 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS - 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> - 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> - 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS - 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS - 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> - 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> - 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS - 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS - 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> - 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> - 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> - 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS - 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS - 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> - 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> - 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> - 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> - 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> - 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> - 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> - 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> - 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS - 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> - 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> - 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS - 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS - 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> - 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> - 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> - 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS - 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> - 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS - 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> - 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> - 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> - 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS - 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> - 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> - 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> - 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> - 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> - 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> - 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> - 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> - 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> - 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> - 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> - 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> - 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> - 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> - 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> - 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> - 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> - 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> - 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> - 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> - 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> - 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> - 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> - 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> - 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> - 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> - 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> - 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> - 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> - 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> - 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> - 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> - 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> - 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> - 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> - 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> - 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> - 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> - 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> - 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> - 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> - 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> - 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> - 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> - 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> - 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> - 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> - 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> - 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS - 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> - 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> - 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> - 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS - 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> - 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> - 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> - 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS - 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS - 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> - 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> - 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> - 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS - 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> - 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> - 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> - 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS - 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS - 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> - 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> - 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> - 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS - 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> - 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> - 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> - 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS - 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS - 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> - 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> - 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> - 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS - 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> - 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> - 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> - 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS - 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS - 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS - 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> - 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> - 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> - 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> - 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> - 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> - 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS - 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> - 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> - 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> - 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> - 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> - 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> - 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> - 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> - 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> - 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> - 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> - 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> - 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> - 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> - 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> - 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> - 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> - 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> - 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS - 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> - 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> - 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> - 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> - 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> - 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> - 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> - 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> - 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS - 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> - 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> - 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> - 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> - 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> - 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> - 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> - 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> - 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS - 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> - 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> - 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> - 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS - 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> - 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS - 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS - 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS - 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS - 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> - 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> - 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> - 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS - 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> - 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> - 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> - 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS - 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS - 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> - 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> - 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> - 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS - 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> - 94817590U, // <5,4,7,6>: Cost 1 vrev RHS - 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> - 94965064U, // <5,4,7,u>: Cost 1 vrev RHS - 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS - 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> - 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> - 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> - 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS - 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> - 94825783U, // <5,4,u,6>: Cost 1 vrev RHS - 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> - 94973257U, // <5,4,u,u>: Cost 1 vrev RHS - 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> - 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS - 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> - 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> - 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> - 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> - 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> - 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS - 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS - 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> - 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> - 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> - 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> - 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> - 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> - 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> - 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> - 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> - 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS - 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> - 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> - 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> - 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> - 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> - 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> - 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> - 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> - 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> - 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> - 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> - 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> - 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> - 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> - 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> - 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> - 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> - 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> - 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> - 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> - 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> - 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> - 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS - 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> - 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> - 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> - 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS - 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> - 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> - 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> - 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS - 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> - 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> - 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS - 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS - 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> - 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> - 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> - 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> - 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> - 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> - 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> - 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> - 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS - 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> - 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> - 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> - 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS - 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> - 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> - 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS - 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS - 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS - 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS - 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> - 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> - 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS - 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> - 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS - 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS - 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> - 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS - 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> - 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> - 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> - 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> - 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> - 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS - 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS - 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> - 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> - 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> - 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> - 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> - 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> - 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> - 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS - 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> - 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> - 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> - 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> - 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> - 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> - 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> - 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> - 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> - 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> - 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> - 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> - 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> - 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> - 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> - 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> - 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> - 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS - 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> - 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS - 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> - 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> - 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> - 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> - 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS - 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> - 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> - 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS - 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS - 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> - 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> - 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> - 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> - 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> - 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> - 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS - 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS - 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS - 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> - 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> - 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> - 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS - 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> - 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> - 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS - 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS - 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS - 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> - 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> - 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS - 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> - 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> - 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> - 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS - 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS - 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS - 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> - 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS - 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS - 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> - 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> - 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS - 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> - 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS - 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> - 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> - 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> - 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> - 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> - 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> - 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS - 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> - 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> - 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> - 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> - 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS - 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> - 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> - 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> - 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> - 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> - 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> - 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> - 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> - 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> - 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> - 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> - 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> - 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> - 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> - 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> - 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> - 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> - 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> - 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> - 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> - 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> - 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> - 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS - 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> - 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> - 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> - 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS - 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS - 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> - 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> - 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS - 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS - 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> - 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> - 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> - 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS - 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> - 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> - 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS - 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS - 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> - 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> - 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> - 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> - 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> - 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> - 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> - 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> - 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> - 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS - 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> - 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> - 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> - 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS - 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> - 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> - 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> - 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS - 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS - 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS - 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> - 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS - 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS - 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS - 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> - 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS - 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS - 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> - 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS - 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> - 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> - 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> - 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> - 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> - 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> - 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS - 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> - 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> - 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS - 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> - 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> - 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> - 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> - 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS - 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> - 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> - 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> - 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> - 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> - 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> - 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> - 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> - 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> - 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> - 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> - 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> - 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> - 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> - 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> - 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> - 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> - 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> - 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> - 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> - 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> - 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> - 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> - 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS - 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> - 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> - 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS - 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS - 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> - 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> - 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> - 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS - 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS - 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS - 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS - 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS - 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS - 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> - 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> - 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS - 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS - 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> - 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS - 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> - 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS - 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> - 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS - 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS - 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 118708378U, // <5,u,7,6>: Cost 1 vrev RHS - 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS - 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS - 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS - 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS - 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS - 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS - 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS - 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS - 118716571U, // <5,u,u,6>: Cost 1 vrev RHS - 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS - 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS - 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> - 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> - 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> - 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> - 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> - 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> - 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> - 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> - 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> - 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS - 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> - 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS - 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> - 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS - 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> - 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> - 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> - 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS - 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> - 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> - 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> - 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> - 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> - 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> - 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> - 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> - 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> - 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> - 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> - 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> - 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> - 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> - 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> - 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> - 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> - 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> - 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> - 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> - 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> - 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> - 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> - 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS - 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> - 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> - 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> - 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS - 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> - 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> - 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> - 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> - 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> - 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> - 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS - 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS - 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> - 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS - 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS - 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> - 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> - 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> - 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> - 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> - 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS - 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS - 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> - 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> - 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> - 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS - 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> - 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> - 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> - 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS - 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> - 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> - 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS - 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> - 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> - 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS - 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> - 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS - 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS - 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS - 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS - 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> - 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> - 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS - 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> - 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> - 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> - 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> - 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> - 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> - 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> - 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> - 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> - 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> - 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> - 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> - 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> - 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS - 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> - 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> - 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> - 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS - 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> - 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> - 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> - 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> - 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS - 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> - 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> - 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> - 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> - 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> - 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> - 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> - 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> - 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> - 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> - 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> - 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> - 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS - 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> - 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> - 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> - 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> - 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> - 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> - 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> - 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> - 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> - 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> - 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> - 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS - 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> - 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS - 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> - 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> - 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS - 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS - 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> - 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> - 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> - 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS - 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS - 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> - 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> - 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS - 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS - 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> - 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> - 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> - 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS - 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS - 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> - 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> - 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> - 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> - 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> - 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> - 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> - 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> - 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> - 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS - 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> - 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> - 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> - 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> - 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> - 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> - 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS - 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> - 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> - 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> - 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS - 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> - 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> - 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> - 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> - 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> - 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> - 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> - 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> - 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> - 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> - 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> - 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> - 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> - 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> - 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> - 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> - 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> - 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> - 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> - 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> - 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> - 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> - 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> - 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> - 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> - 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> - 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> - 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> - 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS - 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> - 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> - 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> - 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> - 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> - 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> - 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> - 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> - 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> - 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> - 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS - 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> - 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> - 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> - 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> - 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> - 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> - 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> - 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> - 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> - 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> - 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS - 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> - 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> - 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS - 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS - 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> - 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> - 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> - 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS - 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> - 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS - 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> - 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS - 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> - 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS - 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> - 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS - 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS - 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> - 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> - 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> - 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> - 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> - 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> - 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> - 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> - 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> - 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> - 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> - 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> - 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> - 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> - 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> - 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> - 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> - 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> - 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> - 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> - 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> - 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> - 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> - 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> - 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> - 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> - 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> - 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> - 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> - 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> - 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> - 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> - 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> - 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> - 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> - 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> - 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS - 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> - 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> - 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> - 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS - 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> - 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> - 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> - 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> - 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS - 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> - 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> - 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> - 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> - 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> - 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> - 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> - 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> - 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS - 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> - 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> - 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> - 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> - 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> - 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> - 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> - 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS - 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS - 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> - 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> - 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> - 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS - 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> - 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> - 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> - 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS - 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS - 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> - 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> - 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> - 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS - 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> - 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> - 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> - 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS - 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> - 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS - 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> - 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> - 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> - 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> - 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> - 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> - 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS - 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> - 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> - 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> - 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS - 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS - 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> - 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> - 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> - 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> - 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> - 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> - 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> - 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> - 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> - 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS - 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> - 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> - 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> - 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> - 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> - 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> - 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> - 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> - 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> - 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> - 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> - 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> - 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS - 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> - 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> - 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> - 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> - 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS - 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> - 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> - 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> - 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS - 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> - 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> - 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> - 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS - 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> - 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS - 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS - 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS - 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS - 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> - 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> - 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> - 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS - 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> - 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> - 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> - 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS - 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS - 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> - 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> - 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> - 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS - 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> - 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> - 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> - 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS - 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS - 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS - 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> - 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> - 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> - 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS - 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS - 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS - 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS - 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> - 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS - 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> - 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> - 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> - 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> - 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> - 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> - 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> - 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> - 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> - 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> - 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS - 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> - 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> - 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> - 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> - 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> - 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS - 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> - 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> - 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> - 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS - 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> - 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> - 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS - 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> - 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> - 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> - 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> - 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> - 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> - 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> - 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> - 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> - 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> - 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS - 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> - 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> - 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> - 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS - 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS - 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> - 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> - 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> - 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS - 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> - 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> - 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> - 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS - 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> - 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> - 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> - 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> - 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> - 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> - 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> - 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> - 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> - 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> - 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> - 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> - 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> - 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS - 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> - 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> - 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> - 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS - 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> - 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> - 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS - 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS - 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS - 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> - 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> - 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> - 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS - 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS - 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> - 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> - 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> - 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS - 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS - 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> - 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> - 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> - 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> - 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> - 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS - 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS - 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> - 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> - 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> - 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS - 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> - 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> - 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> - 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> - 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS - 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> - 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> - 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> - 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> - 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> - 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> - 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> - 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> - 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> - 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> - 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> - 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> - 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> - 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> - 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> - 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> - 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS - 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> - 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS - 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> - 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> - 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> - 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> - 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS - 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> - 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> - 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> - 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS - 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> - 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> - 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> - 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> - 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> - 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> - 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS - 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS - 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS - 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> - 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> - 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> - 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS - 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> - 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS - 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> - 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS - 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS - 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> - 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> - 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> - 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS - 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> - 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> - 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS - 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS - 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS - 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS - 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> - 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS - 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS - 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS - 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS - 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS - 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS - 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS - 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> - 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> - 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> - 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> - 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS - 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> - 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> - 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> - 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> - 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> - 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> - 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> - 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> - 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> - 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> - 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> - 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> - 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> - 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> - 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> - 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> - 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> - 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> - 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> - 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> - 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> - 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> - 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS - 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> - 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS - 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> - 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> - 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> - 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> - 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> - 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> - 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> - 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> - 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> - 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> - 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> - 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> - 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> - 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> - 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> - 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> - 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> - 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> - 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> - 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> - 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> - 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> - 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS - 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> - 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> - 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS - 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> - 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> - 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS - 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS - 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> - 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> - 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> - 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> - 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS - 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS - 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> - 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS - 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> - 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> - 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> - 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS - 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS - 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> - 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS - 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> - 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> - 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> - 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> - 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> - 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> - 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> - 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> - 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> - 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> - 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> - 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> - 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS - 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> - 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS - 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS - 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> - 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> - 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS - 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS - 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS - 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> - 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> - 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS - 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> - 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS - 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS - 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS - 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> - 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> - 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS - 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS - 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> - 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> - 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS - 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS - 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS - 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS - 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS - 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS - 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS - 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS - 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS - 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS - 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> - 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> - 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> - 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> - 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> - 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> - 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> - 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> - 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> - 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS - 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> - 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS - 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> - 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS - 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> - 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> - 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> - 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS - 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> - 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> - 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> - 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> - 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> - 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> - 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> - 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> - 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> - 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> - 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> - 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> - 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> - 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> - 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> - 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> - 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> - 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> - 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> - 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> - 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> - 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> - 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> - 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS - 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> - 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> - 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> - 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS - 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> - 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> - 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> - 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> - 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> - 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> - 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> - 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> - 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> - 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> - 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> - 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> - 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> - 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> - 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> - 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> - 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> - 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> - 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS - 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS - 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> - 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> - 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> - 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> - 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> - 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> - 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> - 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> - 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS - 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> - 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> - 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS - 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> - 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> - 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS - 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS - 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS - 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS - 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> - 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS - 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> - 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> - 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> - 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> - 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> - 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> - 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> - 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> - 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> - 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> - 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> - 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> - 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> - 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> - 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> - 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> - 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> - 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> - 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> - 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> - 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> - 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> - 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> - 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> - 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> - 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> - 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> - 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> - 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> - 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> - 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> - 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> - 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> - 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> - 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> - 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> - 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS - 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> - 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> - 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS - 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS - 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> - 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> - 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> - 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS - 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> - 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> - 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> - 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> - 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> - 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> - 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> - 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> - 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> - 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> - 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> - 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> - 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> - 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> - 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> - 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> - 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS - 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS - 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> - 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> - 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> - 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS - 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS - 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> - 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> - 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> - 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS - 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> - 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> - 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS - 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> - 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> - 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> - 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> - 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> - 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> - 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> - 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> - 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> - 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> - 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> - 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> - 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> - 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> - 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> - 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> - 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> - 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> - 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> - 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> - 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> - 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> - 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> - 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> - 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> - 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> - 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> - 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> - 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> - 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> - 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> - 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> - 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> - 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> - 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> - 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> - 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> - 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> - 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> - 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> - 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> - 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> - 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> - 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> - 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> - 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> - 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> - 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> - 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> - 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> - 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> - 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> - 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> - 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> - 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS - 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> - 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> - 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> - 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS - 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> - 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> - 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> - 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> - 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> - 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> - 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> - 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS - 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS - 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> - 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> - 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> - 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS - 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> - 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> - 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> - 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> - 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> - 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> - 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> - 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> - 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> - 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> - 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> - 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> - 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> - 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> - 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> - 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> - 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> - 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> - 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> - 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> - 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> - 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> - 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> - 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> - 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> - 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> - 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> - 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> - 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> - 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> - 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> - 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> - 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> - 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> - 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> - 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> - 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> - 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> - 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> - 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> - 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> - 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> - 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> - 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> - 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> - 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> - 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> - 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> - 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> - 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> - 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> - 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> - 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> - 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS - 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> - 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> - 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> - 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> - 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> - 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> - 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> - 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> - 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> - 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> - 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> - 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> - 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> - 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> - 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> - 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> - 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> - 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> - 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> - 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> - 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> - 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> - 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> - 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> - 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> - 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> - 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> - 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> - 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> - 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> - 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> - 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> - 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> - 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> - 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> - 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> - 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS - 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> - 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> - 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> - 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> - 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> - 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> - 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> - 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> - 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> - 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> - 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> - 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> - 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> - 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> - 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> - 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> - 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> - 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> - 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> - 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> - 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> - 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> - 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> - 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> - 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> - 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> - 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> - 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> - 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> - 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> - 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> - 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> - 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> - 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> - 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> - 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> - 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> - 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> - 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> - 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> - 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> - 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> - 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> - 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS - 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> - 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> - 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> - 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS - 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> - 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS - 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> - 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS - 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> - 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> - 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> - 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> - 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> - 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> - 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> - 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> - 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> - 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> - 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> - 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> - 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> - 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> - 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> - 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> - 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> - 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> - 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS - 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS - 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> - 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> - 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> - 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> - 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS - 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> - 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS - 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS - 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS - 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS - 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> - 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> - 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> - 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> - 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> - 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS - 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> - 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> - 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> - 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> - 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> - 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> - 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> - 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> - 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> - 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> - 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> - 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> - 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> - 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> - 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> - 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> - 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> - 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> - 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS - 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> - 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> - 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> - 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS - 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> - 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> - 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> - 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> - 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS - 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> - 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> - 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> - 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS - 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS - 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> - 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> - 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> - 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> - 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> - 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> - 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> - 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> - 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> - 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> - 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> - 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> - 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> - 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> - 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> - 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> - 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> - 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> - 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> - 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> - 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> - 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS - 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> - 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> - 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> - 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS - 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> - 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> - 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> - 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> - 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS - 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS - 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> - 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> - 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS - 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> - 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> - 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> - 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> - 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> - 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS - 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> - 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> - 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> - 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> - 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> - 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> - 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS - 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> - 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> - 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> - 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> - 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS - 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> - 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> - 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> - 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> - 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> - 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> - 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> - 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> - 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> - 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> - 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> - 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> - 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> - 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> - 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> - 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> - 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> - 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> - 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> - 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> - 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> - 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> - 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> - 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> - 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> - 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> - 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> - 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS - 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> - 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> - 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS - 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS - 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> - 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> - 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> - 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> - 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> - 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> - 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> - 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> - 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> - 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> - 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> - 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> - 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> - 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> - 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> - 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> - 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> - 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> - 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> - 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> - 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> - 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> - 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> - 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> - 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> - 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> - 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> - 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS - 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> - 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> - 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> - 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS - 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> - 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> - 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> - 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> - 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> - 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> - 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> - 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> - 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> - 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> - 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> - 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> - 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> - 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> - 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> - 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> - 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS - 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> - 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> - 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> - 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> - 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> - 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> - 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> - 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> - 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> - 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> - 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> - 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> - 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> - 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> - 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> - 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> - 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> - 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> - 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> - 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> - 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> - 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> - 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> - 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> - 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> - 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> - 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> - 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> - 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> - 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> - 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> - 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS - 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> - 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> - 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> - 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> - 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> - 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> - 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> - 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> - 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS - 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> - 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> - 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> - 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS - 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> - 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> - 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> - 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> - 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS - 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> - 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> - 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> - 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS - 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> - 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> - 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS - 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS - 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS - 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> - 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> - 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> - 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS - 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> - 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> - 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS - 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS - 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> - 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> - 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> - 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> - 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> - 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> - 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> - 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS - 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> - 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS - 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> - 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS - 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> - 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS - 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> - 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> - 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> - 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS - 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> - 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> - 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> - 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> - 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> - 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> - 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> - 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> - 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> - 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> - 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> - 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> - 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> - 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> - 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> - 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> - 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> - 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> - 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> - 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> - 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> - 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> - 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> - 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> - 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> - 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> - 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS - 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> - 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> - 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> - 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> - 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> - 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS - 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> - 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS - 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> - 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> - 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> - 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> - 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> - 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> - 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> - 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> - 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> - 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> - 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> - 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> - 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> - 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> - 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> - 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> - 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS - 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS - 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> - 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> - 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS - 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> - 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> - 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> - 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS - 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS - 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS - 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS - 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> - 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> - 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> - 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS - 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> - 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> - 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> - 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS - 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS - 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS - 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS - 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> - 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS - 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> - 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> - 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> - 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS - 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> - 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS - 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> - 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> - 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> - 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> - 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> - 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> - 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> - 72589981U, // <u,0,3,2>: Cost 1 vrev LHS - 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> - 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> - 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> - 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> - 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> - 73032403U, // <u,0,3,u>: Cost 1 vrev LHS - 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> - 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> - 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> - 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> - 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS - 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS - 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> - 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> - 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> - 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS - 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS - 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> - 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> - 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> - 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> - 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> - 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS - 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> - 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS - 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> - 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS - 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> - 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS - 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> - 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> - 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> - 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS - 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS - 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> - 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> - 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> - 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS - 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> - 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> - 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> - 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS - 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS - 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> - 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS - 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> - 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> - 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS - 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> - 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS - 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS - 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> - 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS - 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> - 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> - 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS - 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> - 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> - 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> - 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> - 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS - 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS - 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> - 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> - 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS - 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> - 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> - 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> - 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS - 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS - 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> - 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> - 835584U, // <u,1,2,3>: Cost 0 copy LHS - 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS - 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> - 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> - 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> - 835584U, // <u,1,2,u>: Cost 0 copy LHS - 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS - 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> - 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> - 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS - 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS - 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> - 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> - 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> - 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> - 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> - 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> - 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> - 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> - 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS - 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS - 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> - 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> - 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> - 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS - 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> - 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> - 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> - 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS - 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> - 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> - 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS - 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> - 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS - 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> - 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> - 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> - 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS - 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> - 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> - 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> - 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> - 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS - 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> - 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> - 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS - 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS - 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> - 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> - 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> - 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS - 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS - 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS - 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> - 835584U, // <u,1,u,3>: Cost 0 copy LHS - 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS - 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> - 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> - 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> - 835584U, // <u,1,u,u>: Cost 0 copy LHS - 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> - 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS - 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> - 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> - 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> - 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> - 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> - 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> - 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS - 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> - 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> - 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> - 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> - 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS - 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> - 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> - 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> - 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> - 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS - 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> - 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS - 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> - 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS - 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> - 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> - 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> - 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS - 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS - 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS - 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS - 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> - 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> - 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS - 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> - 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> - 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> - 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> - 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS - 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS - 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> - 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> - 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS - 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS - 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> - 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> - 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> - 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> - 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> - 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> - 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS - 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> - 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS - 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> - 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> - 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> - 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS - 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> - 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> - 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> - 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> - 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> - 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> - 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> - 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS - 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS - 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> - 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> - 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> - 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS - 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS - 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS - 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS - 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> - 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS - 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS - 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS - 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> - 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS - 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS - 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> - 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> - 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> - 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> - 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS - 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> - 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> - 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> - 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> - 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS - 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> - 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> - 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS - 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> - 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> - 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> - 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> - 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS - 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> - 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> - 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> - 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS - 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS - 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> - 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> - 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> - 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS - 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS - 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> - 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS - 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS - 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> - 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> - 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> - 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> - 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS - 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS - 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> - 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> - 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> - 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> - 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> - 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> - 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> - 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS - 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> - 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> - 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> - 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS - 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> - 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> - 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS - 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> - 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS - 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> - 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS - 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> - 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS - 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> - 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS - 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS - 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> - 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS - 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> - 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> - 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> - 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> - 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> - 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> - 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS - 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS - 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> - 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> - 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> - 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS - 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS - 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> - 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> - 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> - 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS - 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> - 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> - 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> - 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS - 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> - 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS - 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> - 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS - 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> - 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> - 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> - 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> - 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> - 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> - 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> - 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> - 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> - 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS - 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> - 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> - 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> - 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS - 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS - 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> - 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> - 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS - 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS - 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> - 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> - 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> - 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS - 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS - 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS - 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS - 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS - 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS - 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> - 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> - 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> - 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS - 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> - 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS - 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> - 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS - 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS - 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> - 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> - 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> - 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS - 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> - 96808489U, // <u,4,7,6>: Cost 1 vrev RHS - 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> - 96955963U, // <u,4,7,u>: Cost 1 vrev RHS - 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS - 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS - 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> - 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> - 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS - 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS - 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS - 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS - 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS - 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> - 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS - 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> - 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> - 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> - 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> - 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> - 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> - 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS - 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS - 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> - 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> - 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> - 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS - 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> - 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> - 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> - 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> - 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS - 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> - 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> - 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> - 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS - 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> - 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> - 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS - 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> - 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS - 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> - 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> - 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> - 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> - 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> - 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS - 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS - 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS - 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> - 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> - 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> - 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> - 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS - 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> - 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> - 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS - 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS - 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> - 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> - 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> - 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS - 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS - 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> - 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> - 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS - 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS - 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> - 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> - 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> - 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS - 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> - 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> - 27705344U, // <u,5,6,7>: Cost 0 copy RHS - 27705344U, // <u,5,6,u>: Cost 0 copy RHS - 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS - 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> - 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> - 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> - 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS - 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> - 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> - 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS - 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS - 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS - 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> - 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> - 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> - 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS - 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS - 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> - 27705344U, // <u,5,u,7>: Cost 0 copy RHS - 27705344U, // <u,5,u,u>: Cost 0 copy RHS - 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> - 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS - 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> - 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> - 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> - 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> - 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> - 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS - 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS - 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> - 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> - 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> - 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> - 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> - 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> - 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> - 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS - 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> - 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS - 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> - 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> - 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> - 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS - 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> - 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> - 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> - 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> - 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> - 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> - 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> - 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> - 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> - 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> - 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> - 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS - 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS - 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS - 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> - 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> - 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> - 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> - 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS - 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> - 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS - 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS - 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS - 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> - 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> - 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> - 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> - 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> - 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> - 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS - 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> - 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS - 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> - 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> - 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> - 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS - 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> - 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS - 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> - 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS - 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS - 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> - 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> - 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS - 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> - 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS - 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS - 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS - 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS - 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> - 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS - 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS - 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS - 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS - 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS - 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS - 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> - 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> - 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> - 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> - 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS - 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> - 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> - 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS - 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> - 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> - 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> - 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> - 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> - 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> - 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> - 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> - 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> - 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> - 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> - 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> - 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> - 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> - 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> - 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> - 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> - 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> - 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> - 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS - 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> - 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS - 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> - 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> - 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> - 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> - 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> - 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS - 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> - 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> - 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS - 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> - 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> - 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> - 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> - 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> - 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> - 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> - 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> - 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> - 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> - 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS - 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS - 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> - 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS - 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> - 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> - 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS - 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> - 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS - 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS - 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS - 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS - 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> - 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> - 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> - 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> - 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS - 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS - 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS - 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS - 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> - 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> - 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS - 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS - 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> - 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS - 835584U, // <u,u,2,3>: Cost 0 copy LHS - 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS - 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> - 835584U, // <u,u,2,u>: Cost 0 copy LHS - 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS - 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 120371557U, // <u,u,3,2>: Cost 1 vrev LHS - 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS - 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS - 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> - 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS - 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS - 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS - 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> - 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> - 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> - 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS - 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS - 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> - 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS - 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS - 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> - 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> - 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS - 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS - 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS - 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS - 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS - 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS - 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> - 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> - 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS - 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> - 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS - 27705344U, // <u,u,6,7>: Cost 0 copy RHS - 27705344U, // <u,u,6,u>: Cost 0 copy RHS - 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS - 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> - 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> - 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS - 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS - 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 120699277U, // <u,u,7,6>: Cost 1 vrev RHS - 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS - 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS - 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS - 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS - 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS - 835584U, // <u,u,u,3>: Cost 0 copy LHS - 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS - 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS - 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS - 27705344U, // <u,u,u,7>: Cost 0 copy RHS - 835584U, // <u,u,u,u>: Cost 0 copy LHS + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS + 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> + 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> + 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> + 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> + 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS + 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> + 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> + 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> + 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> + 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> + 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS + 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> + 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> + 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS + 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> + 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> + 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> + 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> + 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS + 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> + 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> + 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> + 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> + 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> + 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> + 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> + 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> + 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> + 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> + 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> + 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS + 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> + 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> + 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> + 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> + 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> + 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> + 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> + 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> + 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS + 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> + 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> + 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS + 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> + 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> + 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> + 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> + 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> + 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> + 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> + 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> + 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> + 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> + 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> + 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS + 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS + 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> + 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS + 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> + 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS + 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> + 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> + 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> + 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> + 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> + 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> + 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> + 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS + 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> + 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> + 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> + 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> + 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> + 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> + 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS + 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS + 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> + 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS + 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> + 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> + 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> + 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> + 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> + 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> + 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS + 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> + 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> + 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> + 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> + 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS + 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> + 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> + 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> + 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS + 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> + 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS + 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> + 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> + 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> + 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> + 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> + 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> + 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> + 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> + 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> + 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS + 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> + 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> + 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> + 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS + 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> + 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> + 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> + 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> + 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> + 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> + 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> + 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> + 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> + 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> + 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> + 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> + 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> + 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS + 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS + 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> + 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS + 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> + 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> + 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> + 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> + 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> + 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS + 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> + 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> + 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> + 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS + 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> + 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> + 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> + 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> + 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> + 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> + 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> + 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS + 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> + 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> + 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> + 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS + 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> + 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> + 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> + 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> + 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> + 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS + 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> + 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> + 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> + 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS + 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS + 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> + 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> + 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> + 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> + 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> + 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> + 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> + 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS + 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> + 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> + 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> + 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> + 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> + 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> + 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> + 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> + 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> + 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> + 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> + 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> + 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> + 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> + 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> + 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> + 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> + 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS + 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS + 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS + 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS + 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> + 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> + 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> + 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> + 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> + 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> + 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS + 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> + 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> + 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> + 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> + 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> + 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> + 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> + 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> + 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> + 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> + 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> + 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> + 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS + 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> + 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> + 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> + 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> + 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> + 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> + 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> + 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> + 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> + 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> + 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> + 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> + 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> + 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> + 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> + 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS + 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> + 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> + 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS + 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> + 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> + 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> + 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> + 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> + 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> + 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> + 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> + 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> + 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> + 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> + 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> + 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> + 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> + 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> + 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> + 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> + 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> + 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> + 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> + 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> + 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> + 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS + 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> + 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> + 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS + 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> + 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> + 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS + 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> + 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> + 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> + 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> + 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> + 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> + 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS + 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS + 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> + 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> + 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> + 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS + 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS + 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS + 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS + 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS + 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> + 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> + 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> + 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS + 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS + 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS + 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> + 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> + 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> + 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> + 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> + 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> + 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS + 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> + 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> + 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> + 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> + 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> + 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> + 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> + 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS + 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS + 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> + 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> + 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> + 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> + 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> + 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> + 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> + 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> + 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> + 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> + 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> + 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS + 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> + 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> + 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> + 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS + 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> + 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> + 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS + 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS + 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> + 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS + 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS + 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS + 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> + 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS + 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> + 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS + 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> + 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> + 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS + 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> + 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> + 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS + 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> + 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> + 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> + 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> + 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> + 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> + 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> + 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS + 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS + 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> + 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> + 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> + 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS + 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> + 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> + 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS + 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS + 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> + 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> + 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> + 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> + 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> + 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> + 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> + 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> + 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> + 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> + 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> + 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> + 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS + 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> + 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> + 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS + 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> + 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> + 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> + 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> + 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> + 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> + 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> + 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS + 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> + 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> + 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> + 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS + 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> + 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> + 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> + 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> + 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS + 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> + 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> + 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> + 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS + 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> + 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> + 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> + 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS + 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> + 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> + 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS + 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS + 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS + 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> + 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> + 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> + 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS + 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> + 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> + 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS + 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS + 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> + 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> + 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> + 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> + 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> + 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> + 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS + 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS + 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> + 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> + 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> + 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> + 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> + 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> + 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> + 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> + 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> + 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS + 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> + 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> + 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> + 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS + 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> + 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> + 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> + 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> + 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> + 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> + 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS + 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> + 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> + 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> + 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> + 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> + 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> + 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> + 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> + 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> + 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> + 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> + 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> + 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> + 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> + 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS + 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> + 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> + 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> + 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> + 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS + 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS + 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> + 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS + 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> + 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> + 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> + 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> + 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> + 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> + 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS + 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> + 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> + 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> + 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> + 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> + 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> + 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> + 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> + 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> + 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS + 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> + 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> + 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> + 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> + 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> + 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> + 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> + 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> + 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> + 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> + 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> + 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> + 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> + 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS + 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> + 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> + 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS + 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> + 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> + 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> + 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> + 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> + 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> + 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> + 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> + 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> + 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> + 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> + 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> + 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> + 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> + 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS + 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> + 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> + 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> + 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> + 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> + 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> + 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> + 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS + 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> + 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> + 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS + 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> + 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS + 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> + 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> + 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS + 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> + 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS + 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS + 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS + 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> + 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS + 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS + 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> + 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS + 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> + 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS + 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> + 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> + 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> + 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> + 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS + 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> + 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS + 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS + 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> + 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS + 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS + 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> + 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> + 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> + 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS + 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS + 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS + 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> + 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> + 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> + 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS + 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> + 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> + 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> + 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> + 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS + 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> + 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> + 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> + 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS + 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> + 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> + 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> + 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS + 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS + 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> + 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> + 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> + 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> + 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> + 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> + 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> + 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS + 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> + 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS + 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> + 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> + 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> + 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> + 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> + 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> + 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> + 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS + 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> + 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> + 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> + 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> + 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> + 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS + 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> + 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> + 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS + 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> + 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS + 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> + 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> + 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> + 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> + 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS + 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS + 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> + 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> + 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> + 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> + 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> + 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> + 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> + 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> + 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> + 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> + 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> + 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> + 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> + 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> + 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> + 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> + 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> + 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> + 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS + 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> + 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> + 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> + 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> + 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> + 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> + 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> + 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> + 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> + 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> + 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> + 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS + 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> + 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> + 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> + 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> + 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> + 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> + 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> + 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS + 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> + 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> + 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> + 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> + 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> + 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> + 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> + 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS + 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> + 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> + 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> + 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> + 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> + 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> + 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> + 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> + 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> + 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> + 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> + 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> + 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> + 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> + 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> + 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> + 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> + 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> + 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> + 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS + 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> + 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> + 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> + 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> + 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> + 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> + 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS + 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> + 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> + 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> + 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> + 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> + 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> + 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> + 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS + 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> + 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> + 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS + 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> + 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> + 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> + 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> + 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> + 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> + 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> + 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> + 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> + 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> + 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> + 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> + 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> + 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> + 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> + 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS + 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> + 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> + 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> + 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> + 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> + 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> + 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> + 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> + 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> + 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> + 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> + 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> + 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS + 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> + 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> + 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> + 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> + 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> + 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> + 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS + 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> + 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> + 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> + 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS + 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS + 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> + 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> + 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> + 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS + 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> + 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> + 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> + 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> + 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> + 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> + 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> + 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> + 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> + 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> + 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS + 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> + 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> + 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS + 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS + 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> + 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> + 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> + 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS + 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> + 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> + 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS + 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS + 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> + 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> + 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> + 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> + 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> + 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> + 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> + 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> + 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> + 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> + 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> + 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> + 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> + 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> + 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> + 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS + 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> + 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> + 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS + 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS + 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> + 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS + 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS + 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> + 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> + 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> + 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> + 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> + 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> + 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> + 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> + 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> + 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> + 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS + 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> + 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS + 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> + 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> + 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> + 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> + 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> + 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> + 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> + 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS + 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS + 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> + 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> + 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> + 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> + 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> + 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> + 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> + 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> + 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> + 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> + 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> + 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> + 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> + 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> + 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS + 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS + 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> + 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> + 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> + 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS + 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> + 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> + 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> + 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> + 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> + 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> + 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> + 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS + 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> + 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> + 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> + 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> + 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS + 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> + 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> + 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> + 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> + 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> + 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> + 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> + 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> + 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> + 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS + 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> + 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> + 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> + 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS + 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> + 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> + 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> + 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> + 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> + 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> + 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> + 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> + 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS + 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> + 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS + 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS + 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> + 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> + 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> + 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS + 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> + 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> + 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> + 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS + 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> + 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> + 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> + 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> + 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> + 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> + 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> + 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> + 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> + 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> + 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> + 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS + 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> + 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> + 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> + 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS + 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS + 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> + 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> + 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> + 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> + 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> + 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS + 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS + 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> + 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> + 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> + 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> + 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> + 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> + 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> + 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> + 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS + 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS + 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS + 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> + 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> + 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> + 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS + 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> + 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> + 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> + 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> + 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> + 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> + 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> + 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS + 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS + 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS + 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> + 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> + 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> + 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> + 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> + 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> + 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> + 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS + 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> + 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> + 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> + 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS + 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> + 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> + 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> + 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> + 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> + 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> + 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS + 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> + 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> + 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> + 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS + 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> + 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> + 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS + 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS + 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> + 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> + 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> + 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> + 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> + 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> + 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> + 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> + 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> + 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> + 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> + 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> + 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> + 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS + 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> + 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> + 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> + 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> + 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS + 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> + 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> + 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS + 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> + 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> + 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> + 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> + 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> + 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> + 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS + 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> + 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> + 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> + 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS + 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> + 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> + 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS + 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> + 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> + 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> + 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> + 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> + 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS + 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS + 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> + 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS + 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS + 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> + 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> + 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> + 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> + 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS + 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS + 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> + 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> + 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS + 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS + 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> + 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS + 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS + 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS + 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> + 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> + 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> + 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> + 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> + 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> + 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> + 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> + 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> + 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> + 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS + 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> + 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> + 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS + 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS + 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS + 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> + 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> + 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> + 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> + 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> + 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> + 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> + 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> + 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS + 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> + 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> + 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> + 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> + 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> + 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> + 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> + 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS + 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> + 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> + 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> + 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> + 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> + 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> + 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> + 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS + 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> + 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> + 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> + 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> + 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> + 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> + 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS + 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> + 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> + 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> + 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> + 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> + 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> + 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> + 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> + 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> + 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> + 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> + 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> + 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> + 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS + 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> + 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS + 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS + 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS + 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS + 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> + 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> + 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS + 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> + 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> + 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> + 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> + 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> + 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> + 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS + 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> + 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> + 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> + 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS + 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> + 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> + 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> + 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> + 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS + 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> + 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> + 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS + 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> + 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> + 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> + 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> + 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS + 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS + 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS + 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> + 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> + 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> + 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> + 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> + 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS + 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> + 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS + 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> + 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> + 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> + 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS + 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> + 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> + 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> + 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> + 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> + 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> + 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> + 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> + 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> + 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS + 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> + 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> + 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS + 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> + 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> + 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> + 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> + 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS + 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> + 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> + 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> + 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> + 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS + 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> + 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> + 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> + 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> + 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> + 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> + 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> + 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS + 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> + 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> + 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> + 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS + 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS + 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> + 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> + 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> + 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS + 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> + 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> + 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> + 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> + 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> + 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS + 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> + 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> + 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> + 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> + 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS + 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> + 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> + 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> + 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> + 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> + 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS + 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> + 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> + 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> + 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> + 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> + 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> + 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> + 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> + 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> + 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> + 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> + 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> + 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> + 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> + 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> + 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> + 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> + 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> + 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> + 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> + 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> + 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> + 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> + 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> + 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS + 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> + 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> + 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> + 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> + 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS + 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> + 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> + 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> + 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> + 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> + 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS + 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS + 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> + 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS + 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> + 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> + 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> + 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> + 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> + 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> + 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> + 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> + 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> + 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> + 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> + 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS + 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS + 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> + 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> + 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> + 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> + 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> + 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> + 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> + 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS + 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> + 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> + 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS + 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> + 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> + 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> + 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS + 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> + 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> + 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> + 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> + 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> + 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> + 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> + 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> + 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> + 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> + 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS + 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS + 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> + 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> + 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> + 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS + 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> + 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS + 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS + 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> + 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> + 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> + 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS + 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> + 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> + 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> + 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> + 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS + 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> + 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> + 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> + 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> + 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> + 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> + 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS + 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> + 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> + 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS + 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS + 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS + 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> + 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> + 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> + 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS + 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS + 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> + 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> + 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> + 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> + 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> + 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS + 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> + 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> + 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> + 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> + 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> + 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> + 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> + 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS + 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> + 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> + 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> + 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> + 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> + 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> + 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> + 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS + 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS + 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> + 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> + 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> + 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> + 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> + 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> + 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> + 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> + 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> + 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS + 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS + 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS + 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> + 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> + 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> + 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS + 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> + 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> + 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS + 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS + 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> + 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> + 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> + 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> + 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> + 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> + 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> + 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS + 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS + 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> + 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> + 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> + 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> + 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> + 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> + 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> + 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> + 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> + 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> + 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> + 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> + 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> + 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> + 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> + 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> + 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS + 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> + 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> + 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> + 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> + 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS + 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> + 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> + 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> + 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> + 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> + 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> + 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> + 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> + 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> + 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS + 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> + 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> + 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> + 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS + 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> + 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> + 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> + 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> + 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS + 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> + 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> + 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS + 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> + 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> + 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS + 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> + 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> + 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> + 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> + 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> + 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> + 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> + 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> + 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> + 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> + 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> + 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> + 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> + 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> + 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> + 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS + 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> + 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> + 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> + 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> + 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS + 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> + 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> + 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> + 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> + 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> + 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> + 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> + 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> + 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> + 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> + 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> + 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> + 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> + 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> + 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> + 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> + 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> + 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> + 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS + 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS + 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> + 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> + 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> + 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> + 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> + 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> + 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS + 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> + 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> + 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS + 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> + 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> + 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS + 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS + 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> + 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> + 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> + 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> + 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> + 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> + 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> + 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS + 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> + 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> + 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS + 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> + 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> + 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> + 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> + 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> + 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> + 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> + 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> + 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> + 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> + 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> + 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> + 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> + 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS + 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> + 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> + 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> + 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> + 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> + 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> + 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> + 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> + 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> + 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> + 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> + 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> + 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS + 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> + 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> + 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> + 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> + 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> + 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> + 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> + 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> + 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS + 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> + 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> + 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> + 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> + 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> + 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> + 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> + 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> + 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> + 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> + 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> + 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS + 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> + 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> + 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> + 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> + 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> + 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> + 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> + 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> + 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> + 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS + 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS + 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> + 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> + 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS + 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> + 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> + 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> + 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> + 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> + 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> + 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> + 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> + 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> + 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> + 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS + 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> + 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> + 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> + 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> + 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> + 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> + 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> + 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> + 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> + 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> + 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> + 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> + 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> + 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> + 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> + 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> + 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> + 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> + 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> + 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS + 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS + 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS + 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> + 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> + 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> + 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS + 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS + 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> + 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> + 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> + 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> + 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> + 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> + 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> + 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> + 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> + 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> + 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> + 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> + 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> + 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> + 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS + 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> + 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> + 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> + 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> + 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> + 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> + 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> + 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> + 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> + 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> + 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> + 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> + 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> + 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> + 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS + 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> + 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> + 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS + 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> + 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> + 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> + 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> + 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS + 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> + 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> + 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> + 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> + 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS + 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> + 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> + 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> + 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> + 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> + 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> + 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> + 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS + 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> + 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> + 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> + 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS + 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> + 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> + 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> + 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> + 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> + 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> + 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> + 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS + 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> + 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> + 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS + 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> + 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> + 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> + 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS + 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> + 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS + 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> + 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> + 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> + 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> + 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> + 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> + 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> + 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> + 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> + 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> + 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS + 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> + 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> + 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> + 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> + 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> + 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> + 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> + 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> + 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> + 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> + 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> + 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> + 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> + 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> + 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> + 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> + 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> + 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> + 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> + 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> + 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> + 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> + 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> + 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> + 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS + 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> + 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> + 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS + 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> + 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> + 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> + 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS + 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> + 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> + 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> + 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS + 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS + 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> + 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> + 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS + 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS + 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS + 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> + 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> + 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS + 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> + 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> + 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> + 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS + 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS + 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> + 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS + 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> + 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS + 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> + 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> + 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> + 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS + 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> + 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS + 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS + 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> + 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> + 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> + 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> + 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> + 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS + 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS + 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS + 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> + 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS + 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS + 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> + 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> + 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS + 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> + 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> + 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> + 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> + 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS + 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> + 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> + 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> + 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> + 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> + 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> + 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> + 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS + 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> + 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> + 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> + 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS + 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> + 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> + 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> + 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> + 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> + 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> + 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> + 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> + 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS + 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> + 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS + 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> + 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> + 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS + 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> + 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> + 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> + 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> + 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS + 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> + 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> + 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS + 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS + 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> + 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> + 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> + 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> + 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> + 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> + 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> + 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> + 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> + 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> + 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS + 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> + 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> + 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS + 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS + 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> + 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS + 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> + 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> + 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS + 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> + 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> + 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> + 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> + 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> + 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> + 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> + 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> + 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> + 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> + 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> + 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> + 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> + 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> + 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> + 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> + 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> + 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> + 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> + 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> + 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> + 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> + 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> + 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> + 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> + 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS + 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> + 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> + 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS + 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS + 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> + 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> + 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS + 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS + 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> + 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> + 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS + 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS + 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS + 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> + 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> + 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> + 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> + 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> + 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> + 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> + 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> + 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> + 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> + 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> + 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> + 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS + 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> + 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> + 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> + 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> + 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS + 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> + 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> + 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> + 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> + 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> + 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> + 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> + 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> + 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> + 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> + 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> + 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> + 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> + 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> + 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS + 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> + 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> + 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> + 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> + 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> + 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> + 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> + 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> + 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> + 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> + 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> + 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> + 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> + 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> + 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> + 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> + 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> + 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> + 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS + 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> + 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> + 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> + 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS + 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> + 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> + 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> + 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS + 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS + 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> + 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> + 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> + 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> + 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> + 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> + 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> + 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> + 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> + 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> + 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> + 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> + 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> + 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> + 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> + 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS + 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> + 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> + 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> + 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> + 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> + 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> + 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> + 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> + 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> + 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> + 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> + 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> + 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS + 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> + 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> + 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> + 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> + 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> + 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> + 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> + 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> + 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> + 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> + 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> + 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> + 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> + 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> + 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> + 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> + 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> + 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> + 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> + 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> + 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> + 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> + 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> + 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> + 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> + 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS + 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS + 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS + 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> + 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> + 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> + 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS + 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> + 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS + 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> + 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> + 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> + 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS + 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> + 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS + 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> + 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS + 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> + 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> + 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> + 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> + 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> + 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> + 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> + 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS + 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS + 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> + 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS + 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> + 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> + 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> + 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> + 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> + 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> + 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> + 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS + 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> + 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> + 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> + 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS + 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> + 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> + 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> + 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> + 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS + 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS + 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> + 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> + 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> + 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> + 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> + 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> + 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS + 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> + 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> + 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> + 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> + 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> + 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> + 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> + 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> + 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> + 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> + 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS + 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> + 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> + 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> + 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS + 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> + 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> + 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> + 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> + 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> + 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> + 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> + 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> + 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS + 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS + 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS + 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> + 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> + 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> + 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS + 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> + 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS + 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> + 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> + 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> + 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS + 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> + 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> + 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> + 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS + 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS + 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> + 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> + 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS + 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> + 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> + 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> + 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> + 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> + 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS + 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> + 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> + 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> + 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> + 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> + 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> + 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS + 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> + 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> + 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> + 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> + 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> + 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> + 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> + 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> + 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> + 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> + 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> + 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> + 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> + 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> + 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> + 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS + 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> + 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> + 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> + 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS + 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> + 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS + 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS + 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> + 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> + 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> + 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> + 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> + 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> + 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS + 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS + 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS + 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> + 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> + 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> + 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> + 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> + 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> + 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS + 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS + 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> + 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> + 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> + 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> + 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> + 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> + 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> + 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS + 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> + 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS + 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS + 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS + 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> + 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> + 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> + 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> + 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> + 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> + 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> + 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> + 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS + 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> + 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> + 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> + 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> + 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS + 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> + 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> + 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> + 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> + 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> + 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> + 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> + 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> + 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> + 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> + 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> + 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> + 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> + 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> + 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> + 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> + 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> + 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> + 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS + 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> + 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> + 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS + 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> + 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> + 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> + 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> + 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> + 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> + 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> + 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> + 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> + 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> + 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> + 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> + 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS + 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS + 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> + 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> + 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> + 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> + 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> + 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> + 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> + 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> + 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS + 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> + 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> + 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS + 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> + 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> + 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS + 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> + 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> + 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> + 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> + 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> + 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> + 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> + 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS + 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> + 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> + 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> + 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> + 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> + 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> + 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS + 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> + 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> + 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> + 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> + 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> + 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> + 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> + 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> + 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> + 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> + 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> + 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> + 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> + 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> + 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> + 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> + 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS + 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> + 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> + 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS + 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> + 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS + 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS + 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> + 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS + 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS + 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> + 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS + 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> + 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS + 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> + 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS + 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> + 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> + 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> + 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS + 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> + 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS + 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS + 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> + 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> + 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> + 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> + 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS + 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> + 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> + 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> + 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> + 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> + 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> + 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> + 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> + 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> + 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> + 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> + 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> + 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> + 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS + 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS + 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> + 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> + 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> + 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> + 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS + 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS + 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> + 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> + 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> + 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> + 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> + 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> + 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> + 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS + 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS + 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> + 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> + 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> + 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS + 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> + 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> + 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> + 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS + 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> + 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> + 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> + 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> + 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> + 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> + 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> + 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> + 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> + 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> + 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> + 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> + 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> + 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> + 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> + 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> + 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> + 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> + 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> + 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> + 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> + 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> + 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> + 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> + 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> + 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> + 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS + 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> + 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> + 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> + 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS + 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> + 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> + 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS + 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> + 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> + 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS + 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS + 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> + 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> + 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> + 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS + 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> + 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> + 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS + 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> + 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> + 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS + 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> + 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> + 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> + 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> + 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> + 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> + 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS + 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> + 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> + 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> + 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> + 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS + 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> + 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> + 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> + 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> + 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS + 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> + 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> + 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> + 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> + 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> + 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> + 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> + 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> + 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> + 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> + 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> + 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS + 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS + 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> + 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> + 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS + 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS + 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> + 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> + 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> + 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS + 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS + 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> + 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> + 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> + 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> + 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> + 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> + 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS + 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> + 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> + 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS + 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS + 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> + 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> + 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> + 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS + 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> + 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> + 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> + 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> + 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> + 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> + 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> + 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> + 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> + 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> + 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> + 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> + 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> + 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> + 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> + 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> + 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> + 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> + 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> + 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> + 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> + 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> + 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> + 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> + 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> + 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> + 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> + 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> + 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> + 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> + 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> + 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> + 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> + 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> + 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> + 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> + 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS + 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS + 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> + 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> + 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> + 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS + 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> + 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> + 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> + 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> + 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> + 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> + 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> + 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> + 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> + 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> + 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> + 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> + 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> + 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> + 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> + 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> + 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> + 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> + 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS + 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> + 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> + 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> + 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> + 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> + 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> + 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> + 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> + 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> + 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> + 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> + 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> + 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS + 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> + 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> + 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> + 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> + 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> + 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS + 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS + 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> + 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> + 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> + 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS + 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS + 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> + 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> + 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> + 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS + 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> + 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> + 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> + 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> + 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> + 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> + 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> + 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS + 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> + 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> + 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> + 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> + 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> + 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> + 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> + 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> + 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> + 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> + 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> + 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> + 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> + 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> + 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> + 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> + 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> + 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS + 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> + 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> + 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> + 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> + 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> + 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> + 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> + 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> + 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS + 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> + 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> + 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> + 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS + 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> + 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> + 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS + 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> + 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> + 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> + 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS + 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS + 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> + 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> + 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> + 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> + 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> + 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> + 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> + 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> + 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> + 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> + 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> + 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> + 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> + 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> + 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS + 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> + 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> + 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> + 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> + 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> + 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> + 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> + 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS + 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS + 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS + 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> + 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> + 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> + 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS + 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> + 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> + 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> + 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> + 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> + 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> + 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> + 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> + 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> + 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS + 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> + 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> + 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> + 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> + 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> + 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> + 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> + 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> + 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> + 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> + 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> + 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> + 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> + 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> + 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> + 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> + 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> + 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS + 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> + 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> + 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> + 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS + 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> + 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> + 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS + 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS + 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> + 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> + 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> + 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> + 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> + 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> + 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> + 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> + 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> + 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> + 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> + 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> + 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS + 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> + 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS + 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> + 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS + 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> + 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> + 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> + 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> + 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> + 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> + 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> + 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> + 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> + 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> + 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> + 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> + 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> + 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> + 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> + 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> + 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> + 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> + 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> + 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> + 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> + 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> + 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> + 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> + 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> + 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> + 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> + 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> + 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> + 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS + 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS + 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS + 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> + 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> + 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS + 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS + 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS + 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS + 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS + 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> + 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> + 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> + 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> + 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> + 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> + 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> + 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> + 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> + 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> + 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> + 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> + 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> + 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> + 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> + 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> + 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS + 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> + 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> + 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> + 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> + 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> + 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> + 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> + 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS + 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS + 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> + 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> + 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> + 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> + 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> + 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS + 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS + 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> + 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> + 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> + 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> + 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> + 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS + 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> + 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> + 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS + 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> + 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> + 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS + 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> + 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> + 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> + 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> + 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> + 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> + 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> + 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> + 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> + 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS + 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> + 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> + 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> + 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS + 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> + 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> + 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> + 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> + 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> + 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> + 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> + 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS + 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> + 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> + 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> + 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> + 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> + 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS + 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> + 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS + 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> + 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> + 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS + 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS + 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> + 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> + 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> + 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> + 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> + 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> + 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> + 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> + 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS + 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS + 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> + 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> + 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> + 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> + 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> + 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> + 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> + 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> + 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> + 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS + 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> + 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> + 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> + 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> + 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> + 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> + 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> + 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> + 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> + 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> + 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> + 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> + 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> + 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> + 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> + 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> + 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> + 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> + 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> + 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> + 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> + 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> + 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> + 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> + 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS + 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> + 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> + 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS + 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS + 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> + 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> + 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS + 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> + 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS + 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> + 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS + 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> + 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> + 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> + 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> + 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> + 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> + 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> + 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> + 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> + 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> + 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> + 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> + 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> + 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> + 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> + 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> + 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> + 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> + 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> + 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> + 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> + 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> + 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS + 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> + 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> + 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> + 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> + 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> + 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> + 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS + 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> + 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> + 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> + 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> + 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> + 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> + 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> + 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> + 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> + 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> + 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> + 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> + 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> + 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> + 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS + 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS + 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> + 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> + 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> + 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> + 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> + 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> + 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> + 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS + 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> + 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> + 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> + 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> + 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> + 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> + 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> + 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> + 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> + 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> + 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS + 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> + 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> + 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> + 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS + 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> + 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> + 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> + 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> + 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS + 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> + 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> + 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> + 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS + 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> + 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> + 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> + 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> + 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS + 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> + 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> + 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> + 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> + 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> + 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> + 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> + 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> + 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS + 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> + 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> + 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> + 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> + 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> + 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> + 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> + 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS + 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS + 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> + 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> + 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> + 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> + 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> + 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> + 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> + 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> + 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS + 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> + 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS + 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS + 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> + 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> + 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> + 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS + 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> + 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> + 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> + 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> + 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> + 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> + 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> + 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> + 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> + 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> + 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> + 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> + 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> + 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> + 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS + 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS + 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS + 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> + 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> + 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> + 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS + 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS + 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> + 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> + 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> + 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> + 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> + 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> + 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> + 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> + 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS + 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> + 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> + 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> + 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> + 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> + 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> + 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> + 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> + 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> + 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> + 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> + 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> + 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS + 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> + 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS + 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> + 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> + 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> + 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> + 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> + 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> + 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> + 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS + 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> + 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> + 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS + 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> + 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> + 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> + 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS + 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> + 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> + 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS + 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS + 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> + 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS + 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> + 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> + 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> + 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> + 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> + 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> + 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> + 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> + 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> + 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> + 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> + 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS + 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> + 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> + 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> + 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> + 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> + 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS + 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> + 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> + 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS + 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> + 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> + 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> + 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> + 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> + 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> + 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> + 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS + 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> + 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> + 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> + 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> + 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS + 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS + 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> + 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS + 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS + 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> + 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> + 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> + 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> + 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> + 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> + 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> + 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> + 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> + 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> + 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> + 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> + 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> + 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS + 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> + 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> + 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> + 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> + 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> + 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> + 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> + 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> + 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> + 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS + 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS + 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> + 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> + 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> + 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS + 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> + 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> + 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> + 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS + 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> + 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> + 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> + 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> + 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> + 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> + 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> + 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> + 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> + 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> + 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> + 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> + 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> + 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> + 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> + 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> + 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> + 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS + 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS + 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> + 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> + 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> + 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> + 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> + 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> + 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> + 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> + 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> + 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> + 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> + 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> + 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> + 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> + 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> + 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> + 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> + 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> + 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> + 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> + 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> + 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> + 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> + 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> + 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> + 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> + 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> + 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> + 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> + 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> + 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> + 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> + 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> + 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> + 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> + 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> + 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> + 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> + 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> + 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> + 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> + 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> + 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> + 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> + 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> + 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> + 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS + 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> + 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> + 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> + 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> + 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> + 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> + 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> + 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> + 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> + 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> + 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> + 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> + 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> + 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS + 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> + 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> + 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> + 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS + 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> + 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> + 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> + 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> + 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> + 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> + 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS + 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS + 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> + 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> + 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> + 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> + 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> + 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> + 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> + 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> + 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> + 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> + 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> + 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> + 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> + 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> + 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> + 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> + 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> + 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> + 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> + 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> + 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> + 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> + 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> + 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> + 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> + 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> + 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> + 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> + 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> + 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> + 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> + 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> + 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> + 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> + 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> + 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> + 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> + 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> + 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> + 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> + 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> + 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> + 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> + 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> + 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> + 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> + 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> + 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> + 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> + 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> + 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> + 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS + 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> + 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> + 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> + 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> + 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> + 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> + 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> + 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> + 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> + 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> + 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> + 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> + 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> + 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> + 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> + 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> + 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> + 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> + 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> + 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> + 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> + 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> + 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> + 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> + 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> + 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> + 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> + 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS + 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS + 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> + 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> + 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS + 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS + 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS + 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS + 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> + 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS + 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> + 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> + 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS + 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS + 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> + 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 72589981U, // <u,0,3,2>: Cost 1 vrev LHS + 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> + 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> + 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> + 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> + 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> + 73032403U, // <u,0,3,u>: Cost 1 vrev LHS + 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> + 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> + 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS + 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS + 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> + 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS + 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS + 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> + 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> + 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> + 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> + 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS + 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS + 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> + 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS + 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> + 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> + 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> + 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS + 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS + 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> + 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> + 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> + 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS + 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> + 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> + 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> + 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS + 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS + 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS + 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS + 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> + 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS + 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> + 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS + 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> + 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> + 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS + 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS + 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> + 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS + 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> + 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS + 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS + 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS + 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> + 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS + 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> + 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> + 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> + 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> + 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS + 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS + 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> + 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> + 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> + 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS + 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> + 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> + 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS + 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> + 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> + 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS + 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS + 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS + 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> + 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> + 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> + 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS + 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> + 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> + 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS + 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS + 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> + 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> + 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> + 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS + 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS + 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS + 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS + 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> + 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS + 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> + 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> + 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> + 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> + 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS + 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> + 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> + 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS + 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> + 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> + 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS + 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> + 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS + 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS + 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> + 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS + 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS + 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS + 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS + 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS + 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> + 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> + 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS + 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> + 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS + 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS + 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> + 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> + 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> + 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS + 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> + 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS + 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> + 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> + 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS + 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> + 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> + 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> + 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> + 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS + 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS + 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> + 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> + 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS + 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS + 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS + 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS + 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS + 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS + 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS + 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS + 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> + 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> + 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> + 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS + 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> + 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS + 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> + 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS + 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> + 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> + 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> + 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS + 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> + 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS + 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> + 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS + 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> + 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS + 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS + 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> + 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> + 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS + 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS + 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> + 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> + 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> + 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> + 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> + 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS + 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> + 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> + 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS + 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS + 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS + 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS + 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS + 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> + 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS + 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> + 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> + 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS + 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS + 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> + 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> + 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS + 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS + 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> + 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS + 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> + 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> + 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> + 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS + 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS + 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> + 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> + 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> + 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> + 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> + 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> + 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS + 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> + 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> + 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS + 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS + 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS + 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> + 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS + 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS + 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS + 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS + 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS + 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> + 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> + 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> + 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS + 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS + 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS + 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS + 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> + 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> + 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> + 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS + 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 96808489U, // <u,4,7,6>: Cost 1 vrev RHS + 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> + 96955963U, // <u,4,7,u>: Cost 1 vrev RHS + 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS + 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> + 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS + 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS + 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS + 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS + 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> + 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS + 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> + 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> + 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> + 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> + 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS + 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS + 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> + 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> + 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> + 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS + 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> + 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> + 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS + 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> + 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> + 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> + 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS + 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> + 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS + 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> + 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS + 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> + 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> + 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> + 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> + 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> + 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS + 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS + 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS + 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> + 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> + 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> + 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS + 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> + 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS + 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS + 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> + 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> + 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> + 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS + 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS + 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS + 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS + 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> + 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> + 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> + 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS + 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> + 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> + 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS + 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS + 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS + 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> + 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> + 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> + 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS + 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS + 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> + 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> + 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> + 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> + 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> + 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS + 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS + 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> + 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> + 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> + 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> + 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> + 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> + 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS + 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> + 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS + 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> + 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> + 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> + 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS + 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> + 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> + 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> + 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> + 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> + 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> + 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> + 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS + 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS + 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS + 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> + 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> + 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> + 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS + 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> + 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS + 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS + 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS + 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> + 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> + 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> + 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> + 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS + 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> + 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS + 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> + 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> + 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> + 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS + 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> + 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS + 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS + 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS + 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS + 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS + 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS + 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS + 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS + 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS + 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS + 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS + 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS + 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS + 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> + 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> + 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> + 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS + 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> + 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS + 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> + 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> + 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> + 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> + 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> + 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> + 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> + 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> + 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> + 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> + 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> + 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> + 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> + 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> + 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> + 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS + 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> + 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS + 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS + 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS + 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> + 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> + 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> + 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> + 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> + 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> + 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS + 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS + 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS + 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS + 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS + 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS + 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS + 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> + 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS + 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS + 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS + 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS + 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS + 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS + 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> + 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS + 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS + 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 120371557U, // <u,u,3,2>: Cost 1 vrev LHS + 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS + 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS + 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS + 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS + 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> + 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS + 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS + 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS + 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS + 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS + 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS + 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS + 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS + 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS + 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS + 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> + 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS + 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> + 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS + 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS + 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS + 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 120699277U, // <u,u,7,6>: Cost 1 vrev RHS + 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS + 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS + 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS + 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS + 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS + 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS + 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS 0 }; diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp index ad51bc1..1cba1ba 100644 --- a/lib/Target/ARM/ARMRegisterInfo.cpp +++ b/lib/Target/ARM/ARMRegisterInfo.cpp @@ -12,26 +12,8 @@ //===----------------------------------------------------------------------===// #include "ARM.h" -#include "ARMAddressingModes.h" #include "ARMBaseInstrInfo.h" -#include "ARMInstrInfo.h" -#include "ARMMachineFunctionInfo.h" #include "ARMRegisterInfo.h" -#include "ARMSubtarget.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLocation.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallVector.h" using namespace llvm; ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii, diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index 22d15b5..f4fbae3 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -30,18 +30,6 @@ def ssub_0 : SubRegIndex; def ssub_1 : SubRegIndex; def ssub_2 : SubRegIndex; // In a Q reg. def ssub_3 : SubRegIndex; -def ssub_4 : SubRegIndex; // In a QQ reg. -def ssub_5 : SubRegIndex; -def ssub_6 : SubRegIndex; -def ssub_7 : SubRegIndex; -def ssub_8 : SubRegIndex; // In a QQQQ reg. -def ssub_9 : SubRegIndex; -def ssub_10 : SubRegIndex; -def ssub_11 : SubRegIndex; -def ssub_12 : SubRegIndex; -def ssub_13 : SubRegIndex; -def ssub_14 : SubRegIndex; -def ssub_15 : SubRegIndex; def dsub_0 : SubRegIndex; def dsub_1 : SubRegIndex; @@ -70,6 +58,8 @@ def R4 : ARMReg< 4, "r4">, DwarfRegNum<[4]>; def R5 : ARMReg< 5, "r5">, DwarfRegNum<[5]>; def R6 : ARMReg< 6, "r6">, DwarfRegNum<[6]>; def R7 : ARMReg< 7, "r7">, DwarfRegNum<[7]>; +// These require 32-bit instructions. +let CostPerUse = 1 in { def R8 : ARMReg< 8, "r8">, DwarfRegNum<[8]>; def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>; def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>; @@ -78,6 +68,7 @@ def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>; def SP : ARMReg<13, "sp">, DwarfRegNum<[13]>; def LR : ARMReg<14, "lr">, DwarfRegNum<[14]>; def PC : ARMReg<15, "pc">, DwarfRegNum<[15]>; +} // Float registers def S0 : ARMFReg< 0, "s0">; def S1 : ARMFReg< 1, "s1">; @@ -99,33 +90,41 @@ def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">; // Aliases of the F* registers used to hold 64-bit fp values (doubles) let SubRegIndices = [ssub_0, ssub_1] in { -def D0 : ARMReg< 0, "d0", [S0, S1]>; -def D1 : ARMReg< 1, "d1", [S2, S3]>; -def D2 : ARMReg< 2, "d2", [S4, S5]>; -def D3 : ARMReg< 3, "d3", [S6, S7]>; -def D4 : ARMReg< 4, "d4", [S8, S9]>; -def D5 : ARMReg< 5, "d5", [S10, S11]>; -def D6 : ARMReg< 6, "d6", [S12, S13]>; -def D7 : ARMReg< 7, "d7", [S14, S15]>; -def D8 : ARMReg< 8, "d8", [S16, S17]>; -def D9 : ARMReg< 9, "d9", [S18, S19]>; -def D10 : ARMReg<10, "d10", [S20, S21]>; -def D11 : ARMReg<11, "d11", [S22, S23]>; -def D12 : ARMReg<12, "d12", [S24, S25]>; -def D13 : ARMReg<13, "d13", [S26, S27]>; -def D14 : ARMReg<14, "d14", [S28, S29]>; -def D15 : ARMReg<15, "d15", [S30, S31]>; +def D0 : ARMReg< 0, "d0", [S0, S1]>, DwarfRegNum<[256]>; +def D1 : ARMReg< 1, "d1", [S2, S3]>, DwarfRegNum<[257]>; +def D2 : ARMReg< 2, "d2", [S4, S5]>, DwarfRegNum<[258]>; +def D3 : ARMReg< 3, "d3", [S6, S7]>, DwarfRegNum<[259]>; +def D4 : ARMReg< 4, "d4", [S8, S9]>, DwarfRegNum<[260]>; +def D5 : ARMReg< 5, "d5", [S10, S11]>, DwarfRegNum<[261]>; +def D6 : ARMReg< 6, "d6", [S12, S13]>, DwarfRegNum<[262]>; +def D7 : ARMReg< 7, "d7", [S14, S15]>, DwarfRegNum<[263]>; +def D8 : ARMReg< 8, "d8", [S16, S17]>, DwarfRegNum<[264]>; +def D9 : ARMReg< 9, "d9", [S18, S19]>, DwarfRegNum<[265]>; +def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>; +def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>; +def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>; +def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>; +def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>; +def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>; } // VFP3 defines 16 additional double registers -def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">; -def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">; -def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">; -def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">; -def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">; -def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">; -def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">; -def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">; +def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; +def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>; +def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>; +def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>; +def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>; +def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>; +def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; +def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>; +def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>; +def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>; +def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>; +def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>; +def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>; +def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>; +def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>; +def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>; // Advanced SIMD (NEON) defines 16 quad-word aliases let SubRegIndices = [dsub_0, dsub_1], @@ -158,43 +157,28 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>; // starting D register number doesn't have to be multiple of 4, e.g., // D1, D2, D3, D4 would be a legal quad, but that would make the subregister // stuff very messy. -let SubRegIndices = [qsub_0, qsub_1] in { -let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1), - (ssub_4 qsub_1, ssub_0), (ssub_5 qsub_1, ssub_1), - (ssub_6 qsub_1, ssub_2), (ssub_7 qsub_1, ssub_3)] in { +let SubRegIndices = [qsub_0, qsub_1], + CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1)] in { def QQ0 : ARMReg<0, "qq0", [Q0, Q1]>; def QQ1 : ARMReg<1, "qq1", [Q2, Q3]>; def QQ2 : ARMReg<2, "qq2", [Q4, Q5]>; def QQ3 : ARMReg<3, "qq3", [Q6, Q7]>; -} -let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1)] in { def QQ4 : ARMReg<4, "qq4", [Q8, Q9]>; def QQ5 : ARMReg<5, "qq5", [Q10, Q11]>; def QQ6 : ARMReg<6, "qq6", [Q12, Q13]>; def QQ7 : ARMReg<7, "qq7", [Q14, Q15]>; } -} // Pseudo 512-bit registers to represent four consecutive Q registers. -let SubRegIndices = [qqsub_0, qqsub_1] in { -let CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1), - (dsub_4 qqsub_1, dsub_0), (dsub_5 qqsub_1, dsub_1), - (dsub_6 qqsub_1, dsub_2), (dsub_7 qqsub_1, dsub_3), - (ssub_8 qqsub_1, ssub_0), (ssub_9 qqsub_1, ssub_1), - (ssub_10 qqsub_1, ssub_2), (ssub_11 qqsub_1, ssub_3), - (ssub_12 qqsub_1, ssub_4), (ssub_13 qqsub_1, ssub_5), - (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in -{ +let SubRegIndices = [qqsub_0, qqsub_1], + CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1), + (dsub_4 qqsub_1, dsub_0), (dsub_5 qqsub_1, dsub_1), + (dsub_6 qqsub_1, dsub_2), (dsub_7 qqsub_1, dsub_3)] in { def QQQQ0 : ARMReg<0, "qqqq0", [QQ0, QQ1]>; def QQQQ1 : ARMReg<1, "qqqq1", [QQ2, QQ3]>; -} -let CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1), - (dsub_4 qqsub_1, dsub_0), (dsub_5 qqsub_1, dsub_1), - (dsub_6 qqsub_1, dsub_2), (dsub_7 qqsub_1, dsub_3)] in { def QQQQ2 : ARMReg<2, "qqqq2", [QQ4, QQ5]>; def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>; } -} // Current Program Status Register. def CPSR : ARMReg<0, "cpsr">; @@ -216,9 +200,8 @@ def FPEXC : ARMReg<8, "fpexc">; // r11 == Frame Pointer (arm-style backtraces) // r10 == Stack Limit // -def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, - R7, R8, R9, R10, R11, R12, - SP, LR, PC]> { +def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), + SP, LR, PC)> { let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; @@ -262,8 +245,7 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, // register range for operands, but have undefined behaviours when PC // or SP (R13 or R15) are used. The ARM ISA refers to these operands // via the BadReg() pseudo-code description. -def rGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, - R7, R8, R9, R10, R11, R12, LR]> { +def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> { let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; @@ -307,13 +289,13 @@ def rGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, // Thumb registers are R0-R7 normally. Some instructions can still use // the general GPR register class above (MOV, e.g.) -def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {} +def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>; // For tail calls, we can't use callee-saved registers, as they are restored // to the saved value before the tail call, which would clobber a call address. // Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of // this class and the preceding one(!) This is what we want. -def tcGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R9, R12]> { +def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R9, R12)> { let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; @@ -361,25 +343,18 @@ def tcGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R9, R12]> { // Scalar single precision floating point register class.. -def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, - S23, S24, S25, S26, S27, S28, S29, S30, S31]>; +def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)>; // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations -def SPR_8 : RegisterClass<"ARM", [f32], 32, - [S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15]>; +def SPR_8 : RegisterClass<"ARM", [f32], 32, (trunc SPR, 16)>; // Scalar double precision floating point / generic 64-bit vector register // class. // ARM requires only word alignment for double. It's more performant if it // is double-word alignment though. def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, - [D0, D1, D2, D3, D4, D5, D6, D7, - D8, D9, D10, D11, D12, D13, D14, D15, - D16, D17, D18, D19, D20, D21, D22, D23, - D24, D25, D26, D27, D28, D29, D30, D31]> { + (sequence "D%u", 0, 31)> { let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; @@ -427,22 +402,20 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, // Subset of DPR that are accessible with VFP2 (and so that also have // 32-bit SPR subregs). def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, - [D0, D1, D2, D3, D4, D5, D6, D7, - D8, D9, D10, D11, D12, D13, D14, D15]> { + (trunc DPR, 16)> { let SubRegClasses = [(SPR ssub_0, ssub_1)]; } // Subset of DPR which can be used as a source of NEON scalars for 16-bit // operations def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, - [D0, D1, D2, D3, D4, D5, D6, D7]> { + (trunc DPR, 8)> { let SubRegClasses = [(SPR_8 ssub_0, ssub_1)]; } // Generic 128-bit vector register class. def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, - Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> { + (sequence "Q%u", 0, 15)> { let SubRegClasses = [(DPR dsub_0, dsub_1)]; let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; @@ -471,25 +444,21 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, // Subset of QPR that have 32-bit SPR subregs. def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]> { + 128, (trunc QPR, 8)> { let SubRegClasses = [(SPR ssub_0, ssub_1, ssub_2, ssub_3), (DPR_VFP2 dsub_0, dsub_1)]; } // Subset of QPR that have DPR_8 and SPR_8 subregs. def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, - [Q0, Q1, Q2, Q3]> { + 128, (trunc QPR, 4)> { let SubRegClasses = [(SPR_8 ssub_0, ssub_1, ssub_2, ssub_3), (DPR_8 dsub_0, dsub_1)]; } // Pseudo 256-bit vector register class to model pairs of Q registers // (4 consecutive D registers). -def QQPR : RegisterClass<"ARM", [v4i64], - 256, - [QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> { +def QQPR : RegisterClass<"ARM", [v4i64], 256, (sequence "QQ%u", 0, 7)> { let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3), (QPR qsub_0, qsub_1)]; let MethodProtos = [{ @@ -516,9 +485,7 @@ def QQPR : RegisterClass<"ARM", [v4i64], } // Subset of QQPR that have 32-bit SPR subregs. -def QQPR_VFP2 : RegisterClass<"ARM", [v4i64], - 256, - [QQ0, QQ1, QQ2, QQ3]> { +def QQPR_VFP2 : RegisterClass<"ARM", [v4i64], 256, (trunc QQPR, 4)> { let SubRegClasses = [(SPR ssub_0, ssub_1, ssub_2, ssub_3), (DPR_VFP2 dsub_0, dsub_1, dsub_2, dsub_3), (QPR_VFP2 qsub_0, qsub_1)]; @@ -527,9 +494,7 @@ def QQPR_VFP2 : RegisterClass<"ARM", [v4i64], // Pseudo 512-bit vector register class to model 4 consecutive Q registers // (8 consecutive D registers). -def QQQQPR : RegisterClass<"ARM", [v8i64], - 256, - [QQQQ0, QQQQ1, QQQQ2, QQQQ3]> { +def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (sequence "QQQQ%u", 0, 3)> { let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3, dsub_4, dsub_5, dsub_6, dsub_7), (QPR qsub_0, qsub_1, qsub_2, qsub_3)]; @@ -556,4 +521,6 @@ def QQQQPR : RegisterClass<"ARM", [v8i64], } // Condition code registers. -def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>; +def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { + let isAllocatable = 0; +} diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 82c6735..49fedf6 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -656,19 +656,19 @@ def CortexA9Itineraries : ProcessorItineraries< [1, 1, 1]>, // // Single-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, - InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_DRegsVFP], 0, Required>, - InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_NPipe]>], + InstrStage<1, [A9_MUX0], 0>], [2, 1]>, // // Double-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, - InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_DRegsVFP], 0, Required>, - InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_NPipe]>], + InstrStage<1, [A9_MUX0], 0>], [2, 1, 1]>, // // Single-precision FP Load @@ -691,20 +691,22 @@ def CortexA9Itineraries : ProcessorItineraries< [2, 1]>, // // FP Load Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, // // FP Load Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, // // Single-precision FP Store InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -725,205 +727,206 @@ def CortexA9Itineraries : ProcessorItineraries< [1, 1]>, // // FP Store Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, // // FP Store Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, // NEON // VLD1 - // FIXME: Conservatively assume insufficent alignment. InstrItinData<IIC_VLD1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // VLD1x2 InstrItinData<IIC_VLD1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, // VLD1x3 InstrItinData<IIC_VLD1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 1]>, // VLD1x4 InstrItinData<IIC_VLD1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 3, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, // VLD1u InstrItinData<IIC_VLD1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 2, 1]>, // VLD1x2u InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 2, 1]>, // VLD1x3u InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, // VLD1x4u InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 3, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 2, 1]>, // // VLD1ln InstrItinData<IIC_VLD1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1, 1, 1]>, // // VLD1lnu InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 2, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1, 1, 1]>, // // VLD1dup InstrItinData<IIC_VLD1dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, // // VLD1dupu InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 2, 1, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1, 1]>, // // VLD2 InstrItinData<IIC_VLD2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, // // VLD2x2 InstrItinData<IIC_VLD2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 4, 3, 4, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 1]>, // // VLD2ln InstrItinData<IIC_VLD2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 4, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 1, 1, 1, 1]>, // // VLD2u InstrItinData<IIC_VLD2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 2, 1, 1, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1, 1]>, // // VLD2x2u InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 4, 3, 4, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 2, 1]>, // // VLD2lnu InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 4, 2, 1, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, // // VLD2dup InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, // // VLD2dupu InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 2, 1, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1]>, // // VLD3 InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, // // VLD3ln InstrItinData<IIC_VLD3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -938,10 +941,10 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VLD3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 2, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1]>, // // VLD3lnu InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -974,108 +977,108 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VLD4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 5, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 1]>, // // VLD4ln InstrItinData<IIC_VLD4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<11,[A9_DRegsVFP], 0, Reserved>, - InstrStage<5, [A9_NPipe], 0>, - InstrStage<5, [A9_LSUnit]>], - [5, 5, 6, 6, 1, 1, 1, 1, 2, 2]>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, // // VLD4u InstrItinData<IIC_VLD4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 5, 2, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 2, 1]>, // // VLD4lnu InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<11,[A9_DRegsVFP], 0, Reserved>, - InstrStage<5, [A9_NPipe], 0>, - InstrStage<5, [A9_LSUnit]>], - [5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, // // VLD4dup InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 3, 4, 4, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 1]>, // // VLD4dupu InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 3, 4, 4, 2, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 2, 1, 1]>, // // VST1 InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // VST1x2 InstrItinData<IIC_VST1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST1x3 InstrItinData<IIC_VST1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2]>, // // VST1x4 InstrItinData<IIC_VST1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST1u InstrItinData<IIC_VST1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1]>, // // VST1x2u InstrItinData<IIC_VST1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST1x3u @@ -1083,44 +1086,44 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2]>, // // VST1x4u InstrItinData<IIC_VST1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // // VST1ln InstrItinData<IIC_VST1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // VST1lnu InstrItinData<IIC_VST1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1]>, // // VST2 InstrItinData<IIC_VST2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST2x2 @@ -1136,9 +1139,9 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VST2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST2x2u @@ -1154,36 +1157,36 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VST2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST2lnu InstrItinData<IIC_VST2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST3 InstrItinData<IIC_VST3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2]>, // // VST3u InstrItinData<IIC_VST3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2]>, // // VST3ln @@ -1208,36 +1211,36 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VST4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST4u InstrItinData<IIC_VST4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // // VST4ln InstrItinData<IIC_VST4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST4lnu InstrItinData<IIC_VST4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index 2b9202b..ef0aaf2 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -13,6 +13,8 @@ #define DEBUG_TYPE "arm-selectiondag-info" #include "ARMTargetMachine.h" +#include "llvm/DerivedTypes.h" +#include "llvm/CodeGen/SelectionDAG.h" using namespace llvm; ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM) @@ -35,7 +37,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, // This requires 4-byte alignment. if ((Align & 3) != 0) return SDValue(); - // This requires the copy size to be a constant, preferrably + // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) @@ -132,3 +134,65 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); } + +// Adjust parameters for memset, EABI uses format (ptr, size, value), +// GNU library uses (ptr, value, size) +// See RTABI section 4.3.4 +SDValue +ARMSelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, SDValue Dst, + SDValue Src, SDValue Size, + unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const +{ + // Use default for non AAPCS subtargets + if (!Subtarget->isAAPCS_ABI()) + return SDValue(); + + const ARMTargetLowering &TLI = + *static_cast<const ARMTargetLowering*>(DAG.getTarget().getTargetLowering()); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + + // First argument: data pointer + const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + + // Second argument: buffer size + Entry.Node = Size; + Entry.Ty = IntPtrTy; + Entry.isSExt = false; + Args.push_back(Entry); + + // Extend or truncate the argument to be an i32 value for the call. + if (Src.getValueType().bitsGT(MVT::i32)) + Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); + else + Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); + + // Third argument: value to fill + Entry.Node = Src; + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); + Entry.isSExt = true; + Args.push_back(Entry); + + // Emit __eabi_memset call + std::pair<SDValue,SDValue> CallResult = + TLI.LowerCallTo(Chain, + Type::getVoidTy(*DAG.getContext()), // return type + false, // return sign ext + false, // return zero ext + false, // is var arg + false, // is in regs + 0, // number of fixed arguments + TLI.getLibcallCallingConv(RTLIB::MEMSET), // call conv + false, // is tail call + false, // is return val used + DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET), + TLI.getPointerTy()), // callee + Args, DAG, dl); // arg list, DAG and debug + + return CallResult.second; +} diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h index 7533690..ec1bf5c 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -35,6 +35,15 @@ public: bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const; + + // Adjust parameters for memset, see RTABI section 4.3.4 + virtual + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Op1, SDValue Op2, + SDValue Op3, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo) const; }; } diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 2a9d480..c6f266b 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -52,6 +52,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, , HasT2ExtractPack(false) , HasDataBarrier(false) , Pref32BitThumb(false) + , AvoidCPSRPartialUpdate(false) , HasMPExtension(false) , FPOnlySP(false) , AllowsUnalignedMem(false) diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index e024182..0271c87 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -110,6 +110,11 @@ protected: /// over 16-bit ones. bool Pref32BitThumb; + /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions + /// that partially update CPSR and add false dependency on the previous + /// CPSR setting instruction. + bool AvoidCPSRPartialUpdate; + /// HasMPExtension - True if the subtarget supports Multiprocessing /// extension (ARMv7 only). bool HasMPExtension; @@ -190,12 +195,15 @@ protected: bool isFPBrccSlow() const { return SlowFPBrcc; } bool isFPOnlySP() const { return FPOnlySP; } bool prefers32BitThumb() const { return Pref32BitThumb; } + bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } bool hasMPExtension() const { return HasMPExtension; } bool hasFP16() const { return HasFP16; } bool hasD16() const { return HasD16; } - bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; } + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } bool isTargetELF() const { return !isTargetDarwin(); } bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; } diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 6ec6747..29aa4f7 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -22,16 +22,13 @@ #include "llvm/Target/TargetRegistry.h" using namespace llvm; -static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden); - static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: + + if (TheTriple.isOSDarwin()) return new ARMMCAsmInfoDarwin(); - default: - return new ARMELFMCAsmInfo(); - } + + return new ARMELFMCAsmInfo(); } // This is duplicated code. Refactor this. @@ -41,17 +38,17 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, MCCodeEmitter *Emitter, bool RelaxAll, bool NoExecStack) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: + + if (TheTriple.isOSWindows()) { llvm_unreachable("ARM does not support Windows COFF format"); return NULL; - default: - return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack); } + + return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack); } extern "C" void LLVMInitializeARMTarget() { @@ -148,8 +145,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM, // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) PM.add(createARMLoadStoreOptimizationPass(true)); - if (ExpandMLx && - OptLevel != CodeGenOpt::None && Subtarget.hasVFP2()) + if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9()) PM.add(createMLxExpansionPass()); return true; diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 05b2b46..4bc12c9 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -15,6 +15,7 @@ #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCExpr.h" @@ -114,6 +115,7 @@ class ARMAsmParser : public TargetAsmParser { public: ARMAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &_TM) : TargetAsmParser(T), Parser(_Parser), TM(_TM) { + MCAsmParserExtension::Initialize(_Parser); // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures( &TM.getSubtarget<ARMSubtarget>())); @@ -1239,6 +1241,8 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { FlagsVal = 0; // No flag } } else if (SpecReg == "cpsr" || SpecReg == "spsr") { + if (Flags == "all") // cpsr_all is an alias for cpsr_fc + Flags = "fc"; for (int i = 0, e = Flags.size(); i != e; ++i) { unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1)) .Case("c", 1) @@ -1826,10 +1830,11 @@ GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet, Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" || Mnemonic == "smull" || Mnemonic == "add" || Mnemonic == "adc" || Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" || - Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mov" || + Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mvn" || Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" || Mnemonic == "sbc" || Mnemonic == "mla" || Mnemonic == "umull" || - Mnemonic == "eor" || Mnemonic == "smlal" || Mnemonic == "mvn") { + Mnemonic == "eor" || Mnemonic == "smlal" || + (Mnemonic == "mov" && !isThumb)) { CanAcceptCarrySet = true; } else { CanAcceptCarrySet = false; @@ -1848,7 +1853,8 @@ GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet, if (isThumb) if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" || - Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp") + Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp" || + Mnemonic == "mov") CanAcceptPredicationCode = false; } @@ -2098,15 +2104,29 @@ bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) { /// ParseDirectiveThumbFunc /// ::= .thumbfunc symbol_name bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) { - const AsmToken &Tok = Parser.getTok(); - if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) - return Error(L, "unexpected token in .thumb_func directive"); - StringRef Name = Tok.getString(); - Parser.Lex(); // Consume the identifier token. + const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo(); + bool isMachO = MAI.hasSubsectionsViaSymbols(); + StringRef Name; + + // Darwin asm has function name after .thumb_func direction + // ELF doesn't + if (isMachO) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) + return Error(L, "unexpected token in .thumb_func directive"); + Name = Tok.getString(); + Parser.Lex(); // Consume the identifier token. + } + if (getLexer().isNot(AsmToken::EndOfStatement)) return Error(L, "unexpected token in directive"); Parser.Lex(); + // FIXME: assuming function name will be the line following .thumb_func + if (!isMachO) { + Name = Parser.getTok().getString(); + } + // Mark symbol as a thumb symbol. MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name); getParser().getStreamer().EmitThumbFunc(Func); diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 74f8b53..bdce2c4 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -422,6 +422,10 @@ bool ARMDisassembler::getInstruction(MCInst &MI, if (!Builder) return false; + Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(), + getDisInfoBlock(), getMCContext(), + Address); + if (!Builder->Build(MI, insn)) return false; @@ -433,7 +437,7 @@ bool ThumbDisassembler::getInstruction(MCInst &MI, const MemoryObject &Region, uint64_t Address, raw_ostream &os) const { - // The Thumb instruction stream is a sequence of halhwords. + // The Thumb instruction stream is a sequence of halfwords. // This represents the first halfword as well as the machine instruction // passed to decodeThumbInstruction(). For 16-bit Thumb instruction, the top @@ -504,6 +508,10 @@ bool ThumbDisassembler::getInstruction(MCInst &MI, Builder->SetSession(const_cast<Session *>(&SO)); + Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(), + getDisInfoBlock(), getMCContext(), + Address); + if (!Builder->Build(MI, insn)) return false; diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp index ca67e5e..271ca8c 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp @@ -17,6 +17,7 @@ #include "ARMDisassemblerCore.h" #include "ARMAddressingModes.h" +#include "ARMMCExpr.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -532,17 +533,18 @@ static bool BadRegsMulFrm(unsigned Opcode, uint32_t insn) { switch (Opcode) { default: // Did we miss an opcode? - assert(0 && "Unexpected opcode!"); + DEBUG(errs() << "BadRegsMulFrm: unexpected opcode!"); return false; case ARM::MLA: case ARM::MLS: case ARM::SMLABB: case ARM::SMLABT: case ARM::SMLATB: case ARM::SMLATT: case ARM::SMLAWB: case ARM::SMLAWT: - case ARM::SMMLA: case ARM::SMMLS: case ARM::USADA8: + case ARM::SMMLA: case ARM::SMMLAR: case ARM::SMMLS: case ARM::SMMLSR: + case ARM::USADA8: if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15) return true; return false; - case ARM::MUL: case ARM::SMMUL: case ARM::SMULBB: case ARM::SMULBT: - case ARM::SMULTB: case ARM::SMULTT: case ARM::SMULWB: case ARM::SMULWT: - case ARM::SMUAD: case ARM::SMUADX: + case ARM::MUL: case ARM::SMMUL: case ARM::SMMULR: + case ARM::SMULBB: case ARM::SMULBT: case ARM::SMULTB: case ARM::SMULTT: + case ARM::SMULWB: case ARM::SMULWT: case ARM::SMUAD: case ARM::SMUADX: // A8.6.167 SMLAD & A8.6.172 SMLSD case ARM::SMLAD: case ARM::SMLADX: case ARM::SMLSD: case ARM::SMLSDX: case ARM::USAD8: @@ -562,14 +564,14 @@ static bool BadRegsMulFrm(unsigned Opcode, uint32_t insn) { } // Multiply Instructions. -// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLS, -// SMLAD, SMLADX, SMLSD, SMLSDX, USADA8 (for convenience): +// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLAR, +// SMMLS, SMMLAR, SMLAD, SMLADX, SMLSD, SMLSDX, and USADA8 (for convenience): // Rd{19-16} Rn{3-0} Rm{11-8} Ra{15-12} // But note that register checking for {SMLAD, SMLADX, SMLSD, SMLSDX} is // only for {d, n, m}. // -// MUL, SMMUL, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT, SMUAD, SMUADX, -// USAD8 (for convenience): +// MUL, SMMUL, SMMULR, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT, SMUAD, +// SMUADX, and USAD8 (for convenience): // Rd{19-16} Rn{3-0} Rm{11-8} // // SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT, @@ -893,8 +895,9 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, } // Misc. Branch Instructions. -// BLX, BLXi, BX -// BX, BX_RET +// BX_RET, MOVPCLR +// BLX, BLX_pred, BX, BX_pred +// BLXi static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -911,7 +914,7 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // BLX and BX take one GPR reg. if (Opcode == ARM::BLX || Opcode == ARM::BLX_pred || - Opcode == ARM::BX) { + Opcode == ARM::BX || Opcode == ARM::BX_pred) { assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && "Reg operand expected"); MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, @@ -955,7 +958,7 @@ static bool BadRegsDPFrm(unsigned Opcode, uint32_t insn) { switch (Opcode) { default: // Did we miss an opcode? - if (decodeRd(insn) == 15 | decodeRn(insn) == 15 || decodeRm(insn) == 15) { + if (decodeRd(insn) == 15 || decodeRn(insn) == 15 || decodeRm(insn) == 15) { DEBUG(errs() << "DPFrm with bad reg specifier(s)\n"); return true; } @@ -1065,7 +1068,8 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // We have an imm16 = imm4:imm12 (imm4=Inst{19:16}, imm12 = Inst{11:0}). assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form"); unsigned Imm16 = slice(insn, 19, 16) << 12 | slice(insn, 11, 0); - MI.addOperand(MCOperand::CreateImm(Imm16)); + if (!B->tryAddingSymbolicOperand(Imm16, 4, MI)) + MI.addOperand(MCOperand::CreateImm(Imm16)); ++OpIdx; } else { // We have a reg/imm form. @@ -1172,6 +1176,71 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } +static bool BadRegsLdStFrm(unsigned Opcode, uint32_t insn, bool Store, bool WBack, + bool Imm) { + const StringRef Name = ARMInsts[Opcode].Name; + unsigned Rt = decodeRd(insn); + unsigned Rn = decodeRn(insn); + unsigned Rm = decodeRm(insn); + unsigned P = getPBit(insn); + unsigned W = getWBit(insn); + + if (Store) { + // Only STR (immediate, register) allows PC as the source. + if (Name.startswith("STRB") && Rt == 15) { + DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n"); + return true; + } + if (WBack && (Rn == 15 || Rn == Rt)) { + DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n"); + return true; + } + if (!Imm && Rm == 15) { + DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n"); + return true; + } + } else { + // Only LDR (immediate, register) allows PC as the destination. + if (Name.startswith("LDRB") && Rt == 15) { + DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n"); + return true; + } + if (Imm) { + // Immediate + if (Rn == 15) { + // The literal form must be in offset mode; it's an encoding error + // otherwise. + if (!(P == 1 && W == 0)) { + DEBUG(errs() << "Ld literal form with !(P == 1 && W == 0)\n"); + return true; + } + // LDRB (literal) does not allow PC as the destination. + if (Opcode != ARM::LDRi12 && Rt == 15) { + DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n"); + return true; + } + } else { + // Write back while Rn == Rt does not make sense. + if (WBack && (Rn == Rt)) { + DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n"); + return true; + } + } + } else { + // Register + if (Rm == 15) { + DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n"); + return true; + } + if (WBack && (Rn == 15 || Rn == Rt)) { + DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n"); + return true; + } + } + } + return false; +} + static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) { @@ -1234,6 +1303,9 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, if (OpIdx + 1 >= NumOps) return false; + if (BadRegsLdStFrm(Opcode, insn, isStore, isPrePost, getIBit(insn)==0)) + return false; + ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; unsigned IndexMode = (TID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift; @@ -2586,6 +2658,39 @@ static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn, // <size> == 32 && Inst{6} == 1 --> DblSpaced = true if (Name.endswith("32") || Name.endswith("32_UPD")) DblSpaced = slice(insn, 6, 6) == 1; + } else if (Name.find("DUP") != std::string::npos) { + // Single element (or structure) to all lanes. + // Inst{9-8} encodes the number of element(s) in the structure, with: + // 0b00 (VLD1DUP) (for this, a bit makes sense only for data size 16 and 32. + // 0b01 (VLD2DUP) + // 0b10 (VLD3DUP) (for this, a bit must be encoded as 0) + // 0b11 (VLD4DUP) + // + // Inst{7-6} encodes the data size, with: + // 0b00 => 8, 0b01 => 16, 0b10 => 32 + // + // Inst{4} (the a bit) encodes the align action (0: standard alignment) + unsigned elem = slice(insn, 9, 8) + 1; + unsigned a = slice(insn, 4, 4); + if (elem != 3) { + // 0b11 is not a valid encoding for Inst{7-6}. + if (slice(insn, 7, 6) == 3) + return false; + unsigned data_size = 8 << slice(insn, 7, 6); + // For VLD1DUP, a bit makes sense only for data size of 16 and 32. + if (a && data_size == 8) + return false; + + // Now we can calculate the alignment! + if (a) + alignment = elem * data_size; + } else { + if (a) { + // A8.6.315 VLD3 (single 3-element structure to all lanes) + // The a bit must be encoded as 0. + return false; + } + } } else { // Multiple n-element structures with type encoded as Inst{11-8}. // See, for example, A8.6.316 VLD4 (multiple 4-element structures). @@ -3211,17 +3316,6 @@ static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } -// A8.6.41 DMB -// A8.6.42 DSB -// A8.6.49 ISB -static inline bool MemBarrierInstr(uint32_t insn) { - unsigned op7_4 = slice(insn, 7, 4); - if (slice(insn, 31, 8) == 0xf57ff0 && (op7_4 >= 4 && op7_4 <= 6)) - return true; - - return false; -} - static inline bool PreLoadOpcode(unsigned Opcode) { switch(Opcode) { case ARM::PLDi12: case ARM::PLDrs: @@ -3285,14 +3379,20 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn, static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - if (MemBarrierInstr(insn)) { - // DMBsy, DSBsy, and ISBsy instructions have zero operand and are taken care - // of within the generic ARMBasicMCBuilder::BuildIt() method. - // + if (Opcode == ARM::DMB || Opcode == ARM::DSB) { // Inst{3-0} encodes the memory barrier option for the variants. - MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0))); - NumOpsAdded = 1; - return true; + unsigned opt = slice(insn, 3, 0); + switch (opt) { + case ARM_MB::SY: case ARM_MB::ST: + case ARM_MB::ISH: case ARM_MB::ISHST: + case ARM_MB::NSH: case ARM_MB::NSHST: + case ARM_MB::OSH: case ARM_MB::OSHST: + MI.addOperand(MCOperand::CreateImm(opt)); + NumOpsAdded = 1; + return true; + default: + return false; + } } switch (Opcode) { @@ -3559,11 +3659,17 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode, // like ARM. // // A8.6.16 B - if (Name == "t2Bcc") - MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 25, 22)))); - else if (Name == "tBcc") - MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 11, 8)))); - else + // Check for undefined encodings. + unsigned cond; + if (Name == "t2Bcc") { + if ((cond = slice(insn, 25, 22)) >= 14) + return false; + MI.addOperand(MCOperand::CreateImm(CondCode(cond))); + } else if (Name == "tBcc") { + if ((cond = slice(insn, 11, 8)) == 14) + return false; + MI.addOperand(MCOperand::CreateImm(CondCode(cond))); + } else MI.addOperand(MCOperand::CreateImm(ARMCC::AL)); } else { // ARM instructions get their condition field from Inst{31-28}. @@ -3632,3 +3738,84 @@ ARMBasicMCBuilder *llvm::CreateMCBuilder(unsigned Opcode, ARMFormat Format) { return new ARMBasicMCBuilder(Opcode, Format, ARMInsts[Opcode].getNumOperands()); } + +/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic +/// operand in place of the immediate Value in the MCInst. The immediate +/// Value has had any PC adjustment made by the caller. If the getOpInfo() +/// function was set as part of the setupBuilderForSymbolicDisassembly() call +/// then that function is called to get any symbolic information at the +/// builder's Address for this instrution. If that returns non-zero then the +/// symbolic information it returns is used to create an MCExpr and that is +/// added as an operand to the MCInst. This function returns true if it adds +/// an operand to the MCInst and false otherwise. +bool ARMBasicMCBuilder::tryAddingSymbolicOperand(uint64_t Value, + uint64_t InstSize, + MCInst &MI) { + if (!GetOpInfo) + return false; + + struct LLVMOpInfo1 SymbolicOp; + SymbolicOp.Value = Value; + if (!GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) + return false; + + const MCExpr *Add = NULL; + if (SymbolicOp.AddSymbol.Present) { + if (SymbolicOp.AddSymbol.Name) { + StringRef Name(SymbolicOp.AddSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Add = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx); + } + } + + const MCExpr *Sub = NULL; + if (SymbolicOp.SubtractSymbol.Present) { + if (SymbolicOp.SubtractSymbol.Name) { + StringRef Name(SymbolicOp.SubtractSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Sub = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx); + } + } + + const MCExpr *Off = NULL; + if (SymbolicOp.Value != 0) + Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx); + + const MCExpr *Expr; + if (Sub) { + const MCExpr *LHS; + if (Add) + LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx); + else + LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx); + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx); + else + Expr = LHS; + } else if (Add) { + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx); + else + Expr = Add; + } else { + if (Off != 0) + Expr = Off; + else + Expr = MCConstantExpr::Create(0, *Ctx); + } + + if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16) + MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx))); + else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16) + MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx))); + else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None) + MI.addOperand(MCOperand::CreateExpr(Expr)); + else + assert("bad SymbolicOp.VariantKind"); + + return true; +} diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h index 56dd85a..a7ba141 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h +++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h @@ -22,12 +22,17 @@ #define ARMDISASSEMBLERCORE_H #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCContext.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm-c/Disassembler.h" #include "ARMBaseInstrInfo.h" #include "ARMRegisterInfo.h" #include "ARMDisassembler.h" namespace llvm { +class MCContext; class ARMUtils { public: @@ -134,6 +139,31 @@ static inline void setSlice(unsigned &Bits, unsigned From, unsigned To, Bits |= (Val & Mask) << To; } +// Return an integer result equal to the number of bits of x that are ones. +static inline uint32_t +BitCount (uint64_t x) +{ + // c accumulates the total bits set in x + uint32_t c; + for (c = 0; x; ++c) + { + x &= x - 1; // clear the least significant bit set + } + return c; +} + +static inline bool +BitIsSet (const uint64_t value, const uint64_t bit) +{ + return (value & (1ull << bit)) != 0; +} + +static inline bool +BitIsClear (const uint64_t value, const uint64_t bit) +{ + return (value & (1ull << bit)) == 0; +} + /// Various utilities for checking the target specific flags. /// A unary data processing instruction doesn't have an Rn operand. @@ -202,7 +232,7 @@ private: public: ARMBasicMCBuilder(ARMBasicMCBuilder &B) : Opcode(B.Opcode), Format(B.Format), NumOps(B.NumOps), Disasm(B.Disasm), - SP(B.SP) { + SP(B.SP), GetOpInfo(0), DisInfo(0), Ctx(0) { Err = 0; } @@ -261,6 +291,44 @@ private: assert(SP); return slice(SP->ITState, 7, 4); } + +private: + // + // Hooks for symbolic disassembly via the public 'C' interface. + // + // The function to get the symbolic information for operands. + LLVMOpInfoCallback GetOpInfo; + // The pointer to the block of symbolic information for above call back. + void *DisInfo; + // The assembly context for creating symbols and MCExprs in place of + // immediate operands when there is symbolic information. + MCContext *Ctx; + // The address of the instruction being disassembled. + uint64_t Address; + +public: + void setupBuilderForSymbolicDisassembly(LLVMOpInfoCallback getOpInfo, + void *disInfo, MCContext *ctx, + uint64_t address) { + GetOpInfo = getOpInfo; + DisInfo = disInfo; + Ctx = ctx; + Address = address; + } + + uint64_t getBuilderAddress() const { return Address; } + + /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic + /// operand in place of the immediate Value in the MCInst. The immediate + /// Value has had any PC adjustment made by the caller. If the getOpInfo() + /// function was set as part of the setupBuilderForSymbolicDisassembly() call + /// then that function is called to get any symbolic information at the + /// builder's Address for this instrution. If that returns non-zero then the + /// symbolic information it returns is used to create an MCExpr and that is + /// added as an operand to the MCInst. This function returns true if it adds + /// an operand to the MCInst and false otherwise. + bool tryAddingSymbolicOperand(uint64_t Value, uint64_t InstSize, MCInst &MI); + }; } // namespace llvm diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h index 066a8e3..9639c8a 100644 --- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h +++ b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h @@ -108,6 +108,8 @@ static inline bool IsGPR(unsigned RegClass) { // Utilities for 32-bit Thumb instructions. +static inline bool BadReg(uint32_t n) { return n == 13 || n == 15; } + // Extract imm4: Inst{19-16}. static inline unsigned getImm4(uint32_t insn) { return slice(insn, 19, 16); @@ -398,9 +400,17 @@ static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn, assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); - MI.addOperand(MCOperand::CreateImm(UseRt ? getT1Imm8(insn) - : (Imm3 ? getT1Imm3(insn) - : getT1Imm5(insn)))); + unsigned Imm = 0; + if (UseRt) + Imm = getT1Imm8(insn); + else if (Imm3) + Imm = getT1Imm3(insn); + else { + Imm = getT1Imm5(insn); + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 12, 11)); + getImmShiftSE(ShOp, Imm); + } + MI.addOperand(MCOperand::CreateImm(Imm)); } ++OpIdx; @@ -466,9 +476,11 @@ static bool DisassembleThumb1DP(MCInst &MI, unsigned Opcode, uint32_t insn, // tADDhirr: Rd Rd(TIED_TO) Rm // tCMPhir: Rd Rm // tMOVr, tMOVgpr2gpr, tMOVgpr2tgpr, tMOVtgpr2gpr: Rd|tRd Rm|tRn +// tBX: Rm // tBX_RET: 0 operand // tBX_RET_vararg: Rm // tBLXr_r9: Rm +// tBRIND: Rm static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -476,11 +488,26 @@ static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn, if (NumOps == 0) return true; - // BX/BLX has 1 reg operand: Rm. - if (NumOps == 1) { + // BX/BLX/tBRIND (indirect branch, i.e, mov pc, Rm) has 1 reg operand: Rm. + if (Opcode==ARM::tBLXr_r9 || Opcode==ARM::tBX || Opcode==ARM::tBRIND) { + if (Opcode == ARM::tBLXr_r9) { + // Handling the two predicate operands before the reg operand. + if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps)) + return false; + NumOpsAdded += 2; + } + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, getT1Rm(insn)))); - NumOpsAdded = 1; + NumOpsAdded += 1; + + if (Opcode == ARM::tBX) { + // Handling the two predicate operands after the reg operand. + if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps)) + return false; + NumOpsAdded += 2; + } + return true; } @@ -890,6 +917,10 @@ static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode, } unsigned RegListBits = slice(insn, 7, 0); + if (BitCount(RegListBits) < 1) { + DEBUG(errs() << "if BitCount(registers) < 1 then UNPREDICTABLE\n"); + return false; + } // Fill the variadic part of reglist. for (unsigned i = 0; i < 8; ++i) @@ -936,10 +967,15 @@ static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned Imm8 = getT1Imm8(insn); MI.addOperand(MCOperand::CreateImm( - Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1) + 4 + Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1) : (int)Imm8)); // Predicate operands by ARMBasicMCBuilder::TryPredicateAndSBitModifier(). + // But note that for tBcc, if cond = '1110' then UNDEFINED. + if (Opcode == ARM::tBcc && slice(insn, 11, 8) == 14) { + DEBUG(errs() << "if cond = '1110' then UNDEFINED\n"); + return false; + } NumOpsAdded = 1; return true; @@ -1120,8 +1156,12 @@ static bool DisassembleThumb2SRS(MCInst &MI, unsigned Opcode, uint32_t insn, // t2RFE[IA|DB]W/t2RFE[IA|DB]: Rn static bool DisassembleThumb2RFE(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRn(insn)))); + unsigned Rn = decodeRn(insn); + if (Rn == 15) { + DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n"); + return false; + } + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,ARM::GPRRegClassID,Rn))); NumOpsAdded = 1; return true; } @@ -1202,29 +1242,66 @@ static bool DisassembleThumb2LdStEx(MCInst &MI, unsigned Opcode, uint32_t insn, bool isSW = (Opcode == ARM::t2LDREX || Opcode == ARM::t2STREX); bool isDW = (Opcode == ARM::t2LDREXD || Opcode == ARM::t2STREXD); + unsigned Rt = decodeRd(insn); + unsigned Rt2 = decodeRs(insn); // But note that this is Rd for t2STREX. + unsigned Rd = decodeRm(insn); + unsigned Rn = decodeRn(insn); + + // Some sanity checking first. + if (isStore) { + // if d == n || d == t then UNPREDICTABLE + // if d == n || d == t || d == t2 then UNPREDICTABLE + if (isDW) { + if (Rd == Rn || Rd == Rt || Rd == Rt2) { + DEBUG(errs() << "if d == n || d == t || d == t2 then UNPREDICTABLE\n"); + return false; + } + } else { + if (isSW) { + if (Rt2 == Rn || Rt2 == Rt) { + DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n"); + return false; + } + } else { + if (Rd == Rn || Rd == Rt) { + DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n"); + return false; + } + } + } + } else { + // Load + // A8.6.71 LDREXD + // if t == t2 then UNPREDICTABLE + if (isDW && Rt == Rt2) { + DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n"); + return false; + } + } + // Add the destination operand for store. if (isStore) { MI.addOperand(MCOperand::CreateReg( getRegisterEnum(B, OpInfo[OpIdx].RegClass, - isSW ? decodeRs(insn) : decodeRm(insn)))); + isSW ? Rt2 : Rd))); ++OpIdx; } // Source operand for store and destination operand for load. MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, - decodeRd(insn)))); + Rt))); ++OpIdx; // Thumb2 doubleword complication: with an extra source/destination operand. if (isDW) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass, - decodeRs(insn)))); + Rt2))); ++OpIdx; } // Finally add the pointer operand. MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, - decodeRn(insn)))); + Rn))); ++OpIdx; return true; @@ -1249,6 +1326,35 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode, && OpInfo[3].RegClass < 0 && "Expect >= 4 operands and first 3 as reg operands"); + // Thumnb allows for specifying Rt and Rt2, unlike ARM (which has Rt2==Rt+1). + unsigned Rt = decodeRd(insn); + unsigned Rt2 = decodeRs(insn); + unsigned Rn = decodeRn(insn); + + // Some sanity checking first. + + // A8.6.67 LDRD (literal) has its W bit as (0). + if (Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST) { + if (Rn == 15 && slice(insn, 21, 21) != 0) + return false; + } else { + // For Dual Store, PC cannot be used as the base register. + if (Rn == 15) { + DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n"); + return false; + } + } + if (Rt == Rt2) { + DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n"); + return false; + } + if (Opcode != ARM::t2LDRDi8 && Opcode != ARM::t2STRDi8) { + if (Rn == Rt || Rn == Rt2) { + DEBUG(errs() << "if wback && (n == t || n == t2) then UNPREDICTABLE\n"); + return false; + } + } + // Add the <Rt> <Rt2> operands. unsigned RegClassPair = OpInfo[0].RegClass; unsigned RegClassBase = OpInfo[2].RegClass; @@ -1385,9 +1491,12 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn, if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { - if (Thumb2ShiftOpcode(Opcode)) - MI.addOperand(MCOperand::CreateImm(getShiftAmtBits(insn))); - else { + if (Thumb2ShiftOpcode(Opcode)) { + unsigned Imm = getShiftAmtBits(insn); + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 5, 4)); + getImmShiftSE(ShOp, Imm); + MI.addOperand(MCOperand::CreateImm(Imm)); + } else { // Build the constant shift specifier operand. unsigned bits2 = getShiftTypeBits(insn); unsigned imm5 = getShiftAmtBits(insn); @@ -1412,7 +1521,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn, static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; unsigned &OpIdx = NumOpsAdded; OpIdx = 0; @@ -1439,8 +1549,15 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode, DEBUG(errs()<<"Thumb2 encoding error: d==15 for DPModImm 2-reg instr.\n"); return false; } - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID, - decodeRn(insn)))); + int Idx; + if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) { + // The reg operand is tied to the first reg operand. + MI.addOperand(MI.getOperand(Idx)); + } else { + // Add second reg operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID, + decodeRn(insn)))); + } ++OpIdx; } @@ -1509,7 +1626,7 @@ static bool DisassembleThumb2Sat(MCInst &MI, unsigned Opcode, uint32_t insn, // o t2ADDri12, t2SUBri12: Rs Rn imm12 // o t2LEApcrel (ADR): Rs imm12 // o t2BFC (BFC): Rs Ro(TIED_TO) bf_inv_mask_imm -// o t2BFI (BFI) (Currently not defined in LLVM as of Jan-07-2010) +// o t2BFI (BFI): Rs Ro(TIED_TO) Rn bf_inv_mask_imm // o t2MOVi16: Rs imm16 // o t2MOVTi16: Rs imm16 // o t2SBFX (SBFX): Rs Rn lsb width @@ -1570,9 +1687,10 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode, if (Opcode == ARM::t2ADDri12 || Opcode == ARM::t2SUBri12 || Opcode == ARM::t2LEApcrel) MI.addOperand(MCOperand::CreateImm(getIImm3Imm8(insn))); - else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) - MI.addOperand(MCOperand::CreateImm(getImm16(insn))); - else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) { + else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) { + if (!B->tryAddingSymbolicOperand(getImm16(insn), 4, MI)) + MI.addOperand(MCOperand::CreateImm(getImm16(insn))); + } else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) { uint32_t mask = 0; if (getBitfieldInvMask(insn, mask)) MI.addOperand(MCOperand::CreateImm(mask)); @@ -1616,8 +1734,7 @@ static inline bool t2MiscCtrlInstr(uint32_t insn) { // A8.6.26 // t2BXJ -> Rn // -// Miscellaneous control: t2DMBsy (and its t2DMB variants), -// t2DSBsy (and its t2DSB varianst), t2ISBsy, t2CLREX +// Miscellaneous control: // -> no operand (except pred-imm pred-ccr for CLREX, memory barrier variants) // // Hint: t2NOP, t2YIELD, t2WFE, t2WFI, t2SEV @@ -1634,6 +1751,22 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode, if (NumOps == 0) return true; + if (Opcode == ARM::t2DMB || Opcode == ARM::t2DSB) { + // Inst{3-0} encodes the memory barrier option for the variants. + unsigned opt = slice(insn, 3, 0); + switch (opt) { + case ARM_MB::SY: case ARM_MB::ST: + case ARM_MB::ISH: case ARM_MB::ISHST: + case ARM_MB::NSH: case ARM_MB::NSHST: + case ARM_MB::OSH: case ARM_MB::OSHST: + MI.addOperand(MCOperand::CreateImm(opt)); + NumOpsAdded = 1; + return true; + default: + return false; + } + } + if (t2MiscCtrlInstr(insn)) return true; @@ -1741,7 +1874,9 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode, Offset = decodeImm32_BLX(insn); break; } - MI.addOperand(MCOperand::CreateImm(Offset)); + + if (!B->tryAddingSymbolicOperand(Offset + B->getBuilderAddress() + 4, 4, MI)) + MI.addOperand(MCOperand::CreateImm(Offset)); // This is an increment as some predicate operands may have been added first. NumOpsAdded += 1; @@ -1819,6 +1954,87 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } +static bool BadRegsThumb2LdSt(unsigned Opcode, uint32_t insn, bool Load, + unsigned R0, unsigned R1, unsigned R2, bool UseRm, bool WB) { + + // Inst{22-21} encodes the data item transferred for load/store. + // For single word, it is encoded as ob10. + bool Word = (slice(insn, 22, 21) == 2); + bool Half = (slice(insn, 22, 21) == 1); + bool Byte = (slice(insn, 22, 21) == 0); + + if (UseRm && BadReg(R2)) { + DEBUG(errs() << "if BadReg(m) then UNPREDICTABLE\n"); + return true; + } + + if (Load) { + if (!Word && R0 == 13) { + DEBUG(errs() << "if t == 13 then UNPREDICTABLE\n"); + return true; + } + if (Byte) { + if (WB && R0 == 15 && slice(insn, 10, 8) == 3) { + // A8.6.78 LDRSB (immediate) Encoding T2 (errata markup 8.0) + DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n"); + return true; + } + } + // A6.3.8 Load halfword, memory hints + if (Half) { + if (WB) { + if (R0 == R1) { + // A8.6.82 LDRSH (immediate) Encoding T2 + DEBUG(errs() << "if WB && n == t then UNPREDICTABLE\n"); + return true; + } + if (R0 == 15 && slice(insn, 10, 8) == 3) { + // A8.6.82 LDRSH (immediate) Encoding T2 (errata markup 8.0) + DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n"); + return true; + } + } else { + if (Opcode == ARM::t2LDRHi8 || Opcode == ARM::t2LDRSHi8) { + if (R0 == 15 && slice(insn, 10, 8) == 4) { + // A8.6.82 LDRSH (immediate) Encoding T2 + DEBUG(errs() << "if Rt == '1111' and PUW == '100' then SEE" + << " \"Unallocated memory hints\"\n"); + return true; + } + } else { + if (R0 == 15) { + // A8.6.82 LDRSH (immediate) Encoding T1 + DEBUG(errs() << "if Rt == '1111' then SEE" + << " \"Unallocated memory hints\"\n"); + return true; + } + } + } + } + } else { + if (WB && R0 == R1) { + DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n"); + return true; + } + if ((WB && R0 == 15) || (!WB && R1 == 15)) { + DEBUG(errs() << "if Rn == '1111' then UNDEFINED\n"); + return true; + } + if (Word) { + if ((WB && R1 == 15) || (!WB && R0 == 15)) { + DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n"); + return true; + } + } else { + if ((WB && BadReg(R1)) || (!WB && BadReg(R0))) { + DEBUG(errs() << "if BadReg(t) then UNPREDICTABLE\n"); + return true; + } + } + } + return false; +} + // A6.3.10 Store single data item // A6.3.9 Load byte, memory hints // A6.3.8 Load halfword, memory hints @@ -1864,8 +2080,8 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, OpIdx = 0; assert(NumOps >= 3 && - OpInfo[0].RegClass == ARM::GPRRegClassID && - OpInfo[1].RegClass == ARM::GPRRegClassID && + OpInfo[0].RegClass > 0 && + OpInfo[1].RegClass > 0 && "Expect >= 3 operands and first two as reg operands"); bool ThreeReg = (OpInfo[2].RegClass > 0); @@ -1873,7 +2089,7 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, bool Imm12 = !ThreeReg && slice(insn, 23, 23) == 1; // ARMInstrThumb2.td // Build the register operands, followed by the immediate. - unsigned R0, R1, R2 = 0; + unsigned R0 = 0, R1 = 0, R2 = 0; unsigned Rd = decodeRd(insn); int Imm = 0; @@ -1904,10 +2120,10 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, Imm = decodeImm8(insn); } - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, R0))); ++OpIdx; - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, R1))); ++OpIdx; @@ -1918,6 +2134,10 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, ++OpIdx; } + if (BadRegsThumb2LdSt(Opcode, insn, Load, R0, R1, R2, ThreeReg & !TIED_TO, + TIED_TO)) + return false; + assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index fc2aa75..8ae87f8 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -29,8 +29,8 @@ StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const { return getInstructionName(Opcode); } -StringRef ARMInstPrinter::getRegName(unsigned RegNo) const { - return getRegisterName(RegNo); +void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); } void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) { diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h index b3ac03a..bde0eb9 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h @@ -28,7 +28,7 @@ public: virtual void printInst(const MCInst *MI, raw_ostream &O); virtual StringRef getOpcodeName(unsigned Opcode) const; - virtual StringRef getRegName(unsigned RegNo) const; + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; static const char *getInstructionName(unsigned Opcode); diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp index 9a27e2f..f6d0242 100644 --- a/lib/Target/ARM/MLxExpansionPass.cpp +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -15,11 +15,13 @@ #define DEBUG_TYPE "mlx-expansion" #include "ARM.h" #include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -49,15 +51,17 @@ namespace { const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; + bool isA9; unsigned MIIdx; MachineInstr* LastMIs[4]; + SmallPtrSet<MachineInstr*, 4> IgnoreStall; void clearStack(); void pushStack(MachineInstr *MI); MachineInstr *getAccDefMI(MachineInstr *MI) const; unsigned getDefReg(MachineInstr *MI) const; bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; - bool FindMLxHazard(MachineInstr *MI) const; + bool FindMLxHazard(MachineInstr *MI); void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, unsigned MulOpc, unsigned AddSubOpc, bool NegAcc, bool HasLane); @@ -146,7 +150,7 @@ bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { } -bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { +bool MLxExpansion::FindMLxHazard(MachineInstr *MI) { if (NumExpand >= ExpandLimit) return false; @@ -154,7 +158,7 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { return true; MachineInstr *DefMI = getAccDefMI(MI); - if (TII->isFpMLxInstruction(DefMI->getOpcode())) + if (TII->isFpMLxInstruction(DefMI->getOpcode())) { // r0 = vmla // r3 = vmla r0, r1, r2 // takes 16 - 17 cycles @@ -163,24 +167,33 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { // r4 = vmul r1, r2 // r3 = vadd r0, r4 // takes about 14 - 15 cycles even with vmul stalling for 4 cycles. + IgnoreStall.insert(DefMI); return true; + } + + if (IgnoreStall.count(MI)) + return false; // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall // preserves the in-order retirement of the instructions. // Look at the next few instructions, if *most* of them can cause hazards, // then the scheduler can't *fix* this, we'd better break up the VMLA. + unsigned Limit1 = isA9 ? 1 : 4; + unsigned Limit2 = isA9 ? 1 : 4; for (unsigned i = 1; i <= 4; ++i) { int Idx = ((int)MIIdx - i + 4) % 4; MachineInstr *NextMI = LastMIs[Idx]; if (!NextMI) continue; - if (TII->canCauseFpMLxStall(NextMI->getOpcode())) - return true; + if (TII->canCauseFpMLxStall(NextMI->getOpcode())) { + if (i <= Limit1) + return true; + } // Look for VMLx RAW hazard. - if (hasRAWHazard(getDefReg(MI), NextMI)) + if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI)) return true; } @@ -248,6 +261,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) { bool Changed = false; clearStack(); + IgnoreStall.clear(); unsigned Skip = 0; MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); @@ -299,6 +313,8 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo()); TRI = Fn.getTarget().getRegisterInfo(); MRI = &Fn.getRegInfo(); + const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); + isA9 = STI->isCortexA9(); bool Modified = false; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index dee3d27..e56d481 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -136,8 +136,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) .addFrameIndex(FramePtrSpillFI).addImm(0) .setMIFlags(MachineInstr::FrameSetup); - if (NumBytes > 7) - // If offset is > 7 then sp cannot be adjusted in a single instruction, + if (NumBytes > 508) + // If offset is > 508 then sp cannot be adjusted in a single instruction, // try restoring from fp instead. AFI->setShouldRestoreSPFromFP(true); } diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h index c592e12..bcfc516 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.h +++ b/lib/Target/ARM/Thumb1FrameLowering.h @@ -12,7 +12,7 @@ //===----------------------------------------------------------------------===// #ifndef __THUMB_FRAMEINFO_H_ -#define __THUMM_FRAMEINFO_H_ +#define __THUMB_FRAMEINFO_H_ #include "ARM.h" #include "ARMFrameLowering.h" diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp index 0cdca11..6bf5650 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -31,8 +31,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -48,6 +46,14 @@ Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, : ARMBaseRegisterInfo(tii, sti) { } +const TargetRegisterClass* +Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) + const { + if (ARM::tGPRRegClass.hasSubClassEq(RC)) + return ARM::tGPRRegisterClass; + return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC); +} + const TargetRegisterClass * Thumb1RegisterInfo::getPointerRegClass(unsigned Kind) const { return ARM::tGPRRegisterClass; diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h index b4fdd67..9060e59 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.h +++ b/lib/Target/ARM/Thumb1RegisterInfo.h @@ -28,6 +28,9 @@ struct Thumb1RegisterInfo : public ARMBaseRegisterInfo { public: Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; /// emitLoadConstPool - Emits a load from constpool to materialize the diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp index ce8edbe..355c3bf 100644 --- a/lib/Target/ARM/Thumb2RegisterInfo.cpp +++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp @@ -13,26 +13,15 @@ //===----------------------------------------------------------------------===// #include "ARM.h" -#include "ARMAddressingModes.h" -#include "ARMBaseInstrInfo.h" -#include "ARMMachineFunctionInfo.h" #include "ARMSubtarget.h" #include "Thumb2InstrInfo.h" #include "Thumb2RegisterInfo.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" -#include "llvm/LLVMContext.h" #include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLocation.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Support/ErrorHandling.h" using namespace llvm; Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index be9c150..ce2e966 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -12,6 +12,7 @@ #include "ARMAddressingModes.h" #include "ARMBaseRegisterInfo.h" #include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" #include "Thumb2InstrInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -49,82 +50,86 @@ namespace { // 1 - No cc field. // 2 - Always set CPSR. unsigned PredCC2 : 2; + unsigned PartFlag : 1; // 16-bit instruction does partial flag update unsigned Special : 1; // Needs to be dealt with specially }; static const ReduceEntry ReduceTable[] = { - // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, S - { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0 }, - { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0 }, + // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, PF, S + { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0,0 }, + { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0,0 }, + { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0,0 }, // Note: immediate scale is 4. - { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 1 }, - { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 1 }, - { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 1 }, - { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 0 }, - { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 0,1 }, + { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 0,1 }, + { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 0,1 }, + { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0 }, //FIXME: Disable CMN, as CCodes are backwards from compare expectations - //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0 }, - { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0 }, - { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 1 }, - { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 0 }, + //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1 }, + { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0 }, // FIXME: adr.n immediate offset must be multiple of 4. - //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 0 }, - { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 0 }, - { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 }, - { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1 }, + //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0 }, + // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less + // likely to cause issue in the loop. As a size / performance workaround, + // they are not marked as such. + { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,1 }, // FIXME: Do we need the 16-bit 'S' variant? - { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0 }, - { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0 }, - { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 1, 0,1, 0 }, - { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0 }, - { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 1 }, - { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0 }, - { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0 }, - { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0 }, - { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0 }, - { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0 }, - { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0,0 }, + { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0,0 }, + { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 1, 0,1, 0,0 }, + { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 0,1 }, + { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0,0 }, + { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0,0 }, + { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0,0 }, + { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,0 }, // FIXME: Clean this up after splitting each Thumb load / store opcode // into multiple ones. - { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 1 }, - { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 1 }, - { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 1 }, - - { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 1 }, - { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 1 }, - { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 1 }, + { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 0,1 }, + { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + + { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1 }, // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent - { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 1 }, - { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1 }, }; class Thumb2SizeReduce : public MachineFunctionPass { @@ -133,6 +138,7 @@ namespace { Thumb2SizeReduce(); const Thumb2InstrInfo *TII; + const ARMSubtarget *STI; virtual bool runOnMachineFunction(MachineFunction &MF); @@ -144,6 +150,8 @@ namespace { /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. DenseMap<unsigned, unsigned> ReduceOpcodeMap; + bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use); + bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, bool is2Addr, ARMCC::CondCodes Pred, bool LiveCPSR, bool &HasCC, bool &CCDead); @@ -152,19 +160,20 @@ namespace { const ReduceEntry &Entry); bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, - const ReduceEntry &Entry, bool LiveCPSR); + const ReduceEntry &Entry, bool LiveCPSR, + MachineInstr *CPSRDef); /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address /// instruction. bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR); + bool LiveCPSR, MachineInstr *CPSRDef); /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit /// non-two-address instruction. bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR); + bool LiveCPSR, MachineInstr *CPSRDef); /// ReduceMBB - Reduce width of instructions in the specified basic block. bool ReduceMBB(MachineBasicBlock &MBB); @@ -187,6 +196,52 @@ static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) { return false; } +/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations, +/// the 's' 16-bit instruction partially update CPSR. Abort the +/// transformation to avoid adding false dependency on last CPSR setting +/// instruction which hurts the ability for out-of-order execution engine +/// to do register renaming magic. +/// This function checks if there is a read-of-write dependency between the +/// last instruction that defines the CPSR and the current instruction. If there +/// is, then there is no harm done since the instruction cannot be retired +/// before the CPSR setting instruction anyway. +/// Note, we are not doing full dependency analysis here for the sake of compile +/// time. We're not looking for cases like: +/// r0 = muls ... +/// r1 = add.w r0, ... +/// ... +/// = mul.w r1 +/// In this case it would have been ok to narrow the mul.w to muls since there +/// are indirect RAW dependency between the muls and the mul.w +bool +Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use) { + if (!Def || !STI->avoidCPSRPartialUpdate()) + return false; + + SmallSet<unsigned, 2> Defs; + for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = Def->getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == 0 || Reg == ARM::CPSR) + continue; + Defs.insert(Reg); + } + + for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = Use->getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (Defs.count(Reg)) + return false; + } + + // No read-after-write dependency. The narrowing will add false dependency. + return true; +} + bool Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, bool is2Addr, ARMCC::CondCodes Pred, @@ -410,7 +465,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, MIB.addOperand(MI->getOperand(OpNum)); // Transfer memoperands. - (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); // Transfer MI flags. MIB.setMIFlags(MI->getFlags()); @@ -425,7 +480,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR) { + bool LiveCPSR, MachineInstr *CPSRDef) { if (Entry.LowRegs1 && !VerifyLowRegs(MI)) return false; @@ -443,12 +498,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, switch (Opc) { default: break; case ARM::t2ADDSri: { - if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) return true; // fallthrough } case ARM::t2ADDSrr: - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); } } break; @@ -456,13 +511,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, case ARM::t2RSBri: case ARM::t2RSBSri: if (MI->getOperand(2).getImm() == 0) - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); break; case ARM::t2MOVi16: // Can convert only 'pure' immediate operands, not immediates obtained as // globals' addresses. if (MI->getOperand(1).isImm()) - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); break; case ARM::t2CMPrr: { // Try to reduce to the lo-reg only version first. Why there are two @@ -471,17 +526,17 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, // are prioritized, but the table assumes a unique entry for each // source insn opcode. So for now, we hack a local entry record to use. static const ReduceEntry NarrowEntry = - { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 1 }; - if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR)) + { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1 }; + if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef)) return true; - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); } case ARM::t2ADDrSPi: { static const ReduceEntry NarrowEntry = - { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 1 }; + { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 0,1 }; if (MI->getOperand(0).getReg() == ARM::SP) - return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR); - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); } } return false; @@ -490,7 +545,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR) { + bool LiveCPSR, MachineInstr *CPSRDef) { if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) return false; @@ -545,6 +600,12 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead)) return false; + // Avoid adding a false dependency on partial flag update by some 16-bit + // instructions which has the 's' bit set. + if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC && + canAddPseudoFlagDep(CPSRDef, MI)) + return false; + // Add the 16-bit instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); @@ -579,7 +640,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR) { + bool LiveCPSR, MachineInstr *CPSRDef) { if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) return false; @@ -632,6 +693,12 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead)) return false; + // Avoid adding a false dependency on partial flag update by some 16-bit + // instructions which has the 's' bit set. + if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC && + canAddPseudoFlagDep(CPSRDef, MI)) + return false; + // Add the 16-bit instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); @@ -679,7 +746,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, return true; } -static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) { +static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) { bool HasDef = false; for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); @@ -687,6 +754,8 @@ static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) { continue; if (MO.getReg() != ARM::CPSR) continue; + + DefCPSR = true; if (!MO.isDead()) HasDef = true; } @@ -716,6 +785,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { // Yes, CPSR could be livein. bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); + MachineInstr *CPSRDef = 0; MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end(); MachineBasicBlock::iterator NextMII; @@ -731,7 +801,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { const ReduceEntry &Entry = ReduceTable[OPI->second]; // Ignore "special" cases for now. if (Entry.Special) { - if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) { + if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -740,7 +810,8 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { } // Try to transform to a 16-bit two-address instruction. - if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) { + if (Entry.NarrowOpc2 && + ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -748,7 +819,8 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { } // Try to transform to a 16-bit non-two-address instruction. - if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) { + if (Entry.NarrowOpc1 && + ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -756,7 +828,14 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { } ProcessNext: - LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR); + bool DefCPSR = false; + LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR); + if (MI->getDesc().isCall()) + // Calls don't really set CPSR. + CPSRDef = 0; + else if (DefCPSR) + // This is the last CPSR defining instruction. + CPSRDef = MI; } return Modified; @@ -765,6 +844,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { const TargetMachine &TM = MF.getTarget(); TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + STI = &TM.getSubtarget<ARMSubtarget>(); bool Modified = false; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td index 4508eda..ae79c2e 100644 --- a/lib/Target/Alpha/Alpha.td +++ b/lib/Target/Alpha/Alpha.td @@ -21,7 +21,7 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true", - "Enable CIX extentions">; + "Enable CIX extensions">; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp index c4f43ab..0875cfd 100644 --- a/lib/Target/Alpha/AlphaISelLowering.cpp +++ b/lib/Target/Alpha/AlphaISelLowering.cpp @@ -155,6 +155,8 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) setJumpBufSize(272); setJumpBufAlignment(16); + setMinFunctionAlignment(4); + computeRegisterProperties(); } @@ -180,11 +182,6 @@ const char *AlphaTargetLowering::getTargetNodeName(unsigned Opcode) const { } } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned AlphaTargetLowering::getFunctionAlignment(const Function *F) const { - return 4; -} - static SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) { EVT PtrVT = Op.getValueType(); JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); @@ -233,8 +230,8 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_Alpha); @@ -296,7 +293,7 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain and // flag operands which copy the outgoing args into registers. The InFlag in - // necessary since all emited instructions must be stuck together. + // necessary since all emitted instructions must be stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, @@ -347,8 +344,8 @@ AlphaTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_Alpha); diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h index cb98f92..d38c314 100644 --- a/lib/Target/Alpha/AlphaISelLowering.h +++ b/lib/Target/Alpha/AlphaISelLowering.h @@ -104,9 +104,6 @@ namespace llvm { virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td index 099d715..b201712 100644 --- a/lib/Target/Alpha/AlphaInstrInfo.td +++ b/lib/Target/Alpha/AlphaInstrInfo.td @@ -1030,7 +1030,7 @@ def : Pat<(brcond (setune F8RC:$RA, immFPZ), bb:$DISP), //WMB Mfc 18.4400 Write memory barrier //MF_FPCR F-P 17.025 Move from FPCR //MT_FPCR F-P 17.024 Move to FPCR -//There are in the Multimedia extentions, so let's not use them yet +//There are in the Multimedia extensions, so let's not use them yet //def MAXSB8 : OForm<0x1C, 0x3E, "MAXSB8 $RA,$RB,$RC">; //Vector signed byte maximum //def MAXSW4 : OForm< 0x1C, 0x3F, "MAXSW4 $RA,$RB,$RC">; //Vector signed word maximum //def MAXUB8 : OForm<0x1C, 0x3C, "MAXUB8 $RA,$RB,$RC">; //Vector unsigned byte maximum diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp index 7667fd8..d6c3809 100644 --- a/lib/Target/Alpha/AlphaRegisterInfo.cpp +++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp @@ -69,6 +69,7 @@ const unsigned* AlphaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) BitVector AlphaRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(Alpha::R15); + Reserved.set(Alpha::R29); Reserved.set(Alpha::R30); Reserved.set(Alpha::R31); return Reserved; @@ -198,6 +199,11 @@ int AlphaRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return -1; } +int AlphaRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum, bool isEH) const { + llvm_unreachable("What is the dwarf register number"); + return -1; +} + #include "AlphaGenRegisterInfo.inc" std::string AlphaRegisterInfo::getPrettyName(unsigned reg) diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h index b0d4dd0..ffe6cf1 100644 --- a/lib/Target/Alpha/AlphaRegisterInfo.h +++ b/lib/Target/Alpha/AlphaRegisterInfo.h @@ -48,6 +48,7 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo { unsigned getEHHandlerRegister() const; int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; static std::string getPrettyName(unsigned reg); }; diff --git a/lib/Target/Alpha/AlphaRegisterInfo.td b/lib/Target/Alpha/AlphaRegisterInfo.td index 35e6804..32120d7 100644 --- a/lib/Target/Alpha/AlphaRegisterInfo.td +++ b/lib/Target/Alpha/AlphaRegisterInfo.td @@ -110,10 +110,10 @@ def F31 : FPR<31, "$f31">, DwarfRegNum<[64]>; // $28 is undefined after any and all calls /// Register classes -def GPRC : RegisterClass<"Alpha", [i64], 64, +def GPRC : RegisterClass<"Alpha", [i64], 64, (add // Volatile - [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, R20, R21, R22, - R23, R24, R25, R28, + R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, R20, R21, R22, + R23, R24, R25, R28, //Special meaning, but volatile R27, //procedure address R26, //return address @@ -121,51 +121,13 @@ def GPRC : RegisterClass<"Alpha", [i64], 64, // Non-volatile R9, R10, R11, R12, R13, R14, // Don't allocate 15, 30, 31 - R15, R30, R31 ]> //zero -{ - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GPRCClass::iterator - GPRCClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; - } - }]; -} + R15, R30, R31)>; //zero -def F4RC : RegisterClass<"Alpha", [f32], 64, [F0, F1, +def F4RC : RegisterClass<"Alpha", [f32], 64, (add F0, F1, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, // Saved: F2, F3, F4, F5, F6, F7, F8, F9, - F31 ]> //zero -{ - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - F4RCClass::iterator - F4RCClass::allocation_order_end(const MachineFunction &MF) const { - return end()-1; - } - }]; -} + F31)>; //zero -def F8RC : RegisterClass<"Alpha", [f64], 64, [F0, F1, - F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, - F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, - // Saved: - F2, F3, F4, F5, F6, F7, F8, F9, - F31 ]> //zero -{ - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - F8RCClass::iterator - F8RCClass::allocation_order_end(const MachineFunction &MF) const { - return end()-1; - } - }]; -} +def F8RC : RegisterClass<"Alpha", [f64], 64, (add F4RC)>; diff --git a/lib/Target/Alpha/README.txt b/lib/Target/Alpha/README.txt index 9ae1517..cc170e3 100644 --- a/lib/Target/Alpha/README.txt +++ b/lib/Target/Alpha/README.txt @@ -33,9 +33,9 @@ add crazy vector instructions (MVI): (MIN|MAX)(U|S)(B8|W4) min and max, signed and unsigned, byte and word PKWB, UNPKBW pack/unpack word to byte PKLB UNPKBL pack/unpack long to byte -PERR pixel error (sum accross bytes of bytewise abs(i8v8 a - i8v8 b)) +PERR pixel error (sum across bytes of bytewise abs(i8v8 a - i8v8 b)) -cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extentions) +cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extensions) this has some good examples for other operations that can be synthesised well from these rather meager vector ops (such as saturating add). diff --git a/lib/Target/Blackfin/BlackfinFrameLowering.cpp b/lib/Target/Blackfin/BlackfinFrameLowering.cpp index 08bb952..0b0984d 100644 --- a/lib/Target/Blackfin/BlackfinFrameLowering.cpp +++ b/lib/Target/Blackfin/BlackfinFrameLowering.cpp @@ -31,6 +31,12 @@ bool BlackfinFrameLowering::hasFP(const MachineFunction &MF) const { MFI->adjustsStack() || MFI->hasVarSizedObjects(); } +// Always reserve a call frame. We dont have enough registers to adjust SP. +bool BlackfinFrameLowering:: +hasReservedCallFrame(const MachineFunction &MF) const { + return true; +} + // Emit a prologue that sets up a stack frame. // On function entry, R0-R2 and P0 may hold arguments. // R3, P1, and P2 may be used as scratch registers diff --git a/lib/Target/Blackfin/BlackfinFrameLowering.h b/lib/Target/Blackfin/BlackfinFrameLowering.h index 3d2ee25..726fa2c 100644 --- a/lib/Target/Blackfin/BlackfinFrameLowering.h +++ b/lib/Target/Blackfin/BlackfinFrameLowering.h @@ -36,6 +36,7 @@ public: void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; bool hasFP(const MachineFunction &MF) const; + bool hasReservedCallFrame(const MachineFunction &MF) const; void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const; diff --git a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp index 9df2aee..42659ae 100644 --- a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp +++ b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp @@ -117,11 +117,11 @@ bool BlackfinDAGToDAGISel::SelectADDRspii(SDValue Addr, } static inline bool isCC(const TargetRegisterClass *RC) { - return RC == &BF::AnyCCRegClass || BF::AnyCCRegClass.hasSubClass(RC); + return BF::AnyCCRegClass.hasSubClassEq(RC); } static inline bool isDCC(const TargetRegisterClass *RC) { - return RC == &BF::DRegClass || BF::DRegClass.hasSubClass(RC) || isCC(RC); + return BF::DRegClass.hasSubClassEq(RC) || isCC(RC); } static void UpdateNodeOperand(SelectionDAG &DAG, diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp index 7c80eec..588d9bd 100644 --- a/lib/Target/Blackfin/BlackfinISelLowering.cpp +++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp @@ -121,6 +121,8 @@ BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM) setOperationAction(ISD::VAEND, MVT::Other, Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + + setMinFunctionAlignment(2); } const char *BlackfinTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -169,8 +171,8 @@ BlackfinTargetLowering::LowerFormalArguments(SDValue Chain, MachineFrameInfo *MFI = MF.getFrameInfo(); SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AllocateStack(12, 4); // ABI requires 12 bytes stack space CCInfo.AnalyzeFormalArguments(Ins, CC_Blackfin); @@ -227,8 +229,8 @@ BlackfinTargetLowering::LowerReturn(SDValue Chain, SmallVector<CCValAssign, 16> RVLocs; // CCState - Info about the registers and stack slot. - CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + DAG.getTarget(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_Blackfin); @@ -288,8 +290,8 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + DAG.getTarget(), ArgLocs, *DAG.getContext()); CCInfo.AllocateStack(12, 4); // ABI requires 12 bytes stack space CCInfo.AnalyzeCallOperands(Outs, CC_Blackfin); @@ -345,7 +347,7 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { @@ -376,8 +378,8 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState RVInfo(CallConv, isVarArg, DAG.getTarget(), RVLocs, - *DAG.getContext()); + CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(), + DAG.getTarget(), RVLocs, *DAG.getContext()); RVInfo.AnalyzeCallResult(Ins, RetCC_Blackfin); @@ -497,11 +499,6 @@ BlackfinTargetLowering::ReplaceNodeResults(SDNode *N, } } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned BlackfinTargetLowering::getFunctionAlignment(const Function *F) const { - return 2; -} - //===----------------------------------------------------------------------===// // Blackfin Inline Assembly Support //===----------------------------------------------------------------------===// diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h index 102c830..9a54557 100644 --- a/lib/Target/Blackfin/BlackfinISelLowering.h +++ b/lib/Target/Blackfin/BlackfinISelLowering.h @@ -53,7 +53,6 @@ namespace llvm { EVT VT) const; virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; const char *getTargetNodeName(unsigned Opcode) const; - unsigned getFunctionAlignment(const Function *F) const; private: SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp index e50d57a..598cf2a 100644 --- a/lib/Target/Blackfin/BlackfinInstrInfo.cpp +++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp @@ -160,7 +160,7 @@ static bool inClass(const TargetRegisterClass &Test, if (TargetRegisterInfo::isPhysicalRegister(Reg)) return Test.contains(Reg); else - return &Test==RC || Test.hasSubClass(RC); + return Test.hasSubClassEq(RC); } void diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp index b4a9b84..6ca460e 100644 --- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp +++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp @@ -351,5 +351,11 @@ int BlackfinRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return -1; } +int BlackfinRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum, + bool isEH) const { + llvm_unreachable("What is the dwarf register number"); + return -1; +} + #include "BlackfinGenRegisterInfo.inc" diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h index 642b8ad..375d277 100644 --- a/lib/Target/Blackfin/BlackfinRegisterInfo.h +++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h @@ -41,8 +41,6 @@ namespace llvm { return &BF::PRegClass; } - // bool hasReservedCallFrame(MachineFunction &MF) const; - bool requiresRegisterScavenging(const MachineFunction &MF) const; void eliminateCallFramePseudoInstr(MachineFunction &MF, @@ -60,6 +58,7 @@ namespace llvm { unsigned getEHHandlerRegister() const; int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; // Utility functions void adjustRegister(MachineBasicBlock &MBB, diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.td b/lib/Target/Blackfin/BlackfinRegisterInfo.td index f5dd439..9e2f79f 100644 --- a/lib/Target/Blackfin/BlackfinRegisterInfo.td +++ b/lib/Target/Blackfin/BlackfinRegisterInfo.td @@ -195,158 +195,66 @@ def LB0 : Ri<6, 2, "lb0">, DwarfRegNum<[48]>; def LB1 : Ri<6, 5, "lb1">, DwarfRegNum<[49]>; // Register classes. -def D16 : RegisterClass<"BF", [i16], 16, - [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L, - R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L]>; +def D16L : RegisterClass<"BF", [i16], 16, (sequence "R%uL", 0, 7)>; -def D16L : RegisterClass<"BF", [i16], 16, - [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L]>; +def D16H : RegisterClass<"BF", [i16], 16, (sequence "R%uH", 0, 7)>; -def D16H : RegisterClass<"BF", [i16], 16, - [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H]>; - -def P16 : RegisterClass<"BF", [i16], 16, - [P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L, - P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL]>; +def D16 : RegisterClass<"BF", [i16], 16, (add D16L, D16H)>; def P16L : RegisterClass<"BF", [i16], 16, - [P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL]>; + (add (sequence "P%uL", 0, 5), SPL, FPL)>; def P16H : RegisterClass<"BF", [i16], 16, - [P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH]>; + (add (sequence "P%uH", 0, 5), SPH, FPH)>; + +def P16 : RegisterClass<"BF", [i16], 16, (add P16L, P16H)>; -def DP16 : RegisterClass<"BF", [i16], 16, - [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L, - R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L, - P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L, - P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL]>; +def DP16 : RegisterClass<"BF", [i16], 16, (add D16, P16)>; -def DP16L : RegisterClass<"BF", [i16], 16, - [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L, - P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL]>; +def DP16L : RegisterClass<"BF", [i16], 16, (add D16L, P16L)>; -def DP16H : RegisterClass<"BF", [i16], 16, - [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H, - P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH]>; +def DP16H : RegisterClass<"BF", [i16], 16, (add D16H, P16H)>; def GR16 : RegisterClass<"BF", [i16], 16, - [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L, - R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L, - P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L, - P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL, + (add DP16, I0H, I0L, I1H, I1L, I2H, I2L, I3H, I3L, M0H, M0L, M1H, M1L, M2H, M2L, M3H, M3L, B0H, B0L, B1H, B1L, B2H, B2L, B3H, B3L, - L0H, L0L, L1H, L1L, L2H, L2L, L3H, L3L]>; + L0H, L0L, L1H, L1L, L2H, L2L, L3H, L3L)>; -def D : RegisterClass<"BF", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> { +def D : RegisterClass<"BF", [i32], 32, (sequence "R%u", 0, 7)> { let SubRegClasses = [(D16L lo16), (D16H hi16)]; } -def P : RegisterClass<"BF", [i32], 32, [P0, P1, P2, P3, P4, P5, FP, SP]> { +def P : RegisterClass<"BF", [i32], 32, (add (sequence "P%u", 0, 5), FP, SP)> { let SubRegClasses = [(P16L lo16), (P16H hi16)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - PClass::iterator - PClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - return allocation_order_begin(MF) - + (TFI->hasFP(MF) ? 7 : 6); - } - }]; } -def I : RegisterClass<"BF", [i32], 32, [I0, I1, I2, I3]>; -def M : RegisterClass<"BF", [i32], 32, [M0, M1, M2, M3]>; -def B : RegisterClass<"BF", [i32], 32, [B0, B1, B2, B3]>; -def L : RegisterClass<"BF", [i32], 32, [L0, L1, L2, L3]>; - -def DP : RegisterClass<"BF", [i32], 32, - [R0, R1, R2, R3, R4, R5, R6, R7, - P0, P1, P2, P3, P4, P5, FP, SP]> { +def DP : RegisterClass<"BF", [i32], 32, (add D, P)> { let SubRegClasses = [(DP16L lo16), (DP16H hi16)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - DPClass::iterator - DPClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - return allocation_order_begin(MF) - + (TFI->hasFP(MF) ? 15 : 14); - } - }]; } -def GR : RegisterClass<"BF", [i32], 32, - [R0, R1, R2, R3, R4, R5, R6, R7, - P0, P1, P2, P3, P4, P5, - I0, I1, I2, I3, M0, M1, M2, M3, - B0, B1, B2, B3, L0, L1, L2, L3, - FP, SP]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GRClass::iterator - GRClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - return allocation_order_begin(MF) - + (TFI->hasFP(MF) ? 31 : 30); - } - }]; -} +def I : RegisterClass<"BF", [i32], 32, (add I0, I1, I2, I3)>; +def M : RegisterClass<"BF", [i32], 32, (add M0, M1, M2, M3)>; +def B : RegisterClass<"BF", [i32], 32, (add B0, B1, B2, B3)>; +def L : RegisterClass<"BF", [i32], 32, (add L0, L1, L2, L3)>; + +def GR : RegisterClass<"BF", [i32], 32, (add DP, I, M, B, L)>; def ALL : RegisterClass<"BF", [i32], 32, - [R0, R1, R2, R3, R4, R5, R6, R7, - P0, P1, P2, P3, P4, P5, - I0, I1, I2, I3, M0, M1, M2, M3, - B0, B1, B2, B3, L0, L1, L2, L3, - FP, SP, + (add GR, A0X, A0W, A1X, A1W, ASTAT, RETS, LC0, LT0, LB0, LC1, LT1, LB1, CYCLES, CYCLES2, - USP, SEQSTAT, SYSCFG, RETI, RETX, RETN, RETE, EMUDAT]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - ALLClass::iterator - ALLClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - return allocation_order_begin(MF) - + (TFI->hasFP(MF) ? 31 : 30); - } - }]; -} + USP, SEQSTAT, SYSCFG, RETI, RETX, RETN, RETE, EMUDAT)>; -def PI : RegisterClass<"BF", [i32], 32, - [P0, P1, P2, P3, P4, P5, I0, I1, I2, I3, FP, SP]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - PIClass::iterator - PIClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - return allocation_order_begin(MF) - + (TFI->hasFP(MF) ? 11 : 10); - } - }]; -} +def PI : RegisterClass<"BF", [i32], 32, (add P, I)>; // We are going to pretend that CC and !CC are 32-bit registers, even though // they only can hold 1 bit. let CopyCost = -1, Size = 8 in { -def JustCC : RegisterClass<"BF", [i32], 8, [CC]>; -def NotCC : RegisterClass<"BF", [i32], 8, [NCC]>; -def AnyCC : RegisterClass<"BF", [i32], 8, [CC, NCC]> { +def JustCC : RegisterClass<"BF", [i32], 8, (add CC)>; +def NotCC : RegisterClass<"BF", [i32], 8, (add NCC)>; +def AnyCC : RegisterClass<"BF", [i32], 8, (add CC, NCC)> { let MethodProtos = [{ iterator allocation_order_end(const MachineFunction &MF) const; }]; @@ -358,8 +266,8 @@ def AnyCC : RegisterClass<"BF", [i32], 8, [CC, NCC]> { }]; } def StatBit : RegisterClass<"BF", [i1], 8, - [AZ, AN, CC, AQ, AC0, AC1, AV0, AV0S, AV1, AV1S, V, VS]>; + (add AZ, AN, CC, AQ, AC0, AC1, AV0, AV0S, AV1, AV1S, V, VS)>; } // Should be i40, but that isn't defined. It is not a legal type yet anyway. -def Accu : RegisterClass<"BF", [i64], 64, [A0, A1]>; +def Accu : RegisterClass<"BF", [i64], 64, (add A0, A1)>; diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp index 358d1b3..7c24037 100644 --- a/lib/Target/CBackend/CBackend.cpp +++ b/lib/Target/CBackend/CBackend.cpp @@ -278,7 +278,7 @@ namespace { return AI; } - // isInlineAsm - Check if the instruction is a call to an inline asm chunk + // isInlineAsm - Check if the instruction is a call to an inline asm chunk. static bool isInlineAsm(const Instruction& I) { if (const CallInst *CI = dyn_cast<CallInst>(&I)) return isa<InlineAsm>(CI->getCalledValue()); @@ -373,7 +373,7 @@ static std::string CBEMangle(const std::string &S) { /// bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) { // Get a set of types that are used by the program... - std::set<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes(); + SetVector<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes(); // Loop over the module symbol table, removing types from UT that are // already named, and removing names for types that are not used. @@ -390,11 +390,10 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) { TST.remove(I); } else { // If this is not used, remove it from the symbol table. - std::set<const Type *>::iterator UTI = UT.find(I->second); - if (UTI == UT.end()) + if (!UT.count(I->second)) TST.remove(I); else - UT.erase(UTI); // Only keep one name for this type. + UT.remove(I->second); // Only keep one name for this type. } } @@ -403,7 +402,7 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) { // bool Changed = false; unsigned RenameCounter = 0; - for (std::set<const Type *>::const_iterator I = UT.begin(), E = UT.end(); + for (SetVector<const Type *>::const_iterator I = UT.begin(), E = UT.end(); I != E; ++I) if ((*I)->isStructTy() || (*I)->isArrayTy()) { while (M.addTypeName("unnamed"+utostr(RenameCounter), *I)) @@ -661,7 +660,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) { if (isString) { Out << '\"'; - // Keep track of whether the last number was a hexadecimal escape + // Keep track of whether the last number was a hexadecimal escape. bool LastWasHex = false; // Do not include the last character, which we know is null diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td index 5ef5716..f340edf 100644 --- a/lib/Target/CellSPU/SPU64InstrInfo.td +++ b/lib/Target/CellSPU/SPU64InstrInfo.td @@ -24,7 +24,7 @@ // 5. The code sequences for r64 and v2i64 are probably overly conservative, // compared to the code that gcc produces. // -// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!) +// M00$E B!tes Kan be Pretty N@sTi!!!!! (apologies to Monty!) //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // selb instruction definition for i64. Note that the selection mask is diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 743a4d7..f9b5041 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -445,6 +445,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); + setMinFunctionAlignment(3); + computeRegisterProperties(); // Set pre-RA register scheduler default to BURR, which produces slightly @@ -489,11 +491,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const return ((i != node_names.end()) ? i->second : 0); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const { - return 3; -} - //===----------------------------------------------------------------------===// // Return the Cell SPU's SETCC result type //===----------------------------------------------------------------------===// @@ -705,7 +702,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { offset )); - // Shift the low similarily + // Shift the low similarly // TODO: add SPUISD::SHL_BYTES low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset ); @@ -1120,8 +1117,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // FIXME: allow for other calling conventions CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU); @@ -1218,7 +1215,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setVarArgsFrameIndex( MFI->CreateFixedObject(StackSlotSize, ArgOffset, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); - unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass); + unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass); SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8); SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(), false, false, 0); @@ -1267,8 +1264,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // FIXME: allow for other calling conventions CCInfo.AnalyzeCallOperands(Outs, CCC_SPU); @@ -1428,8 +1425,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Now handle the return value(s) SmallVector<CCValAssign, 16> RVLocs; - CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU); @@ -1455,8 +1452,8 @@ SPUTargetLowering::LowerReturn(SDValue Chain, DebugLoc dl, SelectionDAG &DAG) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_SPU); // If this is the first return lowered for this function, add the regs to the @@ -3207,11 +3204,11 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, // LowerAsmOperandForConstraint void SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, + std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { // Default, for the time being, to the base class handler - TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG); + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } /// isLegalAddressImmediate - Return true if the integer value can be used diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index cf883e2..d23f6cc 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -141,7 +141,7 @@ namespace llvm { getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; - void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter, + void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const; @@ -152,9 +152,6 @@ namespace llvm { virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp index 0bdd50a..623ae76 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.cpp +++ b/lib/Target/CellSPU/SPURegisterInfo.cpp @@ -328,6 +328,10 @@ SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); } +int SPURegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const { + return SPUGenRegisterInfo::getLLVMRegNumFull(RegNum, 0); +} + int SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const { diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h index 1708c59..6ecf0f2 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.h +++ b/lib/Target/CellSPU/SPURegisterInfo.h @@ -83,6 +83,7 @@ namespace llvm { //! Get DWARF debugging register number int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; //! Convert D-form load/store to X-form load/store /*! diff --git a/lib/Target/CellSPU/SPURegisterInfo.td b/lib/Target/CellSPU/SPURegisterInfo.td index 3e8f097..e16f51f 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.td +++ b/lib/Target/CellSPU/SPURegisterInfo.td @@ -155,275 +155,29 @@ def R127 : SPUVecReg<127, "$127">, DwarfRegNum<[127]>; // The SPU's registers as 128-bit wide entities, and can function as general // purpose registers, where the operands are in the "preferred slot": +// The non-volatile registers are allocated in reverse order, like PPC does it. def GPRC : RegisterClass<"SPU", [i128], 128, - [ - /* volatile register */ - R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, - R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, - R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, - R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, - R77, R78, R79, - /* non-volatile register: take hint from PPC and allocate in reverse order */ - R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115, - R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102, - R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87, - R86, R85, R84, R83, R82, R81, R80, - /* environment ptr, SP, LR */ - R2, R1, R0 ]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GPRCClass::iterator - GPRCClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - GPRCClass::iterator - GPRCClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; // don't allocate R2, R1, or R0 (envp, sp, lr) - } - }]; -} + (add (sequence "R%u", 0, 79), + (sequence "R%u", 127, 80))>; // The SPU's registers as 64-bit wide (double word integer) "preferred slot": -def R64C : RegisterClass<"SPU", [i64], 128, - [ - /* volatile register */ - R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, - R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, - R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, - R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, - R77, R78, R79, - /* non-volatile register: take hint from PPC and allocate in reverse order */ - R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115, - R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102, - R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87, - R86, R85, R84, R83, R82, R81, R80, - /* environment ptr, SP, LR */ - R2, R1, R0 ]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - R64CClass::iterator - R64CClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - R64CClass::iterator - R64CClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; // don't allocate R2, R1, or R0 (envp, sp, lr) - } - }]; -} +def R64C : RegisterClass<"SPU", [i64], 128, (add GPRC)>; // The SPU's registers as 64-bit wide (double word) FP "preferred slot": -def R64FP : RegisterClass<"SPU", [f64], 128, - [ - /* volatile register */ - R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, - R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, - R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, - R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, - R77, R78, R79, - /* non-volatile register: take hint from PPC and allocate in reverse order */ - R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115, - R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102, - R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87, - R86, R85, R84, R83, R82, R81, R80, - /* environment ptr, SP, LR */ - R2, R1, R0 ]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - R64FPClass::iterator - R64FPClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - R64FPClass::iterator - R64FPClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; // don't allocate R2, R1, or R0 (envp, sp, lr) - } - }]; -} +def R64FP : RegisterClass<"SPU", [f64], 128, (add GPRC)>; // The SPU's registers as 32-bit wide (word) "preferred slot": -def R32C : RegisterClass<"SPU", [i32], 128, - [ - /* volatile register */ - R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, - R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, - R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, - R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, - R77, R78, R79, - /* non-volatile register: take hint from PPC and allocate in reverse order */ - R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115, - R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102, - R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87, - R86, R85, R84, R83, R82, R81, R80, - /* environment ptr, SP, LR */ - R2, R1, R0 ]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - R32CClass::iterator - R32CClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - R32CClass::iterator - R32CClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; // don't allocate R2, R1, or R0 (envp, sp, lr) - } - }]; -} +def R32C : RegisterClass<"SPU", [i32], 128, (add GPRC)>; // The SPU's registers as single precision floating point "preferred slot": -def R32FP : RegisterClass<"SPU", [f32], 128, - [ - /* volatile register */ - R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, - R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, - R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, - R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, - R77, R78, R79, - /* non-volatile register: take hint from PPC and allocate in reverse order */ - R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115, - R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102, - R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87, - R86, R85, R84, R83, R82, R81, R80, - /* environment ptr, SP, LR */ - R2, R1, R0 ]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - R32FPClass::iterator - R32FPClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - R32FPClass::iterator - R32FPClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; // don't allocate R2, R1, or R0 (envp, sp, lr) - } - }]; -} +def R32FP : RegisterClass<"SPU", [f32], 128, (add GPRC)>; // The SPU's registers as 16-bit wide (halfword) "preferred slot": -def R16C : RegisterClass<"SPU", [i16], 128, - [ - /* volatile register */ - R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, - R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, - R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, - R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, - R77, R78, R79, - /* non-volatile register: take hint from PPC and allocate in reverse order */ - R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115, - R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102, - R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87, - R86, R85, R84, R83, R82, R81, R80, - /* environment ptr, SP, LR */ - R2, R1, R0 ]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - R16CClass::iterator - R16CClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - R16CClass::iterator - R16CClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; // don't allocate R2, R1, or R0 (envp, sp, lr) - } - }]; -} +def R16C : RegisterClass<"SPU", [i16], 128, (add GPRC)>; // The SPU's registers as 8-bit wide (byte) "preferred slot": -def R8C : RegisterClass<"SPU", [i8], 128, - [ - /* volatile register */ - R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, - R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, - R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, - R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, - R77, R78, R79, - /* non-volatile register: take hint from PPC and allocate in reverse order */ - R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115, - R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102, - R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87, - R86, R85, R84, R83, R82, R81, R80, - /* environment ptr, SP, LR */ - R2, R1, R0 ]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - R8CClass::iterator - R8CClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - R8CClass::iterator - R8CClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; // don't allocate R2, R1, or R0 (envp, sp, lr) - } - }]; -} +def R8C : RegisterClass<"SPU", [i8], 128, (add GPRC)>; // The SPU's registers as vector registers: -def VECREG : RegisterClass<"SPU", - [v16i8,v8i16,v4i32,v4f32,v2i64,v2f64], - 128, - [ - /* volatile register */ - R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, - R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, - R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, - R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, - R77, R78, R79, - /* non-volatile register: take hint from PPC and allocate in reverse order */ - R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115, - R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102, - R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87, - R86, R85, R84, R83, R82, R81, R80, - /* environment ptr, SP, LR */ - R2, R1, R0 ]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - VECREGClass::iterator - VECREGClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - VECREGClass::iterator - VECREGClass::allocation_order_end(const MachineFunction &MF) const { - return end()-3; // don't allocate R2, R1, or R0 (envp, sp, lr) - } - }]; -} +def VECREG : RegisterClass<"SPU", [v16i8,v8i16,v4i32,v4f32,v2i64,v2f64], 128, + (add GPRC)>; diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp index 38de3b6..797cfd5 100644 --- a/lib/Target/CppBackend/CPPBackend.cpp +++ b/lib/Target/CppBackend/CPPBackend.cpp @@ -1348,7 +1348,7 @@ void CppWriter::printInstruction(const Instruction *I, const PHINode* phi = cast<PHINode>(I); Out << "PHINode* " << iName << " = PHINode::Create(" - << getCppName(phi->getType()) << ", \"" + << getCppName(phi->getType()) << ", " << phi->getNumIncomingValues() << ", \""; printEscapedString(phi->getName()); Out << "\", " << bbname << ");"; diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp index 3379ac2..060a87b 100644 --- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp +++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp @@ -57,18 +57,26 @@ static unsigned mblazeBinary2Opcode[] = { }; static unsigned getRD(uint32_t insn) { + if (!MBlazeRegisterInfo::isRegister((insn>>21)&0x1F)) + return UNSUPPORTED; return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>21)&0x1F); } static unsigned getRA(uint32_t insn) { + if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F)) + return UNSUPPORTED; return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F); } static unsigned getRB(uint32_t insn) { + if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F)) + return UNSUPPORTED; return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F); } static int64_t getRS(uint32_t insn) { + if (!MBlazeRegisterInfo::isSpecialRegister(insn&0x3FFF)) + return UNSUPPORTED; return MBlazeRegisterInfo::getSpecialRegisterFromNumbering(insn&0x3FFF); } @@ -489,13 +497,14 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, raw_ostream &vStream) const { // The machine instruction. uint32_t insn; + uint64_t read; uint8_t bytes[4]; - // We always consume 4 bytes of data - size = 4; + // By default we consume 1 byte on failure + size = 1; // We want to read exactly 4 bytes of data. - if (region.readBytes(address, 4, (uint8_t*)bytes, NULL) == -1) + if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4) return false; // Encoded as a big-endian 32-bit word in the stream. @@ -509,44 +518,63 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, instr.setOpcode(opcode); + unsigned RD = getRD(insn); + unsigned RA = getRA(insn); + unsigned RB = getRB(insn); + unsigned RS = getRS(insn); + uint64_t tsFlags = MBlazeInsts[opcode].TSFlags; switch ((tsFlags & MBlazeII::FormMask)) { - default: llvm_unreachable("unknown instruction encoding"); + default: + return false; case MBlazeII::FRRRR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRB(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RB)); + instr.addOperand(MCOperand::CreateReg(RA)); break; case MBlazeII::FRRR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RA)); + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FRI: switch (opcode) { - default: llvm_unreachable("unknown instruction encoding"); + default: + return false; case MBlaze::MFS: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(insn&0x3FFF)); break; case MBlaze::MTS: + if (RA == UNSUPPORTED) + return false; instr.addOperand(MCOperand::CreateImm(insn&0x3FFF)); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + instr.addOperand(MCOperand::CreateReg(RA)); break; case MBlaze::MSRSET: case MBlaze::MSRCLR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(insn&0x7FFF)); break; } break; case MBlazeII::FRRI: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RA)); switch (opcode) { default: instr.addOperand(MCOperand::CreateImm(getIMM(insn))); @@ -560,27 +588,37 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, break; case MBlazeII::FCRR: - instr.addOperand(MCOperand::CreateReg(getRA(insn))); - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RA == UNSUPPORTED || RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RA)); + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FCRI: - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RA)); instr.addOperand(MCOperand::CreateImm(getIMM(insn))); break; case MBlazeII::FRCR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RD == UNSUPPORTED || RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FRCI: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(getIMM(insn))); break; case MBlazeII::FCCR: - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FCCI: @@ -588,33 +626,45 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, break; case MBlazeII::FRRCI: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RA)); instr.addOperand(MCOperand::CreateImm(getSHT(insn))); break; case MBlazeII::FRRC: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RA)); break; case MBlazeII::FRCX: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(getFSL(insn))); break; case MBlazeII::FRCS: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRS(insn))); + if (RD == UNSUPPORTED || RS == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RS)); break; case MBlazeII::FCRCS: - instr.addOperand(MCOperand::CreateReg(getRS(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RS == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RS)); + instr.addOperand(MCOperand::CreateReg(RA)); break; case MBlazeII::FCRCX: - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RA)); instr.addOperand(MCOperand::CreateImm(getFSL(insn))); break; @@ -623,16 +673,23 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, break; case MBlazeII::FCR: - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FRIR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(getIMM(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + instr.addOperand(MCOperand::CreateReg(RA)); break; } + // We always consume 4 bytes of data on success + size = 4; + return true; } diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td index 1fa1e4d..1245658 100644 --- a/lib/Target/MBlaze/MBlaze.td +++ b/lib/Target/MBlaze/MBlaze.td @@ -31,49 +31,28 @@ def MBlazeInstrInfo : InstrInfo; // Microblaze Subtarget features // //===----------------------------------------------------------------------===// -def FeaturePipe3 : SubtargetFeature<"pipe3", "HasPipe3", "true", - "Implements 3-stage pipeline">; def FeatureBarrel : SubtargetFeature<"barrel", "HasBarrel", "true", "Implements barrel shifter">; def FeatureDiv : SubtargetFeature<"div", "HasDiv", "true", "Implements hardware divider">; def FeatureMul : SubtargetFeature<"mul", "HasMul", "true", "Implements hardware multiplier">; -def FeatureFSL : SubtargetFeature<"fsl", "HasFSL", "true", - "Implements FSL instructions">; -def FeatureEFSL : SubtargetFeature<"efsl", "HasEFSL", "true", - "Implements extended FSL instructions">; -def FeatureMSRSet : SubtargetFeature<"msrset", "HasMSRSet", "true", - "Implements MSR register set and clear">; -def FeatureException : SubtargetFeature<"exception", "HasException", "true", - "Implements hardware exception support">; def FeaturePatCmp : SubtargetFeature<"patcmp", "HasPatCmp", "true", "Implements pattern compare instruction">; def FeatureFPU : SubtargetFeature<"fpu", "HasFPU", "true", "Implements floating point unit">; -def FeatureESR : SubtargetFeature<"esr", "HasESR", "true", - "Implements ESR and EAR registers">; -def FeaturePVR : SubtargetFeature<"pvr", "HasPVR", "true", - "Implements processor version register">; def FeatureMul64 : SubtargetFeature<"mul64", "HasMul64", "true", "Implements multiplier with 64-bit result">; def FeatureSqrt : SubtargetFeature<"sqrt", "HasSqrt", "true", "Implements sqrt and floating point convert">; -def FeatureMMU : SubtargetFeature<"mmu", "HasMMU", "true", - "Implements memory management unit">; //===----------------------------------------------------------------------===// // MBlaze processors supported. //===----------------------------------------------------------------------===// -class Proc<string Name, list<SubtargetFeature> Features> - : Processor<Name, MBlazeGenericItineraries, Features>; - -def : Proc<"v400", []>; -def : Proc<"v500", []>; -def : Proc<"v600", []>; -def : Proc<"v700", []>; -def : Proc<"v710", []>; +def : Processor<"mblaze", MBlazeGenericItineraries, []>; +def : Processor<"mblaze3", MBlazePipe3Itineraries, []>; +def : Processor<"mblaze5", MBlazePipe5Itineraries, []>; //===----------------------------------------------------------------------===// // Instruction Descriptions diff --git a/lib/Target/MBlaze/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MBlazeAsmBackend.cpp index a4b21af..08f14c3 100644 --- a/lib/Target/MBlaze/MBlazeAsmBackend.cpp +++ b/lib/Target/MBlaze/MBlazeAsmBackend.cpp @@ -150,14 +150,13 @@ void ELFMBlazeAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, TargetAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) assert(0 && "Mac not supported on MBlaze"); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: + + if (TheTriple.isOSWindows()) assert(0 && "Windows not supported on MBlaze"); - default: - return new ELFMBlazeAsmBackend(T, Triple(TT).getOS()); - } + + return new ELFMBlazeAsmBackend(T, TheTriple.getOS()); } diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp index 4399ee2..973e968 100644 --- a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp +++ b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp @@ -77,7 +77,7 @@ static bool hasImmInstruction(MachineBasicBlock::iterator &candidate) { // We must assume that unknown immediate values require more than // 16-bits to represent. - if (mop.isGlobal() || mop.isSymbol()) + if (mop.isGlobal() || mop.isSymbol() || mop.isJTI() || mop.isCPI()) return true; // FIXME: we could probably check to see if the FP value happens diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp index f39826b..c5e0a89 100644 --- a/lib/Target/MBlaze/MBlazeISelLowering.cpp +++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp @@ -180,6 +180,8 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM) setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setMinFunctionAlignment(2); + setStackPointerRegisterToSaveRestore(MBlaze::R1); computeRegisterProperties(); } @@ -188,11 +190,6 @@ MVT::SimpleValueType MBlazeTargetLowering::getSetCCResultType(EVT VT) const { return MVT::i32; } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned MBlazeTargetLowering::getFunctionAlignment(const Function *) const { - return 2; -} - SDValue MBlazeTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) @@ -274,7 +271,7 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI, F->insert(It, loop); F->insert(It, finish); - // Update machine-CFG edges by transfering adding all successors and + // Update machine-CFG edges by transferring adding all successors and // remaining instructions from the current block to the new block which // will contain the Phi node for the select. finish->splice(finish->begin(), MBB, @@ -420,7 +417,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI, // All atomic instructions on the Microblaze are implemented using the // load-linked / store-conditional style atomic instruction sequences. // Thus, all operations will look something like the following: - // + // // start: // lwx RV, RP, 0 // <do stuff> @@ -456,7 +453,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI, F->insert(It, start); F->insert(It, exit); - // Update machine-CFG edges by transfering adding all successors and + // Update machine-CFG edges by transferring adding all successors and // remaining instructions from the current block to the new block which // will contain the Phi node for the select. exit->splice(exit->begin(), MBB, llvm::next(MachineBasicBlock::iterator(MI)), @@ -701,8 +698,8 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze); // Get a count of how many bytes are to be pushed on the stack. @@ -778,7 +775,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { @@ -840,8 +837,8 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, SmallVectorImpl<SDValue> &InVals) const { // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze); @@ -883,8 +880,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze); SDValue StackPtr; @@ -1015,8 +1012,8 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, SmallVector<CCValAssign, 16> RVLocs; // CCState - Info about the registers and stack slot. - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze); @@ -1046,9 +1043,9 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, // If this function is using the interrupt_handler calling convention // then use "rtid r14, 0" otherwise use "rtsd r15, 8" - unsigned Ret = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet + unsigned Ret = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet : MBlazeISD::Ret; - unsigned Reg = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlaze::R14 + unsigned Reg = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlaze::R14 : MBlaze::R15; SDValue DReg = DAG.getRegister(Reg, MVT::i32); @@ -1103,7 +1100,7 @@ MBlazeTargetLowering::getSingleConstraintMatchWeight( switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); - break;
+ break; case 'd': case 'y': if (type->isIntegerTy()) diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h index 91649bc..265c1a7 100644 --- a/lib/Target/MBlaze/MBlazeISelLowering.h +++ b/lib/Target/MBlaze/MBlazeISelLowering.h @@ -104,7 +104,6 @@ namespace llvm { /// getSetCCResultType - get the ISD::SETCC result ValueType MVT::SimpleValueType getSetCCResultType(EVT VT) const; - virtual unsigned getFunctionAlignment(const Function *F) const; private: // Subtarget Info const MBlazeSubtarget *Subtarget; diff --git a/lib/Target/MBlaze/MBlazeInstrFPU.td b/lib/Target/MBlaze/MBlazeInstrFPU.td index 094de5c..4acdcfd 100644 --- a/lib/Target/MBlaze/MBlazeInstrFPU.td +++ b/lib/Target/MBlaze/MBlazeInstrFPU.td @@ -21,22 +21,22 @@ class LoadFM<bits<6> op, string instr_asm, PatFrag OpNode> : TA<op, 0x000, (outs GPR:$dst), (ins memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IILoad>; + [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IIC_MEMl>; class LoadFMI<bits<6> op, string instr_asm, PatFrag OpNode> : TB<op, (outs GPR:$dst), (ins memri:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IILoad>; + [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>; class StoreFM<bits<6> op, string instr_asm, PatFrag OpNode> : TA<op, 0x000, (outs), (ins GPR:$dst, memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIStore>; + [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIC_MEMs>; class StoreFMI<bits<6> op, string instr_asm, PatFrag OpNode> : TB<op, (outs), (ins GPR:$dst, memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIStore>; + [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIC_MEMs>; class ArithF<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode, InstrItinClass itin> : @@ -56,15 +56,10 @@ class ArithFR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode, !strconcat(instr_asm, " $dst, $c, $b"), [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>; -class LogicF<bits<6> op, string instr_asm> : - TB<op, (outs GPR:$dst), (ins GPR:$b, GPR:$c), - !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; - class LogicFI<bits<6> op, string instr_asm> : TB<op, (outs GPR:$dst), (ins GPR:$b, fimm:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; let rb=0 in { class ArithF2<bits<6> op, bits<11> flags, string instr_asm, @@ -95,10 +90,10 @@ let rb=0 in { //===----------------------------------------------------------------------===// let Predicates=[HasFPU] in { def FORI : LogicFI<0x28, "ori ">; - def FADD : ArithF<0x16, 0x000, "fadd ", fadd, IIAlu>; - def FRSUB : ArithFR<0x16, 0x080, "frsub ", fsub, IIAlu>; - def FMUL : ArithF<0x16, 0x100, "fmul ", fmul, IIAlu>; - def FDIV : ArithF<0x16, 0x180, "fdiv ", fdiv, IIAlu>; + def FADD : ArithF<0x16, 0x000, "fadd ", fadd, IIC_FPU>; + def FRSUB : ArithFR<0x16, 0x080, "frsub ", fsub, IIC_FPU>; + def FMUL : ArithF<0x16, 0x100, "fmul ", fmul, IIC_FPU>; + def FDIV : ArithF<0x16, 0x180, "fdiv ", fdiv, IIC_FPUd>; } let Predicates=[HasFPU], isCodeGenOnly=1 in { @@ -110,19 +105,19 @@ let Predicates=[HasFPU], isCodeGenOnly=1 in { } let Predicates=[HasFPU,HasSqrt] in { - def FLT : ArithIF<0x16, 0x280, "flt ", IIAlu>; - def FINT : ArithFI<0x16, 0x300, "fint ", IIAlu>; - def FSQRT : ArithF2<0x16, 0x380, "fsqrt ", IIAlu>; + def FLT : ArithIF<0x16, 0x280, "flt ", IIC_FPUf>; + def FINT : ArithFI<0x16, 0x300, "fint ", IIC_FPUi>; + def FSQRT : ArithF2<0x16, 0x380, "fsqrt ", IIC_FPUs>; } let isAsCheapAsAMove = 1 in { - def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIAlu>; - def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIAlu>; - def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIAlu>; - def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIAlu>; - def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIAlu>; - def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIAlu>; - def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIAlu>; + def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIC_FPUc>; + def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIC_FPUc>; + def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIC_FPUc>; + def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIC_FPUc>; + def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIC_FPUc>; + def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIC_FPUc>; + def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIC_FPUc>; } diff --git a/lib/Target/MBlaze/MBlazeInstrFSL.td b/lib/Target/MBlaze/MBlazeInstrFSL.td index 3209845..3082a7e 100644 --- a/lib/Target/MBlaze/MBlazeInstrFSL.td +++ b/lib/Target/MBlaze/MBlazeInstrFSL.td @@ -13,7 +13,7 @@ class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FRCX, (outs GPR:$dst), (ins fslimm:$b), !strconcat(instr_asm, " $dst, $b"), - [(set GPR:$dst, (OpNode immZExt4:$b))],IIAlu> + [(set GPR:$dst, (OpNode immZExt4:$b))],IIC_FSLg> { bits<5> rd; bits<4> fslno; @@ -29,7 +29,7 @@ class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> : class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FRCR, (outs GPR:$dst), (ins GPR:$b), !strconcat(instr_asm, " $dst, $b"), - [(set GPR:$dst, (OpNode GPR:$b))], IIAlu> + [(set GPR:$dst, (OpNode GPR:$b))], IIC_FSLg> { bits<5> rd; bits<5> rb; @@ -45,7 +45,7 @@ class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> : class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FCRCX, (outs), (ins GPR:$v, fslimm:$b), !strconcat(instr_asm, " $v, $b"), - [(OpNode GPR:$v, immZExt4:$b)], IIAlu> + [(OpNode GPR:$v, immZExt4:$b)], IIC_FSLp> { bits<5> ra; bits<4> fslno; @@ -61,7 +61,7 @@ class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FCRR, (outs), (ins GPR:$v, GPR:$b), !strconcat(instr_asm, " $v, $b"), - [(OpNode GPR:$v, GPR:$b)], IIAlu> + [(OpNode GPR:$v, GPR:$b)], IIC_FSLp> { bits<5> ra; bits<5> rb; @@ -77,7 +77,7 @@ class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FCX, (outs), (ins fslimm:$b), !strconcat(instr_asm, " $b"), - [(OpNode immZExt4:$b)], IIAlu> + [(OpNode immZExt4:$b)], IIC_FSLp> { bits<4> fslno; @@ -92,7 +92,7 @@ class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : class FSLPutTD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FCR, (outs), (ins GPR:$b), !strconcat(instr_asm, " $b"), - [(OpNode GPR:$b)], IIAlu> + [(OpNode GPR:$b)], IIC_FSLp> { bits<5> rb; diff --git a/lib/Target/MBlaze/MBlazeInstrFormats.td b/lib/Target/MBlaze/MBlazeInstrFormats.td index d62574d..54f605f 100644 --- a/lib/Target/MBlaze/MBlazeInstrFormats.td +++ b/lib/Target/MBlaze/MBlazeInstrFormats.td @@ -81,7 +81,7 @@ class MBlazeInst<bits<6> op, Format form, dag outs, dag ins, string asmstr, // Pseudo instruction class //===----------------------------------------------------------------------===// class MBlazePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>: - MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIPseudo>; + MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIC_Pseudo>; //===----------------------------------------------------------------------===// // Type A instruction class in MBlaze : <|opcode|rd|ra|rb|flags|> diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp index b353dcd..794ebed 100644 --- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp +++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp @@ -17,6 +17,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ScoreboardHazardRecognizer.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "MBlazeGenInstrInfo.inc" diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h index b7300c1..b717da8 100644 --- a/lib/Target/MBlaze/MBlazeInstrInfo.h +++ b/lib/Target/MBlaze/MBlazeInstrInfo.h @@ -261,7 +261,6 @@ public: virtual bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; - virtual void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td index 7b8f70a..950f2d7 100644 --- a/lib/Target/MBlaze/MBlazeInstrInfo.td +++ b/lib/Target/MBlaze/MBlazeInstrInfo.td @@ -47,22 +47,22 @@ def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_MBCallSeqEnd, //===----------------------------------------------------------------------===// // MBlaze Instruction Predicate Definitions. //===----------------------------------------------------------------------===// -def HasPipe3 : Predicate<"Subtarget.hasPipe3()">; +// def HasPipe3 : Predicate<"Subtarget.hasPipe3()">; def HasBarrel : Predicate<"Subtarget.hasBarrel()">; -def NoBarrel : Predicate<"!Subtarget.hasBarrel()">; +// def NoBarrel : Predicate<"!Subtarget.hasBarrel()">; def HasDiv : Predicate<"Subtarget.hasDiv()">; def HasMul : Predicate<"Subtarget.hasMul()">; -def HasFSL : Predicate<"Subtarget.hasFSL()">; -def HasEFSL : Predicate<"Subtarget.hasEFSL()">; -def HasMSRSet : Predicate<"Subtarget.hasMSRSet()">; -def HasException : Predicate<"Subtarget.hasException()">; +// def HasFSL : Predicate<"Subtarget.hasFSL()">; +// def HasEFSL : Predicate<"Subtarget.hasEFSL()">; +// def HasMSRSet : Predicate<"Subtarget.hasMSRSet()">; +// def HasException : Predicate<"Subtarget.hasException()">; def HasPatCmp : Predicate<"Subtarget.hasPatCmp()">; def HasFPU : Predicate<"Subtarget.hasFPU()">; -def HasESR : Predicate<"Subtarget.hasESR()">; -def HasPVR : Predicate<"Subtarget.hasPVR()">; +// def HasESR : Predicate<"Subtarget.hasESR()">; +// def HasPVR : Predicate<"Subtarget.hasPVR()">; def HasMul64 : Predicate<"Subtarget.hasMul64()">; def HasSqrt : Predicate<"Subtarget.hasSqrt()">; -def HasMMU : Predicate<"Subtarget.hasMMU()">; +// def HasMMU : Predicate<"Subtarget.hasMMU()">; //===----------------------------------------------------------------------===// // MBlaze Operand, Complex Patterns and Transformations Definitions. @@ -170,18 +170,18 @@ class ArithI<bits<6> op, string instr_asm, SDNode OpNode, Operand Od, PatLeaf imm_type> : TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIAlu>; + [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_ALU>; class ArithI32<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> : TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; class ShiftI<bits<6> op, bits<2> flags, string instr_asm, SDNode OpNode, Operand Od, PatLeaf imm_type> : SHT<op, flags, (outs GPR:$dst), (ins GPR:$b, Od:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIAlu>; + [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_SHT>; class ArithR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode, InstrItinClass itin> : @@ -193,7 +193,7 @@ class ArithRI<bits<6> op, string instr_asm, SDNode OpNode, Operand Od, PatLeaf imm_type> : TBR<op, (outs GPR:$dst), (ins Od:$b, GPR:$c), !strconcat(instr_asm, " $dst, $c, $b"), - [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIAlu>; + [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIC_ALU>; class ArithN<bits<6> op, bits<11> flags, string instr_asm, InstrItinClass itin> : @@ -204,7 +204,7 @@ class ArithN<bits<6> op, bits<11> flags, string instr_asm, class ArithNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> : TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; class ArithRN<bits<6> op, bits<11> flags, string instr_asm, InstrItinClass itin> : @@ -215,7 +215,7 @@ class ArithRN<bits<6> op, bits<11> flags, string instr_asm, class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> : TBR<op, (outs GPR:$dst), (ins Od:$c, GPR:$b), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; //===----------------------------------------------------------------------===// // Misc Arithmetic Instructions @@ -224,46 +224,51 @@ class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> : class Logic<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode> : TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIAlu>; + [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIC_ALU>; class LogicI<bits<6> op, string instr_asm, SDNode OpNode> : TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c), !strconcat(instr_asm, " $dst, $b, $c"), [(set GPR:$dst, (OpNode GPR:$b, immZExt16:$c))], - IIAlu>; + IIC_ALU>; class LogicI32<bits<6> op, string instr_asm> : TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; class PatCmp<bits<6> op, bits<11> flags, string instr_asm> : TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; //===----------------------------------------------------------------------===// // Memory Access Instructions //===----------------------------------------------------------------------===// + +let mayLoad = 1 in { class LoadM<bits<6> op, bits<11> flags, string instr_asm> : TA<op, flags, (outs GPR:$dst), (ins memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [], IILoad>; + [], IIC_MEMl>; +} class LoadMI<bits<6> op, string instr_asm, PatFrag OpNode> : TB<op, (outs GPR:$dst), (ins memri:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IILoad>; + [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>; +let mayStore = 1 in { class StoreM<bits<6> op, bits<11> flags, string instr_asm> : TA<op, flags, (outs), (ins GPR:$dst, memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [], IIStore>; + [], IIC_MEMs>; +} class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> : TB<op, (outs), (ins GPR:$dst, memri:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIStore>; + [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIC_MEMs>; //===----------------------------------------------------------------------===// // Branch Instructions @@ -271,7 +276,7 @@ class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> : class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : TA<op, flags, (outs), (ins GPR:$target), !strconcat(instr_asm, " $target"), - [], IIBranch> { + [], IIC_BR> { let rd = 0x0; let ra = br; let Form = FCCR; @@ -280,7 +285,7 @@ class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : class BranchI<bits<6> op, bits<5> br, string instr_asm> : TB<op, (outs), (ins brtarget:$target), !strconcat(instr_asm, " $target"), - [], IIBranch> { + [], IIC_BR> { let rd = 0; let ra = br; let Form = FCCI; @@ -292,7 +297,7 @@ class BranchI<bits<6> op, bits<5> br, string instr_asm> : class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : TA<op, flags, (outs), (ins GPR:$link, GPR:$target, variable_ops), !strconcat(instr_asm, " $link, $target"), - [], IIBranch> { + [], IIC_BRl> { let ra = br; let Form = FRCR; } @@ -300,7 +305,7 @@ class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : class BranchLI<bits<6> op, bits<5> br, string instr_asm> : TB<op, (outs), (ins GPR:$link, calltarget:$target, variable_ops), !strconcat(instr_asm, " $link, $target"), - [], IIBranch> { + [], IIC_BRl> { let ra = br; let Form = FRCI; } @@ -312,7 +317,7 @@ class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : TA<op, flags, (outs), (ins GPR:$a, GPR:$b), !strconcat(instr_asm, " $a, $b"), - [], IIBranch> { + [], IIC_BRc> { let rd = br; let Form = FCRR; } @@ -320,7 +325,7 @@ class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : class BranchCI<bits<6> op, bits<5> br, string instr_asm> : TB<op, (outs), (ins GPR:$a, brtarget:$offset), !strconcat(instr_asm, " $a, $offset"), - [], IIBranch> { + [], IIC_BRc> { let rd = br; let Form = FCRI; } @@ -330,71 +335,74 @@ class BranchCI<bits<6> op, bits<5> br, string instr_asm> : //===----------------------------------------------------------------------===// let isCommutable = 1, isAsCheapAsAMove = 1 in { - def ADDK : Arith<0x04, 0x000, "addk ", add, IIAlu>; + def ADDK : Arith<0x04, 0x000, "addk ", add, IIC_ALU>; def AND : Logic<0x21, 0x000, "and ", and>; def OR : Logic<0x20, 0x000, "or ", or>; def XOR : Logic<0x22, 0x000, "xor ", xor>; - def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">; - def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">; - def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">; + + let Predicates=[HasPatCmp] in { + def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">; + def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">; + def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">; + } let Defs = [CARRY] in { - def ADD : Arith<0x00, 0x000, "add ", addc, IIAlu>; + def ADD : Arith<0x00, 0x000, "add ", addc, IIC_ALU>; let Uses = [CARRY] in { - def ADDC : Arith<0x02, 0x000, "addc ", adde, IIAlu>; + def ADDC : Arith<0x02, 0x000, "addc ", adde, IIC_ALU>; } } let Uses = [CARRY] in { - def ADDKC : ArithN<0x06, 0x000, "addkc ", IIAlu>; + def ADDKC : ArithN<0x06, 0x000, "addkc ", IIC_ALU>; } } let isAsCheapAsAMove = 1 in { - def ANDN : ArithN<0x23, 0x000, "andn ", IIAlu>; - def CMP : ArithN<0x05, 0x001, "cmp ", IIAlu>; - def CMPU : ArithN<0x05, 0x003, "cmpu ", IIAlu>; - def RSUBK : ArithR<0x05, 0x000, "rsubk ", sub, IIAlu>; + def ANDN : ArithN<0x23, 0x000, "andn ", IIC_ALU>; + def CMP : ArithN<0x05, 0x001, "cmp ", IIC_ALU>; + def CMPU : ArithN<0x05, 0x003, "cmpu ", IIC_ALU>; + def RSUBK : ArithR<0x05, 0x000, "rsubk ", sub, IIC_ALU>; let Defs = [CARRY] in { - def RSUB : ArithR<0x01, 0x000, "rsub ", subc, IIAlu>; + def RSUB : ArithR<0x01, 0x000, "rsub ", subc, IIC_ALU>; let Uses = [CARRY] in { - def RSUBC : ArithR<0x03, 0x000, "rsubc ", sube, IIAlu>; + def RSUBC : ArithR<0x03, 0x000, "rsubc ", sube, IIC_ALU>; } } let Uses = [CARRY] in { - def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIAlu>; + def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIC_ALU>; } } let isCommutable = 1, Predicates=[HasMul] in { - def MUL : Arith<0x10, 0x000, "mul ", mul, IIAlu>; + def MUL : Arith<0x10, 0x000, "mul ", mul, IIC_ALUm>; } let isCommutable = 1, Predicates=[HasMul,HasMul64] in { - def MULH : Arith<0x10, 0x001, "mulh ", mulhs, IIAlu>; - def MULHU : Arith<0x10, 0x003, "mulhu ", mulhu, IIAlu>; + def MULH : Arith<0x10, 0x001, "mulh ", mulhs, IIC_ALUm>; + def MULHU : Arith<0x10, 0x003, "mulhu ", mulhu, IIC_ALUm>; } let Predicates=[HasMul,HasMul64] in { - def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIAlu>; + def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIC_ALUm>; } let Predicates=[HasBarrel] in { - def BSRL : Arith<0x11, 0x000, "bsrl ", srl, IIAlu>; - def BSRA : Arith<0x11, 0x200, "bsra ", sra, IIAlu>; - def BSLL : Arith<0x11, 0x400, "bsll ", shl, IIAlu>; + def BSRL : Arith<0x11, 0x000, "bsrl ", srl, IIC_SHT>; + def BSRA : Arith<0x11, 0x200, "bsra ", sra, IIC_SHT>; + def BSLL : Arith<0x11, 0x400, "bsll ", shl, IIC_SHT>; def BSRLI : ShiftI<0x19, 0x0, "bsrli ", srl, uimm5, immZExt5>; def BSRAI : ShiftI<0x19, 0x1, "bsrai ", sra, uimm5, immZExt5>; def BSLLI : ShiftI<0x19, 0x2, "bslli ", shl, uimm5, immZExt5>; } let Predicates=[HasDiv] in { - def IDIV : ArithR<0x12, 0x000, "idiv ", sdiv, IIAlu>; - def IDIVU : ArithR<0x12, 0x002, "idivu ", udiv, IIAlu>; + def IDIV : ArithR<0x12, 0x000, "idiv ", sdiv, IIC_ALUd>; + def IDIVU : ArithR<0x12, 0x002, "idivu ", udiv, IIC_ALUd>; } //===----------------------------------------------------------------------===// @@ -552,7 +560,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, def RTSD : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm), "rtsd $target, $imm", [], - IIBranch>; + IIC_BR>; } let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, @@ -560,7 +568,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, def RTID : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm), "rtid $target, $imm", [], - IIBranch>; + IIC_BR>; } let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, @@ -568,7 +576,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, def RTBD : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm), "rtbd $target, $imm", [], - IIBranch>; + IIC_BR>; } let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, @@ -576,7 +584,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, def RTED : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm), "rted $target, $imm", [], - IIBranch>; + IIC_BR>; } //===----------------------------------------------------------------------===// @@ -584,7 +592,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, //===----------------------------------------------------------------------===// let neverHasSideEffects = 1 in { - def NOP : MBlazeInst< 0x20, FC, (outs), (ins), "nop ", [], IIAlu>; + def NOP : MBlazeInst< 0x20, FC, (outs), (ins), "nop ", [], IIC_ALU>; } let usesCustomInserter = 1 in { @@ -611,17 +619,17 @@ let usesCustomInserter = 1 in { let rb = 0 in { def SEXT16 : TA<0x24, 0x061, (outs GPR:$dst), (ins GPR:$src), - "sext16 $dst, $src", [], IIAlu>; + "sext16 $dst, $src", [], IIC_ALU>; def SEXT8 : TA<0x24, 0x060, (outs GPR:$dst), (ins GPR:$src), - "sext8 $dst, $src", [], IIAlu>; + "sext8 $dst, $src", [], IIC_ALU>; let Defs = [CARRY] in { def SRL : TA<0x24, 0x041, (outs GPR:$dst), (ins GPR:$src), - "srl $dst, $src", [], IIAlu>; + "srl $dst, $src", [], IIC_ALU>; def SRA : TA<0x24, 0x001, (outs GPR:$dst), (ins GPR:$src), - "sra $dst, $src", [], IIAlu>; + "sra $dst, $src", [], IIC_ALU>; let Uses = [CARRY] in { def SRC : TA<0x24, 0x021, (outs GPR:$dst), (ins GPR:$src), - "src $dst, $src", [], IIAlu>; + "src $dst, $src", [], IIC_ALU>; } } } @@ -637,36 +645,36 @@ let isCodeGenOnly=1 in { //===----------------------------------------------------------------------===// let Form=FRCS in { def MFS : SPC<0x25, 0x2, (outs GPR:$dst), (ins SPR:$src), - "mfs $dst, $src", [], IIAlu>; + "mfs $dst, $src", [], IIC_ALU>; } let Form=FCRCS in { def MTS : SPC<0x25, 0x3, (outs SPR:$dst), (ins GPR:$src), - "mts $dst, $src", [], IIAlu>; + "mts $dst, $src", [], IIC_ALU>; } def MSRSET : MSR<0x25, 0x20, (outs GPR:$dst), (ins uimm15:$set), - "msrset $dst, $set", [], IIAlu>; + "msrset $dst, $set", [], IIC_ALU>; def MSRCLR : MSR<0x25, 0x22, (outs GPR:$dst), (ins uimm15:$clr), - "msrclr $dst, $clr", [], IIAlu>; + "msrclr $dst, $clr", [], IIC_ALU>; let rd=0x0, Form=FCRR in { def WDC : TA<0x24, 0x64, (outs), (ins GPR:$a, GPR:$b), - "wdc $a, $b", [], IIAlu>; + "wdc $a, $b", [], IIC_WDC>; def WDCF : TA<0x24, 0x74, (outs), (ins GPR:$a, GPR:$b), - "wdc.flush $a, $b", [], IIAlu>; + "wdc.flush $a, $b", [], IIC_WDC>; def WDCC : TA<0x24, 0x66, (outs), (ins GPR:$a, GPR:$b), - "wdc.clear $a, $b", [], IIAlu>; + "wdc.clear $a, $b", [], IIC_WDC>; def WIC : TA<0x24, 0x68, (outs), (ins GPR:$a, GPR:$b), - "wic $a, $b", [], IIAlu>; + "wic $a, $b", [], IIC_WDC>; } def BRK : BranchL<0x26, 0x0C, 0x000, "brk ">; def BRKI : BranchLI<0x2E, 0x0C, "brki ">; def IMM : MBlazeInst<0x2C, FCCI, (outs), (ins simm16:$imm), - "imm $imm", [], IIAlu>; + "imm $imm", [], IIC_ALU>; //===----------------------------------------------------------------------===// // Pseudo instructions for atomic operations @@ -848,11 +856,6 @@ def : Pat<(MBWrapper tconstpool:$in), (ORI (i32 R0), tconstpool:$in)>; // Misc instructions def : Pat<(and (i32 GPR:$lh), (not (i32 GPR:$rh))),(ANDN GPR:$lh, GPR:$rh)>; -// Arithmetic with immediates -def : Pat<(add (i32 GPR:$in), imm:$imm),(ADDIK GPR:$in, imm:$imm)>; -def : Pat<(or (i32 GPR:$in), imm:$imm),(ORI GPR:$in, imm:$imm)>; -def : Pat<(xor (i32 GPR:$in), imm:$imm),(XORI GPR:$in, imm:$imm)>; - // Convert any extend loads into zero extend loads def : Pat<(extloadi8 iaddr:$src), (i32 (LBUI iaddr:$src))>; def : Pat<(extloadi16 iaddr:$src), (i32 (LHUI iaddr:$src))>; diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp index fa9140d..517279f 100644 --- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp +++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp @@ -181,6 +181,26 @@ unsigned MBlazeRegisterInfo::getSpecialRegisterFromNumbering(unsigned Reg) { return 0; // Not reached } +bool MBlazeRegisterInfo::isRegister(unsigned Reg) { + return Reg <= 31; +} + +bool MBlazeRegisterInfo::isSpecialRegister(unsigned Reg) { + switch (Reg) { + case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : + case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : + case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : + case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : + case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : + case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : + return true; + + default: + return false; + } + return false; // Not reached +} + unsigned MBlazeRegisterInfo::getPICCallReg() { return MBlaze::R20; } @@ -336,5 +356,9 @@ int MBlazeRegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const { return MBlazeGenRegisterInfo::getDwarfRegNumFull(RegNo,0); } +int MBlazeRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + return MBlazeGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0); +} + #include "MBlazeGenRegisterInfo.inc" diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h index 839536d..3807839 100644 --- a/lib/Target/MBlaze/MBlazeRegisterInfo.h +++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h @@ -45,6 +45,8 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo { static unsigned getRegisterNumbering(unsigned RegEnum); static unsigned getRegisterFromNumbering(unsigned RegEnum); static unsigned getSpecialRegisterFromNumbering(unsigned RegEnum); + static bool isRegister(unsigned RegEnum); + static bool isSpecialRegister(unsigned RegEnum); /// Get PIC indirect call register static unsigned getPICCallReg(); @@ -73,6 +75,7 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo { unsigned getEHHandlerRegister() const; int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; }; } // end namespace llvm diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.td b/lib/Target/MBlaze/MBlazeRegisterInfo.td index fbefb22..13c46ba 100644 --- a/lib/Target/MBlaze/MBlazeRegisterInfo.td +++ b/lib/Target/MBlaze/MBlazeRegisterInfo.td @@ -43,7 +43,7 @@ let Namespace = "MBlaze" in { def R1 : MBlazeGPRReg< 1, "r1">, DwarfRegNum<[1]>; def R2 : MBlazeGPRReg< 2, "r2">, DwarfRegNum<[2]>; def R3 : MBlazeGPRReg< 3, "r3">, DwarfRegNum<[3]>; - def R4 : MBlazeGPRReg< 4, "r4">, DwarfRegNum<[5]>; + def R4 : MBlazeGPRReg< 4, "r4">, DwarfRegNum<[4]>; def R5 : MBlazeGPRReg< 5, "r5">, DwarfRegNum<[5]>; def R6 : MBlazeGPRReg< 6, "r6">, DwarfRegNum<[6]>; def R7 : MBlazeGPRReg< 7, "r7">, DwarfRegNum<[7]>; @@ -85,67 +85,33 @@ let Namespace = "MBlaze" in { def RTLBX : MBlazeSPRReg<0x1002, "rtlbx">, DwarfRegNum<[41]>; def RTLBLO : MBlazeSPRReg<0x1003, "rtlblo">, DwarfRegNum<[42]>; def RTLBHI : MBlazeSPRReg<0x1004, "rtlbhi">, DwarfRegNum<[43]>; - def RPVR0 : MBlazeSPRReg<0x2000, "rpvr0">, DwarfRegNum<[44]>; - def RPVR1 : MBlazeSPRReg<0x2001, "rpvr1">, DwarfRegNum<[45]>; - def RPVR2 : MBlazeSPRReg<0x2002, "rpvr2">, DwarfRegNum<[46]>; - def RPVR3 : MBlazeSPRReg<0x2003, "rpvr3">, DwarfRegNum<[47]>; - def RPVR4 : MBlazeSPRReg<0x2004, "rpvr4">, DwarfRegNum<[48]>; - def RPVR5 : MBlazeSPRReg<0x2005, "rpvr5">, DwarfRegNum<[49]>; - def RPVR6 : MBlazeSPRReg<0x2006, "rpvr6">, DwarfRegNum<[50]>; - def RPVR7 : MBlazeSPRReg<0x2007, "rpvr7">, DwarfRegNum<[51]>; - def RPVR8 : MBlazeSPRReg<0x2008, "rpvr8">, DwarfRegNum<[52]>; - def RPVR9 : MBlazeSPRReg<0x2009, "rpvr9">, DwarfRegNum<[53]>; - def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[54]>; - def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[55]>; + def RTLBSX : MBlazeSPRReg<0x1004, "rtlbsx">, DwarfRegNum<[44]>; + def RPVR0 : MBlazeSPRReg<0x2000, "rpvr0">, DwarfRegNum<[45]>; + def RPVR1 : MBlazeSPRReg<0x2001, "rpvr1">, DwarfRegNum<[46]>; + def RPVR2 : MBlazeSPRReg<0x2002, "rpvr2">, DwarfRegNum<[47]>; + def RPVR3 : MBlazeSPRReg<0x2003, "rpvr3">, DwarfRegNum<[48]>; + def RPVR4 : MBlazeSPRReg<0x2004, "rpvr4">, DwarfRegNum<[49]>; + def RPVR5 : MBlazeSPRReg<0x2005, "rpvr5">, DwarfRegNum<[50]>; + def RPVR6 : MBlazeSPRReg<0x2006, "rpvr6">, DwarfRegNum<[51]>; + def RPVR7 : MBlazeSPRReg<0x2007, "rpvr7">, DwarfRegNum<[52]>; + def RPVR8 : MBlazeSPRReg<0x2008, "rpvr8">, DwarfRegNum<[53]>; + def RPVR9 : MBlazeSPRReg<0x2009, "rpvr9">, DwarfRegNum<[54]>; + def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[55]>; + def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[56]>; // The carry bit. In the Microblaze this is really bit 29 of the // MSR register but this is the only bit of that register that we // are interested in modeling. - def CARRY : MBlazeSPRReg<0x0000, "rmsr[c]">, DwarfRegNum<[33]>; + def CARRY : MBlazeSPRReg<0x0000, "rmsr[c]">; } //===----------------------------------------------------------------------===// // Register Classes //===----------------------------------------------------------------------===// -def GPR : RegisterClass<"MBlaze", [i32,f32], 32, - [ - // Return Values and Arguments - R3, R4, R5, R6, R7, R8, R9, R10, +def GPR : RegisterClass<"MBlaze", [i32,f32], 32, (sequence "R%u", 0, 31)>; - // Not preserved across procedure calls - R11, R12, - - // Callee save - R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, - - // Reserved - R0, // Always zero - R1, // The stack pointer - R2, // Read-only small data area anchor - R13, // Read-write small data area anchor - R14, // Return address for interrupts - R15, // Return address for sub-routines - R16, // Return address for trap - R17, // Return address for exceptions - R18, // Reserved for assembler - R19 // The frame-pointer - ]> -{ - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GPRClass::iterator - GPRClass::allocation_order_end(const MachineFunction &MF) const { - // The last 10 registers on the list above are reserved - return end()-10; - } - }]; -} - -def SPR : RegisterClass<"MBlaze", [i32], 32, - [ +def SPR : RegisterClass<"MBlaze", [i32], 32, (add // Reserved RPC, RMSR, @@ -171,20 +137,12 @@ def SPR : RegisterClass<"MBlaze", [i32], 32, RPVR9, RPVR10, RPVR11 - ]> + )> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - SPRClass::iterator - SPRClass::allocation_order_end(const MachineFunction &MF) const { - // None of the special purpose registers are allocatable. - return end()-24; - } - }]; + // None of the special purpose registers are allocatable. + let isAllocatable = 0; } -def CRC : RegisterClass<"MBlaze", [i32], 32, [CARRY]> { +def CRC : RegisterClass<"MBlaze", [i32], 32, (add CARRY)> { let CopyCost = -1; } diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td index ac4d98c..4662f25 100644 --- a/lib/Target/MBlaze/MBlazeSchedule.td +++ b/lib/Target/MBlaze/MBlazeSchedule.td @@ -8,57 +8,48 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Functional units across MBlaze chips sets. Based on GCC/MBlaze backend files. +// MBlaze functional units. //===----------------------------------------------------------------------===// -def ALU : FuncUnit; -def IMULDIV : FuncUnit; +def IF : FuncUnit; +def ID : FuncUnit; +def EX : FuncUnit; +def MA : FuncUnit; +def WB : FuncUnit; //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for MBlaze //===----------------------------------------------------------------------===// -def IIAlu : InstrItinClass; -def IILoad : InstrItinClass; -def IIStore : InstrItinClass; -def IIXfer : InstrItinClass; -def IIBranch : InstrItinClass; -def IIHiLo : InstrItinClass; -def IIImul : InstrItinClass; -def IIIdiv : InstrItinClass; -def IIFcvt : InstrItinClass; -def IIFmove : InstrItinClass; -def IIFcmp : InstrItinClass; -def IIFadd : InstrItinClass; -def IIFmulSingle : InstrItinClass; -def IIFmulDouble : InstrItinClass; -def IIFdivSingle : InstrItinClass; -def IIFdivDouble : InstrItinClass; -def IIFsqrtSingle : InstrItinClass; -def IIFsqrtDouble : InstrItinClass; -def IIFrecipFsqrtStep : InstrItinClass; -def IIPseudo : InstrItinClass; +def IIC_ALU : InstrItinClass; +def IIC_ALUm : InstrItinClass; +def IIC_ALUd : InstrItinClass; +def IIC_SHT : InstrItinClass; +def IIC_FSLg : InstrItinClass; +def IIC_FSLp : InstrItinClass; +def IIC_MEMs : InstrItinClass; +def IIC_MEMl : InstrItinClass; +def IIC_FPU : InstrItinClass; +def IIC_FPUd : InstrItinClass; +def IIC_FPUf : InstrItinClass; +def IIC_FPUi : InstrItinClass; +def IIC_FPUs : InstrItinClass; +def IIC_FPUc : InstrItinClass; +def IIC_BR : InstrItinClass; +def IIC_BRc : InstrItinClass; +def IIC_BRl : InstrItinClass; +def IIC_WDC : InstrItinClass; +def IIC_Pseudo : InstrItinClass; //===----------------------------------------------------------------------===// -// MBlaze Generic instruction itineraries. +// MBlaze generic instruction itineraries. //===----------------------------------------------------------------------===// -def MBlazeGenericItineraries : ProcessorItineraries< - [ALU, IMULDIV], [], [ - InstrItinData<IIAlu , [InstrStage<1, [ALU]>]>, - InstrItinData<IILoad , [InstrStage<3, [ALU]>]>, - InstrItinData<IIStore , [InstrStage<1, [ALU]>]>, - InstrItinData<IIXfer , [InstrStage<2, [ALU]>]>, - InstrItinData<IIBranch , [InstrStage<1, [ALU]>]>, - InstrItinData<IIHiLo , [InstrStage<1, [IMULDIV]>]>, - InstrItinData<IIImul , [InstrStage<17, [IMULDIV]>]>, - InstrItinData<IIIdiv , [InstrStage<38, [IMULDIV]>]>, - InstrItinData<IIFcvt , [InstrStage<1, [ALU]>]>, - InstrItinData<IIFmove , [InstrStage<2, [ALU]>]>, - InstrItinData<IIFcmp , [InstrStage<3, [ALU]>]>, - InstrItinData<IIFadd , [InstrStage<4, [ALU]>]>, - InstrItinData<IIFmulSingle , [InstrStage<7, [ALU]>]>, - InstrItinData<IIFmulDouble , [InstrStage<8, [ALU]>]>, - InstrItinData<IIFdivSingle , [InstrStage<23, [ALU]>]>, - InstrItinData<IIFdivDouble , [InstrStage<36, [ALU]>]>, - InstrItinData<IIFsqrtSingle , [InstrStage<54, [ALU]>]>, - InstrItinData<IIFsqrtDouble , [InstrStage<12, [ALU]>]>, - InstrItinData<IIFrecipFsqrtStep , [InstrStage<5, [ALU]>]> -]>; +def MBlazeGenericItineraries : ProcessorItineraries<[], [], []>; + +//===----------------------------------------------------------------------===// +// MBlaze instruction itineraries for three stage pipeline. +//===----------------------------------------------------------------------===// +include "MBlazeSchedule3.td" + +//===----------------------------------------------------------------------===// +// MBlaze instruction itineraries for five stage pipeline. +//===----------------------------------------------------------------------===// +include "MBlazeSchedule5.td" diff --git a/lib/Target/MBlaze/MBlazeSchedule3.td b/lib/Target/MBlaze/MBlazeSchedule3.td new file mode 100644 index 0000000..ccbf99d --- /dev/null +++ b/lib/Target/MBlaze/MBlazeSchedule3.td @@ -0,0 +1,236 @@ +//===- MBlazeSchedule3.td - MBlaze Scheduling Definitions --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MBlaze instruction itineraries for the three stage pipeline. +//===----------------------------------------------------------------------===// +def MBlazePipe3Itineraries : ProcessorItineraries< + [IF,ID,EX], [], [ + + // ALU instruction with one destination register and either two register + // source operands or one register source operand and one immediate operand. + // The instruction takes one cycle to execute in each of the stages. The + // two source operands are read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_ALU, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]>], // one cycle in execute stage + [ 2 // result ready after two cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // ALU multiply instruction with one destination register and either two + // register source operands or one register source operand and one immediate + // operand. The instruction takes one cycle to execute in each of the + // pipeline stages except the execute stage, which takes three cycles. The + // two source operands are read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_ALUm, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<3,[EX]>], // three cycles in execute stage + [ 4 // result ready after four cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // ALU divide instruction with one destination register two register source + // operands. The instruction takes one cycle to execute in each the pipeline + // stages except the execute stage, which takes 34 cycles. The two + // source operands are read during the decode stage and the result is ready + // after the execute stage. + InstrItinData< IIC_ALUd, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<34,[EX]>], // 34 cycles in execute stage + [ 35 // result ready after 35 cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Shift instruction with one destination register and either two register + // source operands or one register source operand and one immediate operand. + // The instruction takes one cycle to execute in each of the pipeline stages + // except the execute stage, which takes two cycles. The two source operands + // are read during the decode stage and the result is ready after the execute + // stage. + InstrItinData< IIC_SHT, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 3 // result ready after three cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Branch instruction with one source operand register. The instruction takes + // one cycle to execute in each of the pipeline stages. The source operand is + // read during the decode stage. + InstrItinData< IIC_BR, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]>], // one cycle in execute stage + [ 1 ]>, // first operand read after one cycle + + // Conditional branch instruction with two source operand registers. The + // instruction takes one cycle to execute in each of the pipeline stages. The + // two source operands are read during the decode stage. + InstrItinData< IIC_BRc, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]>], // one cycle in execute stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Branch and link instruction with one destination register and one source + // operand register. The instruction takes one cycle to execute in each of + // the pipeline stages. The source operand is read during the decode stage + // and the destination register is ready after the execute stage. + InstrItinData< IIC_BRl, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]>], // one cycle in execute stage + [ 2 // result ready after two cycles + , 1 ]>, // first operand read after one cycle + + // Cache control instruction with two source operand registers. The + // instruction takes one cycle to execute in each of the pipeline stages + // except the memory access stage, which takes two cycles. The source + // operands are read during the decode stage. + InstrItinData< IIC_WDC, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Floating point instruction with one destination register and two source + // operand registers. The instruction takes one cycle to execute in each of + // the pipeline stages except the execute stage, which takes six cycles. The + // source operands are read during the decode stage and the results are ready + // after the execute stage. + InstrItinData< IIC_FPU, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<6,[EX]>], // six cycles in execute stage + [ 7 // result ready after seven cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Floating point divide instruction with one destination register and two + // source operand registers. The instruction takes one cycle to execute in + // each of the pipeline stages except the execute stage, which takes 30 + // cycles. The source operands are read during the decode stage and the + // results are ready after the execute stage. + InstrItinData< IIC_FPUd, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<30,[EX]>], // one cycle in execute stage + [ 31 // result ready after 31 cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Convert floating point to integer instruction with one destination + // register and one source operand register. The instruction takes one cycle + // to execute in each of the pipeline stages except the execute stage, + // which takes seven cycles. The source operands are read during the decode + // stage and the results are ready after the execute stage. + InstrItinData< IIC_FPUi, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<7,[EX]>], // seven cycles in execute stage + [ 8 // result ready after eight cycles + , 1 ]>, // first operand read after one cycle + + // Convert integer to floating point instruction with one destination + // register and one source operand register. The instruction takes one cycle + // to execute in each of the pipeline stages except the execute stage, + // which takes six cycles. The source operands are read during the decode + // stage and the results are ready after the execute stage. + InstrItinData< IIC_FPUf, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<6,[EX]>], // six cycles in execute stage + [ 7 // result ready after seven cycles + , 1 ]>, // first operand read after one cycle + + // Floating point square root instruction with one destination register and + // one source operand register. The instruction takes one cycle to execute in + // each of the pipeline stages except the execute stage, which takes 29 + // cycles. The source operands are read during the decode stage and the + // results are ready after the execute stage. + InstrItinData< IIC_FPUs, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<29,[EX]>], // 29 cycles in execute stage + [ 30 // result ready after 30 cycles + , 1 ]>, // first operand read after one cycle + + // Floating point comparison instruction with one destination register and + // two source operand registers. The instruction takes one cycle to execute + // in each of the pipeline stages except the execute stage, which takes three + // cycles. The source operands are read during the decode stage and the + // results are ready after the execute stage. + InstrItinData< IIC_FPUc, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<3,[EX]>], // three cycles in execute stage + [ 4 // result ready after four cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // FSL get instruction with one register or immediate source operand and one + // destination register. The instruction takes one cycle to execute in each + // of the pipeline stages except the execute stage, which takes two cycles. + // The one source operand is read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_FSLg, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 3 // result ready after two cycles + , 1 ]>, // first operand read after one cycle + + // FSL put instruction with either two register source operands or one + // register source operand and one immediate operand. There is no result + // produced by the instruction. The instruction takes one cycle to execute in + // each of the pipeline stages except the execute stage, which takes two + // cycles. The two source operands are read during the decode stage. + InstrItinData< IIC_FSLp, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Memory store instruction with either three register source operands or two + // register source operands and one immediate operand. There is no result + // produced by the instruction. The instruction takes one cycle to execute in + // each of the pipeline stages except the execute stage, which takes two + // cycles. All of the source operands are read during the decode stage. + InstrItinData< IIC_MEMs, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 1 // first operand read after one cycle + , 1 // second operand read after one cycle + , 1 ]>, // third operand read after one cycle + + // Memory load instruction with one destination register and either two + // register source operands or one register source operand and one immediate + // operand. The instruction takes one cycle to execute in each of the + // pipeline stages except the execute stage, which takes two cycles. All of + // the source operands are read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_MEMl, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 3 // result ready after four cycles + , 1 // second operand read after one cycle + , 1 ]> // third operand read after one cycle +]>; diff --git a/lib/Target/MBlaze/MBlazeSchedule5.td b/lib/Target/MBlaze/MBlazeSchedule5.td new file mode 100644 index 0000000..fa88766 --- /dev/null +++ b/lib/Target/MBlaze/MBlazeSchedule5.td @@ -0,0 +1,267 @@ +//===- MBlazeSchedule5.td - MBlaze Scheduling Definitions --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MBlaze instruction itineraries for the five stage pipeline. +//===----------------------------------------------------------------------===// +def MBlazePipe5Itineraries : ProcessorItineraries< + [IF,ID,EX,MA,WB], [], [ + + // ALU instruction with one destination register and either two register + // source operands or one register source operand and one immediate operand. + // The instruction takes one cycle to execute in each of the stages. The + // two source operands are read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_ALU, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 2 // result ready after two cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // ALU multiply instruction with one destination register and either two + // register source operands or one register source operand and one immediate + // operand. The instruction takes one cycle to execute in each of the + // pipeline stages. The two source operands are read during the decode stage + // and the result is ready after the execute stage. + InstrItinData< IIC_ALUm, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 2 // result ready after two cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // ALU divide instruction with one destination register two register source + // operands. The instruction takes one cycle to execute in each the pipeline + // stages except the memory access stage, which takes 31 cycles. The two + // source operands are read during the decode stage and the result is ready + // after the memory access stage. + InstrItinData< IIC_ALUd, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<31,[MA]> // 31 cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 33 // result ready after 33 cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Shift instruction with one destination register and either two register + // source operands or one register source operand and one immediate operand. + // The instruction takes one cycle to execute in each of the pipeline stages. + // The two source operands are read during the decode stage and the result is + // ready after the memory access stage. + InstrItinData< IIC_SHT, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 3 // result ready after three cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Branch instruction with one source operand register. The instruction takes + // one cycle to execute in each of the pipeline stages. The source operand is + // read during the decode stage. + InstrItinData< IIC_BR, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 ]>, // first operand read after one cycle + + // Conditional branch instruction with two source operand registers. The + // instruction takes one cycle to execute in each of the pipeline stages. The + // two source operands are read during the decode stage. + InstrItinData< IIC_BRc, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Branch and link instruction with one destination register and one source + // operand register. The instruction takes one cycle to execute in each of + // the pipeline stages. The source operand is read during the decode stage + // and the destination register is ready after the writeback stage. + InstrItinData< IIC_BRl, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 4 // result ready after four cycles + , 1 ]>, // first operand read after one cycle + + // Cache control instruction with two source operand registers. The + // instruction takes one cycle to execute in each of the pipeline stages + // except the memory access stage, which takes two cycles. The source + // operands are read during the decode stage. + InstrItinData< IIC_WDC, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<2,[MA]> // two cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Floating point instruction with one destination register and two source + // operand registers. The instruction takes one cycle to execute in each of + // the pipeline stages except the memory access stage, which takes two + // cycles. The source operands are read during the decode stage and the + // results are ready after the writeback stage. + InstrItinData< IIC_FPU, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<2,[MA]> // two cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 5 // result ready after five cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Floating point divide instruction with one destination register and two + // source operand registers. The instruction takes one cycle to execute in + // each of the pipeline stages except the memory access stage, which takes 26 + // cycles. The source operands are read during the decode stage and the + // results are ready after the writeback stage. + InstrItinData< IIC_FPUd, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<26,[MA]> // 26 cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 29 // result ready after 29 cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Convert floating point to integer instruction with one destination + // register and one source operand register. The instruction takes one cycle + // to execute in each of the pipeline stages except the memory access stage, + // which takes three cycles. The source operands are read during the decode + // stage and the results are ready after the writeback stage. + InstrItinData< IIC_FPUi, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<3,[MA]> // three cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 6 // result ready after six cycles + , 1 ]>, // first operand read after one cycle + + // Convert integer to floating point instruction with one destination + // register and one source operand register. The instruction takes one cycle + // to execute in each of the pipeline stages except the memory access stage, + // which takes two cycles. The source operands are read during the decode + // stage and the results are ready after the writeback stage. + InstrItinData< IIC_FPUf, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<2,[MA]> // two cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 5 // result ready after five cycles + , 1 ]>, // first operand read after one cycle + + // Floating point square root instruction with one destination register and + // one source operand register. The instruction takes one cycle to execute in + // each of the pipeline stages except the memory access stage, which takes 25 + // cycles. The source operands are read during the decode stage and the + // results are ready after the writeback stage. + InstrItinData< IIC_FPUs, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<25,[MA]> // 25 cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 28 // result ready after 28 cycles + , 1 ]>, // first operand read after one cycle + + // Floating point comparison instruction with one destination register and + // two source operand registers. The instruction takes one cycle to execute + // in each of the pipeline stages. The source operands are read during the + // decode stage and the results are ready after the execute stage. + InstrItinData< IIC_FPUc, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 2 // result ready after two cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // FSL get instruction with one register or immediate source operand and one + // destination register. The instruction takes one cycle to execute in each + // of the pipeline stages. The one source operand is read during the decode + // stage and the result is ready after the execute stage. + InstrItinData< IIC_FSLg, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 2 // result ready after two cycles + , 1 ]>, // first operand read after one cycle + + // FSL put instruction with either two register source operands or one + // register source operand and one immediate operand. There is no result + // produced by the instruction. The instruction takes one cycle to execute in + // each of the pipeline stages. The two source operands are read during the + // decode stage. + InstrItinData< IIC_FSLp, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Memory store instruction with either three register source operands or two + // register source operands and one immediate operand. There is no result + // produced by the instruction. The instruction takes one cycle to execute in + // each of the pipeline stages. All of the source operands are read during + // the decode stage. + InstrItinData< IIC_MEMs, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 // first operand read after one cycle + , 1 // second operand read after one cycle + , 1 ]>, // third operand read after one cycle + + // Memory load instruction with one destination register and either two + // register source operands or one register source operand and one immediate + // operand. The instruction takes one cycle to execute in each of the + // pipeline stages. All of the source operands are read during the decode + // stage and the result is ready after the writeback stage. + InstrItinData< IIC_MEMl, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 4 // result ready after four cycles + , 1 // second operand read after one cycle + , 1 ]> // third operand read after one cycle +]>; diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp index 3440521..a80744a 100644 --- a/lib/Target/MBlaze/MBlazeSubtarget.cpp +++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp @@ -13,19 +13,39 @@ #include "MBlazeSubtarget.h" #include "MBlaze.h" +#include "MBlazeRegisterInfo.h" #include "MBlazeGenSubtarget.inc" #include "llvm/Support/CommandLine.h" using namespace llvm; MBlazeSubtarget::MBlazeSubtarget(const std::string &TT, const std::string &FS): - HasPipe3(false), HasBarrel(false), HasDiv(false), HasMul(false), - HasFSL(false), HasEFSL(false), HasMSRSet(false), HasException(false), - HasPatCmp(false), HasFPU(false), HasESR(false), HasPVR(false), - HasMul64(false), HasSqrt(false), HasMMU(false) + HasBarrel(false), HasDiv(false), HasMul(false), HasPatCmp(false), + HasFPU(false), HasMul64(false), HasSqrt(false) { - std::string CPU = "v400"; - MBlazeArchVersion = V400; - // Parse features string. - ParseSubtargetFeatures(FS, CPU); + std::string CPU = "mblaze"; + CPU = ParseSubtargetFeatures(FS, CPU); + + // Only use instruction scheduling if the selected CPU has an instruction + // itinerary (the default CPU is the only one that doesn't). + HasItin = CPU != "mblaze"; + DEBUG(dbgs() << "CPU " << CPU << "(" << HasItin << ")\n"); + + // Compute the issue width of the MBlaze itineraries + computeIssueWidth(); +} + +void MBlazeSubtarget::computeIssueWidth() { + InstrItins.IssueWidth = 1; +} + +bool MBlazeSubtarget:: +enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtarget::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const { + Mode = TargetSubtarget::ANTIDEP_CRITICAL; + CriticalPathRCs.clear(); + CriticalPathRCs.push_back(&MBlaze::GPRRegClass); + return HasItin && OptLevel >= CodeGenOpt::Default; } + diff --git a/lib/Target/MBlaze/MBlazeSubtarget.h b/lib/Target/MBlaze/MBlazeSubtarget.h index bebb3f7..2255b28 100644 --- a/lib/Target/MBlaze/MBlazeSubtarget.h +++ b/lib/Target/MBlaze/MBlazeSubtarget.h @@ -24,29 +24,14 @@ namespace llvm { class MBlazeSubtarget : public TargetSubtarget { protected: - - enum MBlazeArchEnum { - V400, V500, V600, V700, V710 - }; - - // MBlaze architecture version - MBlazeArchEnum MBlazeArchVersion; - - bool HasPipe3; bool HasBarrel; bool HasDiv; bool HasMul; - bool HasFSL; - bool HasEFSL; - bool HasMSRSet; - bool HasException; bool HasPatCmp; bool HasFPU; - bool HasESR; - bool HasPVR; bool HasMul64; bool HasSqrt; - bool HasMMU; + bool HasItin; InstrItineraryData InstrItins; @@ -61,18 +46,26 @@ public: std::string ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + /// Compute the number of maximum number of issues per cycle for the + /// MBlaze scheduling itineraries. + void computeIssueWidth(); + + /// enablePostRAScheduler - True at 'More' optimization. + bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtarget::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const; + + /// getInstrItins - Return the instruction itineraies based on subtarget. + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + + bool hasItin() const { return HasItin; } + bool hasPCMP() const { return HasPatCmp; } bool hasFPU() const { return HasFPU; } bool hasSqrt() const { return HasSqrt; } bool hasMul() const { return HasMul; } bool hasMul64() const { return HasMul64; } bool hasDiv() const { return HasDiv; } bool hasBarrel() const { return HasBarrel; } - - bool isV400() const { return MBlazeArchVersion == V400; } - bool isV500() const { return MBlazeArchVersion == V500; } - bool isV600() const { return MBlazeArchVersion == V600; } - bool isV700() const { return MBlazeArchVersion == V700; } - bool isV710() const { return MBlazeArchVersion == V710; } }; } // End llvm namespace diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp index cd949e1..df34a83 100644 --- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp +++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp @@ -36,19 +36,18 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, bool RelaxAll, bool NoExecStack) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: + + if (TheTriple.isOSDarwin()) { llvm_unreachable("MBlaze does not support Darwin MACH-O format"); return NULL; - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: + } + + if (TheTriple.isOSWindows()) { llvm_unreachable("MBlaze does not support Windows COFF format"); return NULL; - default: - return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, - NoExecStack); } + + return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); } @@ -87,7 +86,8 @@ MBlazeTargetMachine(const Target &T, const std::string &TT, DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"), InstrInfo(*this), FrameLowering(Subtarget), - TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this) { + TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this), + InstrItins(Subtarget.getInstrItineraryData()) { if (getRelocationModel() == Reloc::Default) { setRelocationModel(Reloc::Static); } diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h index 45ad078..48ce37a 100644 --- a/lib/Target/MBlaze/MBlazeTargetMachine.h +++ b/lib/Target/MBlaze/MBlazeTargetMachine.h @@ -38,13 +38,18 @@ namespace llvm { MBlazeSelectionDAGInfo TSInfo; MBlazeIntrinsicInfo IntrinsicInfo; MBlazeELFWriterInfo ELFWriterInfo; + InstrItineraryData InstrItins; + public: MBlazeTargetMachine(const Target &T, const std::string &TT, - const std::string &FS); + const std::string &FS); virtual const MBlazeInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const InstrItineraryData *getInstrItineraryData() const + { return &InstrItins; } + virtual const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } diff --git a/lib/Target/MBlaze/TODO b/lib/Target/MBlaze/TODO index 2e613eb..317d7c0 100644 --- a/lib/Target/MBlaze/TODO +++ b/lib/Target/MBlaze/TODO @@ -9,8 +9,6 @@ needs to be examined more closely: - The stack layout needs to be examined to make sure it meets the standard, especially in regards to var arg functions. - - The processor itineraries are copied from a different backend - and need to be updated to model the MicroBlaze correctly. - Look at the MBlazeGenFastISel.inc stuff and make use of it if appropriate. @@ -18,9 +16,6 @@ There are a few things that need to be looked at: - There are some instructions that are not generated by the backend and have not been tested as far as the parser is concerned. - - The assembly parser does not use any MicroBlaze specific directives. + - The assembly parser does not use many MicroBlaze specific directives. I should investigate if there are MicroBlaze specific directive and, if there are, add them. - - The instruction MFS and MTS use special names for some of the - special registers that can be accessed. These special register - names should be parsed by the assembly parser. diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index a95d59c..0a3eab1 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -170,6 +170,9 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) : setLibcallName(RTLIB::MUL_I8, "__mulqi3hw_noint"); setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint"); } + + setMinFunctionAlignment(1); + setPrefFunctionAlignment(2); } SDValue MSP430TargetLowering::LowerOperation(SDValue Op, @@ -193,11 +196,6 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op, } } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned MSP430TargetLowering::getFunctionAlignment(const Function *F) const { - return F->hasFnAttr(Attribute::OptimizeForSize) ? 1 : 2; -} - //===----------------------------------------------------------------------===// // MSP430 Inline Assembly Support //===----------------------------------------------------------------------===// @@ -314,8 +312,8 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430); assert(!isVarArg && "Varargs not supported yet"); @@ -397,8 +395,8 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, } // CCState - Info about the registers and stack slot. - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_MSP430); @@ -451,8 +449,8 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, SmallVectorImpl<SDValue> &InVals) const { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_MSP430); @@ -515,7 +513,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain and // flag operands which copy the outgoing args into registers. The InFlag in - // necessary since all emited instructions must be stuck together. + // necessary since all emitted instructions must be stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, @@ -574,8 +572,8 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430); diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h index 19c9eac..bd660a0 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.h +++ b/lib/Target/MSP430/MSP430ISelLowering.h @@ -82,9 +82,6 @@ namespace llvm { /// DAG node. virtual const char *getTargetNodeName(unsigned Opcode) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 1da6d8d..53f4c2e 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -76,7 +76,11 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - // Mark 4 special registers as reserved. + // Mark 4 special registers with subregisters as reserved. + Reserved.set(MSP430::PCB); + Reserved.set(MSP430::SPB); + Reserved.set(MSP430::SRB); + Reserved.set(MSP430::CGB); Reserved.set(MSP430::PCW); Reserved.set(MSP430::SPW); Reserved.set(MSP430::SRW); @@ -242,4 +246,9 @@ int MSP430RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return 0; } +int MSP430RegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const { + llvm_unreachable("Not implemented yet!"); + return 0; +} + #include "MSP430GenRegisterInfo.inc" diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h index 56744fa..e820558 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.h +++ b/lib/Target/MSP430/MSP430RegisterInfo.h @@ -39,6 +39,13 @@ public: BitVector getReservedRegs(const MachineFunction &MF) const; const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const; + const TargetRegisterClass * + getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, unsigned Idx) const { + // No sub-classes makes this really easy. + return A; + } + void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; @@ -54,6 +61,7 @@ public: //! Get DWARF debugging register number int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; }; } // end namespace llvm diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td index ab7b59b..d1c2e3f 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.td +++ b/lib/Target/MSP430/MSP430RegisterInfo.td @@ -66,54 +66,20 @@ def R15W : MSP430RegWithSubregs<15, "r15", [R15B]>; def GR8 : RegisterClass<"MSP430", [i8], 8, // Volatile registers - [R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B, + (add R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B, // Frame pointer, sometimes allocable FPB, // Volatile, but not allocable - PCB, SPB, SRB, CGB]> -{ - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR8Class::iterator - GR8Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - // Depending on whether the function uses frame pointer or not, last 5 or 4 - // registers on the list above are reserved - if (TFI->hasFP(MF)) - return end()-5; - else - return end()-4; - } - }]; -} + PCB, SPB, SRB, CGB)>; def GR16 : RegisterClass<"MSP430", [i16], 16, // Volatile registers - [R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W, + (add R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W, // Frame pointer, sometimes allocable FPW, // Volatile, but not allocable - PCW, SPW, SRW, CGW]> + PCW, SPW, SRW, CGW)> { let SubRegClasses = [(GR8 subreg_8bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR16Class::iterator - GR16Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - // Depending on whether the function uses frame pointer or not, last 5 or 4 - // registers on the list above are reserved - if (TFI->hasFP(MF)) - return end()-5; - else - return end()-4; - } - }]; } diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt index 26df1a0..fd16516 100644 --- a/lib/Target/Mips/CMakeLists.txt +++ b/lib/Target/Mips/CMakeLists.txt @@ -13,6 +13,8 @@ tablegen(MipsGenSubtarget.inc -gen-subtarget) add_llvm_target(MipsCodeGen MipsAsmPrinter.cpp MipsDelaySlotFiller.cpp + MipsEmitGPRestore.cpp + MipsExpandPseudo.cpp MipsInstrInfo.cpp MipsISelDAGToDAG.cpp MipsISelLowering.cpp diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h index fb3c492..76a26a9 100644 --- a/lib/Target/Mips/Mips.h +++ b/lib/Target/Mips/Mips.h @@ -25,6 +25,8 @@ namespace llvm { FunctionPass *createMipsISelDag(MipsTargetMachine &TM); FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM); + FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM); + FunctionPass *createMipsEmitGPRestorePass(MipsTargetMachine &TM); extern Target TheMipsTarget; extern Target TheMipselTarget; diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index 5102c69..b79016d 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -81,7 +81,7 @@ def : Proc<"r6000", [FeatureMips2]>; def : Proc<"4ke", [FeatureMips32r2]>; -// Allegrex is a 32bit subset of r4000, both for interger and fp registers, +// Allegrex is a 32bit subset of r4000, both for integer and fp registers, // but much more similar to Mips2 than Mips3. It also contains some of // Mips32/Mips32r2 instructions and a custom vector fpu processor. def : Proc<"allegrex", [FeatureMips2, FeatureSingleFloat, FeatureEABI, diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 718110f..8caa7cd 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -77,7 +77,8 @@ namespace { } virtual void EmitFunctionBodyStart(); virtual void EmitFunctionBodyEnd(); - virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const; + virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock* + MBB) const; static const char *getRegisterName(unsigned RegNo); virtual void EmitFunctionEntryLabel(); @@ -125,44 +126,60 @@ namespace { // Create a bitmask with all callee saved registers for CPU or Floating Point // registers. For CPU registers consider RA, GP and FP for saving if necessary. void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) { - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); - const MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>(); - // CPU and FPU Saved Registers Bitmasks - unsigned int CPUBitmask = 0; - unsigned int FPUBitmask = 0; + unsigned CPUBitmask = 0, FPUBitmask = 0; + int CPUTopSavedRegOff, FPUTopSavedRegOff; // Set the CPU and FPU Bitmasks const MachineFrameInfo *MFI = MF->getFrameInfo(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + // size of stack area to which FP callee-saved regs are saved. + unsigned CPURegSize = Mips::CPURegsRegisterClass->getSize(); + unsigned FGR32RegSize = Mips::FGR32RegisterClass->getSize(); + unsigned AFGR64RegSize = Mips::AFGR64RegisterClass->getSize(); + bool HasAFGR64Reg = false; + unsigned CSFPRegsSize = 0; + unsigned i, e = CSI.size(); + + // Set FPU Bitmask. + for (i = 0; i != e; ++i) { unsigned Reg = CSI[i].getReg(); - unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg); if (Mips::CPURegsRegisterClass->contains(Reg)) - CPUBitmask |= (1 << RegNum); - else - FPUBitmask |= (1 << RegNum); + break; + + unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg); + if (Mips::AFGR64RegisterClass->contains(Reg)) { + FPUBitmask |= (3 << RegNum); + CSFPRegsSize += AFGR64RegSize; + HasAFGR64Reg = true; + continue; + } + + FPUBitmask |= (1 << RegNum); + CSFPRegsSize += FGR32RegSize; + } + + // Set CPU Bitmask. + for (; i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg); + CPUBitmask |= (1 << RegNum); } - // Return Address and Frame registers must also be set in CPUBitmask. - // FIXME: Do we really need hasFP() call here? When no FP is present SP is - // just returned -- will it be ok? - if (TFI->hasFP(*MF)) - CPUBitmask |= (1 << MipsRegisterInfo:: - getRegisterNumbering(RI->getFrameRegister(*MF))); + // FP Regs are saved right below where the virtual frame pointer points to. + FPUTopSavedRegOff = FPUBitmask ? + (HasAFGR64Reg ? -AFGR64RegSize : -FGR32RegSize) : 0; - if (MFI->adjustsStack()) - CPUBitmask |= (1 << MipsRegisterInfo:: - getRegisterNumbering(RI->getRARegister())); + // CPU Regs are saved below FP Regs. + CPUTopSavedRegOff = CPUBitmask ? -CSFPRegsSize - CPURegSize : 0; // Print CPUBitmask O << "\t.mask \t"; printHex32(CPUBitmask, O); - O << ',' << MipsFI->getCPUTopSavedRegOff() << '\n'; + O << ',' << CPUTopSavedRegOff << '\n'; // Print FPUBitmask - O << "\t.fmask\t"; printHex32(FPUBitmask, O); O << "," - << MipsFI->getFPUTopSavedRegOff() << '\n'; + O << "\t.fmask\t"; printHex32(FPUBitmask, O); + O << "," << FPUTopSavedRegOff << '\n'; } // Print a 32 bit hex number with all numbers. @@ -236,8 +253,8 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() { /// isBlockOnlyReachableByFallthough - Return true if the basic block has /// exactly one predecessor and the control transfer mechanism between /// the predecessor and this block is a fall-through. -bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) - const { +bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock* + MBB) const { // The predecessor has to be immediately before this block. const MachineBasicBlock *Pred = *MBB->pred_begin(); @@ -301,6 +318,10 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, case MipsII::MO_GOT: O << "%got("; break; case MipsII::MO_ABS_HI: O << "%hi("; break; case MipsII::MO_ABS_LO: O << "%lo("; break; + case MipsII::MO_TLSGD: O << "%tlsgd("; break; + case MipsII::MO_GOTTPREL: O << "%gottprel("; break; + case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break; + case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break; } switch (MO.getType()) { @@ -309,7 +330,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, break; case MachineOperand::MO_Immediate: - O << (short int)MO.getImm(); + O << MO.getImm(); break; case MachineOperand::MO_MachineBasicBlock: diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td index 8e4b216..57aeb1d 100644 --- a/lib/Target/Mips/MipsCallingConv.td +++ b/lib/Target/Mips/MipsCallingConv.td @@ -48,7 +48,7 @@ def CC_MipsEABI : CallingConv<[ CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[F12, F14, F16, F18]>>>, - // The first 4 doubl fp arguments are passed in single fp registers. + // The first 4 double fp arguments are passed in single fp registers. CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D6, D7, D8, D9]>>>, diff --git a/lib/Target/Mips/MipsEmitGPRestore.cpp b/lib/Target/Mips/MipsEmitGPRestore.cpp new file mode 100644 index 0000000..f49d490 --- /dev/null +++ b/lib/Target/Mips/MipsEmitGPRestore.cpp @@ -0,0 +1,94 @@ +//===-- MipsEmitGPRestore.cpp - Emit GP restore instruction----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass emits instructions that restore $gp right +// after jalr instructions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "emit-gp-restore" + +#include "Mips.h" +#include "MipsTargetMachine.h" +#include "MipsMachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/Statistic.h" + +using namespace llvm; + +namespace { + struct Inserter : public MachineFunctionPass { + + TargetMachine &TM; + const TargetInstrInfo *TII; + + static char ID; + Inserter(TargetMachine &tm) + : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { } + + virtual const char *getPassName() const { + return "Mips Emit GP Restore"; + } + + bool runOnMachineFunction(MachineFunction &F); + }; + char Inserter::ID = 0; +} // end of anonymous namespace + +bool Inserter::runOnMachineFunction(MachineFunction &F) { + if (TM.getRelocationModel() != Reloc::PIC_) + return false; + + bool Changed = false; + int FI = F.getInfo<MipsFunctionInfo>()->getGPFI(); + + for (MachineFunction::iterator MFI = F.begin(), MFE = F.end(); + MFI != MFE; ++MFI) { + MachineBasicBlock& MBB = *MFI; + MachineBasicBlock::iterator I = MFI->begin(); + + // If MBB is a landing pad, insert instruction that restores $gp after + // EH_LABEL. + if (MBB.isLandingPad()) { + // Find EH_LABEL first. + for (; I->getOpcode() != TargetOpcode::EH_LABEL; ++I) ; + + // Insert lw. + ++I; + DebugLoc dl = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + BuildMI(MBB, I, dl, TII->get(Mips::LW), Mips::GP).addImm(0) + .addFrameIndex(FI); + Changed = true; + } + + while (I != MFI->end()) { + if (I->getOpcode() != Mips::JALR) { + ++I; + continue; + } + + DebugLoc dl = I->getDebugLoc(); + // emit lw $gp, ($gp save slot on stack) after jalr + BuildMI(MBB, ++I, dl, TII->get(Mips::LW), Mips::GP).addImm(0) + .addFrameIndex(FI); + Changed = true; + } + } + + return Changed; +} + +/// createMipsEmitGPRestorePass - Returns a pass that emits instructions that +/// restores $gp clobbered by jalr instructions. +FunctionPass *llvm::createMipsEmitGPRestorePass(MipsTargetMachine &tm) { + return new Inserter(tm); +} + diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp new file mode 100644 index 0000000..4423f51 --- /dev/null +++ b/lib/Target/Mips/MipsExpandPseudo.cpp @@ -0,0 +1,117 @@ +//===-- MipsExpandPseudo.cpp - Expand pseudo instructions ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass expands pseudo instructions into target instructions after register +// allocation but before post-RA scheduling. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mips-expand-pseudo" + +#include "Mips.h" +#include "MipsTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/Statistic.h" + +using namespace llvm; + +namespace { + struct MipsExpandPseudo : public MachineFunctionPass { + + TargetMachine &TM; + const TargetInstrInfo *TII; + + static char ID; + MipsExpandPseudo(TargetMachine &tm) + : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { } + + virtual const char *getPassName() const { + return "Mips PseudoInstrs Expansion"; + } + + bool runOnMachineFunction(MachineFunction &F); + bool runOnMachineBasicBlock(MachineBasicBlock &MBB); + + private: + void ExpandBuildPairF64(MachineBasicBlock&, MachineBasicBlock::iterator); + void ExpandExtractElementF64(MachineBasicBlock&, + MachineBasicBlock::iterator); + }; + char MipsExpandPseudo::ID = 0; +} // end of anonymous namespace + +bool MipsExpandPseudo::runOnMachineFunction(MachineFunction& F) { + bool Changed = false; + + for (MachineFunction::iterator I = F.begin(); I != F.end(); ++I) + Changed |= runOnMachineBasicBlock(*I); + + return Changed; +} + +bool MipsExpandPseudo::runOnMachineBasicBlock(MachineBasicBlock& MBB) { + + bool Changed = false; + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) { + const TargetInstrDesc& Tid = I->getDesc(); + + switch(Tid.getOpcode()) { + default: + ++I; + continue; + case Mips::BuildPairF64: + ExpandBuildPairF64(MBB, I); + break; + case Mips::ExtractElementF64: + ExpandExtractElementF64(MBB, I); + break; + } + + // delete original instr + MBB.erase(I++); + Changed = true; + } + + return Changed; +} + +void MipsExpandPseudo::ExpandBuildPairF64(MachineBasicBlock& MBB, + MachineBasicBlock::iterator I) { + unsigned DstReg = I->getOperand(0).getReg(); + unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg(); + const TargetInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1); + DebugLoc dl = I->getDebugLoc(); + const unsigned* SubReg = + TM.getRegisterInfo()->getSubRegisters(DstReg); + + // mtc1 Lo, $fp + // mtc1 Hi, $fp + 1 + BuildMI(MBB, I, dl, Mtc1Tdd, *SubReg).addReg(LoReg); + BuildMI(MBB, I, dl, Mtc1Tdd, *(SubReg + 1)).addReg(HiReg); +} + +void MipsExpandPseudo::ExpandExtractElementF64(MachineBasicBlock& MBB, + MachineBasicBlock::iterator I) { + unsigned DstReg = I->getOperand(0).getReg(); + unsigned SrcReg = I->getOperand(1).getReg(); + unsigned N = I->getOperand(2).getImm(); + const TargetInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1); + DebugLoc dl = I->getDebugLoc(); + const unsigned* SubReg = TM.getRegisterInfo()->getSubRegisters(SrcReg); + + BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(*(SubReg + N)); +} + +/// createMipsMipsExpandPseudoPass - Returns a pass that expands pseudo +/// instrs into real instrs +FunctionPass *llvm::createMipsExpandPseudoPass(MipsTargetMachine &tm) { + return new MipsExpandPseudo(tm); +} diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index 5e4a7da..a0f90a0 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -84,128 +84,20 @@ using namespace llvm; // if frame pointer elimination is disabled. bool MipsFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects(); + return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects() + || MFI->isFrameAddressTaken(); } -void MipsFrameLowering::adjustMipsStackFrame(MachineFunction &MF) const { - MachineFrameInfo *MFI = MF.getFrameInfo(); - MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - unsigned StackAlign = getStackAlignment(); - unsigned RegSize = STI.isGP32bit() ? 4 : 8; - bool HasGP = MipsFI->needGPSaveRestore(); - - // Min and Max CSI FrameIndex. - int MinCSFI = -1, MaxCSFI = -1; - - // See the description at MipsMachineFunction.h - int TopCPUSavedRegOff = -1, TopFPUSavedRegOff = -1; - - // Replace the dummy '0' SPOffset by the negative offsets, as explained on - // LowerFormalArguments. Leaving '0' for while is necessary to avoid the - // approach done by calculateFrameObjectOffsets to the stack frame. - MipsFI->adjustLoadArgsFI(MFI); - MipsFI->adjustStoreVarArgsFI(MFI); - - // It happens that the default stack frame allocation order does not directly - // map to the convention used for mips. So we must fix it. We move the callee - // save register slots after the local variables area, as described in the - // stack frame above. - unsigned CalleeSavedAreaSize = 0; - if (!CSI.empty()) { - MinCSFI = CSI[0].getFrameIdx(); - MaxCSFI = CSI[CSI.size()-1].getFrameIdx(); - } - for (unsigned i = 0, e = CSI.size(); i != e; ++i) - CalleeSavedAreaSize += MFI->getObjectAlignment(CSI[i].getFrameIdx()); - - unsigned StackOffset = HasGP ? (MipsFI->getGPStackOffset()+RegSize) - : (STI.isABI_O32() ? 16 : 0); - - // Adjust local variables. They should come on the stack right - // after the arguments. - int LastOffsetFI = -1; - for (int i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { - if (i >= MinCSFI && i <= MaxCSFI) - continue; - if (MFI->isDeadObjectIndex(i)) - continue; - unsigned Offset = - StackOffset + MFI->getObjectOffset(i) - CalleeSavedAreaSize; - if (LastOffsetFI == -1) - LastOffsetFI = i; - if (Offset > MFI->getObjectOffset(LastOffsetFI)) - LastOffsetFI = i; - MFI->setObjectOffset(i, Offset); - } - - // Adjust CPU Callee Saved Registers Area. Registers RA and FP must - // be saved in this CPU Area. This whole area must be aligned to the - // default Stack Alignment requirements. - if (LastOffsetFI >= 0) - StackOffset = MFI->getObjectOffset(LastOffsetFI)+ - MFI->getObjectSize(LastOffsetFI); - StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign); - - for (unsigned i = 0, e = CSI.size(); i != e ; ++i) { - unsigned Reg = CSI[i].getReg(); - if (!Mips::CPURegsRegisterClass->contains(Reg)) - break; - MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset); - TopCPUSavedRegOff = StackOffset; - StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx()); - } - - // Stack locations for FP and RA. If only one of them is used, - // the space must be allocated for both, otherwise no space at all. - if (hasFP(MF) || MFI->adjustsStack()) { - // FP stack location - MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true), - StackOffset); - MipsFI->setFPStackOffset(StackOffset); - TopCPUSavedRegOff = StackOffset; - StackOffset += RegSize; - - // SP stack location - MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true), - StackOffset); - MipsFI->setRAStackOffset(StackOffset); - StackOffset += RegSize; - - if (MFI->adjustsStack()) - TopCPUSavedRegOff += RegSize; - } - - StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign); - - // Adjust FPU Callee Saved Registers Area. This Area must be - // aligned to the default Stack Alignment requirements. - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - if (Mips::CPURegsRegisterClass->contains(Reg)) - continue; - MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset); - TopFPUSavedRegOff = StackOffset; - StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx()); - } - StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign); - - // Update frame info - MFI->setStackSize(StackOffset); - - // Recalculate the final tops offset. The final values must be '0' - // if there isn't a callee saved register for CPU or FPU, otherwise - // a negative offset is needed. - if (TopCPUSavedRegOff >= 0) - MipsFI->setCPUTopSavedRegOff(TopCPUSavedRegOff-StackOffset); - - if (TopFPUSavedRegOff >= 0) - MipsFI->setFPUTopSavedRegOff(TopFPUSavedRegOff-StackOffset); +bool MipsFrameLowering::targetHandlesStackFrameRounding() const { + return true; } +static unsigned AlignOffset(unsigned Offset, unsigned Align) { + return (Offset + Align - 1) / Align * Align; +} -// expand pair of register and immediate if the immediate doesn't fit in the 16-bit -// offset field. +// expand pair of register and immediate if the immediate doesn't fit in the +// 16-bit offset field. // e.g. // if OrigImm = 0x10000, OrigReg = $sp: // generate the following sequence of instrs: @@ -216,7 +108,8 @@ void MipsFrameLowering::adjustMipsStackFrame(MachineFunction &MF) const { // return true static bool expandRegLargeImmPair(unsigned OrigReg, int OrigImm, unsigned& NewReg, int& NewImm, - MachineBasicBlock& MBB, MachineBasicBlock::iterator I) { + MachineBasicBlock& MBB, + MachineBasicBlock::iterator I) { // OrigImm fits in the 16-bit field if (OrigImm < 0x8000 && OrigImm >= -0x8000) { NewReg = OrigReg; @@ -227,13 +120,15 @@ static bool expandRegLargeImmPair(unsigned OrigReg, int OrigImm, MachineFunction* MF = MBB.getParent(); const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); DebugLoc DL = I->getDebugLoc(); - int ImmLo = OrigImm & 0xffff; - int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) + ((OrigImm & 0x8000) != 0); + int ImmLo = (short)(OrigImm & 0xffff); + int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) + + ((OrigImm & 0x8000) != 0); // FIXME: change this when mips goes MC". BuildMI(MBB, I, DL, TII->get(Mips::NOAT)); BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi); - BuildMI(MBB, I, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg).addReg(Mips::AT); + BuildMI(MBB, I, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg) + .addReg(Mips::AT); NewReg = Mips::AT; NewImm = ImmLo; @@ -255,18 +150,18 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { int NewImm = 0; bool ATUsed; - // Get the right frame order for Mips. - adjustMipsStackFrame(MF); - - // Get the number of bytes to allocate from the FrameInfo. - unsigned StackSize = MFI->getStackSize(); - - // No need to allocate space on the stack. - if (StackSize == 0 && !MFI->adjustsStack()) return; - - int FPOffset = MipsFI->getFPStackOffset(); - int RAOffset = MipsFI->getRAStackOffset(); - + // First, compute final stack size. + unsigned RegSize = STI.isGP32bit() ? 4 : 8; + unsigned StackAlign = getStackAlignment(); + unsigned LocalVarAreaOffset = MipsFI->needGPSaveRestore() ? + (MFI->getObjectOffset(MipsFI->getGPFI()) + RegSize) : + MipsFI->getMaxCallFrameSize(); + unsigned StackSize = AlignOffset(LocalVarAreaOffset, StackAlign) + + AlignOffset(MFI->getStackSize(), StackAlign); + + // Update stack size + MFI->setStackSize(StackSize); + BuildMI(MBB, MBBI, dl, TII.get(Mips::NOREORDER)); // TODO: check need from GP here. @@ -275,6 +170,13 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { .addReg(RegInfo->getPICCallReg()); BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO)); + // No need to allocate space on the stack. + if (StackSize == 0 && !MFI->adjustsStack()) return; + + MachineModuleInfo &MMI = MF.getMMI(); + std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + MachineLocation DstML, SrcML; + // Adjust stack : addi sp, sp, (-imm) ATUsed = expandRegLargeImmPair(Mips::SP, -StackSize, NewReg, NewImm, MBB, MBBI); @@ -285,97 +187,109 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { if (ATUsed) BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); - // Save the return address only if the function isnt a leaf one. - // sw $ra, stack_loc($sp) - if (MFI->adjustsStack()) { - ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB, - MBBI); - BuildMI(MBB, MBBI, dl, TII.get(Mips::SW)) - .addReg(Mips::RA).addImm(NewImm).addReg(NewReg); + // emit ".cfi_def_cfa_offset StackSize" + MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, + TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel); + DstML = MachineLocation(MachineLocation::VirtualFP); + SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize); + Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML)); - // FIXME: change this when mips goes MC". - if (ATUsed) - BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); - } + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - // if framepointer enabled, save it and set it - // to point to the stack pointer + if (CSI.size()) { + // Find the instruction past the last instruction that saves a callee-saved + // register to the stack. + for (unsigned i = 0; i < CSI.size(); ++i) + ++MBBI; + + // Iterate over list of callee-saved registers and emit .cfi_offset + // directives. + MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, + TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel); + + for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), + E = CSI.end(); I != E; ++I) { + int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); + unsigned Reg = I->getReg(); + + // If Reg is a double precision register, emit two cfa_offsets, + // one for each of the paired single precision registers. + if (Mips::AFGR64RegisterClass->contains(Reg)) { + const unsigned *SubRegs = RegInfo->getSubRegisters(Reg); + MachineLocation DstML0(MachineLocation::VirtualFP, Offset); + MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4); + MachineLocation SrcML0(*SubRegs); + MachineLocation SrcML1(*(SubRegs + 1)); + + if (!STI.isLittle()) + std::swap(SrcML0, SrcML1); + + Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0)); + Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1)); + } + else { + // Reg is either in CPURegs or FGR32. + DstML = MachineLocation(MachineLocation::VirtualFP, Offset); + SrcML = MachineLocation(Reg); + Moves.push_back(MachineMove(CSLabel, DstML, SrcML)); + } + } + } + + // if framepointer enabled, set it to point to the stack pointer. if (hasFP(MF)) { - // sw $fp,stack_loc($sp) - ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB, - MBBI); - BuildMI(MBB, MBBI, dl, TII.get(Mips::SW)) - .addReg(Mips::FP).addImm(NewImm).addReg(NewReg); - - // FIXME: change this when mips goes MC". - if (ATUsed) - BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); - - // move $fp, $sp + // Insert instruction "move $fp, $sp" at this location. BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::FP) .addReg(Mips::SP).addReg(Mips::ZERO); + + // emit ".cfi_def_cfa_register $fp" + MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, + TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel); + DstML = MachineLocation(Mips::FP); + SrcML = MachineLocation(MachineLocation::VirtualFP); + Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML)); } // Restore GP from the saved stack location if (MipsFI->needGPSaveRestore()) BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE)) - .addImm(MipsFI->getGPStackOffset()); + .addImm(MFI->getObjectOffset(MipsFI->getGPFI())); } void MipsFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); MachineFrameInfo *MFI = MF.getFrameInfo(); - MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); const MipsInstrInfo &TII = *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo()); DebugLoc dl = MBBI->getDebugLoc(); // Get the number of bytes from FrameInfo - int NumBytes = (int) MFI->getStackSize(); - - // Get the FI's where RA and FP are saved. - int FPOffset = MipsFI->getFPStackOffset(); - int RAOffset = MipsFI->getRAStackOffset(); + unsigned StackSize = MFI->getStackSize(); unsigned NewReg = 0; int NewImm = 0; bool ATUsed = false; - // if framepointer enabled, restore it and restore the - // stack pointer + // if framepointer enabled, restore the stack pointer. if (hasFP(MF)) { - // move $sp, $fp - BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::SP) + // Find the first instruction that restores a callee-saved register. + MachineBasicBlock::iterator I = MBBI; + + for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i) + --I; + + // Insert instruction "move $sp, $fp" at this location. + BuildMI(MBB, I, dl, TII.get(Mips::ADDu), Mips::SP) .addReg(Mips::FP).addReg(Mips::ZERO); - - // lw $fp,stack_loc($sp) - ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB, - MBBI); - BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::FP) - .addImm(NewImm).addReg(NewReg); - - // FIXME: change this when mips goes MC". - if (ATUsed) - BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); - } - - // Restore the return address only if the function isnt a leaf one. - // lw $ra, stack_loc($sp) - if (MFI->adjustsStack()) { - ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB, - MBBI); - BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::RA) - .addImm(NewImm).addReg(NewReg); - - // FIXME: change this when mips goes MC". - if (ATUsed) - BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); } // adjust stack : insert addi sp, sp, (imm) - if (NumBytes) { - ATUsed = expandRegLargeImmPair(Mips::SP, NumBytes, NewReg, NewImm, MBB, + if (StackSize) { + ATUsed = expandRegLargeImmPair(Mips::SP, StackSize, NewReg, NewImm, MBB, MBBI); BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP) .addReg(NewReg).addImm(NewImm); @@ -386,9 +300,32 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF, } } +void +MipsFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const { + MachineLocation Dst(MachineLocation::VirtualFP); + MachineLocation Src(Mips::SP, 0); + Moves.push_back(MachineMove(0, Dst, Src)); +} + void MipsFrameLowering:: -processFunctionBeforeFrameFinalized(MachineFunction &MF) const { - const MipsRegisterInfo *RegInfo = - static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo()); - RegInfo->processFunctionBeforeFrameFinalized(MF); +processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + MachineRegisterInfo& MRI = MF.getRegInfo(); + + // FIXME: remove this code if register allocator can correctly mark + // $fp and $ra used or unused. + + // Mark $fp and $ra as used or unused. + if (hasFP(MF)) + MRI.setPhysRegUsed(Mips::FP); + + // The register allocator might determine $ra is used after seeing + // instruction "jr $ra", but we do not want PrologEpilogInserter to insert + // instructions to save/restore $ra unless there is a function call. + // To correct this, $ra is explicitly marked unused if there is no + // function call. + if (MF.getFrameInfo()->hasCalls()) + MRI.setPhysRegUsed(Mips::RA); + else + MRI.setPhysRegUnused(Mips::RA); } diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index 34647df..78c78ee 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -27,11 +27,10 @@ protected: public: explicit MipsFrameLowering(const MipsSubtarget &sti) - // FIXME: Is this correct at all? - : TargetFrameLowering(StackGrowsUp, 8, 0), STI(sti) { + : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) { } - void adjustMipsStackFrame(MachineFunction &MF) const; + bool targetHandlesStackFrameRounding() const; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. @@ -40,7 +39,10 @@ public: bool hasFP(const MachineFunction &MF) const; - void processFunctionBeforeFrameFinalized(MachineFunction &MF) const; + void getInitialFrameState(std::vector<MachineMove> &Moves) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const; }; } // End llvm namespace diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index 0382964..d8a84ce 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -119,39 +119,41 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) { // on PIC code Load GA if (TM.getRelocationModel() == Reloc::PIC_) { - if ((Addr.getOpcode() == ISD::TargetGlobalAddress) || - (Addr.getOpcode() == ISD::TargetConstantPool) || - (Addr.getOpcode() == ISD::TargetJumpTable) || - (Addr.getOpcode() == ISD::TargetBlockAddress) || - (Addr.getOpcode() == ISD::TargetExternalSymbol)) { + if (Addr.getOpcode() == MipsISD::WrapperPIC) { Base = CurDAG->getRegister(Mips::GP, MVT::i32); - Offset = Addr; + Offset = Addr.getOperand(0); return true; } } else { if ((Addr.getOpcode() == ISD::TargetExternalSymbol || Addr.getOpcode() == ISD::TargetGlobalAddress)) return false; + else if (Addr.getOpcode() == ISD::TargetGlobalTLSAddress) { + Base = CurDAG->getRegister(Mips::GP, MVT::i32); + Offset = Addr; + return true; + } } - // Operand is a result from an ADD. - if (Addr.getOpcode() == ISD::ADD) { - if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { - if (isInt<16>(CN->getSExtValue())) { - - // If the first operand is a FI, get the TargetFI Node - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode> - (Addr.getOperand(0))) { - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - } else { - Base = Addr.getOperand(0); - } - - Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32); - return true; - } + // Addresses of the form FI+const or FI|const + if (CurDAG->isBaseWithConstantOffset(Addr)) { + ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); + if (isInt<16>(CN->getSExtValue())) { + + // If the first operand is a FI, get the TargetFI Node + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode> + (Addr.getOperand(0))) + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + else + Base = Addr.getOperand(0); + + Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32); + return true; } + } + // Operand is a result from an ADD. + if (Addr.getOpcode() == ISD::ADD) { // When loading from constant pools, load the lower address part in // the instruction itself. Example, instead of: // lui $2, %hi($CPI1_0) @@ -321,7 +323,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { // tablegen selection should be handled here. /// switch(Opcode) { - default: break; case ISD::SUBE: @@ -355,10 +356,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { LHS, SDValue(AddCarry,0)); } - /// Mul/Div with two results - case ISD::SDIVREM: - case ISD::UDIVREM: - break; + /// Mul with two results case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: { SDValue Op1 = Node->getOperand(0); @@ -405,13 +403,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { return CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag); } - /// Div/Rem operations - case ISD::SREM: - case ISD::UREM: - case ISD::SDIV: - case ISD::UDIV: - break; - // Get target GOT address. case ISD::GLOBAL_OFFSET_TABLE: return getGlobalBaseReg(); @@ -445,6 +436,18 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { return ResNode; // Other cases are autogenerated. break; + + case MipsISD::ThreadPointer: { + unsigned SrcReg = Mips::HWR29; + unsigned DestReg = Mips::V1; + SDNode *Rdhwr = CurDAG->getMachineNode(Mips::RDHWR, Node->getDebugLoc(), + Node->getValueType(0), CurDAG->getRegister(SrcReg, MVT::i32)); + SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, DestReg, + SDValue(Rdhwr, 0)); + SDValue ResNode = CurDAG->getCopyFromReg(Chain, dl, DestReg, MVT::i32); + ReplaceUses(SDValue(Node, 0), ResNode); + return ResNode.getNode(); + } } // Select the default instruction diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 0e193f2..c42054e 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -36,23 +36,30 @@ using namespace llvm; const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { - case MipsISD::JmpLink : return "MipsISD::JmpLink"; - case MipsISD::Hi : return "MipsISD::Hi"; - case MipsISD::Lo : return "MipsISD::Lo"; - case MipsISD::GPRel : return "MipsISD::GPRel"; - case MipsISD::Ret : return "MipsISD::Ret"; - case MipsISD::FPBrcond : return "MipsISD::FPBrcond"; - case MipsISD::FPCmp : return "MipsISD::FPCmp"; - case MipsISD::CMovFP_T : return "MipsISD::CMovFP_T"; - case MipsISD::CMovFP_F : return "MipsISD::CMovFP_F"; - case MipsISD::FPRound : return "MipsISD::FPRound"; - case MipsISD::MAdd : return "MipsISD::MAdd"; - case MipsISD::MAddu : return "MipsISD::MAddu"; - case MipsISD::MSub : return "MipsISD::MSub"; - case MipsISD::MSubu : return "MipsISD::MSubu"; - case MipsISD::DivRem : return "MipsISD::DivRem"; - case MipsISD::DivRemU : return "MipsISD::DivRemU"; - default : return NULL; + case MipsISD::JmpLink: return "MipsISD::JmpLink"; + case MipsISD::Hi: return "MipsISD::Hi"; + case MipsISD::Lo: return "MipsISD::Lo"; + case MipsISD::GPRel: return "MipsISD::GPRel"; + case MipsISD::TlsGd: return "MipsISD::TlsGd"; + case MipsISD::TprelHi: return "MipsISD::TprelHi"; + case MipsISD::TprelLo: return "MipsISD::TprelLo"; + case MipsISD::ThreadPointer: return "MipsISD::ThreadPointer"; + case MipsISD::Ret: return "MipsISD::Ret"; + case MipsISD::FPBrcond: return "MipsISD::FPBrcond"; + case MipsISD::FPCmp: return "MipsISD::FPCmp"; + case MipsISD::CMovFP_T: return "MipsISD::CMovFP_T"; + case MipsISD::CMovFP_F: return "MipsISD::CMovFP_F"; + case MipsISD::FPRound: return "MipsISD::FPRound"; + case MipsISD::MAdd: return "MipsISD::MAdd"; + case MipsISD::MAddu: return "MipsISD::MAddu"; + case MipsISD::MSub: return "MipsISD::MSub"; + case MipsISD::MSubu: return "MipsISD::MSubu"; + case MipsISD::DivRem: return "MipsISD::DivRem"; + case MipsISD::DivRemU: return "MipsISD::DivRemU"; + case MipsISD::BuildPairF64: return "MipsISD::BuildPairF64"; + case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64"; + case MipsISD::WrapperPIC: return "MipsISD::WrapperPIC"; + default: return NULL; } } @@ -100,7 +107,6 @@ MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::SDIV, MVT::i32, Expand); @@ -125,20 +131,22 @@ MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOWI, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); + setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FLOG, MVT::f32, Expand); setOperationAction(ISD::FLOG2, MVT::f32, Expand); setOperationAction(ISD::FLOG10, MVT::f32, Expand); setOperationAction(ISD::FEXP, MVT::f32, Expand); - setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); setOperationAction(ISD::VAARG, MVT::Other, Expand); setOperationAction(ISD::VACOPY, MVT::Other, Expand); @@ -169,19 +177,19 @@ MipsTargetLowering(MipsTargetMachine &TM) setTargetDAGCombine(ISD::UDIVREM); setTargetDAGCombine(ISD::SETCC); + setMinFunctionAlignment(2); + setStackPointerRegisterToSaveRestore(Mips::SP); computeRegisterProperties(); + + setExceptionPointerRegister(Mips::A0); + setExceptionSelectorRegister(Mips::A1); } MVT::SimpleValueType MipsTargetLowering::getSetCCResultType(EVT VT) const { return MVT::i32; } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned MipsTargetLowering::getFunctionAlignment(const Function *) const { - return 2; -} - // SelectMadd - // Transforms a subgraph in CurDAG if the following pattern is found: // (addc multLo, Lo0), (adde multHi, Hi0), @@ -381,7 +389,7 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG, // insert MFHI if (N->hasAnyUseOfValue(1)) { SDValue CopyFromHi = DAG.getCopyFromReg(InChain, dl, - Mips::HI, MVT::i32, InGlue); + Mips::HI, MVT::i32, InGlue); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi); } @@ -442,8 +450,8 @@ static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) { SDValue RHS = Op.getOperand(1); DebugLoc dl = Op.getDebugLoc(); - // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of node - // if necessary. + // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of + // node if necessary. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); return DAG.getNode(MipsISD::FPCmp, dl, MVT::Glue, LHS, RHS, @@ -507,13 +515,14 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); - case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); } return SDValue(); } @@ -545,45 +554,16 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) { return Mips::BRANCH_INVALID; } -MachineBasicBlock * -MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB) const { +static MachineBasicBlock* ExpandCondMov(MachineInstr *MI, MachineBasicBlock *BB, + DebugLoc dl, + const MipsSubtarget* Subtarget, + const TargetInstrInfo *TII, + bool isFPCmp, unsigned Opc) { // There is no need to expand CMov instructions if target has // conditional moves. if (Subtarget->hasCondMov()) return BB; - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - bool isFPCmp = false; - DebugLoc dl = MI->getDebugLoc(); - unsigned Opc; - - switch (MI->getOpcode()) { - default: assert(false && "Unexpected instr type to insert"); - case Mips::MOVT: - case Mips::MOVT_S: - case Mips::MOVT_D: - isFPCmp = true; - Opc = Mips::BC1F; - break; - case Mips::MOVF: - case Mips::MOVF_S: - case Mips::MOVF_D: - isFPCmp = true; - Opc = Mips::BC1T; - break; - case Mips::MOVZ_I: - case Mips::MOVZ_S: - case Mips::MOVZ_D: - Opc = Mips::BNE; - break; - case Mips::MOVN_I: - case Mips::MOVN_S: - case Mips::MOVN_D: - Opc = Mips::BEQ; - break; - } - // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the @@ -622,7 +602,6 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BuildMI(BB, dl, TII->get(Opc)).addReg(MI->getOperand(2).getReg()) .addReg(Mips::ZERO).addMBB(sinkMBB); - // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB @@ -651,46 +630,572 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return BB; } -//===----------------------------------------------------------------------===// -// Misc Lower Operation implementation -//===----------------------------------------------------------------------===// +MachineBasicBlock * +MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); -SDValue MipsTargetLowering:: -LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const -{ - if (!Subtarget->isMips1()) - return Op; + switch (MI->getOpcode()) { + default: + assert(false && "Unexpected instr type to insert"); + return NULL; + case Mips::MOVT: + case Mips::MOVT_S: + case Mips::MOVT_D: + return ExpandCondMov(MI, BB, dl, Subtarget, TII, true, Mips::BC1F); + case Mips::MOVF: + case Mips::MOVF_S: + case Mips::MOVF_D: + return ExpandCondMov(MI, BB, dl, Subtarget, TII, true, Mips::BC1T); + case Mips::MOVZ_I: + case Mips::MOVZ_S: + case Mips::MOVZ_D: + return ExpandCondMov(MI, BB, dl, Subtarget, TII, false, Mips::BNE); + case Mips::MOVN_I: + case Mips::MOVN_S: + case Mips::MOVN_D: + return ExpandCondMov(MI, BB, dl, Subtarget, TII, false, Mips::BEQ); + + case Mips::ATOMIC_LOAD_ADD_I8: + return EmitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu); + case Mips::ATOMIC_LOAD_ADD_I16: + return EmitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu); + case Mips::ATOMIC_LOAD_ADD_I32: + return EmitAtomicBinary(MI, BB, 4, Mips::ADDu); + + case Mips::ATOMIC_LOAD_AND_I8: + return EmitAtomicBinaryPartword(MI, BB, 1, Mips::AND); + case Mips::ATOMIC_LOAD_AND_I16: + return EmitAtomicBinaryPartword(MI, BB, 2, Mips::AND); + case Mips::ATOMIC_LOAD_AND_I32: + return EmitAtomicBinary(MI, BB, 4, Mips::AND); + + case Mips::ATOMIC_LOAD_OR_I8: + return EmitAtomicBinaryPartword(MI, BB, 1, Mips::OR); + case Mips::ATOMIC_LOAD_OR_I16: + return EmitAtomicBinaryPartword(MI, BB, 2, Mips::OR); + case Mips::ATOMIC_LOAD_OR_I32: + return EmitAtomicBinary(MI, BB, 4, Mips::OR); + + case Mips::ATOMIC_LOAD_XOR_I8: + return EmitAtomicBinaryPartword(MI, BB, 1, Mips::XOR); + case Mips::ATOMIC_LOAD_XOR_I16: + return EmitAtomicBinaryPartword(MI, BB, 2, Mips::XOR); + case Mips::ATOMIC_LOAD_XOR_I32: + return EmitAtomicBinary(MI, BB, 4, Mips::XOR); + + case Mips::ATOMIC_LOAD_NAND_I8: + return EmitAtomicBinaryPartword(MI, BB, 1, 0, true); + case Mips::ATOMIC_LOAD_NAND_I16: + return EmitAtomicBinaryPartword(MI, BB, 2, 0, true); + case Mips::ATOMIC_LOAD_NAND_I32: + return EmitAtomicBinary(MI, BB, 4, 0, true); + + case Mips::ATOMIC_LOAD_SUB_I8: + return EmitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu); + case Mips::ATOMIC_LOAD_SUB_I16: + return EmitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu); + case Mips::ATOMIC_LOAD_SUB_I32: + return EmitAtomicBinary(MI, BB, 4, Mips::SUBu); + + case Mips::ATOMIC_SWAP_I8: + return EmitAtomicBinaryPartword(MI, BB, 1, 0); + case Mips::ATOMIC_SWAP_I16: + return EmitAtomicBinaryPartword(MI, BB, 2, 0); + case Mips::ATOMIC_SWAP_I32: + return EmitAtomicBinary(MI, BB, 4, 0); + + case Mips::ATOMIC_CMP_SWAP_I8: + return EmitAtomicCmpSwapPartword(MI, BB, 1); + case Mips::ATOMIC_CMP_SWAP_I16: + return EmitAtomicCmpSwapPartword(MI, BB, 2); + case Mips::ATOMIC_CMP_SWAP_I32: + return EmitAtomicCmpSwap(MI, BB, 4); + } +} - MachineFunction &MF = DAG.getMachineFunction(); - unsigned CCReg = AddLiveIn(MF, Mips::FCR31, Mips::CCRRegisterClass); +// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and +// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true) +MachineBasicBlock * +MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, + unsigned Size, unsigned BinOpcode, + bool Nand) const { + assert(Size == 4 && "Unsupported size for EmitAtomicBinary."); + + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + const TargetRegisterClass *RC = getRegClassFor(MVT::i32); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); - SDValue Chain = DAG.getEntryNode(); - DebugLoc dl = Op.getDebugLoc(); - SDValue Src = Op.getOperand(0); - - // Set the condition register - SDValue CondReg = DAG.getCopyFromReg(Chain, dl, CCReg, MVT::i32); - CondReg = DAG.getCopyToReg(Chain, dl, Mips::AT, CondReg); - CondReg = DAG.getCopyFromReg(CondReg, dl, Mips::AT, MVT::i32); - - SDValue Cst = DAG.getConstant(3, MVT::i32); - SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, CondReg, Cst); - Cst = DAG.getConstant(2, MVT::i32); - SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i32, Or, Cst); - - SDValue InFlag(0, 0); - CondReg = DAG.getCopyToReg(Chain, dl, Mips::FCR31, Xor, InFlag); - - // Emit the round instruction and bit convert to integer - SDValue Trunc = DAG.getNode(MipsISD::FPRound, dl, MVT::f32, - Src, CondReg.getValue(1)); - SDValue BitCvt = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Trunc); - return BitCvt; + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Ptr = MI->getOperand(1).getReg(); + unsigned Incr = MI->getOperand(2).getReg(); + + unsigned Oldval = RegInfo.createVirtualRegister(RC); + unsigned Tmp1 = RegInfo.createVirtualRegister(RC); + unsigned Tmp2 = RegInfo.createVirtualRegister(RC); + + // insert new blocks after the current block + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineFunction::iterator It = BB; + ++It; + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // thisMBB: + // ... + // sw incr, fi(sp) // store incr to stack (when BinOpcode == 0) + // fallthrough --> loopMBB + + // Note: for atomic.swap (when BinOpcode == 0), storing incr to stack before + // the loop and then loading it from stack in block loopMBB is necessary to + // prevent MachineLICM pass to hoist "or" instruction out of the block + // loopMBB. + + int fi = 0; + if (BinOpcode == 0 && !Nand) { + // Get or create a temporary stack location. + MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>(); + fi = MipsFI->getAtomicFrameIndex(); + if (fi == -1) { + fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false); + MipsFI->setAtomicFrameIndex(fi); + } + + BuildMI(BB, dl, TII->get(Mips::SW)) + .addReg(Incr).addImm(0).addFrameIndex(fi); + } + BB->addSuccessor(loopMBB); + + // loopMBB: + // ll oldval, 0(ptr) + // or dest, $0, oldval + // <binop> tmp1, oldval, incr + // sc tmp1, 0(ptr) + // beq tmp1, $0, loopMBB + BB = loopMBB; + BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addImm(0).addReg(Ptr); + BuildMI(BB, dl, TII->get(Mips::OR), Dest).addReg(Mips::ZERO).addReg(Oldval); + if (Nand) { + // and tmp2, oldval, incr + // nor tmp1, $0, tmp2 + BuildMI(BB, dl, TII->get(Mips::AND), Tmp2).addReg(Oldval).addReg(Incr); + BuildMI(BB, dl, TII->get(Mips::NOR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2); + } else if (BinOpcode) { + // <binop> tmp1, oldval, incr + BuildMI(BB, dl, TII->get(BinOpcode), Tmp1).addReg(Oldval).addReg(Incr); + } else { + // lw tmp2, fi(sp) // load incr from stack + // or tmp1, $zero, tmp2 + BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addImm(0).addFrameIndex(fi);; + BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2); + } + BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addImm(0).addReg(Ptr); + BuildMI(BB, dl, TII->get(Mips::BEQ)) + .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loopMBB); + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + +MachineBasicBlock * +MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, unsigned BinOpcode, + bool Nand) const { + assert((Size == 1 || Size == 2) && + "Unsupported size for EmitAtomicBinaryPartial."); + + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + const TargetRegisterClass *RC = getRegClassFor(MVT::i32); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Ptr = MI->getOperand(1).getReg(); + unsigned Incr = MI->getOperand(2).getReg(); + + unsigned Addr = RegInfo.createVirtualRegister(RC); + unsigned Shift = RegInfo.createVirtualRegister(RC); + unsigned Mask = RegInfo.createVirtualRegister(RC); + unsigned Mask2 = RegInfo.createVirtualRegister(RC); + unsigned Newval = RegInfo.createVirtualRegister(RC); + unsigned Oldval = RegInfo.createVirtualRegister(RC); + unsigned Incr2 = RegInfo.createVirtualRegister(RC); + unsigned Tmp1 = RegInfo.createVirtualRegister(RC); + unsigned Tmp2 = RegInfo.createVirtualRegister(RC); + unsigned Tmp3 = RegInfo.createVirtualRegister(RC); + unsigned Tmp4 = RegInfo.createVirtualRegister(RC); + unsigned Tmp5 = RegInfo.createVirtualRegister(RC); + unsigned Tmp6 = RegInfo.createVirtualRegister(RC); + unsigned Tmp7 = RegInfo.createVirtualRegister(RC); + unsigned Tmp8 = RegInfo.createVirtualRegister(RC); + unsigned Tmp9 = RegInfo.createVirtualRegister(RC); + unsigned Tmp10 = RegInfo.createVirtualRegister(RC); + unsigned Tmp11 = RegInfo.createVirtualRegister(RC); + unsigned Tmp12 = RegInfo.createVirtualRegister(RC); + + // insert new blocks after the current block + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineFunction::iterator It = BB; + ++It; + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // thisMBB: + // addiu tmp1,$0,-4 # 0xfffffffc + // and addr,ptr,tmp1 + // andi tmp2,ptr,3 + // sll shift,tmp2,3 + // ori tmp3,$0,255 # 0xff + // sll mask,tmp3,shift + // nor mask2,$0,mask + // andi tmp4,incr,255 + // sll incr2,tmp4,shift + // sw incr2, fi(sp) // store incr2 to stack (when BinOpcode == 0) + + // Note: for atomic.swap (when BinOpcode == 0), storing incr2 to stack before + // the loop and then loading it from stack in block loopMBB is necessary to + // prevent MachineLICM pass to hoist "or" instruction out of the block + // loopMBB. + + int64_t MaskImm = (Size == 1) ? 255 : 65535; + BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4); + BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1); + BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3); + BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3); + BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm); + BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift); + BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask); + if (BinOpcode != Mips::SUBu) { + BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Incr).addImm(MaskImm); + BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp4).addReg(Shift); + } else { + BuildMI(BB, dl, TII->get(Mips::SUBu), Tmp4).addReg(Mips::ZERO).addReg(Incr); + BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Tmp4).addImm(MaskImm); + BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp5).addReg(Shift); + } + + int fi = 0; + if (BinOpcode == 0 && !Nand) { + // Get or create a temporary stack location. + MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>(); + fi = MipsFI->getAtomicFrameIndex(); + if (fi == -1) { + fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false); + MipsFI->setAtomicFrameIndex(fi); + } + + BuildMI(BB, dl, TII->get(Mips::SW)) + .addReg(Incr2).addImm(0).addFrameIndex(fi); + } + BB->addSuccessor(loopMBB); + + // loopMBB: + // ll oldval,0(addr) + // binop tmp7,oldval,incr2 + // and newval,tmp7,mask + // and tmp8,oldval,mask2 + // or tmp9,tmp8,newval + // sc tmp9,0(addr) + // beq tmp9,$0,loopMBB + BB = loopMBB; + BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addImm(0).addReg(Addr); + if (Nand) { + // and tmp6, oldval, incr2 + // nor tmp7, $0, tmp6 + BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval).addReg(Incr2); + BuildMI(BB, dl, TII->get(Mips::NOR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6); + } else if (BinOpcode == Mips::SUBu) { + // addu tmp7, oldval, incr2 + BuildMI(BB, dl, TII->get(Mips::ADDu), Tmp7).addReg(Oldval).addReg(Incr2); + } else if (BinOpcode) { + // <binop> tmp7, oldval, incr2 + BuildMI(BB, dl, TII->get(BinOpcode), Tmp7).addReg(Oldval).addReg(Incr2); + } else { + // lw tmp6, fi(sp) // load incr2 from stack + // or tmp7, $zero, tmp6 + BuildMI(BB, dl, TII->get(Mips::LW), Tmp6).addImm(0).addFrameIndex(fi);; + BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6); + } + BuildMI(BB, dl, TII->get(Mips::AND), Newval).addReg(Tmp7).addReg(Mask); + BuildMI(BB, dl, TII->get(Mips::AND), Tmp8).addReg(Oldval).addReg(Mask2); + BuildMI(BB, dl, TII->get(Mips::OR), Tmp9).addReg(Tmp8).addReg(Newval); + BuildMI(BB, dl, TII->get(Mips::SC), Tmp9).addReg(Tmp9).addImm(0).addReg(Addr); + BuildMI(BB, dl, TII->get(Mips::BEQ)) + .addReg(Tmp9).addReg(Mips::ZERO).addMBB(loopMBB); + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // and tmp10,oldval,mask + // srl tmp11,tmp10,shift + // sll tmp12,tmp11,24 + // sra dest,tmp12,24 + BB = exitMBB; + int64_t ShiftImm = (Size == 1) ? 24 : 16; + // reverse order + BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest) + .addReg(Tmp12).addImm(ShiftImm); + BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp12) + .addReg(Tmp11).addImm(ShiftImm); + BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp11) + .addReg(Tmp10).addReg(Shift); + BuildMI(*BB, BB->begin(), dl, TII->get(Mips::AND), Tmp10) + .addReg(Oldval).addReg(Mask); + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + +MachineBasicBlock * +MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size) const { + assert(Size == 4 && "Unsupported size for EmitAtomicCmpSwap."); + + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + const TargetRegisterClass *RC = getRegClassFor(MVT::i32); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Ptr = MI->getOperand(1).getReg(); + unsigned Oldval = MI->getOperand(2).getReg(); + unsigned Newval = MI->getOperand(3).getReg(); + + unsigned Tmp1 = RegInfo.createVirtualRegister(RC); + unsigned Tmp2 = RegInfo.createVirtualRegister(RC); + + // insert new blocks after the current block + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineFunction::iterator It = BB; + ++It; + MF->insert(It, loop1MBB); + MF->insert(It, loop2MBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Get or create a temporary stack location. + MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>(); + int fi = MipsFI->getAtomicFrameIndex(); + if (fi == -1) { + fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false); + MipsFI->setAtomicFrameIndex(fi); + } + + // thisMBB: + // ... + // sw newval, fi(sp) // store newval to stack + // fallthrough --> loop1MBB + + // Note: storing newval to stack before the loop and then loading it from + // stack in block loop2MBB is necessary to prevent MachineLICM pass to + // hoist "or" instruction out of the block loop2MBB. + + BuildMI(BB, dl, TII->get(Mips::SW)) + .addReg(Newval).addImm(0).addFrameIndex(fi); + BB->addSuccessor(loop1MBB); + + // loop1MBB: + // ll dest, 0(ptr) + // bne dest, oldval, exitMBB + BB = loop1MBB; + BuildMI(BB, dl, TII->get(Mips::LL), Dest).addImm(0).addReg(Ptr); + BuildMI(BB, dl, TII->get(Mips::BNE)) + .addReg(Dest).addReg(Oldval).addMBB(exitMBB); + BB->addSuccessor(exitMBB); + BB->addSuccessor(loop2MBB); + + // loop2MBB: + // lw tmp2, fi(sp) // load newval from stack + // or tmp1, $0, tmp2 + // sc tmp1, 0(ptr) + // beq tmp1, $0, loop1MBB + BB = loop2MBB; + BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addImm(0).addFrameIndex(fi);; + BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2); + BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addImm(0).addReg(Ptr); + BuildMI(BB, dl, TII->get(Mips::BEQ)) + .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loop1MBB); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; } +MachineBasicBlock * +MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size) const { + assert((Size == 1 || Size == 2) && + "Unsupported size for EmitAtomicCmpSwapPartial."); + + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + const TargetRegisterClass *RC = getRegClassFor(MVT::i32); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Ptr = MI->getOperand(1).getReg(); + unsigned Oldval = MI->getOperand(2).getReg(); + unsigned Newval = MI->getOperand(3).getReg(); + + unsigned Addr = RegInfo.createVirtualRegister(RC); + unsigned Shift = RegInfo.createVirtualRegister(RC); + unsigned Mask = RegInfo.createVirtualRegister(RC); + unsigned Mask2 = RegInfo.createVirtualRegister(RC); + unsigned Oldval2 = RegInfo.createVirtualRegister(RC); + unsigned Oldval3 = RegInfo.createVirtualRegister(RC); + unsigned Oldval4 = RegInfo.createVirtualRegister(RC); + unsigned Newval2 = RegInfo.createVirtualRegister(RC); + unsigned Tmp1 = RegInfo.createVirtualRegister(RC); + unsigned Tmp2 = RegInfo.createVirtualRegister(RC); + unsigned Tmp3 = RegInfo.createVirtualRegister(RC); + unsigned Tmp4 = RegInfo.createVirtualRegister(RC); + unsigned Tmp5 = RegInfo.createVirtualRegister(RC); + unsigned Tmp6 = RegInfo.createVirtualRegister(RC); + unsigned Tmp7 = RegInfo.createVirtualRegister(RC); + unsigned Tmp8 = RegInfo.createVirtualRegister(RC); + unsigned Tmp9 = RegInfo.createVirtualRegister(RC); + + // insert new blocks after the current block + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineFunction::iterator It = BB; + ++It; + MF->insert(It, loop1MBB); + MF->insert(It, loop2MBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // thisMBB: + // addiu tmp1,$0,-4 # 0xfffffffc + // and addr,ptr,tmp1 + // andi tmp2,ptr,3 + // sll shift,tmp2,3 + // ori tmp3,$0,255 # 0xff + // sll mask,tmp3,shift + // nor mask2,$0,mask + // andi tmp4,oldval,255 + // sll oldval2,tmp4,shift + // andi tmp5,newval,255 + // sll newval2,tmp5,shift + int64_t MaskImm = (Size == 1) ? 255 : 65535; + BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4); + BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1); + BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3); + BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3); + BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm); + BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift); + BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask); + BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Oldval).addImm(MaskImm); + BuildMI(BB, dl, TII->get(Mips::SLL), Oldval2).addReg(Tmp4).addReg(Shift); + BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Newval).addImm(MaskImm); + BuildMI(BB, dl, TII->get(Mips::SLL), Newval2).addReg(Tmp5).addReg(Shift); + BB->addSuccessor(loop1MBB); + + // loop1MBB: + // ll oldval3,0(addr) + // and oldval4,oldval3,mask + // bne oldval4,oldval2,exitMBB + BB = loop1MBB; + BuildMI(BB, dl, TII->get(Mips::LL), Oldval3).addImm(0).addReg(Addr); + BuildMI(BB, dl, TII->get(Mips::AND), Oldval4).addReg(Oldval3).addReg(Mask); + BuildMI(BB, dl, TII->get(Mips::BNE)) + .addReg(Oldval4).addReg(Oldval2).addMBB(exitMBB); + BB->addSuccessor(exitMBB); + BB->addSuccessor(loop2MBB); + + // loop2MBB: + // and tmp6,oldval3,mask2 + // or tmp7,tmp6,newval2 + // sc tmp7,0(addr) + // beq tmp7,$0,loop1MBB + BB = loop2MBB; + BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval3).addReg(Mask2); + BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Tmp6).addReg(Newval2); + BuildMI(BB, dl, TII->get(Mips::SC), Tmp7) + .addReg(Tmp7).addImm(0).addReg(Addr); + BuildMI(BB, dl, TII->get(Mips::BEQ)) + .addReg(Tmp7).addReg(Mips::ZERO).addMBB(loop1MBB); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // srl tmp8,oldval4,shift + // sll tmp9,tmp8,24 + // sra dest,tmp9,24 + BB = exitMBB; + int64_t ShiftImm = (Size == 1) ? 24 : 16; + // reverse order + BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest) + .addReg(Tmp9).addImm(ShiftImm); + BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp9) + .addReg(Tmp8).addImm(ShiftImm); + BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp8) + .addReg(Oldval4).addReg(Shift); + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + +//===----------------------------------------------------------------------===// +// Misc Lower Operation implementation +//===----------------------------------------------------------------------===// SDValue MipsTargetLowering:: LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { + unsigned StackAlignment = + getTargetMachine().getFrameLowering()->getStackAlignment(); + assert(StackAlignment >= + cast<ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue() && + "Cannot lower if the alignment of the allocated space is larger than \ + that of the stack."); + SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); DebugLoc dl = Op.getDebugLoc(); @@ -704,11 +1209,25 @@ LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const // The Sub result contains the new stack start address, so it // must be placed in the stack pointer register. - Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, Mips::SP, Sub); + Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, Mips::SP, Sub, + SDValue()); + // Retrieve updated $sp. There is a glue input to prevent instructions that + // clobber $sp from being inserted between copytoreg and copyfromreg. + SDValue NewSP = DAG.getCopyFromReg(Chain, dl, Mips::SP, MVT::i32, + Chain.getValue(1)); + + // The stack space reserved by alloca is located right above the argument + // area. It is aligned on a boundary that is a multiple of StackAlignment. + MachineFunction &MF = DAG.getMachineFunction(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + unsigned SPOffset = (MipsFI->getMaxCallFrameSize() + StackAlignment - 1) / + StackAlignment * StackAlignment; + SDValue AllocPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP, + DAG.getConstant(SPOffset, MVT::i32)); // This node always has two return values: a new stack pointer // value and a chain - SDValue Ops[2] = { Sub, Chain }; + SDValue Ops[2] = { AllocPtr, NewSP.getValue(1) }; return DAG.getMergeValues(Ops, 2, dl); } @@ -723,7 +1242,7 @@ LowerBRCOND(SDValue Op, SelectionDAG &DAG) const SDValue CondRes = CreateFPCmp(DAG, Op.getOperand(1)); - // Return if flag is not set by a floating point comparision. + // Return if flag is not set by a floating point comparison. if (CondRes.getOpcode() != MipsISD::FPCmp) return Op; @@ -741,7 +1260,7 @@ LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = CreateFPCmp(DAG, Op.getOperand(0)); - // Return if flag is not set by a floating point comparision. + // Return if flag is not set by a floating point comparison. if (Cond.getOpcode() != MipsISD::FPCmp) return Op; @@ -776,54 +1295,111 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op, SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GAHi, 1); SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo); return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo); - } else { - SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, - MipsII::MO_GOT); - SDValue ResNode = DAG.getLoad(MVT::i32, dl, - DAG.getEntryNode(), GA, MachinePointerInfo(), - false, false, 0); - // On functions and global targets not internal linked only - // a load from got/GP is necessary for PIC to work. - if (!GV->hasInternalLinkage() && - (!GV->hasLocalLinkage() || isa<Function>(GV))) - return ResNode; - SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, - MipsII::MO_ABS_LO); - SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo); - return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo); } - llvm_unreachable("Dont know how to handle GlobalAddress"); - return SDValue(0,0); + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, + MipsII::MO_GOT); + GA = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, GA); + SDValue ResNode = DAG.getLoad(MVT::i32, dl, + DAG.getEntryNode(), GA, MachinePointerInfo(), + false, false, 0); + // On functions and global targets not internal linked only + // a load from got/GP is necessary for PIC to work. + if (!GV->hasInternalLinkage() && + (!GV->hasLocalLinkage() || isa<Function>(GV))) + return ResNode; + SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, + MipsII::MO_ABS_LO); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo); + return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo); } SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + // FIXME there isn't actually debug info here + DebugLoc dl = Op.getDebugLoc(); + if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { - assert(false && "implement LowerBlockAddress for -static"); - return SDValue(0, 0); - } - else { - // FIXME there isn't actually debug info here - DebugLoc dl = Op.getDebugLoc(); - const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - SDValue BAGOTOffset = DAG.getBlockAddress(BA, MVT::i32, true, - MipsII::MO_GOT); - SDValue BALOOffset = DAG.getBlockAddress(BA, MVT::i32, true, - MipsII::MO_ABS_LO); - SDValue Load = DAG.getLoad(MVT::i32, dl, - DAG.getEntryNode(), BAGOTOffset, - MachinePointerInfo(), false, false, 0); - SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALOOffset); - return DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo); + // %hi/%lo relocation + SDValue BAHi = DAG.getBlockAddress(BA, MVT::i32, true, + MipsII::MO_ABS_HI); + SDValue BALo = DAG.getBlockAddress(BA, MVT::i32, true, + MipsII::MO_ABS_LO); + SDValue Hi = DAG.getNode(MipsISD::Hi, dl, MVT::i32, BAHi); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALo); + return DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo); } + + SDValue BAGOTOffset = DAG.getBlockAddress(BA, MVT::i32, true, + MipsII::MO_GOT); + BAGOTOffset = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, BAGOTOffset); + SDValue BALOOffset = DAG.getBlockAddress(BA, MVT::i32, true, + MipsII::MO_ABS_LO); + SDValue Load = DAG.getLoad(MVT::i32, dl, + DAG.getEntryNode(), BAGOTOffset, + MachinePointerInfo(), false, false, 0); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALOOffset); + return DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo); } SDValue MipsTargetLowering:: LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { - llvm_unreachable("TLS not implemented for MIPS."); - return SDValue(); // Not reached + // If the relocation model is PIC, use the General Dynamic TLS Model, + // otherwise use the Initial Exec or Local Exec TLS Model. + // TODO: implement Local Dynamic TLS model + + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + DebugLoc dl = GA->getDebugLoc(); + const GlobalValue *GV = GA->getGlobal(); + EVT PtrVT = getPointerTy(); + + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { + // General Dynamic TLS Model + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, + 0, MipsII::MO_TLSGD); + SDValue Tlsgd = DAG.getNode(MipsISD::TlsGd, dl, MVT::i32, TGA); + SDValue GP = DAG.getRegister(Mips::GP, MVT::i32); + SDValue Argument = DAG.getNode(ISD::ADD, dl, MVT::i32, GP, Tlsgd); + + ArgListTy Args; + ArgListEntry Entry; + Entry.Node = Argument; + Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext()); + Args.push_back(Entry); + std::pair<SDValue, SDValue> CallResult = + LowerCallTo(DAG.getEntryNode(), + (const Type *) Type::getInt32Ty(*DAG.getContext()), + false, false, false, false, + 0, CallingConv::C, false, true, + DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl); + + return CallResult.first; + } else { + SDValue Offset; + if (GV->isDeclaration()) { + // Initial Exec TLS Model + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, + MipsII::MO_GOTTPREL); + Offset = DAG.getLoad(MVT::i32, dl, + DAG.getEntryNode(), TGA, MachinePointerInfo(), + false, false, 0); + } else { + // Local Exec TLS Model + SDVTList VTs = DAG.getVTList(MVT::i32); + SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, + MipsII::MO_TPREL_HI); + SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, + MipsII::MO_TPREL_LO); + SDValue Hi = DAG.getNode(MipsISD::TprelHi, dl, VTs, &TGAHi, 1); + SDValue Lo = DAG.getNode(MipsISD::TprelLo, dl, MVT::i32, TGALo); + Offset = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo); + } + + SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, dl, PtrVT); + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); + } } SDValue MipsTargetLowering:: @@ -844,12 +1420,15 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const if (!IsPIC) { SDValue Ops[] = { JTI }; HiPart = DAG.getNode(MipsISD::Hi, dl, DAG.getVTList(MVT::i32), Ops, 1); - } else // Emit Load from Global Pointer + } else {// Emit Load from Global Pointer + JTI = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, JTI); HiPart = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), JTI, MachinePointerInfo(), false, false, 0); + } - SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MipsII::MO_ABS_LO); + SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + MipsII::MO_ABS_LO); SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTILo); ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo); @@ -867,7 +1446,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const // gp_rel relocation // FIXME: we should reference the constant pool using small data sections, - // but the asm printer currently doens't support this feature without + // but the asm printer currently doesn't support this feature without // hacking it. This feature should come soon so we can uncomment the // stuff below. //if (IsInSmallSection(C->getType())) { @@ -886,6 +1465,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const } else { SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), N->getOffset(), MipsII::MO_GOT); + CP = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, CP); SDValue Load = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), false, false, 0); @@ -914,6 +1494,74 @@ SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { false, false, 0); } +static SDValue LowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG) { + // FIXME: Use ext/ins instructions if target architecture is Mips32r2. + DebugLoc dl = Op.getDebugLoc(); + SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(1)); + SDValue And0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op0, + DAG.getConstant(0x7fffffff, MVT::i32)); + SDValue And1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op1, + DAG.getConstant(0x80000000, MVT::i32)); + SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, And0, And1); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Result); +} + +static SDValue LowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool isLittle) { + // FIXME: + // Use ext/ins instructions if target architecture is Mips32r2. + // Eliminate redundant mfc1 and mtc1 instructions. + unsigned LoIdx = 0, HiIdx = 1; + + if (!isLittle) + std::swap(LoIdx, HiIdx); + + DebugLoc dl = Op.getDebugLoc(); + SDValue Word0 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32, + Op.getOperand(0), + DAG.getConstant(LoIdx, MVT::i32)); + SDValue Hi0 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32, + Op.getOperand(0), DAG.getConstant(HiIdx, MVT::i32)); + SDValue Hi1 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32, + Op.getOperand(1), DAG.getConstant(HiIdx, MVT::i32)); + SDValue And0 = DAG.getNode(ISD::AND, dl, MVT::i32, Hi0, + DAG.getConstant(0x7fffffff, MVT::i32)); + SDValue And1 = DAG.getNode(ISD::AND, dl, MVT::i32, Hi1, + DAG.getConstant(0x80000000, MVT::i32)); + SDValue Word1 = DAG.getNode(ISD::OR, dl, MVT::i32, And0, And1); + + if (!isLittle) + std::swap(Word0, Word1); + + return DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64, Word0, Word1); +} + +SDValue MipsTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) + const { + EVT Ty = Op.getValueType(); + + assert(Ty == MVT::f32 || Ty == MVT::f64); + + if (Ty == MVT::f32) + return LowerFCOPYSIGN32(Op, DAG); + else + return LowerFCOPYSIGN64(Op, DAG, Subtarget->isLittle()); +} + +SDValue MipsTargetLowering:: +LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { + // check the depth + assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) && + "Frame address can only be determined for current frame."); + + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Mips::FP, VT); + return FrameAddr; +} + //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -931,6 +1579,8 @@ SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { // yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is // not used, it must be shadowed. If only A3 is avaiable, shadow it and // go to stack. +// +// For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack. //===----------------------------------------------------------------------===// static bool CC_MipsO32(unsigned ValNo, MVT ValVT, @@ -949,90 +1599,17 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, Mips::D6, Mips::D7 }; - unsigned Reg = 0; - static bool IntRegUsed = false; - - // This must be the first arg of the call if no regs have been allocated. - // Initialize IntRegUsed in that case. - if (IntRegs[State.getFirstUnallocated(IntRegs, IntRegsSize)] == Mips::A0 && - F32Regs[State.getFirstUnallocated(F32Regs, FloatRegsSize)] == Mips::F12 && - F64Regs[State.getFirstUnallocated(F64Regs, FloatRegsSize)] == Mips::D6) - IntRegUsed = false; - - // Promote i8 and i16 - if (LocVT == MVT::i8 || LocVT == MVT::i16) { - LocVT = MVT::i32; - if (ArgFlags.isSExt()) - LocInfo = CCValAssign::SExt; - else if (ArgFlags.isZExt()) - LocInfo = CCValAssign::ZExt; - else - LocInfo = CCValAssign::AExt; + // ByVal Args + if (ArgFlags.isByVal()) { + State.HandleByVal(ValNo, ValVT, LocVT, LocInfo, + 1 /*MinSize*/, 4 /*MinAlign*/, ArgFlags); + unsigned NextReg = (State.getNextStackOffset() + 3) / 4; + for (unsigned r = State.getFirstUnallocated(IntRegs, IntRegsSize); + r < std::min(IntRegsSize, NextReg); ++r) + State.AllocateReg(IntRegs[r]); + return false; } - if (ValVT == MVT::i32) { - Reg = State.AllocateReg(IntRegs, IntRegsSize); - IntRegUsed = true; - } else if (ValVT == MVT::f32) { - // An int reg has to be marked allocated regardless of whether or not - // IntRegUsed is true. - Reg = State.AllocateReg(IntRegs, IntRegsSize); - - if (IntRegUsed) { - if (Reg) // Int reg is available - LocVT = MVT::i32; - } else { - unsigned FReg = State.AllocateReg(F32Regs, FloatRegsSize); - if (FReg) // F32 reg is available - Reg = FReg; - else if (Reg) // No F32 regs are available, but an int reg is available. - LocVT = MVT::i32; - } - } else if (ValVT == MVT::f64) { - // Int regs have to be marked allocated regardless of whether or not - // IntRegUsed is true. - Reg = State.AllocateReg(IntRegs, IntRegsSize); - if (Reg == Mips::A1) - Reg = State.AllocateReg(IntRegs, IntRegsSize); - else if (Reg == Mips::A3) - Reg = 0; - State.AllocateReg(IntRegs, IntRegsSize); - - // At this point, Reg is A0, A2 or 0, and all the unavailable integer regs - // are marked as allocated. - if (IntRegUsed) { - if (Reg)// if int reg is available - LocVT = MVT::i32; - } else { - unsigned FReg = State.AllocateReg(F64Regs, FloatRegsSize); - if (FReg) // F64 reg is available. - Reg = FReg; - else if (Reg) // No F64 regs are available, but an int reg is available. - LocVT = MVT::i32; - } - } else - assert(false && "cannot handle this ValVT"); - - if (!Reg) { - unsigned SizeInBytes = ValVT.getSizeInBits() >> 3; - unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - } else - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - - return false; // CC must always match -} - -static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT, - MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - - static const unsigned IntRegsSize=4; - - static const unsigned IntRegs[] = { - Mips::A0, Mips::A1, Mips::A2, Mips::A3 - }; - // Promote i8 and i16 if (LocVT == MVT::i8 || LocVT == MVT::i16) { LocVT = MVT::i32; @@ -1046,23 +1623,52 @@ static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT, unsigned Reg; - if (ValVT == MVT::i32 || ValVT == MVT::f32) { + // f32 and f64 are allocated in A0, A1, A2, A3 when either of the following + // is true: function is vararg, argument is 3rd or higher, there is previous + // argument which is not f32 or f64. + bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1 + || State.getFirstUnallocated(F32Regs, FloatRegsSize) != ValNo; + unsigned OrigAlign = ArgFlags.getOrigAlign(); + bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8); + + if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) { Reg = State.AllocateReg(IntRegs, IntRegsSize); + // If this is the first part of an i64 arg, + // the allocated register must be either A0 or A2. + if (isI64 && (Reg == Mips::A1 || Reg == Mips::A3)) + Reg = State.AllocateReg(IntRegs, IntRegsSize); LocVT = MVT::i32; - } else if (ValVT == MVT::f64) { + } else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) { + // Allocate int register and shadow next int register. If first + // available register is Mips::A1 or Mips::A3, shadow it too. Reg = State.AllocateReg(IntRegs, IntRegsSize); if (Reg == Mips::A1 || Reg == Mips::A3) Reg = State.AllocateReg(IntRegs, IntRegsSize); State.AllocateReg(IntRegs, IntRegsSize); LocVT = MVT::i32; + } else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) { + // we are guaranteed to find an available float register + if (ValVT == MVT::f32) { + Reg = State.AllocateReg(F32Regs, FloatRegsSize); + // Shadow int register + State.AllocateReg(IntRegs, IntRegsSize); + } else { + Reg = State.AllocateReg(F64Regs, FloatRegsSize); + // Shadow int registers + unsigned Reg2 = State.AllocateReg(IntRegs, IntRegsSize); + if (Reg2 == Mips::A1 || Reg2 == Mips::A3) + State.AllocateReg(IntRegs, IntRegsSize); + State.AllocateReg(IntRegs, IntRegsSize); + } } else llvm_unreachable("Cannot handle this ValVT."); - if (!Reg) { - unsigned SizeInBytes = ValVT.getSizeInBits() >> 3; - unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes); + unsigned SizeInBytes = ValVT.getSizeInBits() >> 3; + unsigned Offset = State.AllocateStack(SizeInBytes, OrigAlign); + + if (!Reg) State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - } else + else State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; // CC must always match @@ -1072,6 +1678,56 @@ static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT, // Call Calling Convention Implementation //===----------------------------------------------------------------------===// +static const unsigned O32IntRegsSize = 4; + +static const unsigned O32IntRegs[] = { + Mips::A0, Mips::A1, Mips::A2, Mips::A3 +}; + +// Write ByVal Arg to arg registers and stack. +static void +WriteByValArg(SDValue& Chain, DebugLoc dl, + SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass, + SmallVector<SDValue, 8>& MemOpChains, int& LastFI, + MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg, + const CCValAssign &VA, const ISD::ArgFlagsTy& Flags, + MVT PtrType) { + unsigned FirstWord = VA.getLocMemOffset() / 4; + unsigned NumWords = (Flags.getByValSize() + 3) / 4; + unsigned LastWord = FirstWord + NumWords; + unsigned CurWord; + + // copy the first 4 words of byval arg to registers A0 - A3 + for (CurWord = FirstWord; CurWord < std::min(LastWord, O32IntRegsSize); + ++CurWord) { + SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg, + DAG.getConstant((CurWord - FirstWord) * 4, + MVT::i32)); + SDValue LoadVal = DAG.getLoad(MVT::i32, dl, Chain, LoadPtr, + MachinePointerInfo(), + false, false, 0); + MemOpChains.push_back(LoadVal.getValue(1)); + unsigned DstReg = O32IntRegs[CurWord]; + RegsToPass.push_back(std::make_pair(DstReg, LoadVal)); + } + + // copy remaining part of byval arg to stack. + if (CurWord < LastWord) { + unsigned SizeInBytes = (LastWord - CurWord) * 4; + SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg, + DAG.getConstant((CurWord - FirstWord) * 4, + MVT::i32)); + LastFI = MFI->CreateFixedObject(SizeInBytes, CurWord * 4, true); + SDValue Dst = DAG.getFrameIndex(LastFI, PtrType); + Chain = DAG.getMemcpy(Chain, dl, Dst, Src, + DAG.getConstant(SizeInBytes, MVT::i32), + /*Align*/4, + /*isVolatile=*/false, /*AlwaysInline=*/false, + MachinePointerInfo(0), MachinePointerInfo(0)); + MemOpChains.push_back(Chain); + } +} + /// LowerCall - functions arguments are copied from virtual regs to /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted. /// TODO: isTailCall. @@ -1089,35 +1745,57 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering(); bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_; + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); - - // To meet O32 ABI, Mips must always allocate 16 bytes on - // the stack (even if less than 4 are used as arguments) - if (Subtarget->isABI_O32()) { - int VTsize = MVT(MVT::i32).getSizeInBits()/8; - MFI->CreateFixedObject(VTsize, (VTsize*3), true); - CCInfo.AnalyzeCallOperands(Outs, - isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32); - } else + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + if (Subtarget->isABI_O32()) + CCInfo.AnalyzeCallOperands(Outs, CC_MipsO32); + else CCInfo.AnalyzeCallOperands(Outs, CC_Mips); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + unsigned NextStackOffset = CCInfo.getNextStackOffset(); + + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NextStackOffset, + true)); + + // If this is the first call, create a stack frame object that points to + // a location to which .cprestore saves $gp. + if (IsPIC && !MipsFI->getGPFI()) + MipsFI->setGPFI(MFI->CreateFixedObject(4, 0, true)); + + // Update size of the maximum argument space. + // For O32, a minimum of four words (16 bytes) of argument space is + // allocated. + if (Subtarget->isABI_O32()) + NextStackOffset = std::max(NextStackOffset, (unsigned)16); + + unsigned MaxCallFrameSize = MipsFI->getMaxCallFrameSize(); + + if (MaxCallFrameSize < NextStackOffset) { + MipsFI->setMaxCallFrameSize(NextStackOffset); + + if (IsPIC) { + // $gp restore slot must be aligned. + unsigned StackAlignment = TFL->getStackAlignment(); + NextStackOffset = (NextStackOffset + StackAlignment - 1) / + StackAlignment * StackAlignment; + int GPFI = MipsFI->getGPFI(); + MFI->setObjectOffset(GPFI, NextStackOffset); + } + } // With EABI is it possible to have 16 args on registers. SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass; SmallVector<SDValue, 8> MemOpChains; - // First/LastArgStackLoc contains the first/last - // "at stack" argument location. - int LastArgStackLoc = 0; - unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16); + int FirstFI = -MFI->getNumFixedObjects() - 1, LastFI = 0; // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -1132,11 +1810,12 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i32) Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); if (VA.getValVT() == MVT::f64 && VA.getLocVT() == MVT::i32) { - Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg, - DAG.getConstant(0, getPointerTy())); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg, - DAG.getConstant(1, getPointerTy())); + SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32, + Arg, DAG.getConstant(0, MVT::i32)); + SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32, + Arg, DAG.getConstant(1, MVT::i32)); + if (!Subtarget->isLittle()) + std::swap(Lo, Hi); RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); RegsToPass.push_back(std::make_pair(VA.getLocReg()+1, Hi)); continue; @@ -1164,15 +1843,22 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Register can't get to this point... assert(VA.isMemLoc()); - // Create the frame index object for this incoming parameter - // This guarantees that when allocating Local Area the firsts - // 16 bytes which are alwayes reserved won't be overwritten - // if O32 ABI is used. For EABI the first address is zero. - LastArgStackLoc = (FirstStackArgLoc + VA.getLocMemOffset()); - int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, - LastArgStackLoc, true); + // ByVal Arg. + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (Flags.isByVal()) { + assert(Subtarget->isABI_O32() && + "No support for ByVal args by ABIs other than O32 yet."); + assert(Flags.getByValSize() && + "ByVal args of size 0 should have been ignored by front-end."); + WriteByValArg(Chain, dl, RegsToPass, MemOpChains, LastFI, MFI, DAG, Arg, + VA, Flags, getPointerTy()); + continue; + } - SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy()); + // Create the frame index object for this incoming parameter + LastFI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, + VA.getLocMemOffset(), true); + SDValue PtrOff = DAG.getFrameIndex(LastFI, getPointerTy()); // emit ISD::STORE whichs stores the // parameter value to a stack Location @@ -1181,23 +1867,18 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, false, false, 0)); } + // Extend range of indices of frame objects for outgoing arguments that were + // created during this function call. Skip this step if no such objects were + // created. + if (LastFI) + MipsFI->extendOutArgFIRange(FirstFI, LastFI); + // Transform all store nodes into one single node because all store // nodes are independent of each other. if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0], MemOpChains.size()); - // Build a sequence of copy-to-reg nodes chained together with token - // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be - // stuck together. - SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. @@ -1224,10 +1905,13 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, LoadSymAddr = true; } + SDValue InFlag; + // Create nodes that load address of callee and copy it to T9 if (IsPIC) { if (LoadSymAddr) { // Load callee address + Callee = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, Callee); SDValue LoadValue = DAG.getLoad(MVT::i32, dl, Chain, Callee, MachinePointerInfo::getGOT(), false, false, 0); @@ -1239,7 +1923,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, } else Callee = LoadValue; - // Use chain output from LoadValue + // Use chain output from LoadValue Chain = LoadValue.getValue(1); } @@ -1249,6 +1933,16 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, Callee = DAG.getRegister(Mips::T9, MVT::i32); } + // Build a sequence of copy-to-reg nodes chained together with token + // chain and flag operands which copy the outgoing args into registers. + // The InFlag in necessary since all emitted instructions must be + // stuck together. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + // MipsJmpLink = #chain, #target_address, #opt_in_flags... // = Chain, Callee, Reg#1, Reg#2, ... // @@ -1270,39 +1964,8 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, Chain = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); - // Create a stack location to hold GP when PIC is used. This stack - // location is used on function prologue to save GP and also after all - // emited CALL's to restore GP. - if (IsPIC) { - // Function can have an arbitrary number of calls, so - // hold the LastArgStackLoc with the biggest offset. - int FI; - MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); - if (LastArgStackLoc >= MipsFI->getGPStackOffset()) { - LastArgStackLoc = (!LastArgStackLoc) ? (16) : (LastArgStackLoc+4); - // Create the frame index only once. SPOffset here can be anything - // (this will be fixed on processFunctionBeforeFrameFinalized) - if (MipsFI->getGPStackOffset() == -1) { - FI = MFI->CreateFixedObject(4, 0, true); - MipsFI->setGPFI(FI); - } - MipsFI->setGPStackOffset(LastArgStackLoc); - } - - // Reload GP value. - FI = MipsFI->getGPFI(); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - SDValue GPLoad = DAG.getLoad(MVT::i32, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0); - Chain = GPLoad.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, DAG.getRegister(Mips::GP, MVT::i32), - GPLoad, SDValue(0,0)); - InFlag = Chain.getValue(1); - } - // Create the CALLSEQ_END node. - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NextStackOffset, true), DAG.getIntPtrConstant(0, true), InFlag); InFlag = Chain.getValue(1); @@ -1320,11 +1983,10 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_Mips); @@ -1342,18 +2004,41 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, //===----------------------------------------------------------------------===// // Formal Arguments Calling Convention Implementation //===----------------------------------------------------------------------===// +static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl, + std::vector<SDValue>& OutChains, + SelectionDAG &DAG, unsigned NumWords, SDValue FIN, + const CCValAssign &VA, const ISD::ArgFlagsTy& Flags) { + unsigned LocMem = VA.getLocMemOffset(); + unsigned FirstWord = LocMem / 4; + + // copy register A0 - A3 to frame object + for (unsigned i = 0; i < NumWords; ++i) { + unsigned CurWord = FirstWord + i; + if (CurWord >= O32IntRegsSize) + break; + + unsigned SrcReg = O32IntRegs[CurWord]; + unsigned Reg = AddLiveIn(MF, SrcReg, Mips::CPURegsRegisterClass); + SDValue StorePtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIN, + DAG.getConstant(i * 4, MVT::i32)); + SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(Reg, MVT::i32), + StorePtr, MachinePointerInfo(), false, + false, 0); + OutChains.push_back(Store); + } +} /// LowerFormalArguments - transform physical registers into virtual registers /// and generate load operations for arguments places on the stack. SDValue MipsTargetLowering::LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> - &Ins, - DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { - MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); @@ -1363,23 +2048,17 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // Used with vargs to acumulate store chains. std::vector<SDValue> OutChains; - // Keep track of the last register used for arguments - unsigned ArgRegEnd = 0; - // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); if (Subtarget->isABI_O32()) - CCInfo.AnalyzeFormalArguments(Ins, - isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32); + CCInfo.AnalyzeFormalArguments(Ins, CC_MipsO32); else CCInfo.AnalyzeFormalArguments(Ins, CC_Mips); - unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16); - unsigned LastStackArgEndOffset = 0; - EVT LastRegArgValVT; + int LastFI = 0;// MipsFI->LastInArgFI is 0 at the entry of this function. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -1387,8 +2066,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // Arguments stored on registers if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - ArgRegEnd = VA.getLocReg(); - LastRegArgValVT = VA.getValVT(); + unsigned ArgReg = VA.getLocReg(); TargetRegisterClass *RC = 0; if (RegVT == MVT::i32) @@ -1403,7 +2081,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // Transform the arguments stored on // physical registers into virtual ones - unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC); + unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgReg, RC); SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // If this is an 8 or 16-bit value, it has been passed promoted @@ -1429,9 +2107,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg()+1, RC); SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT); - SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, ArgValue, - ArgValue2); - ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Pair); + if (!Subtarget->isLittle()) + std::swap(ArgValue, ArgValue2); + ArgValue = DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64, + ArgValue, ArgValue2); } } @@ -1441,26 +2120,31 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // sanity check assert(VA.isMemLoc()); - // The last argument is not a register anymore - ArgRegEnd = 0; + ISD::ArgFlagsTy Flags = Ins[i].Flags; + + if (Flags.isByVal()) { + assert(Subtarget->isABI_O32() && + "No support for ByVal args by ABIs other than O32 yet."); + assert(Flags.getByValSize() && + "ByVal args of size 0 should have been ignored by front-end."); + unsigned NumWords = (Flags.getByValSize() + 3) / 4; + LastFI = MFI->CreateFixedObject(NumWords * 4, VA.getLocMemOffset(), + true); + SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy()); + InVals.push_back(FIN); + ReadByValArg(MF, Chain, dl, OutChains, DAG, NumWords, FIN, VA, Flags); + + continue; + } // The stack pointer offset is relative to the caller stack frame. - // Since the real stack size is unknown here, a negative SPOffset - // is used so there's a way to adjust these offsets when the stack - // size get known (on EliminateFrameIndex). A dummy SPOffset is - // used instead of a direct negative address (which is recorded to - // be used on emitPrologue) to avoid mis-calc of the first stack - // offset on PEI::calculateFrameObjectOffsets. - unsigned ArgSize = VA.getValVT().getSizeInBits()/8; - LastStackArgEndOffset = FirstStackArgLoc + VA.getLocMemOffset() + ArgSize; - int FI = MFI->CreateFixedObject(ArgSize, 0, true); - MipsFI->recordLoadArgsFI(FI, -(4 + - (FirstStackArgLoc + VA.getLocMemOffset()))); + LastFI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, + VA.getLocMemOffset(), true); // Create load nodes to retrieve arguments from the stack - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy()); InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), + MachinePointerInfo::getFixedStack(LastFI), false, false, 0)); } } @@ -1478,58 +2162,33 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); } - // To meet ABI, when VARARGS are passed on registers, the registers - // must have their values written to the caller stack frame. If the last - // argument was placed in the stack, there's no need to save any register. if (isVarArg && Subtarget->isABI_O32()) { - if (ArgRegEnd) { - // Last named formal argument is passed in register. - - // The last register argument that must be saved is Mips::A3 + // Record the frame index of the first variable argument + // which is a value necessary to VASTART. + unsigned NextStackOffset = CCInfo.getNextStackOffset(); + assert(NextStackOffset % 4 == 0 && + "NextStackOffset must be aligned to 4-byte boundaries."); + LastFI = MFI->CreateFixedObject(4, NextStackOffset, true); + MipsFI->setVarArgsFrameIndex(LastFI); + + // If NextStackOffset is smaller than o32's 16-byte reserved argument area, + // copy the integer registers that have not been used for argument passing + // to the caller's stack frame. + for (; NextStackOffset < 16; NextStackOffset += 4) { TargetRegisterClass *RC = Mips::CPURegsRegisterClass; - if (LastRegArgValVT == MVT::f64) - ArgRegEnd++; - - if (ArgRegEnd < Mips::A3) { - // Both the last named formal argument and the first variable - // argument are passed in registers. - for (++ArgRegEnd; ArgRegEnd <= Mips::A3; ++ArgRegEnd) { - unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC); - SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32); - - int FI = MFI->CreateFixedObject(4, 0, true); - MipsFI->recordStoreVarArgsFI(FI, -(4+(ArgRegEnd-Mips::A0)*4)); - SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy()); - OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, - MachinePointerInfo(), - false, false, 0)); - - // Record the frame index of the first variable argument - // which is a value necessary to VASTART. - if (!MipsFI->getVarArgsFrameIndex()) { - MFI->setObjectAlignment(FI, 4); - MipsFI->setVarArgsFrameIndex(FI); - } - } - } else { - // Last named formal argument is in register Mips::A3, and the first - // variable argument is on stack. Record the frame index of the first - // variable argument. - int FI = MFI->CreateFixedObject(4, 0, true); - MFI->setObjectAlignment(FI, 4); - MipsFI->recordStoreVarArgsFI(FI, -20); - MipsFI->setVarArgsFrameIndex(FI); - } - } else { - // Last named formal argument and all the variable arguments are passed - // on stack. Record the frame index of the first variable argument. - int FI = MFI->CreateFixedObject(4, 0, true); - MFI->setObjectAlignment(FI, 4); - MipsFI->recordStoreVarArgsFI(FI, -(4+LastStackArgEndOffset)); - MipsFI->setVarArgsFrameIndex(FI); + unsigned Idx = NextStackOffset / 4; + unsigned Reg = AddLiveIn(DAG.getMachineFunction(), O32IntRegs[Idx], RC); + SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32); + LastFI = MFI->CreateFixedObject(4, NextStackOffset, true); + SDValue PtrOff = DAG.getFrameIndex(LastFI, getPointerTy()); + OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, + MachinePointerInfo(), + false, false, 0)); } } + MipsFI->setLastInArgFI(LastFI); + // All stores are grouped in one node to allow the matching between // the size of Ins and InVals. This only happens when on varg functions if (!OutChains.empty()) { @@ -1557,8 +2216,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain, SmallVector<CCValAssign, 16> RVLocs; // CCState - Info about the registers and stack slot. - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_Mips); diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index 95f8d77..fbcedfd 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -40,6 +40,16 @@ namespace llvm { // Handle gp_rel (small data/bss sections) relocation. GPRel, + // General Dynamic TLS + TlsGd, + + // Local Exec TLS + TprelHi, + TprelLo, + + // Thread Pointer + ThreadPointer, + // Floating Point Branch Conditional FPBrcond, @@ -64,7 +74,12 @@ namespace llvm { // DivRem(u) DivRem, - DivRemU + DivRemU, + + BuildPairF64, + ExtractElementF64, + + WrapperPIC }; } @@ -86,9 +101,6 @@ namespace llvm { /// getSetCCResultType - get the ISD::SETCC result ValueType MVT::SimpleValueType getSetCCResultType(EVT VT) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; private: // Subtarget Info @@ -106,13 +118,14 @@ namespace llvm { SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; virtual SDValue LowerFormalArguments(SDValue Chain, @@ -164,6 +177,16 @@ namespace llvm { /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + + MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, + unsigned Size, unsigned BinOpcode, bool Nand = false) const; + MachineBasicBlock *EmitAtomicBinaryPartword(MachineInstr *MI, + MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode, + bool Nand = false) const; + MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, + MachineBasicBlock *BB, unsigned Size) const; + MachineBasicBlock *EmitAtomicCmpSwapPartword(MachineInstr *MI, + MachineBasicBlock *BB, unsigned Size) const; }; } diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index 251f377..021c167 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file contains the Mips implementation of the TargetInstrInfo class. +// This file describes the Mips FPU instruction set. // //===----------------------------------------------------------------------===// @@ -30,6 +30,12 @@ def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>, SDTCisInt<2>]>; def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>; +def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, + SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; +def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, f64>, + SDTCisVT<0, i32>]>; def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>; def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>; @@ -37,6 +43,9 @@ def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>; def MipsFPRound : SDNode<"MipsISD::FPRound", SDTFPRoundOp, [SDNPOptInGlue]>; def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond, [SDNPHasChain, SDNPOptInGlue]>; +def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>; +def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64", + SDT_MipsExtractElementF64>; // Operand for printing out a condition code. let PrintMethod = "printFCCOperand" in @@ -68,40 +77,42 @@ def IsNotMipsI : Predicate<"!Subtarget.isMips1()">; multiclass FFR1_1<bits<6> funct, string asmstr> { def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs), - !strconcat(asmstr, ".s $fd, $fs"), []>; + !strconcat(asmstr, ".s\t$fd, $fs"), []>; def _D32 : FFR<0x11, funct, 0x1, (outs FGR32:$fd), (ins AFGR64:$fs), - !strconcat(asmstr, ".d $fd, $fs"), []>, Requires<[In32BitMode]>; + !strconcat(asmstr, ".d\t$fd, $fs"), []>, Requires<[In32BitMode]>; } multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp> { def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs), - !strconcat(asmstr, ".s $fd, $fs"), + !strconcat(asmstr, ".s\t$fd, $fs"), [(set FGR32:$fd, (FOp FGR32:$fs))]>; def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs), - !strconcat(asmstr, ".d $fd, $fs"), + !strconcat(asmstr, ".d\t$fd, $fs"), [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>; } class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc, RegisterClass RcDst, string asmstr>: FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs), - !strconcat(asmstr, " $fd, $fs"), []>; + !strconcat(asmstr, "\t$fd, $fs"), []>; -multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp> { +multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp, bit isComm = 0> { + let isCommutable = isComm in { def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs, FGR32:$ft), - !strconcat(asmstr, ".s $fd, $fs, $ft"), + !strconcat(asmstr, ".s\t$fd, $fs, $ft"), [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>; def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs, AFGR64:$ft), - !strconcat(asmstr, ".d $fd, $fs, $ft"), + !strconcat(asmstr, ".d\t$fd, $fs, $ft"), [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>, Requires<[In32BitMode]>; + } } //===----------------------------------------------------------------------===// @@ -161,42 +172,42 @@ let ft = 0 in { let fd = 0 in { /// Move Control Registers From/To CPU Registers def CFC1 : FFR<0x11, 0x0, 0x2, (outs CPURegs:$rt), (ins CCR:$fs), - "cfc1 $rt, $fs", []>; + "cfc1\t$rt, $fs", []>; def CTC1 : FFR<0x11, 0x0, 0x6, (outs CCR:$rt), (ins CPURegs:$fs), - "ctc1 $fs, $rt", []>; + "ctc1\t$fs, $rt", []>; def MFC1 : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs), - "mfc1 $rt, $fs", []>; + "mfc1\t$rt, $fs", []>; def MTC1 : FFR<0x11, 0x00, 0x04, (outs FGR32:$fs), (ins CPURegs:$rt), - "mtc1 $rt, $fs", []>; + "mtc1\t$rt, $fs", []>; } def FMOV_S32 : FFR<0x11, 0b000110, 0x0, (outs FGR32:$fd), (ins FGR32:$fs), - "mov.s $fd, $fs", []>; + "mov.s\t$fd, $fs", []>; def FMOV_D32 : FFR<0x11, 0b000110, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs), - "mov.d $fd, $fs", []>; + "mov.d\t$fd, $fs", []>; /// Floating Point Memory Instructions let Predicates = [IsNotSingleFloat, IsNotMipsI] in { def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr), - "ldc1 $ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>; + "ldc1\t$ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>; def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr), - "sdc1 $ft, $addr", [(store AFGR64:$ft, addr:$addr)]>; + "sdc1\t$ft, $addr", [(store AFGR64:$ft, addr:$addr)]>; } -// LWC1 and SWC1 can always be emited with odd registers. -def LWC1 : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1 $ft, $addr", +// LWC1 and SWC1 can always be emitted with odd registers. +def LWC1 : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1\t$ft, $addr", [(set FGR32:$ft, (load addr:$addr))]>; -def SWC1 : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr), "swc1 $ft, $addr", - [(store FGR32:$ft, addr:$addr)]>; +def SWC1 : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr), + "swc1\t$ft, $addr", [(store FGR32:$ft, addr:$addr)]>; /// Floating-point Aritmetic -defm FADD : FFR1_4<0x10, "add", fadd>; +defm FADD : FFR1_4<0x10, "add", fadd, 1>; defm FDIV : FFR1_4<0x03, "div", fdiv>; -defm FMUL : FFR1_4<0x02, "mul", fmul>; +defm FMUL : FFR1_4<0x02, "mul", fmul, 1>; defm FSUB : FFR1_4<0x01, "sub", fsub>; //===----------------------------------------------------------------------===// @@ -212,7 +223,7 @@ def MIPS_BRANCH_TL : PatLeaf<(i32 3)>; /// Floating Point Branch of False/True (Likely) let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs), - (ins brtarget:$dst), !strconcat(asmstr, " $dst"), + (ins brtarget:$dst), !strconcat(asmstr, "\t$dst"), [(MipsFPBrcond op, bb:$dst)]>; def BC1F : FBRANCH<MIPS_BRANCH_F, "bc1f">; @@ -245,19 +256,20 @@ def MIPS_FCOND_NGT : PatLeaf<(i32 15)>; /// Floating Point Compare let hasDelaySlot = 1, Defs=[FCR31] in { def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc), - "c.$cc.s $fs, $ft", + "c.$cc.s\t$fs, $ft", [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc)]>; def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc), - "c.$cc.d $fs, $ft", + "c.$cc.d\t$fs, $ft", [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc)]>, Requires<[In32BitMode]>; } // Conditional moves: -// These instructions are expanded in MipsISelLowering::EmitInstrWithCustomInserter -// if target does not have conditional move instructions. +// These instructions are expanded in +// MipsISelLowering::EmitInstrWithCustomInserter if target does not have +// conditional move instructions. // flag:int, data:float let usesCustomInserter = 1, Constraints = "$F = $dst" in class CondMovIntFP<RegisterClass RC, bits<5> fmt, bits<6> func, @@ -312,6 +324,23 @@ let Predicates = [In32BitMode] in { def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src), "# MOVCCRToCCR", []>; +// This pseudo instr gets expanded into 2 mtc1 instrs after register +// allocation. +def BuildPairF64 : + MipsPseudo<(outs AFGR64:$dst), + (ins CPURegs:$lo, CPURegs:$hi), "", + [(set AFGR64:$dst, (MipsBuildPairF64 CPURegs:$lo, CPURegs:$hi))]>; + +// This pseudo instr gets expanded into 2 mfc1 instrs after register +// allocation. +// if n is 0, lower part of src is extracted. +// if n is 1, higher part of src is extracted. +def ExtractElementF64 : + MipsPseudo<(outs CPURegs:$dst), + (ins AFGR64:$src, i32imm:$n), "", + [(set CPURegs:$dst, + (MipsExtractElementF64 AFGR64:$src, imm:$n))]>; + //===----------------------------------------------------------------------===// // Floating Point Patterns //===----------------------------------------------------------------------===// @@ -330,6 +359,7 @@ def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVTS_W32 (MTC1 CPURegs:$src))>; def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVTD_W32 (MTC1 CPURegs:$src))>; def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>; +def : Pat<(i32 (fp_to_sint AFGR64:$src)), (MFC1 (TRUNC_W_D32 AFGR64:$src))>; def : Pat<(i32 (bitconvert FGR32:$src)), (MFC1 FGR32:$src)>; def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>; diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td index 9dfcdfb..9f55fb3 100644 --- a/lib/Target/Mips/MipsInstrFormats.td +++ b/lib/Target/Mips/MipsInstrFormats.td @@ -1,4 +1,4 @@ -//===- MipsRegisterInfo.td - Mips Register defs ------------*- tablegen -*-===// +//===- MipsInstrFormats.td - Mips Instruction Formats ------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h index 5fdbf1f..abf6773 100644 --- a/lib/Target/Mips/MipsInstrInfo.h +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -146,7 +146,21 @@ namespace MipsII { /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol /// address. MO_ABS_HI, - MO_ABS_LO + MO_ABS_LO, + + /// MO_TLSGD - Represents the offset into the global offset table at which + // the module ID and TSL block offset reside during execution (General + // Dynamic TLS). + MO_TLSGD, + + /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial + // Exec TLS). + MO_GOTTPREL, + + /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from + // the thread pointer (Local Exec TLS). + MO_TPREL_HI, + MO_TPREL_LO }; } diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index c14dc9c..329a002 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -1,4 +1,4 @@ -//===- MipsInstrInfo.td - Mips Register defs ---------------*- tablegen -*-===// +//===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -6,6 +6,10 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// This file contains the Mips implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Instruction format superclass @@ -20,8 +24,9 @@ include "MipsInstrFormats.td" def SDT_MipsRet : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_MipsJmpLink : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; def SDT_MipsCMov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, - SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>, - SDTCisInt<4>]>; + SDTCisSameAs<1, 2>, + SDTCisSameAs<3, 4>, + SDTCisInt<4>]>; def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; def SDT_MipsCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_MipsMAddMSub : SDTypeProfile<0, 4, @@ -32,6 +37,8 @@ def SDT_MipsDivRem : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>]>; +def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; + // Call def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink, [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, @@ -44,6 +51,16 @@ def MipsHi : SDNode<"MipsISD::Hi", SDTIntUnaryOp>; def MipsLo : SDNode<"MipsISD::Lo", SDTIntUnaryOp>; def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>; +// TlsGd node is used to handle General Dynamic TLS +def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>; + +// TprelHi and TprelLo nodes are used to handle Local Exec TLS +def MipsTprelHi : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>; +def MipsTprelLo : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>; + +// Thread pointer +def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>; + // Return def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain, SDNPOptInGlue]>; @@ -70,6 +87,18 @@ def MipsDivRem : SDNode<"MipsISD::DivRem", SDT_MipsDivRem, def MipsDivRemU : SDNode<"MipsISD::DivRemU", SDT_MipsDivRem, [SDNPOutGlue]>; +// Target constant nodes that are not part of any isel patterns and remain +// unchanged can cause instructions with illegal operands to be emitted. +// Wrapper node patterns give the instruction selector a chance to replace +// target constant nodes that would otherwise remain unchanged with ADDiu +// nodes. Without these wrapper node patterns, the following conditional move +// instrucion is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is +// compiled: +// movn %got(d)($gp), %got(c)($gp), $4 +// This instruction is illegal since movn can take only register operands. + +def MipsWrapperPIC : SDNode<"MipsISD::WrapperPIC", SDTIntUnaryOp>; + //===----------------------------------------------------------------------===// // Mips Instruction Predicate Definitions. //===----------------------------------------------------------------------===// @@ -140,17 +169,20 @@ def addr : ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], []>; //===----------------------------------------------------------------------===// // Arithmetic 3 register operands -let isCommutable = 1 in class ArithR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode, - InstrItinClass itin>: + InstrItinClass itin, bit isComm = 0>: FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c), !strconcat(instr_asm, "\t$dst, $b, $c"), - [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin>; + [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin> { + let isCommutable = isComm; +} -let isCommutable = 1 in -class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm>: +class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm, + bit isComm = 0>: FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c), - !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu>; + !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu> { + let isCommutable = isComm; +} // Arithmetic 2 register operands class ArithI<bits<6> op, string instr_asm, SDNode OpNode, @@ -166,12 +198,15 @@ class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode, // Arithmetic Multiply ADD/SUB let rd = 0, shamt = 0, Defs = [HI, LO], Uses = [HI, LO] in -class MArithR<bits<6> func, string instr_asm, SDNode op> : +class MArithR<bits<6> func, string instr_asm, SDNode op, bit isComm = 0> : FR<0x1c, func, (outs), (ins CPURegs:$rs, CPURegs:$rt), !strconcat(instr_asm, "\t$rs, $rt"), - [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul>; + [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul> { + let isCommutable = isComm; +} // Logical +let isCommutable = 1 in class LogicR<bits<6> func, string instr_asm, SDNode OpNode>: FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c), !strconcat(instr_asm, "\t$dst, $b, $c"), @@ -182,6 +217,7 @@ class LogicI<bits<6> op, string instr_asm, SDNode OpNode>: !strconcat(instr_asm, "\t$dst, $b, $c"), [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt16:$c))], IIAlu>; +let isCommutable = 1 in class LogicNOR<bits<6> op, bits<6> func, string instr_asm>: FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c), !strconcat(instr_asm, "\t$dst, $b, $c"), @@ -287,6 +323,7 @@ let isCall=1, hasDelaySlot=1, // Mul, Div let Defs = [HI, LO] in { + let isCommutable = 1 in class Mul<bits<6> func, string instr_asm, InstrItinClass itin>: FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b), !strconcat(instr_asm, "\t$a, $b"), [], itin>; @@ -337,6 +374,13 @@ class CondMov<bits<6> func, string instr_asm, PatLeaf MovCode>: CPURegs:$cond), !strconcat(instr_asm, "\t$dst, $T, $cond"), [], NoItinerary>; +// Read Hardware +class ReadHardware: FR<0x1f, 0x3b, (outs CPURegs:$dst), (ins HWRegs:$src), + "rdhwr\t$dst, $src", [], IIAlu> { + let rs = 0; + let shamt = 0; +} + //===----------------------------------------------------------------------===// // Pseudo instructions //===----------------------------------------------------------------------===// @@ -368,7 +412,116 @@ def ATMACRO : MipsPseudo<(outs), (ins), ".set\tat", []>; // are used, we have the same behavior, but get also a bunch of warnings // from the assembler. def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>; -def CPRESTORE : MipsPseudo<(outs), (ins uimm16:$loc), ".cprestore\t$loc\n", []>; +def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc), ".cprestore\t$loc\n", []>; + +let usesCustomInserter = 1 in { + def ATOMIC_LOAD_ADD_I8 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_add_8\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_add_8 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_ADD_I16 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_add_16\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_add_16 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_ADD_I32 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_add_32\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_add_32 CPURegs:$ptr, CPURegs:$incr))]>; + + def ATOMIC_LOAD_SUB_I8 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_sub_8\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_sub_8 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_SUB_I16 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_sub_16\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_sub_16 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_SUB_I32 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_sub_32\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_sub_32 CPURegs:$ptr, CPURegs:$incr))]>; + + def ATOMIC_LOAD_AND_I8 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_and_8\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_and_8 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_AND_I16 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_and_16\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_and_16 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_AND_I32 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_and_32\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_and_32 CPURegs:$ptr, CPURegs:$incr))]>; + + def ATOMIC_LOAD_OR_I8 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_or_8\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_or_8 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_OR_I16 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_or_16\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_or_16 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_OR_I32 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_or_32\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_or_32 CPURegs:$ptr, CPURegs:$incr))]>; + + def ATOMIC_LOAD_XOR_I8 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_xor_8\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_xor_8 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_XOR_I16 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_xor_16\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_xor_16 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_XOR_I32 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_xor_32\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_xor_32 CPURegs:$ptr, CPURegs:$incr))]>; + + def ATOMIC_LOAD_NAND_I8 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_nand_8\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_nand_8 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_NAND_I16 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_nand_16\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_nand_16 CPURegs:$ptr, CPURegs:$incr))]>; + def ATOMIC_LOAD_NAND_I32 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr), + "atomic_load_nand_32\t$dst, $ptr, $incr", + [(set CPURegs:$dst, (atomic_load_nand_32 CPURegs:$ptr, CPURegs:$incr))]>; + + def ATOMIC_SWAP_I8 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val), + "atomic_swap_8\t$dst, $ptr, $val", + [(set CPURegs:$dst, (atomic_swap_8 CPURegs:$ptr, CPURegs:$val))]>; + def ATOMIC_SWAP_I16 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val), + "atomic_swap_16\t$dst, $ptr, $val", + [(set CPURegs:$dst, (atomic_swap_16 CPURegs:$ptr, CPURegs:$val))]>; + def ATOMIC_SWAP_I32 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val), + "atomic_swap_32\t$dst, $ptr, $val", + [(set CPURegs:$dst, (atomic_swap_32 CPURegs:$ptr, CPURegs:$val))]>; + + def ATOMIC_CMP_SWAP_I8 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval), + "atomic_cmp_swap_8\t$dst, $ptr, $oldval, $newval", + [(set CPURegs:$dst, + (atomic_cmp_swap_8 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>; + def ATOMIC_CMP_SWAP_I16 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval), + "atomic_cmp_swap_16\t$dst, $ptr, $oldval, $newval", + [(set CPURegs:$dst, + (atomic_cmp_swap_16 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>; + def ATOMIC_CMP_SWAP_I32 : MipsPseudo< + (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval), + "atomic_cmp_swap_32\t$dst, $ptr, $oldval, $newval", + [(set CPURegs:$dst, + (atomic_cmp_swap_32 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>; +} //===----------------------------------------------------------------------===// // Instruction definition @@ -389,9 +542,9 @@ def XORi : LogicI<0x0e, "xori", xor>; def LUi : LoadUpper<0x0f, "lui">; /// Arithmetic Instructions (3-Operand, R-Type) -def ADDu : ArithR<0x00, 0x21, "addu", add, IIAlu>; +def ADDu : ArithR<0x00, 0x21, "addu", add, IIAlu, 1>; def SUBu : ArithR<0x00, 0x23, "subu", sub, IIAlu>; -def ADD : ArithOverflowR<0x00, 0x20, "add">; +def ADD : ArithOverflowR<0x00, 0x20, "add", 1>; def SUB : ArithOverflowR<0x00, 0x22, "sub">; def SLT : SetCC_R<0x00, 0x2a, "slt", setlt>; def SLTu : SetCC_R<0x00, 0x2b, "sltu", setult>; @@ -424,6 +577,14 @@ def SB : StoreM<0x28, "sb", truncstorei8>; def SH : StoreM<0x29, "sh", truncstorei16>; def SW : StoreM<0x2b, "sw", store>; +/// Load-linked, Store-conditional +let hasDelaySlot = 1 in + def LL : FI<0x30, (outs CPURegs:$dst), (ins mem:$addr), + "ll\t$dst, $addr", [], IILoad>; +let Constraints = "$src = $dst" in + def SC : FI<0x38, (outs CPURegs:$dst), (ins CPURegs:$src, mem:$addr), + "sc\t$src, $addr", [], IIStore>; + /// Jump and Branch Instructions def J : JumpFJ<0x02, "j">; def JR : JumpFR<0x00, 0x08, "jr">; @@ -491,8 +652,9 @@ def MIPS_CMOV_ZERO : PatLeaf<(i32 0)>; def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>; // Conditional moves: -// These instructions are expanded in MipsISelLowering::EmitInstrWithCustomInserter -// if target does not have conditional move instructions. +// These instructions are expanded in +// MipsISelLowering::EmitInstrWithCustomInserter if target does not have +// conditional move instructions. // flag:int, data:int let usesCustomInserter = 1, shamt = 0, Constraints = "$F = $dst" in class CondMovIntInt<bits<6> funct, string instr_asm> : @@ -514,14 +676,16 @@ let addr=0 in def LEA_ADDiu : EffectiveAddress<"addiu\t$dst, ${addr:stackloc}">; // MADD*/MSUB* -def MADD : MArithR<0, "madd", MipsMAdd>; -def MADDU : MArithR<1, "maddu", MipsMAddu>; +def MADD : MArithR<0, "madd", MipsMAdd, 1>; +def MADDU : MArithR<1, "maddu", MipsMAddu, 1>; def MSUB : MArithR<4, "msub", MipsMSub>; def MSUBU : MArithR<5, "msubu", MipsMSubu>; // MUL is a assembly macro in the current used ISAs. In recent ISA's // it is a real instruction. -def MUL : ArithR<0x1c, 0x02, "mul", mul, IIImul>, Requires<[IsMips32]>; +def MUL : ArithR<0x1c, 0x02, "mul", mul, IIImul, 1>, Requires<[IsMips32]>; + +def RDHWR : ReadHardware; //===----------------------------------------------------------------------===// // Arbitrary patterns that map to one or more instructions @@ -555,6 +719,7 @@ def : Pat<(MipsJmpLink (i32 texternalsym:$dst)), // hi/lo relocs def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>; +def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>; def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)), (ADDiu CPURegs:$hi, tglobaladdr:$lo)>; def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)), @@ -574,6 +739,26 @@ def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)), def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)), (ADDiu CPURegs:$gp, tconstpool:$in)>; +// tlsgd +def : Pat<(add CPURegs:$gp, (MipsTlsGd tglobaltlsaddr:$in)), + (ADDiu CPURegs:$gp, tglobaltlsaddr:$in)>; + +// tprel hi/lo +def : Pat<(MipsTprelHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>; +def : Pat<(add CPURegs:$hi, (MipsTprelLo tglobaltlsaddr:$lo)), + (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>; + +// wrapper_pic +class WrapperPICPat<SDNode node>: + Pat<(MipsWrapperPIC node:$in), + (ADDiu GP, node:$in)>; + +def : WrapperPICPat<tglobaladdr>; +def : WrapperPICPat<tconstpool>; +def : WrapperPICPat<texternalsym>; +def : WrapperPICPat<tblockaddress>; +def : WrapperPICPat<tjumptable>; + // Mips does not have "not", so we expand our way def : Pat<(not CPURegs:$in), (NOR CPURegs:$in, ZERO)>; @@ -641,13 +826,6 @@ multiclass MovnPats<RegisterClass RC, Instruction MOVNInst> { defm : MovzPats<CPURegs, MOVZ_I>; defm : MovnPats<CPURegs, MOVN_I>; -// select patterns with got access -let AddedComplexity = 10 in - def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), - (i32 tglobaladdr:$T), CPURegs:$F), - (MOVN_I CPURegs:$F, (ADDiu GP, tglobaladdr:$T), - (XOR CPURegs:$lhs, CPURegs:$rhs))>; - // setcc patterns def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs), (SLTu (XOR CPURegs:$lhs, CPURegs:$rhs), 1)>; diff --git a/lib/Target/Mips/MipsMCAsmInfo.cpp b/lib/Target/Mips/MipsMCAsmInfo.cpp index fe48ab7..c86bf40 100644 --- a/lib/Target/Mips/MipsMCAsmInfo.cpp +++ b/lib/Target/Mips/MipsMCAsmInfo.cpp @@ -17,11 +17,15 @@ using namespace llvm; MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) { AlignmentIsInBytes = false; Data16bitsDirective = "\t.half\t"; - Data32bitsDirective = "\t.word\t"; + Data32bitsDirective = "\t.4byte\t"; Data64bitsDirective = 0; PrivateGlobalPrefix = "$"; CommentString = "#"; ZeroDirective = "\t.space\t"; GPRel32Directive = "\t.gpword\t"; - HasSetDirective = false; + WeakRefDirective = "\t.weak\t"; + + SupportsDebugInformation = true; + ExceptionsType = ExceptionHandling::DwarfCFI; + HasLEB128 = true; } diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h index 1e8e4fe..df40e6c 100644 --- a/lib/Target/Mips/MipsMachineFunction.h +++ b/lib/Target/Mips/MipsMachineFunction.h @@ -14,6 +14,7 @@ #ifndef MIPS_MACHINE_FUNCTION_INFO_H #define MIPS_MACHINE_FUNCTION_INFO_H +#include <utility> #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/VectorExtras.h" #include "llvm/CodeGen/MachineFunction.h" @@ -26,50 +27,6 @@ namespace llvm { class MipsFunctionInfo : public MachineFunctionInfo { private: - /// Holds for each function where on the stack the Frame Pointer must be - /// saved. This is used on Prologue and Epilogue to emit FP save/restore - int FPStackOffset; - - /// Holds for each function where on the stack the Return Address must be - /// saved. This is used on Prologue and Epilogue to emit RA save/restore - int RAStackOffset; - - /// At each function entry, two special bitmask directives must be emitted - /// to help debugging, for CPU and FPU callee saved registers. Both need - /// the negative offset from the final stack size and its higher registers - /// location on the stack. - int CPUTopSavedRegOff; - int FPUTopSavedRegOff; - - /// MipsFIHolder - Holds a FrameIndex and it's Stack Pointer Offset - struct MipsFIHolder { - - int FI; - int SPOffset; - - MipsFIHolder(int FrameIndex, int StackPointerOffset) - : FI(FrameIndex), SPOffset(StackPointerOffset) {} - }; - - /// When PIC is used the GP must be saved on the stack on the function - /// prologue and must be reloaded from this stack location after every - /// call. A reference to its stack location and frame index must be kept - /// to be used on emitPrologue and processFunctionBeforeFrameFinalized. - MipsFIHolder GPHolder; - - /// On LowerFormalArguments the stack size is unknown, so the Stack - /// Pointer Offset calculation of "not in register arguments" must be - /// postponed to emitPrologue. - SmallVector<MipsFIHolder, 16> FnLoadArgs; - bool HasLoadArgs; - - // When VarArgs, we must write registers back to caller stack, preserving - // on register arguments. Since the stack size is unknown on - // LowerFormalArguments, the Stack Pointer Offset calculation must be - // postponed to emitPrologue. - SmallVector<MipsFIHolder, 4> FnStoreVarArgs; - bool HasStoreVarArgs; - /// SRetReturnReg - Some subtargets require that sret lowering includes /// returning the value of the returned struct in a register. This field /// holds the virtual register into which the sret argument is passed. @@ -83,55 +40,47 @@ private: /// VarArgsFrameIndex - FrameIndex for start of varargs area. int VarArgsFrameIndex; + // Range of frame object indices. + // InArgFIRange: Range of indices of all frame objects created during call to + // LowerFormalArguments. + // OutArgFIRange: Range of indices of all frame objects created during call to + // LowerCall except for the frame object for restoring $gp. + std::pair<int, int> InArgFIRange, OutArgFIRange; + int GPFI; // Index of the frame object for restoring $gp + unsigned MaxCallFrameSize; + + /// AtomicFrameIndex - To implement atomic.swap and atomic.cmp.swap + /// intrinsics, it is necessary to use a temporary stack location. + /// This field holds the frame index of this location. + int AtomicFrameIndex; public: MipsFunctionInfo(MachineFunction& MF) - : FPStackOffset(0), RAStackOffset(0), CPUTopSavedRegOff(0), - FPUTopSavedRegOff(0), GPHolder(-1,-1), HasLoadArgs(false), - HasStoreVarArgs(false), SRetReturnReg(0), GlobalBaseReg(0), - VarArgsFrameIndex(0) + : SRetReturnReg(0), GlobalBaseReg(0), + VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)), + OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), MaxCallFrameSize(0), + AtomicFrameIndex(-1) {} - int getFPStackOffset() const { return FPStackOffset; } - void setFPStackOffset(int Off) { FPStackOffset = Off; } - - int getRAStackOffset() const { return RAStackOffset; } - void setRAStackOffset(int Off) { RAStackOffset = Off; } - - int getCPUTopSavedRegOff() const { return CPUTopSavedRegOff; } - void setCPUTopSavedRegOff(int Off) { CPUTopSavedRegOff = Off; } - - int getFPUTopSavedRegOff() const { return FPUTopSavedRegOff; } - void setFPUTopSavedRegOff(int Off) { FPUTopSavedRegOff = Off; } - - int getGPStackOffset() const { return GPHolder.SPOffset; } - int getGPFI() const { return GPHolder.FI; } - void setGPStackOffset(int Off) { GPHolder.SPOffset = Off; } - void setGPFI(int FI) { GPHolder.FI = FI; } - bool needGPSaveRestore() const { return GPHolder.SPOffset != -1; } - - bool hasLoadArgs() const { return HasLoadArgs; } - bool hasStoreVarArgs() const { return HasStoreVarArgs; } - - void recordLoadArgsFI(int FI, int SPOffset) { - if (!HasLoadArgs) HasLoadArgs=true; - FnLoadArgs.push_back(MipsFIHolder(FI, SPOffset)); - } - void recordStoreVarArgsFI(int FI, int SPOffset) { - if (!HasStoreVarArgs) HasStoreVarArgs=true; - FnStoreVarArgs.push_back(MipsFIHolder(FI, SPOffset)); + bool isInArgFI(int FI) const { + return FI <= InArgFIRange.first && FI >= InArgFIRange.second; } + void setLastInArgFI(int FI) { InArgFIRange.second = FI; } - void adjustLoadArgsFI(MachineFrameInfo *MFI) const { - if (!hasLoadArgs()) return; - for (unsigned i = 0, e = FnLoadArgs.size(); i != e; ++i) - MFI->setObjectOffset( FnLoadArgs[i].FI, FnLoadArgs[i].SPOffset ); + bool isOutArgFI(int FI) const { + return FI <= OutArgFIRange.first && FI >= OutArgFIRange.second; } - void adjustStoreVarArgsFI(MachineFrameInfo *MFI) const { - if (!hasStoreVarArgs()) return; - for (unsigned i = 0, e = FnStoreVarArgs.size(); i != e; ++i) - MFI->setObjectOffset( FnStoreVarArgs[i].FI, FnStoreVarArgs[i].SPOffset ); + void extendOutArgFIRange(int FirstFI, int LastFI) { + if (!OutArgFIRange.second) + // this must be the first time this function was called. + OutArgFIRange.first = FirstFI; + OutArgFIRange.second = LastFI; } + int getGPFI() const { return GPFI; } + void setGPFI(int FI) { GPFI = FI; } + bool needGPSaveRestore() const { return getGPFI(); } + bool isGPFI(int FI) const { return GPFI && GPFI == FI; } + unsigned getSRetReturnReg() const { return SRetReturnReg; } void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } @@ -140,6 +89,12 @@ public: int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } + + unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; } + void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; } + + int getAtomicFrameIndex() const { return AtomicFrameIndex; } + void setAtomicFrameIndex(int Index) { AtomicFrameIndex = Index; } }; } // end of namespace llvm diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index acea7da..b0984af 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -65,16 +65,16 @@ getRegisterNumbering(unsigned RegEnum) case Mips::T5 : case Mips::F13: return 13; case Mips::T6 : case Mips::F14: case Mips::D7: return 14; case Mips::T7 : case Mips::F15: return 15; - case Mips::T8 : case Mips::F16: case Mips::D8: return 16; - case Mips::T9 : case Mips::F17: return 17; - case Mips::S0 : case Mips::F18: case Mips::D9: return 18; - case Mips::S1 : case Mips::F19: return 19; - case Mips::S2 : case Mips::F20: case Mips::D10: return 20; - case Mips::S3 : case Mips::F21: return 21; - case Mips::S4 : case Mips::F22: case Mips::D11: return 22; - case Mips::S5 : case Mips::F23: return 23; - case Mips::S6 : case Mips::F24: case Mips::D12: return 24; - case Mips::S7 : case Mips::F25: return 25; + case Mips::S0 : case Mips::F16: case Mips::D8: return 16; + case Mips::S1 : case Mips::F17: return 17; + case Mips::S2 : case Mips::F18: case Mips::D9: return 18; + case Mips::S3 : case Mips::F19: return 19; + case Mips::S4 : case Mips::F20: case Mips::D10: return 20; + case Mips::S5 : case Mips::F21: return 21; + case Mips::S6 : case Mips::F22: case Mips::D11: return 22; + case Mips::S7 : case Mips::F23: return 23; + case Mips::T8 : case Mips::F24: case Mips::D12: return 24; + case Mips::T9 : case Mips::F25: return 25; case Mips::K0 : case Mips::F26: case Mips::D13: return 26; case Mips::K1 : case Mips::F27: return 27; case Mips::GP : case Mips::F28: case Mips::D14: return 28; @@ -98,22 +98,22 @@ getCalleeSavedRegs(const MachineFunction *MF) const { // Mips callee-save register range is $16-$23, $f20-$f30 static const unsigned SingleFloatOnlyCalleeSavedRegs[] = { - Mips::S0, Mips::S1, Mips::S2, Mips::S3, - Mips::S4, Mips::S5, Mips::S6, Mips::S7, - Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25, - Mips::F26, Mips::F27, Mips::F28, Mips::F29, Mips::F30, 0 + Mips::F30, Mips::F29, Mips::F28, Mips::F27, Mips::F26, + Mips::F25, Mips::F24, Mips::F23, Mips::F22, Mips::F21, Mips::F20, + Mips::RA, Mips::FP, Mips::S7, Mips::S6, Mips::S5, Mips::S4, + Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0 }; - static const unsigned BitMode32CalleeSavedRegs[] = { - Mips::S0, Mips::S1, Mips::S2, Mips::S3, - Mips::S4, Mips::S5, Mips::S6, Mips::S7, - Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, 0 + static const unsigned Mips32CalleeSavedRegs[] = { + Mips::D15, Mips::D14, Mips::D13, Mips::D12, Mips::D11, Mips::D10, + Mips::RA, Mips::FP, Mips::S7, Mips::S6, Mips::S5, Mips::S4, + Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0 }; if (Subtarget.isSingleFloat()) return SingleFloatOnlyCalleeSavedRegs; else - return BitMode32CalleeSavedRegs; + return Mips32CalleeSavedRegs; } BitVector MipsRegisterInfo:: @@ -127,9 +127,11 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(Mips::SP); Reserved.set(Mips::FP); Reserved.set(Mips::RA); + Reserved.set(Mips::F31); + Reserved.set(Mips::D15); // SRV4 requires that odd register can't be used. - if (!Subtarget.isSingleFloat()) + if (!Subtarget.isSingleFloat() && !Subtarget.isMips32()) for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2) Reserved.set(FReg); @@ -153,6 +155,8 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, RegScavenger *RS) const { MachineInstr &MI = *II; MachineFunction &MF = *MI.getParent()->getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); unsigned i = 0; while (!MI.getOperand(i).isFI()) { @@ -172,9 +176,19 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, << "spOffset : " << spOffset << "\n" << "stackSize : " << stackSize << "\n"); - // as explained on LowerFormalArguments, detect negative offsets - // and adjust SPOffsets considering the final stack size. - int Offset = ((spOffset < 0) ? (stackSize + (-(spOffset+4))) : (spOffset)); + int Offset; + + // Calculate final offset. + // - There is no need to change the offset if the frame object is an outgoing + // argument or a $gp restore location, + // - If the frame object is any of the following, its offset must be adjusted + // by adding the size of the stack: + // incoming argument, callee-saved register location or local variable. + if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isGPFI(FrameIndex)) + Offset = spOffset; + else + Offset = spOffset + stackSize; + Offset += MI.getOperand(i-1).getImm(); DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n"); @@ -183,25 +197,46 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, int NewImm = 0; MachineBasicBlock &MBB = *MI.getParent(); bool ATUsed; - unsigned OrigReg = getFrameRegister(MF); - int OrigImm = Offset; + unsigned FrameReg; + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + int MinCSFI = 0; + int MaxCSFI = -1; + + if (CSI.size()) { + MinCSFI = CSI[0].getFrameIdx(); + MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); + } -// OrigImm fits in the 16-bit field - if (OrigImm < 0x8000 && OrigImm >= -0x8000) { - NewReg = OrigReg; - NewImm = OrigImm; + // The following stack frame objects are always referenced relative to $sp: + // 1. Outgoing arguments. + // 2. Pointer to dynamically allocated stack space. + // 3. Locations for callee-saved registers. + // Everything else is referenced relative to whatever register + // getFrameRegister() returns. + if (MipsFI->isOutArgFI(FrameIndex) || + (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)) + FrameReg = Mips::SP; + else + FrameReg = getFrameRegister(MF); + + // Offset fits in the 16-bit field + if (Offset < 0x8000 && Offset >= -0x8000) { + NewReg = FrameReg; + NewImm = Offset; ATUsed = false; } else { const TargetInstrInfo *TII = MF.getTarget().getInstrInfo(); DebugLoc DL = II->getDebugLoc(); - int ImmLo = OrigImm & 0xffff; - int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) + ((OrigImm & 0x8000) != 0); + int ImmLo = (short)(Offset & 0xffff); + int ImmHi = (((unsigned)Offset & 0xffff0000) >> 16) + + ((Offset & 0x8000) != 0); // FIXME: change this when mips goes MC". BuildMI(MBB, II, DL, TII->get(Mips::NOAT)); BuildMI(MBB, II, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi); - BuildMI(MBB, II, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg).addReg(Mips::AT); + BuildMI(MBB, II, DL, TII->get(Mips::ADDu), Mips::AT).addReg(FrameReg) + .addReg(Mips::AT); NewReg = Mips::AT; NewImm = ImmLo; @@ -216,15 +251,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, MI.getOperand(i-1).ChangeToImmediate(NewImm); } -void MipsRegisterInfo:: -processFunctionBeforeFrameFinalized(MachineFunction &MF) const { - // Set the stack offset where GP must be saved/loaded from. - MachineFrameInfo *MFI = MF.getFrameInfo(); - MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); - if (MipsFI->needGPSaveRestore()) - MFI->setObjectOffset(MipsFI->getGPFI(), MipsFI->getGPStackOffset()); -} - unsigned MipsRegisterInfo:: getRARegister() const { return Mips::RA; @@ -251,8 +277,11 @@ getEHHandlerRegister() const { int MipsRegisterInfo:: getDwarfRegNum(unsigned RegNum, bool isEH) const { - llvm_unreachable("What is the dwarf register number"); - return -1; + return MipsGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); +} + +int MipsRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + return MipsGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0); } #include "MipsGenRegisterInfo.inc" diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h index 767359f..76b0035 100644 --- a/lib/Target/Mips/MipsRegisterInfo.h +++ b/lib/Target/Mips/MipsRegisterInfo.h @@ -63,6 +63,7 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo { unsigned getEHHandlerRegister() const; int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; }; } // end namespace llvm diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index 9f9cae7..f0db518 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -44,6 +44,11 @@ class AFPR<bits<5> num, string n, list<Register> subregs> let SubRegIndices = [sub_fpeven, sub_fpodd]; } +// Mips Hardware Registers +class HWR<bits<5> num, string n> : MipsReg<n> { + let Num = num; +} + //===----------------------------------------------------------------------===// // Registers //===----------------------------------------------------------------------===// @@ -55,7 +60,7 @@ let Namespace = "Mips" in { def AT : MipsGPRReg< 1, "AT">, DwarfRegNum<[1]>; def V0 : MipsGPRReg< 2, "2">, DwarfRegNum<[2]>; def V1 : MipsGPRReg< 3, "3">, DwarfRegNum<[3]>; - def A0 : MipsGPRReg< 4, "4">, DwarfRegNum<[5]>; + def A0 : MipsGPRReg< 4, "4">, DwarfRegNum<[4]>; def A1 : MipsGPRReg< 5, "5">, DwarfRegNum<[5]>; def A2 : MipsGPRReg< 6, "6">, DwarfRegNum<[6]>; def A3 : MipsGPRReg< 7, "7">, DwarfRegNum<[7]>; @@ -120,22 +125,22 @@ let Namespace = "Mips" in { /// Mips Double point precision FPU Registers (aliased /// with the single precision to hold 64 bit values) - def D0 : AFPR< 0, "F0", [F0, F1]>, DwarfRegNum<[32]>; - def D1 : AFPR< 2, "F2", [F2, F3]>, DwarfRegNum<[34]>; - def D2 : AFPR< 4, "F4", [F4, F5]>, DwarfRegNum<[36]>; - def D3 : AFPR< 6, "F6", [F6, F7]>, DwarfRegNum<[38]>; - def D4 : AFPR< 8, "F8", [F8, F9]>, DwarfRegNum<[40]>; - def D5 : AFPR<10, "F10", [F10, F11]>, DwarfRegNum<[42]>; - def D6 : AFPR<12, "F12", [F12, F13]>, DwarfRegNum<[44]>; - def D7 : AFPR<14, "F14", [F14, F15]>, DwarfRegNum<[46]>; - def D8 : AFPR<16, "F16", [F16, F17]>, DwarfRegNum<[48]>; - def D9 : AFPR<18, "F18", [F18, F19]>, DwarfRegNum<[50]>; - def D10 : AFPR<20, "F20", [F20, F21]>, DwarfRegNum<[52]>; - def D11 : AFPR<22, "F22", [F22, F23]>, DwarfRegNum<[54]>; - def D12 : AFPR<24, "F24", [F24, F25]>, DwarfRegNum<[56]>; - def D13 : AFPR<26, "F26", [F26, F27]>, DwarfRegNum<[58]>; - def D14 : AFPR<28, "F28", [F28, F29]>, DwarfRegNum<[60]>; - def D15 : AFPR<30, "F30", [F30, F31]>, DwarfRegNum<[62]>; + def D0 : AFPR< 0, "F0", [F0, F1]>; + def D1 : AFPR< 2, "F2", [F2, F3]>; + def D2 : AFPR< 4, "F4", [F4, F5]>; + def D3 : AFPR< 6, "F6", [F6, F7]>; + def D4 : AFPR< 8, "F8", [F8, F9]>; + def D5 : AFPR<10, "F10", [F10, F11]>; + def D6 : AFPR<12, "F12", [F12, F13]>; + def D7 : AFPR<14, "F14", [F14, F15]>; + def D8 : AFPR<16, "F16", [F16, F17]>; + def D9 : AFPR<18, "F18", [F18, F19]>; + def D10 : AFPR<20, "F20", [F20, F21]>; + def D11 : AFPR<22, "F22", [F22, F23]>; + def D12 : AFPR<24, "F24", [F24, F25]>; + def D13 : AFPR<26, "F26", [F26, F27]>; + def D14 : AFPR<28, "F28", [F28, F29]>; + def D15 : AFPR<30, "F30", [F30, F31]>; // Hi/Lo registers def HI : Register<"hi">, DwarfRegNum<[64]>; @@ -143,33 +148,24 @@ let Namespace = "Mips" in { // Status flags register def FCR31 : Register<"31">; + + // Hardware register $29 + def HWR29 : Register<"29">; } //===----------------------------------------------------------------------===// // Register Classes //===----------------------------------------------------------------------===// -def CPURegs : RegisterClass<"Mips", [i32], 32, +def CPURegs : RegisterClass<"Mips", [i32], 32, (add // Return Values and Arguments - [V0, V1, A0, A1, A2, A3, + V0, V1, A0, A1, A2, A3, // Not preserved across procedure calls T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, // Callee save S0, S1, S2, S3, S4, S5, S6, S7, // Reserved - ZERO, AT, K0, K1, GP, SP, FP, RA]> -{ - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - CPURegsClass::iterator - CPURegsClass::allocation_order_end(const MachineFunction &MF) const { - // The last 8 registers on the list above are reserved - return end()-8; - } - }]; -} + ZERO, AT, K0, K1, GP, SP, FP, RA)>; // 64bit fp: // * FGR64 - 32 64-bit registers @@ -178,87 +174,25 @@ def CPURegs : RegisterClass<"Mips", [i32], 32, // 32bit fp: // * FGR32 - 16 32-bit even registers // * FGR32 - 32 32-bit registers (single float only mode) -def FGR32 : RegisterClass<"Mips", [f32], 32, - // Return Values and Arguments - [F0, F1, F2, F3, F12, F13, F14, F15, - // Not preserved across procedure calls - F4, F5, F6, F7, F8, F9, F10, F11, F16, F17, F18, F19, - // Callee save - F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, - // Reserved - F31]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - - static const unsigned MIPS_FGR32[] = { - Mips::F0, Mips::F1, Mips::F2, Mips::F3, Mips::F12, Mips::F13, - Mips::F14, Mips::F15, Mips::F4, Mips::F5, Mips::F6, Mips::F7, - Mips::F8, Mips::F9, Mips::F10, Mips::F11, Mips::F16, Mips::F17, - Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22, Mips::F23, - Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28, Mips::F29, - Mips::F30 - }; - - static const unsigned MIPS_SVR4_FGR32[] = { - Mips::F0, Mips::F2, Mips::F12, Mips::F14, Mips::F4, - Mips::F6, Mips::F8, Mips::F10, Mips::F16, Mips::F18, - Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, - }; - - FGR32Class::iterator - FGR32Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>(); - - if (Subtarget.isSingleFloat()) - return MIPS_FGR32; - else - return MIPS_SVR4_FGR32; - } - - FGR32Class::iterator - FGR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>(); - - if (Subtarget.isSingleFloat()) - return MIPS_FGR32 + (sizeof(MIPS_FGR32) / sizeof(unsigned)); - else - return MIPS_SVR4_FGR32 + (sizeof(MIPS_SVR4_FGR32) / sizeof(unsigned)); - } - }]; -} +def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>; -def AFGR64 : RegisterClass<"Mips", [f64], 64, +def AFGR64 : RegisterClass<"Mips", [f64], 64, (add // Return Values and Arguments - [D0, D1, D6, D7, + D0, D1, D6, D7, // Not preserved across procedure calls D2, D3, D4, D5, D8, D9, // Callee save D10, D11, D12, D13, D14, // Reserved - D15]> -{ + D15)> { let SubRegClasses = [(FGR32 sub_fpeven, sub_fpodd)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - AFGR64Class::iterator - AFGR64Class::allocation_order_end(const MachineFunction &MF) const { - // The last register on the list above is reserved - return end()-1; - } - }]; } // Condition Register for floating point operations -def CCR : RegisterClass<"Mips", [i32], 32, [FCR31]>; +def CCR : RegisterClass<"Mips", [i32], 32, (add FCR31)>; // Hi/Lo Registers -def HILO : RegisterClass<"Mips", [i32], 32, [HI, LO]>; +def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>; +// Hardware registers +def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>; diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 7a2dd1f..cfbb92c 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -38,8 +38,9 @@ MipsTargetMachine(const Target &T, const std::string &TT, const std::string &FS, bool isLittle=false): LLVMTargetMachine(T, TT), Subtarget(TT, FS, isLittle), - DataLayout(isLittle ? std::string("e-p:32:32:32-i8:8:32-i16:16:32-n32") : - std::string("E-p:32:32:32-i8:8:32-i16:16:32-n32")), + DataLayout(isLittle ? + std::string("e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") : + std::string("E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")), InstrInfo(*this), FrameLowering(Subtarget), TLInfo(*this), TSInfo(*this) { @@ -75,3 +76,15 @@ addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) PM.add(createMipsDelaySlotFillerPass(*this)); return true; } + +bool MipsTargetMachine:: +addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) { + PM.add(createMipsEmitGPRestorePass(*this)); + return true; +} + +bool MipsTargetMachine:: +addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) { + PM.add(createMipsExpandPseudoPass(*this)); + return true; +} diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index 43ab798..102dd85 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -63,6 +63,9 @@ namespace llvm { CodeGenOpt::Level OptLevel); virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreRegAlloc(PassManagerBase &PM, + CodeGenOpt::Level OptLevel); + virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level); }; /// MipselTargetMachine - Mipsel target machine. diff --git a/lib/Target/PTX/CMakeLists.txt b/lib/Target/PTX/CMakeLists.txt index 331266d..c4448d6 100644 --- a/lib/Target/PTX/CMakeLists.txt +++ b/lib/Target/PTX/CMakeLists.txt @@ -1,6 +1,7 @@ set(LLVM_TARGET_DEFINITIONS PTX.td) tablegen(PTXGenAsmWriter.inc -gen-asm-writer) +tablegen(PTXGenCallingConv.inc -gen-callingconv) tablegen(PTXGenDAGISel.inc -gen-dag-isel) tablegen(PTXGenInstrInfo.inc -gen-instr-desc) tablegen(PTXGenInstrNames.inc -gen-instr-enums) diff --git a/lib/Target/PTX/Makefile b/lib/Target/PTX/Makefile index 2c40d69..844480f 100644 --- a/lib/Target/PTX/Makefile +++ b/lib/Target/PTX/Makefile @@ -13,6 +13,7 @@ TARGET = PTX # Make sure that tblgen is run, first thing. BUILT_SOURCES = PTXGenAsmWriter.inc \ + PTXGenCallingConv.inc \ PTXGenDAGISel.inc \ PTXGenInstrInfo.inc \ PTXGenInstrNames.inc \ diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h index 49045cd..ec2be92 100644 --- a/lib/Target/PTX/PTX.h +++ b/lib/Target/PTX/PTX.h @@ -42,7 +42,8 @@ namespace llvm { FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel); - extern Target ThePTXTarget; + extern Target ThePTX32Target; + extern Target ThePTX64Target; } // namespace llvm; // Defines symbolic names for PTX registers. diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td index dbc6f57..2c7bd3b 100644 --- a/lib/Target/PTX/PTX.td +++ b/lib/Target/PTX/PTX.td @@ -24,8 +24,8 @@ include "llvm/Target/Target.td" def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true", "Do not demote .f64 to .f32">; -def Feature64Bit : SubtargetFeature<"64bit", "Use64BitAddresses", "true", - "Use 64-bit integer types for addresses.">; +def FeatureNoFMA : SubtargetFeature<"no-fma","SupportsFMA", "false", + "Disable Fused-Multiply Add">; //===- PTX Version --------------------------------------------------------===// @@ -41,6 +41,10 @@ def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2", "Use PTX Language Version 2.2", [FeaturePTX21]>; +def FeaturePTX23 : SubtargetFeature<"ptx23", "PTXVersion", "PTX_VERSION_2_3", + "Use PTX Language Version 2.3", + [FeaturePTX22]>; + //===- PTX Shader Model ---------------------------------------------------===// def FeatureSM10 : SubtargetFeature<"sm10", "PTXShaderModel", "PTX_SM_1_0", @@ -68,6 +72,12 @@ def : Proc<"generic", []>; include "PTXRegisterInfo.td" //===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "PTXCallingConv.td" + +//===----------------------------------------------------------------------===// // Instruction Descriptions //===----------------------------------------------------------------------===// diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp index 27c9605..1142144 100644 --- a/lib/Target/PTX/PTXAsmPrinter.cpp +++ b/lib/Target/PTX/PTXAsmPrinter.cpp @@ -79,12 +79,12 @@ static const char PARAM_PREFIX[] = "__param_"; static const char *getRegisterTypeName(unsigned RegNo) { #define TEST_REGCLS(cls, clsstr) \ if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr; - TEST_REGCLS(Preds, pred); - TEST_REGCLS(RRegu16, u16); - TEST_REGCLS(RRegu32, u32); - TEST_REGCLS(RRegu64, u64); - TEST_REGCLS(RRegf32, f32); - TEST_REGCLS(RRegf64, f64); + TEST_REGCLS(RegPred, pred); + TEST_REGCLS(RegI16, b16); + TEST_REGCLS(RegI32, b32); + TEST_REGCLS(RegI64, b64); + TEST_REGCLS(RegF32, b32); + TEST_REGCLS(RegF64, b64); #undef TEST_REGCLS llvm_unreachable("Not in any register class!"); @@ -226,7 +226,7 @@ void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, OS << *Mang->getSymbol(MO.getGlobal()); break; case MachineOperand::MO_Immediate: - OS << (int) MO.getImm(); + OS << (long) MO.getImm(); break; case MachineOperand::MO_MachineBasicBlock: OS << *MO.getMBB()->getSymbol(); @@ -308,34 +308,60 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) { const PointerType* pointerTy = dyn_cast<const PointerType>(gv->getType()); const Type* elementTy = pointerTy->getElementType(); - assert(elementTy->isArrayTy() && "Only pointers to arrays are supported"); + decl += ".b8 "; + decl += gvsym->getName(); + decl += "["; - const ArrayType* arrayTy = dyn_cast<const ArrayType>(elementTy); - elementTy = arrayTy->getElementType(); + if (elementTy->isArrayTy()) + { + assert(elementTy->isArrayTy() && "Only pointers to arrays are supported"); - unsigned numElements = arrayTy->getNumElements(); + const ArrayType* arrayTy = dyn_cast<const ArrayType>(elementTy); + elementTy = arrayTy->getElementType(); - while (elementTy->isArrayTy()) { + unsigned numElements = arrayTy->getNumElements(); - arrayTy = dyn_cast<const ArrayType>(elementTy); - elementTy = arrayTy->getElementType(); + while (elementTy->isArrayTy()) { - numElements *= arrayTy->getNumElements(); - } + arrayTy = dyn_cast<const ArrayType>(elementTy); + elementTy = arrayTy->getElementType(); - // FIXME: isPrimitiveType() == false for i16? - assert(elementTy->isSingleValueType() && - "Non-primitive types are not handled"); + numElements *= arrayTy->getNumElements(); + } - // Compute the size of the array, in bytes. - uint64_t arraySize = (elementTy->getPrimitiveSizeInBits() >> 3) - * numElements; + // FIXME: isPrimitiveType() == false for i16? + assert(elementTy->isSingleValueType() && + "Non-primitive types are not handled"); + + // Compute the size of the array, in bytes. + uint64_t arraySize = (elementTy->getPrimitiveSizeInBits() >> 3) + * numElements; + + decl += utostr(arraySize); + } - decl += ".b8 "; - decl += gvsym->getName(); - decl += "["; - decl += utostr(arraySize); decl += "]"; + + // handle string constants (assume ConstantArray means string) + + if (gv->hasInitializer()) + { + Constant *C = gv->getInitializer(); + if (const ConstantArray *CA = dyn_cast<ConstantArray>(C)) + { + decl += " = {"; + + for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) + { + if (i > 0) decl += ","; + + decl += "0x" + + utohexstr(cast<ConstantInt>(CA->getOperand(i))->getZExtValue()); + } + + decl += "}"; + } + } } else { // Note: this is currently the fall-through case and most likely generates @@ -368,17 +394,23 @@ void PTXAsmPrinter::EmitFunctionDeclaration() { const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>(); const bool isKernel = MFI->isKernel(); - unsigned reg; std::string decl = isKernel ? ".entry" : ".func"; - // Print return register - reg = MFI->retReg(); - if (!isKernel && reg != PTX::NoRegister) { - decl += " (.reg ."; // FIXME: could it return in .param space? - decl += getRegisterTypeName(reg); - decl += " "; - decl += getRegisterName(reg); + if (!isKernel) { + decl += " ("; + + for (PTXMachineFunctionInfo::ret_iterator + i = MFI->retRegBegin(), e = MFI->retRegEnd(), b = i; + i != e; ++i) { + if (i != b) { + decl += ", "; + } + decl += ".reg ."; + decl += getRegisterTypeName(*i); + decl += " "; + decl += getRegisterName(*i); + } decl += ")"; } @@ -386,40 +418,66 @@ void PTXAsmPrinter::EmitFunctionDeclaration() { decl += " "; decl += CurrentFnSym->getName().str(); - // Print parameter list - if (!MFI->argRegEmpty()) { - decl += " ("; + decl += " ("; + + unsigned cnt = 0; + + // Print parameters + for (PTXMachineFunctionInfo::reg_iterator + i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; + i != e; ++i) { + if (i != b) { + decl += ", "; + } if (isKernel) { - unsigned cnt = 0; - for(PTXMachineFunctionInfo::reg_iterator - i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; - i != e; ++i) { - reg = *i; - assert(reg != PTX::NoRegister && "Not a valid register!"); - if (i != b) - decl += ", "; - decl += ".param ."; - decl += getRegisterTypeName(reg); - decl += " "; - decl += PARAM_PREFIX; - decl += utostr(++cnt); - } + decl += ".param .b"; + decl += utostr(*i); + decl += " "; + decl += PARAM_PREFIX; + decl += utostr(++cnt); } else { - for (PTXMachineFunctionInfo::reg_iterator - i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; - i != e; ++i) { - reg = *i; - assert(reg != PTX::NoRegister && "Not a valid register!"); - if (i != b) - decl += ", "; - decl += ".reg ."; - decl += getRegisterTypeName(reg); - decl += " "; - decl += getRegisterName(reg); - } + decl += ".reg ."; + decl += getRegisterTypeName(*i); + decl += " "; + decl += getRegisterName(*i); } - decl += ")"; } + decl += ")"; + + // // Print parameter list + // if (!MFI->argRegEmpty()) { + // decl += " ("; + // if (isKernel) { + // unsigned cnt = 0; + // for(PTXMachineFunctionInfo::reg_iterator + // i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; + // i != e; ++i) { + // reg = *i; + // assert(reg != PTX::NoRegister && "Not a valid register!"); + // if (i != b) + // decl += ", "; + // decl += ".param ."; + // decl += getRegisterTypeName(reg); + // decl += " "; + // decl += PARAM_PREFIX; + // decl += utostr(++cnt); + // } + // } else { + // for (PTXMachineFunctionInfo::reg_iterator + // i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; + // i != e; ++i) { + // reg = *i; + // assert(reg != PTX::NoRegister && "Not a valid register!"); + // if (i != b) + // decl += ", "; + // decl += ".reg ."; + // decl += getRegisterTypeName(reg); + // decl += " "; + // decl += getRegisterName(reg); + // } + // } + // decl += ")"; + // } OutStreamer.EmitRawText(Twine(decl)); } @@ -447,5 +505,6 @@ printPredicateOperand(const MachineInstr *MI, raw_ostream &O) { // Force static initialization. extern "C" void LLVMInitializePTXAsmPrinter() { - RegisterAsmPrinter<PTXAsmPrinter> X(ThePTXTarget); + RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target); + RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target); } diff --git a/lib/Target/PTX/PTXCallingConv.td b/lib/Target/PTX/PTXCallingConv.td new file mode 100644 index 0000000..4d7759b --- /dev/null +++ b/lib/Target/PTX/PTXCallingConv.td @@ -0,0 +1,36 @@ +//===--- PTXCallingConv.td - Calling Conventions -----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the PTX architecture. +// +//===----------------------------------------------------------------------===// + +// Currently, we reserve one register of each type for return values and let +// the rest be used for parameters. This is a dirty hack, but I am not sure +// how to tell LLVM that registers used for parameter passing cannot be used +// for return values. + +// PTX Calling Conventions +def CC_PTX : CallingConv<[ + CCIfType<[i1], CCAssignToReg<[P1, P2, P3, P4, P5, P6, P7]>>, + CCIfType<[i16], CCAssignToReg<[RH1, RH2, RH3, RH4, RH5, RH6, RH7]>>, + CCIfType<[i32, f32], CCAssignToReg<[R1, R2, R3, R4, R5, R6, R7]>>, + CCIfType<[i64, f64], CCAssignToReg<[RD1, RD2, RD3, RD4, RD5, RD6, RD7]>> +]>; + +//===----------------------------------------------------------------------===// +// Return Value Calling Conventions +//===----------------------------------------------------------------------===// + +def RetCC_PTX : CallingConv<[ + CCIfType<[i1], CCAssignToReg<[P0]>>, + CCIfType<[i16], CCAssignToReg<[RH0]>>, + CCIfType<[i32, f32], CCAssignToReg<[R0]>>, + CCIfType<[i64, f64], CCAssignToReg<[RD0]>> +]>; diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp index 7187518..c3cdaba 100644 --- a/lib/Target/PTX/PTXISelLowering.cpp +++ b/lib/Target/PTX/PTXISelLowering.cpp @@ -16,6 +16,7 @@ #include "PTXMachineFunctionInfo.h" #include "PTXRegisterInfo.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -24,21 +25,43 @@ using namespace llvm; +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "PTXGenCallingConv.inc" + +//===----------------------------------------------------------------------===// +// TargetLowering Implementation +//===----------------------------------------------------------------------===// + PTXTargetLowering::PTXTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { // Set up the register classes. - addRegisterClass(MVT::i1, PTX::PredsRegisterClass); - addRegisterClass(MVT::i16, PTX::RRegu16RegisterClass); - addRegisterClass(MVT::i32, PTX::RRegu32RegisterClass); - addRegisterClass(MVT::i64, PTX::RRegu64RegisterClass); - addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass); - addRegisterClass(MVT::f64, PTX::RRegf64RegisterClass); + addRegisterClass(MVT::i1, PTX::RegPredRegisterClass); + addRegisterClass(MVT::i16, PTX::RegI16RegisterClass); + addRegisterClass(MVT::i32, PTX::RegI32RegisterClass); + addRegisterClass(MVT::i64, PTX::RegI64RegisterClass); + addRegisterClass(MVT::f32, PTX::RegF32RegisterClass); + addRegisterClass(MVT::f64, PTX::RegF64RegisterClass); + + setBooleanContents(ZeroOrOneBooleanContent); setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + // Turn i16 (z)extload into load + (z)extend + setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand); + + // Turn f32 extload into load + fextend + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + + // Turn f64 truncstore into trunc + store. + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + // Customize translation of memory addresses setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); @@ -46,14 +69,30 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM) // Expand BR_CC into BRCOND setOperationAction(ISD::BR_CC, MVT::Other, Expand); + // Expand SELECT_CC into SETCC + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + + // need to lower SETCC of RegPred into bitwise logic + setOperationAction(ISD::SETCC, MVT::i1, Custom); + + setMinFunctionAlignment(2); + // Compute derived properties from the register classes computeRegisterProperties(); } +MVT::SimpleValueType PTXTargetLowering::getSetCCResultType(EVT VT) const { + return MVT::i1; +} + SDValue PTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Unimplemented operand"); + case ISD::SETCC: + return LowerSETCC(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); } @@ -78,6 +117,28 @@ const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const { // Custom Lower Operation //===----------------------------------------------------------------------===// +SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::i1 && "SetCC type must be 1-bit integer"); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + + // Look for X == 0, X == 1, X != 0, or X != 1 + // We can simplify these to bitwise logic + + if (Op1.getOpcode() == ISD::Constant && + (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || + cast<ConstantSDNode>(Op1)->isNullValue()) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1); + } + + return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2); +} + SDValue PTXTargetLowering:: LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); @@ -111,12 +172,12 @@ struct argmap_entry { void reset() { loc = RC->begin(); } bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; } } argmap[] = { - argmap_entry(MVT::i1, PTX::PredsRegisterClass), - argmap_entry(MVT::i16, PTX::RRegu16RegisterClass), - argmap_entry(MVT::i32, PTX::RRegu32RegisterClass), - argmap_entry(MVT::i64, PTX::RRegu64RegisterClass), - argmap_entry(MVT::f32, PTX::RRegf32RegisterClass), - argmap_entry(MVT::f64, PTX::RRegf64RegisterClass) + argmap_entry(MVT::i1, PTX::RegPredRegisterClass), + argmap_entry(MVT::i16, PTX::RegI16RegisterClass), + argmap_entry(MVT::i32, PTX::RegI32RegisterClass), + argmap_entry(MVT::i64, PTX::RegI64RegisterClass), + argmap_entry(MVT::f32, PTX::RegF32RegisterClass), + argmap_entry(MVT::f64, PTX::RegF64RegisterClass) }; } // end anonymous namespace @@ -145,44 +206,72 @@ SDValue PTXTargetLowering:: break; } - // Make sure we don't add argument registers twice - if (MFI->isDoneAddArg()) - llvm_unreachable("cannot add argument registers twice"); - - // Reset argmap before allocation - for (struct argmap_entry *i = argmap, *e = argmap + array_lengthof(argmap); - i != e; ++ i) - i->reset(); - - for (int i = 0, e = Ins.size(); i != e; ++ i) { - MVT::SimpleValueType VT = Ins[i].VT.SimpleTy; + if (MFI->isKernel()) { + // For kernel functions, we just need to emit the proper READ_PARAM ISDs + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - struct argmap_entry *entry = std::find(argmap, - argmap + array_lengthof(argmap), VT); - if (entry == argmap + array_lengthof(argmap)) - llvm_unreachable("Type of argument is not supported"); + assert(Ins[i].VT != MVT::i1 && "Kernels cannot take pred operands"); - if (MFI->isKernel() && entry->RC == PTX::PredsRegisterClass) - llvm_unreachable("cannot pass preds to kernel"); + SDValue ArgValue = DAG.getNode(PTXISD::READ_PARAM, dl, Ins[i].VT, Chain, + DAG.getTargetConstant(i, MVT::i32)); + InVals.push_back(ArgValue); - MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo(); - - unsigned preg = *++(entry->loc); // allocate start from register 1 - unsigned vreg = RegInfo.createVirtualRegister(entry->RC); - RegInfo.addLiveIn(preg, vreg); - - MFI->addArgReg(preg); - - SDValue inval; - if (MFI->isKernel()) - inval = DAG.getNode(PTXISD::READ_PARAM, dl, VT, Chain, - DAG.getTargetConstant(i, MVT::i32)); - else - inval = DAG.getCopyFromReg(Chain, dl, vreg, VT); - InVals.push_back(inval); + // Instead of storing a physical register in our argument list, we just + // store the total size of the parameter, in bits. The ASM printer + // knows how to process this. + MFI->addArgReg(Ins[i].VT.getStoreSizeInBits()); + } + } + else { + // For device functions, we use the PTX calling convention to do register + // assignments then create CopyFromReg ISDs for the allocated registers + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs, + *DAG.getContext()); + + CCInfo.AnalyzeFormalArguments(Ins, CC_PTX); + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + + CCValAssign& VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + TargetRegisterClass* TRC = 0; + + assert(VA.isRegLoc() && "CCValAssign must be RegLoc"); + + // Determine which register class we need + if (RegVT == MVT::i1) { + TRC = PTX::RegPredRegisterClass; + } + else if (RegVT == MVT::i16) { + TRC = PTX::RegI16RegisterClass; + } + else if (RegVT == MVT::i32) { + TRC = PTX::RegI32RegisterClass; + } + else if (RegVT == MVT::i64) { + TRC = PTX::RegI64RegisterClass; + } + else if (RegVT == MVT::f32) { + TRC = PTX::RegF32RegisterClass; + } + else if (RegVT == MVT::f64) { + TRC = PTX::RegF64RegisterClass; + } + else { + llvm_unreachable("Unknown parameter type"); + } + + unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC); + MF.getRegInfo().addLiveIn(VA.getLocReg(), Reg); + + SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + InVals.push_back(ArgValue); + + MFI->addArgReg(VA.getLocReg()); + } } - - MFI->doneAddArg(); return Chain; } @@ -204,51 +293,43 @@ SDValue PTXTargetLowering:: assert(Outs.size() == 0 && "Kernel must return void."); return DAG.getNode(PTXISD::EXIT, dl, MVT::Other, Chain); case CallingConv::PTX_Device: - assert(Outs.size() <= 1 && "Can at most return one value."); + //assert(Outs.size() <= 1 && "Can at most return one value."); break; } - // PTX_Device - - // return void - if (Outs.size() == 0) - return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain); + MachineFunction& MF = DAG.getMachineFunction(); + PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>(); + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); SDValue Flag; - unsigned reg; - if (Outs[0].VT == MVT::i16) { - reg = PTX::RH0; - } - else if (Outs[0].VT == MVT::i32) { - reg = PTX::R0; - } - else if (Outs[0].VT == MVT::i64) { - reg = PTX::RD0; - } - else if (Outs[0].VT == MVT::f32) { - reg = PTX::F0; - } - else { - assert(Outs[0].VT == MVT::f64 && "Can return only basic types"); - reg = PTX::FD0; - } + CCInfo.AnalyzeReturn(Outs, RetCC_PTX); - MachineFunction &MF = DAG.getMachineFunction(); - PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>(); - MFI->setRetReg(reg); + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + + CCValAssign& VA = RVLocs[i]; + + assert(VA.isRegLoc() && "CCValAssign must be RegLoc"); + + unsigned Reg = VA.getLocReg(); - // If this is the first return lowered for this function, add the regs to the - // liveout set for the function - if (DAG.getMachineFunction().getRegInfo().liveout_empty()) - DAG.getMachineFunction().getRegInfo().addLiveOut(reg); + DAG.getMachineFunction().getRegInfo().addLiveOut(Reg); - // Copy the result values into the output registers - Chain = DAG.getCopyToReg(Chain, dl, reg, OutVals[0], Flag); + Chain = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i], Flag); - // Guarantee that all emitted copies are stuck together, - // avoiding something bad - Flag = Chain.getValue(1); + // Guarantee that all emitted copies are stuck together, + // avoiding something bad + Flag = Chain.getValue(1); - return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag); + MFI->addRetReg(Reg); + } + + if (Flag.getNode() == 0) { + return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain); + } + else { + return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag); + } } diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h index c69c416..ead17ed 100644 --- a/lib/Target/PTX/PTXISelLowering.h +++ b/lib/Target/PTX/PTXISelLowering.h @@ -37,11 +37,10 @@ class PTXTargetLowering : public TargetLowering { virtual const char *getTargetNodeName(unsigned Opcode) const; - virtual unsigned getFunctionAlignment(const Function *F) const { - return 2; } - virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, @@ -60,6 +59,8 @@ class PTXTargetLowering : public TargetLowering { DebugLoc dl, SelectionDAG &DAG) const; + virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const; + private: SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; }; // class PTXTargetLowering diff --git a/lib/Target/PTX/PTXInstrFormats.td b/lib/Target/PTX/PTXInstrFormats.td index e4e0999..8cee351 100644 --- a/lib/Target/PTX/PTXInstrFormats.td +++ b/lib/Target/PTX/PTXInstrFormats.td @@ -9,7 +9,7 @@ // PTX Predicate operand, default to (0, 0) = (zero-reg, always). // Leave PrintMethod empty; predicate printing is defined elsewhere. -def pred : PredicateOperand<OtherVT, (ops Preds, i32imm), +def pred : PredicateOperand<OtherVT, (ops RegPred, i32imm), (ops (i1 zero_reg), (i32 0))>; let Namespace = "PTX" in { diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp index a12a6d0..c305c05 100644 --- a/lib/Target/PTX/PTXInstrInfo.cpp +++ b/lib/Target/PTX/PTXInstrInfo.cpp @@ -33,12 +33,12 @@ static const struct map_entry { const TargetRegisterClass *cls; const int opcode; } map[] = { - { &PTX::RRegu16RegClass, PTX::MOVU16rr }, - { &PTX::RRegu32RegClass, PTX::MOVU32rr }, - { &PTX::RRegu64RegClass, PTX::MOVU64rr }, - { &PTX::RRegf32RegClass, PTX::MOVF32rr }, - { &PTX::RRegf64RegClass, PTX::MOVF64rr }, - { &PTX::PredsRegClass, PTX::MOVPREDrr } + { &PTX::RegI16RegClass, PTX::MOVU16rr }, + { &PTX::RegI32RegClass, PTX::MOVU32rr }, + { &PTX::RegI64RegClass, PTX::MOVU64rr }, + { &PTX::RegF32RegClass, PTX::MOVF32rr }, + { &PTX::RegF64RegClass, PTX::MOVF64rr }, + { &PTX::RegPredRegClass, PTX::MOVPREDrr } }; void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -155,7 +155,7 @@ DefinesPredicate(MachineInstr *MI, const MachineOperand &MO = MI->getOperand(0); - if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::PredsRegClass) + if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::RegPredRegClass) return false; Pred.push_back(MO); diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td index c075512..71f7cc3 100644 --- a/lib/Target/PTX/PTXInstrInfo.td +++ b/lib/Target/PTX/PTXInstrInfo.td @@ -22,8 +22,8 @@ include "PTXInstrFormats.td" //===----------------------------------------------------------------------===// // Addressing -def Use32BitAddresses : Predicate<"!getSubtarget().use64BitAddresses()">; -def Use64BitAddresses : Predicate<"getSubtarget().use64BitAddresses()">; +def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">; +def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">; // Shader Model Support def SupportsSM13 : Predicate<"getSubtarget().supportsSM13()">; @@ -36,6 +36,12 @@ def SupportsPTX21 : Predicate<"getSubtarget().supportsPTX21()">; def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">; def SupportsPTX22 : Predicate<"getSubtarget().supportsPTX22()">; def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">; +def SupportsPTX23 : Predicate<"getSubtarget().supportsPTX23()">; +def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">; + +// Fused-Multiply Add +def SupportsFMA : Predicate<"getSubtarget().supportsFMA()">; +def DoesNotSupportFMA : Predicate<"!getSubtarget().supportsFMA()">; //===----------------------------------------------------------------------===// // Instruction Pattern Stuff @@ -137,11 +143,11 @@ def ADDRii64 : ComplexPattern<i64, 2, "SelectADDRii", [], []>; // Address operands def MEMri32 : Operand<i32> { let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops RRegu32, i32imm); + let MIOperandInfo = (ops RegI32, i32imm); } def MEMri64 : Operand<i64> { let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops RRegu64, i64imm); + let MIOperandInfo = (ops RegI64, i64imm); } def MEMii32 : Operand<i32> { let PrintMethod = "printMemOperand"; @@ -182,209 +188,312 @@ def PTXcopyaddress // Instruction Class Templates //===----------------------------------------------------------------------===// +//===- Floating-Point Instructions - 2 Operand Form -----------------------===// +multiclass PTX_FLOAT_2OP<string opcstr, SDNode opnode> { + def rr32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a), + !strconcat(opcstr, ".f32\t$d, $a"), + [(set RegF32:$d, (opnode RegF32:$a))]>; + def ri32 : InstPTX<(outs RegF32:$d), + (ins f32imm:$a), + !strconcat(opcstr, ".f32\t$d, $a"), + [(set RegF32:$d, (opnode fpimm:$a))]>; + def rr64 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a), + !strconcat(opcstr, ".f64\t$d, $a"), + [(set RegF64:$d, (opnode RegF64:$a))]>; + def ri64 : InstPTX<(outs RegF64:$d), + (ins f64imm:$a), + !strconcat(opcstr, ".f64\t$d, $a"), + [(set RegF64:$d, (opnode fpimm:$a))]>; +} + //===- Floating-Point Instructions - 3 Operand Form -----------------------===// multiclass PTX_FLOAT_3OP<string opcstr, SDNode opnode> { - def rr32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a, RRegf32:$b), + def rr32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a, RegF32:$b), !strconcat(opcstr, ".f32\t$d, $a, $b"), - [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>; - def ri32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a, f32imm:$b), + [(set RegF32:$d, (opnode RegF32:$a, RegF32:$b))]>; + def ri32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a, f32imm:$b), !strconcat(opcstr, ".f32\t$d, $a, $b"), - [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>; - def rr64 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a, RRegf64:$b), + [(set RegF32:$d, (opnode RegF32:$a, fpimm:$b))]>; + def rr64 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a, RegF64:$b), !strconcat(opcstr, ".f64\t$d, $a, $b"), - [(set RRegf64:$d, (opnode RRegf64:$a, RRegf64:$b))]>; - def ri64 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a, f64imm:$b), + [(set RegF64:$d, (opnode RegF64:$a, RegF64:$b))]>; + def ri64 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a, f64imm:$b), !strconcat(opcstr, ".f64\t$d, $a, $b"), - [(set RRegf64:$d, (opnode RRegf64:$a, fpimm:$b))]>; + [(set RegF64:$d, (opnode RegF64:$a, fpimm:$b))]>; } //===- Floating-Point Instructions - 4 Operand Form -----------------------===// multiclass PTX_FLOAT_4OP<string opcstr, SDNode opnode1, SDNode opnode2> { - def rrr32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a, RRegf32:$b, RRegf32:$c), + def rrr32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a, RegF32:$b, RegF32:$c), !strconcat(opcstr, ".f32\t$d, $a, $b, $c"), - [(set RRegf32:$d, (opnode2 (opnode1 RRegf32:$a, - RRegf32:$b), - RRegf32:$c))]>; - def rri32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a, RRegf32:$b, f32imm:$c), + [(set RegF32:$d, (opnode2 (opnode1 RegF32:$a, + RegF32:$b), + RegF32:$c))]>; + def rri32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a, RegF32:$b, f32imm:$c), !strconcat(opcstr, ".f32\t$d, $a, $b, $c"), - [(set RRegf32:$d, (opnode2 (opnode1 RRegf32:$a, - RRegf32:$b), + [(set RegF32:$d, (opnode2 (opnode1 RegF32:$a, + RegF32:$b), fpimm:$c))]>; - def rrr64 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a, RRegf64:$b, RRegf64:$c), + def rrr64 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a, RegF64:$b, RegF64:$c), !strconcat(opcstr, ".f64\t$d, $a, $b, $c"), - [(set RRegf64:$d, (opnode2 (opnode1 RRegf64:$a, - RRegf64:$b), - RRegf64:$c))]>; - def rri64 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a, RRegf64:$b, f64imm:$c), + [(set RegF64:$d, (opnode2 (opnode1 RegF64:$a, + RegF64:$b), + RegF64:$c))]>; + def rri64 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a, RegF64:$b, f64imm:$c), !strconcat(opcstr, ".f64\t$d, $a, $b, $c"), - [(set RRegf64:$d, (opnode2 (opnode1 RRegf64:$a, - RRegf64:$b), + [(set RegF64:$d, (opnode2 (opnode1 RegF64:$a, + RegF64:$b), fpimm:$c))]>; } multiclass INT3<string opcstr, SDNode opnode> { - def rr16 : InstPTX<(outs RRegu16:$d), - (ins RRegu16:$a, RRegu16:$b), + def rr16 : InstPTX<(outs RegI16:$d), + (ins RegI16:$a, RegI16:$b), !strconcat(opcstr, ".u16\t$d, $a, $b"), - [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>; - def ri16 : InstPTX<(outs RRegu16:$d), - (ins RRegu16:$a, i16imm:$b), + [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>; + def ri16 : InstPTX<(outs RegI16:$d), + (ins RegI16:$a, i16imm:$b), !strconcat(opcstr, ".u16\t$d, $a, $b"), - [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>; - def rr32 : InstPTX<(outs RRegu32:$d), - (ins RRegu32:$a, RRegu32:$b), + [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>; + def rr32 : InstPTX<(outs RegI32:$d), + (ins RegI32:$a, RegI32:$b), !strconcat(opcstr, ".u32\t$d, $a, $b"), - [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>; - def ri32 : InstPTX<(outs RRegu32:$d), - (ins RRegu32:$a, i32imm:$b), + [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>; + def ri32 : InstPTX<(outs RegI32:$d), + (ins RegI32:$a, i32imm:$b), !strconcat(opcstr, ".u32\t$d, $a, $b"), - [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>; - def rr64 : InstPTX<(outs RRegu64:$d), - (ins RRegu64:$a, RRegu64:$b), + [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>; + def rr64 : InstPTX<(outs RegI64:$d), + (ins RegI64:$a, RegI64:$b), !strconcat(opcstr, ".u64\t$d, $a, $b"), - [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>; - def ri64 : InstPTX<(outs RRegu64:$d), - (ins RRegu64:$a, i64imm:$b), + [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>; + def ri64 : InstPTX<(outs RegI64:$d), + (ins RegI64:$a, i64imm:$b), !strconcat(opcstr, ".u64\t$d, $a, $b"), - [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>; + [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>; } multiclass PTX_LOGIC<string opcstr, SDNode opnode> { - def rr16 : InstPTX<(outs RRegu16:$d), - (ins RRegu16:$a, RRegu16:$b), + def ripreds : InstPTX<(outs RegPred:$d), + (ins RegPred:$a, i1imm:$b), + !strconcat(opcstr, ".pred\t$d, $a, $b"), + [(set RegPred:$d, (opnode RegPred:$a, imm:$b))]>; + def rrpreds : InstPTX<(outs RegPred:$d), + (ins RegPred:$a, RegPred:$b), + !strconcat(opcstr, ".pred\t$d, $a, $b"), + [(set RegPred:$d, (opnode RegPred:$a, RegPred:$b))]>; + def rr16 : InstPTX<(outs RegI16:$d), + (ins RegI16:$a, RegI16:$b), !strconcat(opcstr, ".b16\t$d, $a, $b"), - [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>; - def ri16 : InstPTX<(outs RRegu16:$d), - (ins RRegu16:$a, i16imm:$b), + [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>; + def ri16 : InstPTX<(outs RegI16:$d), + (ins RegI16:$a, i16imm:$b), !strconcat(opcstr, ".b16\t$d, $a, $b"), - [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>; - def rr32 : InstPTX<(outs RRegu32:$d), - (ins RRegu32:$a, RRegu32:$b), + [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>; + def rr32 : InstPTX<(outs RegI32:$d), + (ins RegI32:$a, RegI32:$b), !strconcat(opcstr, ".b32\t$d, $a, $b"), - [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>; - def ri32 : InstPTX<(outs RRegu32:$d), - (ins RRegu32:$a, i32imm:$b), + [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>; + def ri32 : InstPTX<(outs RegI32:$d), + (ins RegI32:$a, i32imm:$b), !strconcat(opcstr, ".b32\t$d, $a, $b"), - [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>; - def rr64 : InstPTX<(outs RRegu64:$d), - (ins RRegu64:$a, RRegu64:$b), + [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>; + def rr64 : InstPTX<(outs RegI64:$d), + (ins RegI64:$a, RegI64:$b), !strconcat(opcstr, ".b64\t$d, $a, $b"), - [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>; - def ri64 : InstPTX<(outs RRegu64:$d), - (ins RRegu64:$a, i64imm:$b), + [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>; + def ri64 : InstPTX<(outs RegI64:$d), + (ins RegI64:$a, i64imm:$b), !strconcat(opcstr, ".b64\t$d, $a, $b"), - [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>; + [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>; } multiclass INT3ntnc<string opcstr, SDNode opnode> { - def rr16 : InstPTX<(outs RRegu16:$d), - (ins RRegu16:$a, RRegu16:$b), + def rr16 : InstPTX<(outs RegI16:$d), + (ins RegI16:$a, RegI16:$b), !strconcat(opcstr, "16\t$d, $a, $b"), - [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>; - def rr32 : InstPTX<(outs RRegu32:$d), - (ins RRegu32:$a, RRegu32:$b), + [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>; + def rr32 : InstPTX<(outs RegI32:$d), + (ins RegI32:$a, RegI32:$b), !strconcat(opcstr, "32\t$d, $a, $b"), - [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>; - def rr64 : InstPTX<(outs RRegu64:$d), - (ins RRegu64:$a, RRegu64:$b), + [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>; + def rr64 : InstPTX<(outs RegI64:$d), + (ins RegI64:$a, RegI64:$b), !strconcat(opcstr, "64\t$d, $a, $b"), - [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>; - def ri16 : InstPTX<(outs RRegu16:$d), - (ins RRegu16:$a, i16imm:$b), + [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>; + def ri16 : InstPTX<(outs RegI16:$d), + (ins RegI16:$a, i16imm:$b), !strconcat(opcstr, "16\t$d, $a, $b"), - [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>; - def ri32 : InstPTX<(outs RRegu32:$d), - (ins RRegu32:$a, i32imm:$b), + [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>; + def ri32 : InstPTX<(outs RegI32:$d), + (ins RegI32:$a, i32imm:$b), !strconcat(opcstr, "32\t$d, $a, $b"), - [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>; - def ri64 : InstPTX<(outs RRegu64:$d), - (ins RRegu64:$a, i64imm:$b), + [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>; + def ri64 : InstPTX<(outs RegI64:$d), + (ins RegI64:$a, i64imm:$b), !strconcat(opcstr, "64\t$d, $a, $b"), - [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>; - def ir16 : InstPTX<(outs RRegu16:$d), - (ins i16imm:$a, RRegu16:$b), + [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>; + def ir16 : InstPTX<(outs RegI16:$d), + (ins i16imm:$a, RegI16:$b), !strconcat(opcstr, "16\t$d, $a, $b"), - [(set RRegu16:$d, (opnode imm:$a, RRegu16:$b))]>; - def ir32 : InstPTX<(outs RRegu32:$d), - (ins i32imm:$a, RRegu32:$b), + [(set RegI16:$d, (opnode imm:$a, RegI16:$b))]>; + def ir32 : InstPTX<(outs RegI32:$d), + (ins i32imm:$a, RegI32:$b), !strconcat(opcstr, "32\t$d, $a, $b"), - [(set RRegu32:$d, (opnode imm:$a, RRegu32:$b))]>; - def ir64 : InstPTX<(outs RRegu64:$d), - (ins i64imm:$a, RRegu64:$b), + [(set RegI32:$d, (opnode imm:$a, RegI32:$b))]>; + def ir64 : InstPTX<(outs RegI64:$d), + (ins i64imm:$a, RegI64:$b), !strconcat(opcstr, "64\t$d, $a, $b"), - [(set RRegu64:$d, (opnode imm:$a, RRegu64:$b))]>; + [(set RegI64:$d, (opnode imm:$a, RegI64:$b))]>; } -multiclass PTX_SETP<RegisterClass RC, string regclsname, Operand immcls, +multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls, CondCode cmp, string cmpstr> { - // TODO 1. support floating-point 2. support 5-operand format: p|q, a, b, c + // TODO support 5-operand format: p|q, a, b, c def rr - : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b), + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b), !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), - [(set Preds:$p, (setcc RC:$a, RC:$b, cmp))]>; + [(set RegPred:$p, (setcc RC:$a, RC:$b, cmp))]>; def ri - : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b), + : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b), !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), - [(set Preds:$p, (setcc RC:$a, imm:$b, cmp))]>; + [(set RegPred:$p, (setcc RC:$a, imm:$b, cmp))]>; def rr_and_r - : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"), - [(set Preds:$p, (and (setcc RC:$a, RC:$b, cmp), Preds:$c))]>; + [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>; def ri_and_r - : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"), - [(set Preds:$p, (and (setcc RC:$a, imm:$b, cmp), Preds:$c))]>; + [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>; def rr_or_r - : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"), - [(set Preds:$p, (or (setcc RC:$a, RC:$b, cmp), Preds:$c))]>; + [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>; def ri_or_r - : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"), - [(set Preds:$p, (or (setcc RC:$a, imm:$b, cmp), Preds:$c))]>; + [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>; def rr_xor_r - : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"), - [(set Preds:$p, (xor (setcc RC:$a, RC:$b, cmp), Preds:$c))]>; + [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>; def ri_xor_r - : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"), - [(set Preds:$p, (xor (setcc RC:$a, imm:$b, cmp), Preds:$c))]>; + [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>; def rr_and_not_r - : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"), - [(set Preds:$p, (and (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>; + [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>; def ri_and_not_r - : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"), - [(set Preds:$p, (and (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>; + [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>; def rr_or_not_r - : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"), - [(set Preds:$p, (or (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>; + [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>; def ri_or_not_r - : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"), - [(set Preds:$p, (or (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>; + [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>; def rr_xor_not_r - : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"), - [(set Preds:$p, (xor (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>; + [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>; def ri_xor_not_r - : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), + !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"), + [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>; +} + +multiclass PTX_SETP_FP<RegisterClass RC, string regclsname, + CondCode ucmp, CondCode ocmp, string cmpstr> { + // TODO support 5-operand format: p|q, a, b, c + + def rr_u + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b), + !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"), + [(set RegPred:$p, (setcc RC:$a, RC:$b, ucmp))]>; + def rr_o + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b), + !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), + [(set RegPred:$p, (setcc RC:$a, RC:$b, ocmp))]>; + + def rr_and_r_u + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, $c"), + [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>; + def rr_and_r_o + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"), + [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>; + + def rr_or_r_u + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, $c"), + [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>; + def rr_or_r_o + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"), + [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>; + + def rr_xor_r_u + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, $c"), + [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>; + def rr_xor_r_o + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"), + [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>; + + def rr_and_not_r_u + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, !$c"), + [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>; + def rr_and_not_r_o + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"), + [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>; + + def rr_or_not_r_u + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, !$c"), + [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>; + def rr_or_not_r_o + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"), + [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>; + + def rr_xor_not_r_u + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), + !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, !$c"), + [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>; + def rr_xor_not_r_o + : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"), - [(set Preds:$p, (xor (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>; + [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>; +} + +multiclass PTX_SELP<RegisterClass RC, string regclsname> { + def rr + : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, RC:$c), + !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"), + [(set RC:$r, (select RegPred:$a, RC:$b, RC:$c))]>; } multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_load> { @@ -415,11 +524,11 @@ multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_lo } multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> { - defm u16 : PTX_LD<opstr, ".u16", RRegu16, pat_load>; - defm u32 : PTX_LD<opstr, ".u32", RRegu32, pat_load>; - defm u64 : PTX_LD<opstr, ".u64", RRegu64, pat_load>; - defm f32 : PTX_LD<opstr, ".f32", RRegf32, pat_load>; - defm f64 : PTX_LD<opstr, ".f64", RRegf64, pat_load>; + defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>; + defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>; + defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>; + defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>; + defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>; } multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_store> { @@ -450,11 +559,11 @@ multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_st } multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> { - defm u16 : PTX_ST<opstr, ".u16", RRegu16, pat_store>; - defm u32 : PTX_ST<opstr, ".u32", RRegu32, pat_store>; - defm u64 : PTX_ST<opstr, ".u64", RRegu64, pat_store>; - defm f32 : PTX_ST<opstr, ".f32", RRegf32, pat_store>; - defm f64 : PTX_ST<opstr, ".f64", RRegf64, pat_store>; + defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>; + defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>; + defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>; + defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>; + defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>; } //===----------------------------------------------------------------------===// @@ -466,9 +575,14 @@ multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> { defm ADD : INT3<"add", add>; defm SUB : INT3<"sub", sub>; defm MUL : INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies +defm DIV : INT3<"div", udiv>; +defm REM : INT3<"rem", urem>; ///===- Floating-Point Arithmetic Instructions ----------------------------===// +// Standard Unary Operations +defm FNEG : PTX_FLOAT_2OP<"neg", fneg>; + // Standard Binary Operations defm FADD : PTX_FLOAT_3OP<"add", fadd>; defm FSUB : PTX_FLOAT_3OP<"sub", fsub>; @@ -478,35 +592,35 @@ defm FMUL : PTX_FLOAT_3OP<"mul", fmul>; // For division, we need to have f32 and f64 differently. // For f32, we just always use .approx since it is supported on all hardware // for PTX 1.4+, which is our minimum target. -def FDIVrr32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a, RRegf32:$b), +def FDIVrr32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a, RegF32:$b), "div.approx.f32\t$d, $a, $b", - [(set RRegf32:$d, (fdiv RRegf32:$a, RRegf32:$b))]>; -def FDIVri32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a, f32imm:$b), + [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>; +def FDIVri32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a, f32imm:$b), "div.approx.f32\t$d, $a, $b", - [(set RRegf32:$d, (fdiv RRegf32:$a, fpimm:$b))]>; + [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>; // For f64, we must specify a rounding for sm 1.3+ but *not* for sm 1.0. -def FDIVrr64SM13 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a, RRegf64:$b), +def FDIVrr64SM13 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a, RegF64:$b), "div.rn.f64\t$d, $a, $b", - [(set RRegf64:$d, (fdiv RRegf64:$a, RRegf64:$b))]>, + [(set RegF64:$d, (fdiv RegF64:$a, RegF64:$b))]>, Requires<[SupportsSM13]>; -def FDIVri64SM13 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a, f64imm:$b), +def FDIVri64SM13 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a, f64imm:$b), "div.rn.f64\t$d, $a, $b", - [(set RRegf64:$d, (fdiv RRegf64:$a, fpimm:$b))]>, + [(set RegF64:$d, (fdiv RegF64:$a, fpimm:$b))]>, Requires<[SupportsSM13]>; -def FDIVrr64SM10 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a, RRegf64:$b), +def FDIVrr64SM10 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a, RegF64:$b), "div.f64\t$d, $a, $b", - [(set RRegf64:$d, (fdiv RRegf64:$a, RRegf64:$b))]>, + [(set RegF64:$d, (fdiv RegF64:$a, RegF64:$b))]>, Requires<[DoesNotSupportSM13]>; -def FDIVri64SM10 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a, f64imm:$b), +def FDIVri64SM10 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a, f64imm:$b), "div.f64\t$d, $a, $b", - [(set RRegf64:$d, (fdiv RRegf64:$a, fpimm:$b))]>, + [(set RegF64:$d, (fdiv RegF64:$a, fpimm:$b))]>, Requires<[DoesNotSupportSM13]>; @@ -519,56 +633,98 @@ def FDIVri64SM10 : InstPTX<(outs RRegf64:$d), // In the short term, mad is supported on all PTX versions and we use a // default rounding mode no matter what shader model or PTX version. // TODO: Allow the rounding mode to be selectable through llc. -defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>, Requires<[SupportsSM13]>; -defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>, Requires<[DoesNotSupportSM13]>; +defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>, Requires<[SupportsSM13, SupportsFMA]>; +defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>, Requires<[DoesNotSupportSM13, SupportsFMA]>; ///===- Floating-Point Intrinsic Instructions -----------------------------===// -def FSQRT32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a), +def FSQRT32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a), "sqrt.rn.f32\t$d, $a", - [(set RRegf32:$d, (fsqrt RRegf32:$a))]>; + [(set RegF32:$d, (fsqrt RegF32:$a))]>; -def FSQRT64 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a), +def FSQRT64 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a), "sqrt.rn.f64\t$d, $a", - [(set RRegf64:$d, (fsqrt RRegf64:$a))]>; + [(set RegF64:$d, (fsqrt RegF64:$a))]>; -def FSIN32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a), +def FSIN32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a), "sin.approx.f32\t$d, $a", - [(set RRegf32:$d, (fsin RRegf32:$a))]>; + [(set RegF32:$d, (fsin RegF32:$a))]>; -def FSIN64 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a), +def FSIN64 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a), "sin.approx.f64\t$d, $a", - [(set RRegf64:$d, (fsin RRegf64:$a))]>; + [(set RegF64:$d, (fsin RegF64:$a))]>; -def FCOS32 : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a), +def FCOS32 : InstPTX<(outs RegF32:$d), + (ins RegF32:$a), "cos.approx.f32\t$d, $a", - [(set RRegf32:$d, (fcos RRegf32:$a))]>; + [(set RegF32:$d, (fcos RegF32:$a))]>; -def FCOS64 : InstPTX<(outs RRegf64:$d), - (ins RRegf64:$a), +def FCOS64 : InstPTX<(outs RegF64:$d), + (ins RegF64:$a), "cos.approx.f64\t$d, $a", - [(set RRegf64:$d, (fcos RRegf64:$a))]>; + [(set RegF64:$d, (fcos RegF64:$a))]>; ///===- Comparison and Selection Instructions -----------------------------===// -defm SETPEQu32 : PTX_SETP<RRegu32, "u32", i32imm, SETEQ, "eq">; -defm SETPNEu32 : PTX_SETP<RRegu32, "u32", i32imm, SETNE, "ne">; -defm SETPLTu32 : PTX_SETP<RRegu32, "u32", i32imm, SETULT, "lt">; -defm SETPLEu32 : PTX_SETP<RRegu32, "u32", i32imm, SETULE, "le">; -defm SETPGTu32 : PTX_SETP<RRegu32, "u32", i32imm, SETUGT, "gt">; -defm SETPGEu32 : PTX_SETP<RRegu32, "u32", i32imm, SETUGE, "ge">; -defm SETPEQu64 : PTX_SETP<RRegu64, "u64", i64imm, SETEQ, "eq">; -defm SETPNEu64 : PTX_SETP<RRegu64, "u64", i64imm, SETNE, "ne">; -defm SETPLTu64 : PTX_SETP<RRegu64, "u64", i64imm, SETULT, "lt">; -defm SETPLEu64 : PTX_SETP<RRegu64, "u64", i64imm, SETULE, "le">; -defm SETPGTu64 : PTX_SETP<RRegu64, "u64", i64imm, SETUGT, "gt">; -defm SETPGEu64 : PTX_SETP<RRegu64, "u64", i64imm, SETUGE, "ge">; +// .setp + +// Compare u16 + +defm SETPEQu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETEQ, "eq">; +defm SETPNEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETNE, "ne">; +defm SETPLTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULT, "lt">; +defm SETPLEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULE, "le">; +defm SETPGTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGT, "gt">; +defm SETPGEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGE, "ge">; + +// Compare u32 + +defm SETPEQu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETEQ, "eq">; +defm SETPNEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETNE, "ne">; +defm SETPLTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULT, "lt">; +defm SETPLEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULE, "le">; +defm SETPGTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGT, "gt">; +defm SETPGEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGE, "ge">; + +// Compare u64 + +defm SETPEQu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETEQ, "eq">; +defm SETPNEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETNE, "ne">; +defm SETPLTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULT, "lt">; +defm SETPLEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULE, "le">; +defm SETPGTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGT, "gt">; +defm SETPGEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGE, "ge">; + +// Compare f32 + +defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", SETUEQ, SETOEQ, "eq">; +defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", SETUNE, SETONE, "ne">; +defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", SETULT, SETOLT, "lt">; +defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", SETULE, SETOLE, "le">; +defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", SETUGT, SETOGT, "gt">; +defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", SETUGE, SETOGE, "ge">; + +// Compare f64 + +defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", SETUEQ, SETOEQ, "eq">; +defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", SETUNE, SETONE, "ne">; +defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", SETULT, SETOLT, "lt">; +defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", SETULE, SETOLE, "le">; +defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", SETUGT, SETOGT, "gt">; +defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", SETUGE, SETOGE, "ge">; + +// .selp + +defm PTX_SELPu16 : PTX_SELP<RegI16, "u16">; +defm PTX_SELPu32 : PTX_SELP<RegI32, "u32">; +defm PTX_SELPu64 : PTX_SELP<RegI64, "u64">; +defm PTX_SELPf32 : PTX_SELP<RegF32, "f32">; +defm PTX_SELPf64 : PTX_SELP<RegF64, "f64">; ///===- Logic and Shift Instructions --------------------------------------===// @@ -584,47 +740,47 @@ defm XOR : PTX_LOGIC<"xor", xor>; let neverHasSideEffects = 1 in { def MOVPREDrr - : InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>; + : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>; def MOVU16rr - : InstPTX<(outs RRegu16:$d), (ins RRegu16:$a), "mov.u16\t$d, $a", []>; + : InstPTX<(outs RegI16:$d), (ins RegI16:$a), "mov.u16\t$d, $a", []>; def MOVU32rr - : InstPTX<(outs RRegu32:$d), (ins RRegu32:$a), "mov.u32\t$d, $a", []>; + : InstPTX<(outs RegI32:$d), (ins RegI32:$a), "mov.u32\t$d, $a", []>; def MOVU64rr - : InstPTX<(outs RRegu64:$d), (ins RRegu64:$a), "mov.u64\t$d, $a", []>; + : InstPTX<(outs RegI64:$d), (ins RegI64:$a), "mov.u64\t$d, $a", []>; def MOVF32rr - : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>; + : InstPTX<(outs RegF32:$d), (ins RegF32:$a), "mov.f32\t$d, $a", []>; def MOVF64rr - : InstPTX<(outs RRegf64:$d), (ins RRegf64:$a), "mov.f64\t$d, $a", []>; + : InstPTX<(outs RegF64:$d), (ins RegF64:$a), "mov.f64\t$d, $a", []>; } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def MOVPREDri - : InstPTX<(outs Preds:$d), (ins i1imm:$a), "mov.pred\t$d, $a", - [(set Preds:$d, imm:$a)]>; + : InstPTX<(outs RegPred:$d), (ins i1imm:$a), "mov.pred\t$d, $a", + [(set RegPred:$d, imm:$a)]>; def MOVU16ri - : InstPTX<(outs RRegu16:$d), (ins i16imm:$a), "mov.u16\t$d, $a", - [(set RRegu16:$d, imm:$a)]>; + : InstPTX<(outs RegI16:$d), (ins i16imm:$a), "mov.u16\t$d, $a", + [(set RegI16:$d, imm:$a)]>; def MOVU32ri - : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", - [(set RRegu32:$d, imm:$a)]>; - def MOVU164ri - : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", - [(set RRegu64:$d, imm:$a)]>; + : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", + [(set RegI32:$d, imm:$a)]>; + def MOVU64ri + : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", + [(set RegI64:$d, imm:$a)]>; def MOVF32ri - : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a", - [(set RRegf32:$d, fpimm:$a)]>; + : InstPTX<(outs RegF32:$d), (ins f32imm:$a), "mov.f32\t$d, $a", + [(set RegF32:$d, fpimm:$a)]>; def MOVF64ri - : InstPTX<(outs RRegf64:$d), (ins f64imm:$a), "mov.f64\t$d, $a", - [(set RRegf64:$d, fpimm:$a)]>; + : InstPTX<(outs RegF64:$d), (ins f64imm:$a), "mov.f64\t$d, $a", + [(set RegF64:$d, fpimm:$a)]>; } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def MOVaddr32 - : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", - [(set RRegu32:$d, (PTXcopyaddress tglobaladdr:$a))]>; + : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", + [(set RegI32:$d, (PTXcopyaddress tglobaladdr:$a))]>; def MOVaddr64 - : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", - [(set RRegu64:$d, (PTXcopyaddress tglobaladdr:$a))]>; + : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", + [(set RegI64:$d, (PTXcopyaddress tglobaladdr:$a))]>; } // Loads @@ -634,15 +790,15 @@ defm LDl : PTX_LD_ALL<"ld.local", load_local>; defm LDs : PTX_LD_ALL<"ld.shared", load_shared>; // This is a special instruction that is manually inserted for kernel parameters -def LDpiU16 : InstPTX<(outs RRegu16:$d), (ins MEMpi:$a), +def LDpiU16 : InstPTX<(outs RegI16:$d), (ins MEMpi:$a), "ld.param.u16\t$d, [$a]", []>; -def LDpiU32 : InstPTX<(outs RRegu32:$d), (ins MEMpi:$a), +def LDpiU32 : InstPTX<(outs RegI32:$d), (ins MEMpi:$a), "ld.param.u32\t$d, [$a]", []>; -def LDpiU64 : InstPTX<(outs RRegu64:$d), (ins MEMpi:$a), +def LDpiU64 : InstPTX<(outs RegI64:$d), (ins MEMpi:$a), "ld.param.u64\t$d, [$a]", []>; -def LDpiF32 : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a), +def LDpiF32 : InstPTX<(outs RegF32:$d), (ins MEMpi:$a), "ld.param.f32\t$d, [$a]", []>; -def LDpiF64 : InstPTX<(outs RRegf64:$d), (ins MEMpi:$a), +def LDpiF64 : InstPTX<(outs RegF64:$d), (ins MEMpi:$a), "ld.param.f64\t$d, [$a]", []>; // Stores @@ -654,17 +810,137 @@ defm STs : PTX_ST_ALL<"st.shared", store_shared>; // defm LDp : PTX_LD_ALL<"ld.param", load_parameter>; // TODO: Do something with st.param if/when it is needed. +// Conversion to pred + +def CVT_pred_u16 + : InstPTX<(outs RegPred:$d), (ins RegI16:$a), "cvt.pred.u16\t$d, $a", + [(set RegPred:$d, (trunc RegI16:$a))]>; + def CVT_pred_u32 - : InstPTX<(outs Preds:$d), (ins RRegu32:$a), "cvt.pred.u32\t$d, $a", - [(set Preds:$d, (trunc RRegu32:$a))]>; + : InstPTX<(outs RegPred:$d), (ins RegI32:$a), "cvt.pred.u32\t$d, $a", + [(set RegPred:$d, (trunc RegI32:$a))]>; + +def CVT_pred_u64 + : InstPTX<(outs RegPred:$d), (ins RegI64:$a), "cvt.pred.u64\t$d, $a", + [(set RegPred:$d, (trunc RegI64:$a))]>; + +def CVT_pred_f32 + : InstPTX<(outs RegPred:$d), (ins RegF32:$a), "cvt.rni.pred.f32\t$d, $a", + [(set RegPred:$d, (fp_to_uint RegF32:$a))]>; + +def CVT_pred_f64 + : InstPTX<(outs RegPred:$d), (ins RegF64:$a), "cvt.rni.pred.f64\t$d, $a", + [(set RegPred:$d, (fp_to_uint RegF64:$a))]>; + +// Conversion to u16 + +def CVT_u16_pred + : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "cvt.u16.pred\t$d, $a", + [(set RegI16:$d, (zext RegPred:$a))]>; + +def CVT_u16_u32 + : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a", + [(set RegI16:$d, (trunc RegI32:$a))]>; + +def CVT_u16_u64 + : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a", + [(set RegI16:$d, (trunc RegI64:$a))]>; + +def CVT_u16_f32 + : InstPTX<(outs RegI16:$d), (ins RegF32:$a), "cvt.rni.u16.f32\t$d, $a", + [(set RegI16:$d, (fp_to_uint RegF32:$a))]>; + +def CVT_u16_f64 + : InstPTX<(outs RegI16:$d), (ins RegF64:$a), "cvt.rni.u16.f64\t$d, $a", + [(set RegI16:$d, (fp_to_uint RegF64:$a))]>; + +// Conversion to u32 def CVT_u32_pred - : InstPTX<(outs RRegu32:$d), (ins Preds:$a), "cvt.u32.pred\t$d, $a", - [(set RRegu32:$d, (zext Preds:$a))]>; + : InstPTX<(outs RegI32:$d), (ins RegPred:$a), "cvt.u32.pred\t$d, $a", + [(set RegI32:$d, (zext RegPred:$a))]>; + +def CVT_u32_u16 + : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a", + [(set RegI32:$d, (zext RegI16:$a))]>; + +def CVT_u32_u64 + : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a", + [(set RegI32:$d, (trunc RegI64:$a))]>; + +def CVT_u32_f32 + : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "cvt.rni.u32.f32\t$d, $a", + [(set RegI32:$d, (fp_to_uint RegF32:$a))]>; + +def CVT_u32_f64 + : InstPTX<(outs RegI32:$d), (ins RegF64:$a), "cvt.rni.u32.f64\t$d, $a", + [(set RegI32:$d, (fp_to_uint RegF64:$a))]>; + +// Conversion to u64 + +def CVT_u64_pred + : InstPTX<(outs RegI64:$d), (ins RegPred:$a), "cvt.u64.pred\t$d, $a", + [(set RegI64:$d, (zext RegPred:$a))]>; + +def CVT_u64_u16 + : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a", + [(set RegI64:$d, (zext RegI16:$a))]>; def CVT_u64_u32 - : InstPTX<(outs RRegu64:$d), (ins RRegu32:$a), "cvt.u64.u32\t$d, $a", - [(set RRegu64:$d, (zext RRegu32:$a))]>; + : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a", + [(set RegI64:$d, (zext RegI32:$a))]>; + +def CVT_u64_f32 + : InstPTX<(outs RegI64:$d), (ins RegF32:$a), "cvt.rni.u64.f32\t$d, $a", + [(set RegI64:$d, (fp_to_uint RegF32:$a))]>; + +def CVT_u64_f64 + : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "cvt.rni.u64.f64\t$d, $a", + [(set RegI64:$d, (fp_to_uint RegF64:$a))]>; + +// Conversion to f32 + +def CVT_f32_pred + : InstPTX<(outs RegF32:$d), (ins RegPred:$a), "cvt.rn.f32.pred\t$d, $a", + [(set RegF32:$d, (uint_to_fp RegPred:$a))]>; + +def CVT_f32_u16 + : InstPTX<(outs RegF32:$d), (ins RegI16:$a), "cvt.rn.f32.u16\t$d, $a", + [(set RegF32:$d, (uint_to_fp RegI16:$a))]>; + +def CVT_f32_u32 + : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "cvt.rn.f32.u32\t$d, $a", + [(set RegF32:$d, (uint_to_fp RegI32:$a))]>; + +def CVT_f32_u64 + : InstPTX<(outs RegF32:$d), (ins RegI64:$a), "cvt.rn.f32.u64\t$d, $a", + [(set RegF32:$d, (uint_to_fp RegI64:$a))]>; + +def CVT_f32_f64 + : InstPTX<(outs RegF32:$d), (ins RegF64:$a), "cvt.rn.f32.f64\t$d, $a", + [(set RegF32:$d, (fround RegF64:$a))]>; + +// Conversion to f64 + +def CVT_f64_pred + : InstPTX<(outs RegF64:$d), (ins RegPred:$a), "cvt.rn.f64.pred\t$d, $a", + [(set RegF64:$d, (uint_to_fp RegPred:$a))]>; + +def CVT_f64_u16 + : InstPTX<(outs RegF64:$d), (ins RegI16:$a), "cvt.rn.f64.u16\t$d, $a", + [(set RegF64:$d, (uint_to_fp RegI16:$a))]>; + +def CVT_f64_u32 + : InstPTX<(outs RegF64:$d), (ins RegI32:$a), "cvt.rn.f64.u32\t$d, $a", + [(set RegF64:$d, (uint_to_fp RegI32:$a))]>; + +def CVT_f64_u64 + : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "cvt.rn.f64.u64\t$d, $a", + [(set RegF64:$d, (uint_to_fp RegI64:$a))]>; + +def CVT_f64_f32 + : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a", + [(set RegF64:$d, (fextend RegF32:$a))]>; ///===- Control Flow Instructions -----------------------------------------===// @@ -675,7 +951,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in { let isBranch = 1, isTerminator = 1 in { // FIXME: The pattern part is blank because I cannot (or do not yet know - // how to) use the first operand of PredicateOperand (a Preds register) here + // how to) use the first operand of PredicateOperand (a RegPred register) here def BRAdp : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [/*(brcond pred:$_p, bb:$d)*/]>; diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td index 320934a..8d97909 100644 --- a/lib/Target/PTX/PTXIntrinsicInstrInfo.td +++ b/lib/Target/PTX/PTXIntrinsicInstrInfo.td @@ -14,14 +14,14 @@ // PTX Special Purpose Register Accessor Intrinsics class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop> - : InstPTX<(outs RRegu64:$d), (ins), + : InstPTX<(outs RegI64:$d), (ins), !strconcat("mov.u64\t$d, %", regname), - [(set RRegu64:$d, (intop))]>; + [(set RegI64:$d, (intop))]>; class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop> - : InstPTX<(outs RRegu32:$d), (ins), + : InstPTX<(outs RegI32:$d), (ins), !strconcat("mov.u32\t$d, %", regname), - [(set RRegu32:$d, (intop))]>; + [(set RegI32:$d, (intop))]>; // TODO Add read vector-version of special registers diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp index cf743e4..1574670 100644 --- a/lib/Target/PTX/PTXMCAsmStreamer.cpp +++ b/lib/Target/PTX/PTXMCAsmStreamer.cpp @@ -143,9 +143,9 @@ public: virtual void EmitBytes(StringRef Data, unsigned AddrSpace); virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace); - virtual void EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace = 0); - virtual void EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace = 0); + unsigned AddrSpace); + virtual void EmitULEB128Value(const MCExpr *Value); + virtual void EmitSLEB128Value(const MCExpr *Value); virtual void EmitGPRel32Value(const MCExpr *Value); @@ -352,9 +352,8 @@ void PTXMCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) { } void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace) { + unsigned AddrSpace) { assert(getCurrentSection() && "Cannot emit contents before setting section!"); - assert(!isPCRel && "Cannot emit pc relative relocations!"); const char *Directive = 0; switch (Size) { default: break; @@ -383,15 +382,13 @@ void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, EmitEOL(); } -void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value, - unsigned AddrSpace) { +void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value) { assert(MAI.hasLEB128() && "Cannot print a .uleb"); OS << ".uleb128 " << *Value; EmitEOL(); } -void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value, - unsigned AddrSpace) { +void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) { assert(MAI.hasLEB128() && "Cannot print a .sleb"); OS << ".sleb128 " << *Value; EmitEOL(); @@ -533,7 +530,7 @@ void PTXMCAsmStreamer::Finish() {} namespace llvm { MCStreamer *createPTXAsmStreamer(MCContext &Context, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useLoc, + bool isVerboseAsm, bool useLoc, bool useCFI, MCInstPrinter *IP, MCCodeEmitter *CE, TargetAsmBackend *TAB, bool ShowInst) { diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp index c5e1910..6fe9e6c 100644 --- a/lib/Target/PTX/PTXMFInfoExtract.cpp +++ b/lib/Target/PTX/PTXMFInfoExtract.cpp @@ -54,8 +54,6 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "******** PTX FUNCTION LOCAL VAR REG DEF ********\n"); - unsigned retreg = MFI->retReg(); - DEBUG(dbgs() << "PTX::NoRegister == " << PTX::NoRegister << "\n" << "PTX::NUM_TARGET_REGS == " << PTX::NUM_TARGET_REGS << "\n"); @@ -68,15 +66,13 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) { // FIXME: This is a slow linear scanning for (unsigned reg = PTX::NoRegister + 1; reg < PTX::NUM_TARGET_REGS; ++reg) if (MRI.isPhysRegUsed(reg) && - reg != retreg && + !MFI->isRetReg(reg) && (MFI->isKernel() || !MFI->isArgReg(reg))) MFI->addLocalVarReg(reg); // Notify MachineFunctionInfo that I've done adding local var reg MFI->doneAddLocalVar(); - DEBUG(dbgs() << "Return Reg: " << retreg << "\n"); - DEBUG(for (PTXMachineFunctionInfo::reg_iterator i = MFI->argRegBegin(), e = MFI->argRegEnd(); i != e; ++i) diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h index b5b3c3b..1da4b5d 100644 --- a/lib/Target/PTX/PTXMachineFunctionInfo.h +++ b/lib/Target/PTX/PTXMachineFunctionInfo.h @@ -15,6 +15,7 @@ #define PTX_MACHINE_FUNCTION_INFO_H #include "PTX.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/MachineFunction.h" namespace llvm { @@ -25,7 +26,7 @@ class PTXMachineFunctionInfo : public MachineFunctionInfo { private: bool is_kernel; std::vector<unsigned> reg_arg, reg_local_var; - unsigned reg_ret; + DenseSet<unsigned> reg_ret; bool _isDoneAddArg; public: @@ -39,22 +40,18 @@ public: void addArgReg(unsigned reg) { reg_arg.push_back(reg); } void addLocalVarReg(unsigned reg) { reg_local_var.push_back(reg); } - void setRetReg(unsigned reg) { reg_ret = reg; } + void addRetReg(unsigned reg) { reg_ret.insert(reg); } void doneAddArg(void) { - std::sort(reg_arg.begin(), reg_arg.end()); _isDoneAddArg = true; } - void doneAddLocalVar(void) { - std::sort(reg_local_var.begin(), reg_local_var.end()); - } - - bool isDoneAddArg(void) { return _isDoneAddArg; } + void doneAddLocalVar(void) {} bool isKernel() const { return is_kernel; } typedef std::vector<unsigned>::const_iterator reg_iterator; typedef std::vector<unsigned>::const_reverse_iterator reg_reverse_iterator; + typedef DenseSet<unsigned>::const_iterator ret_iterator; bool argRegEmpty() const { return reg_arg.empty(); } int getNumArg() const { return reg_arg.size(); } @@ -67,14 +64,22 @@ public: reg_iterator localVarRegBegin() const { return reg_local_var.begin(); } reg_iterator localVarRegEnd() const { return reg_local_var.end(); } - unsigned retReg() const { return reg_ret; } + bool retRegEmpty() const { return reg_ret.empty(); } + int getNumRet() const { return reg_ret.size(); } + ret_iterator retRegBegin() const { return reg_ret.begin(); } + ret_iterator retRegEnd() const { return reg_ret.end(); } bool isArgReg(unsigned reg) const { - return std::binary_search(reg_arg.begin(), reg_arg.end(), reg); + return std::find(reg_arg.begin(), reg_arg.end(), reg) != reg_arg.end(); + } + + bool isRetReg(unsigned reg) const { + return std::find(reg_ret.begin(), reg_ret.end(), reg) != reg_ret.end(); } bool isLocalVarReg(unsigned reg) const { - return std::binary_search(reg_local_var.begin(), reg_local_var.end(), reg); + return std::find(reg_local_var.begin(), reg_local_var.end(), reg) + != reg_local_var.end(); } }; // class PTXMachineFunctionInfo } // namespace llvm diff --git a/lib/Target/PTX/PTXRegisterInfo.h b/lib/Target/PTX/PTXRegisterInfo.h index 67e130f..dc56352 100644 --- a/lib/Target/PTX/PTXRegisterInfo.h +++ b/lib/Target/PTX/PTXRegisterInfo.h @@ -57,6 +57,9 @@ struct PTXRegisterInfo : public PTXGenRegisterInfo { virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const { return PTXGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); } + virtual int getLLVMRegNum(unsigned RegNum, bool isEH) const { + return PTXGenRegisterInfo::getLLVMRegNumFull(RegNum, 0); + } }; // struct PTXRegisterInfo } // namespace llvm diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td index 548e3bb..08a39a8 100644 --- a/lib/Target/PTX/PTXRegisterInfo.td +++ b/lib/Target/PTX/PTXRegisterInfo.td @@ -29,30 +29,6 @@ def P4 : PTXReg<"p4">; def P5 : PTXReg<"p5">; def P6 : PTXReg<"p6">; def P7 : PTXReg<"p7">; -def P8 : PTXReg<"p8">; -def P9 : PTXReg<"p9">; -def P10 : PTXReg<"p10">; -def P11 : PTXReg<"p11">; -def P12 : PTXReg<"p12">; -def P13 : PTXReg<"p13">; -def P14 : PTXReg<"p14">; -def P15 : PTXReg<"p15">; -def P16 : PTXReg<"p16">; -def P17 : PTXReg<"p17">; -def P18 : PTXReg<"p18">; -def P19 : PTXReg<"p19">; -def P20 : PTXReg<"p20">; -def P21 : PTXReg<"p21">; -def P22 : PTXReg<"p22">; -def P23 : PTXReg<"p23">; -def P24 : PTXReg<"p24">; -def P25 : PTXReg<"p25">; -def P26 : PTXReg<"p26">; -def P27 : PTXReg<"p27">; -def P28 : PTXReg<"p28">; -def P29 : PTXReg<"p29">; -def P30 : PTXReg<"p30">; -def P31 : PTXReg<"p31">; ///===- 16-bit Integer Registers ------------------------------------------===// @@ -64,30 +40,6 @@ def RH4 : PTXReg<"rh4">; def RH5 : PTXReg<"rh5">; def RH6 : PTXReg<"rh6">; def RH7 : PTXReg<"rh7">; -def RH8 : PTXReg<"rh8">; -def RH9 : PTXReg<"rh9">; -def RH10 : PTXReg<"rh10">; -def RH11 : PTXReg<"rh11">; -def RH12 : PTXReg<"rh12">; -def RH13 : PTXReg<"rh13">; -def RH14 : PTXReg<"rh14">; -def RH15 : PTXReg<"rh15">; -def RH16 : PTXReg<"rh16">; -def RH17 : PTXReg<"rh17">; -def RH18 : PTXReg<"rh18">; -def RH19 : PTXReg<"rh19">; -def RH20 : PTXReg<"rh20">; -def RH21 : PTXReg<"rh21">; -def RH22 : PTXReg<"rh22">; -def RH23 : PTXReg<"rh23">; -def RH24 : PTXReg<"rh24">; -def RH25 : PTXReg<"rh25">; -def RH26 : PTXReg<"rh26">; -def RH27 : PTXReg<"rh27">; -def RH28 : PTXReg<"rh28">; -def RH29 : PTXReg<"rh29">; -def RH30 : PTXReg<"rh30">; -def RH31 : PTXReg<"rh31">; ///===- 32-bit Integer Registers ------------------------------------------===// @@ -99,30 +51,6 @@ def R4 : PTXReg<"r4">; def R5 : PTXReg<"r5">; def R6 : PTXReg<"r6">; def R7 : PTXReg<"r7">; -def R8 : PTXReg<"r8">; -def R9 : PTXReg<"r9">; -def R10 : PTXReg<"r10">; -def R11 : PTXReg<"r11">; -def R12 : PTXReg<"r12">; -def R13 : PTXReg<"r13">; -def R14 : PTXReg<"r14">; -def R15 : PTXReg<"r15">; -def R16 : PTXReg<"r16">; -def R17 : PTXReg<"r17">; -def R18 : PTXReg<"r18">; -def R19 : PTXReg<"r19">; -def R20 : PTXReg<"r20">; -def R21 : PTXReg<"r21">; -def R22 : PTXReg<"r22">; -def R23 : PTXReg<"r23">; -def R24 : PTXReg<"r24">; -def R25 : PTXReg<"r25">; -def R26 : PTXReg<"r26">; -def R27 : PTXReg<"r27">; -def R28 : PTXReg<"r28">; -def R29 : PTXReg<"r29">; -def R30 : PTXReg<"r30">; -def R31 : PTXReg<"r31">; ///===- 64-bit Integer Registers ------------------------------------------===// @@ -134,138 +62,14 @@ def RD4 : PTXReg<"rd4">; def RD5 : PTXReg<"rd5">; def RD6 : PTXReg<"rd6">; def RD7 : PTXReg<"rd7">; -def RD8 : PTXReg<"rd8">; -def RD9 : PTXReg<"rd9">; -def RD10 : PTXReg<"rd10">; -def RD11 : PTXReg<"rd11">; -def RD12 : PTXReg<"rd12">; -def RD13 : PTXReg<"rd13">; -def RD14 : PTXReg<"rd14">; -def RD15 : PTXReg<"rd15">; -def RD16 : PTXReg<"rd16">; -def RD17 : PTXReg<"rd17">; -def RD18 : PTXReg<"rd18">; -def RD19 : PTXReg<"rd19">; -def RD20 : PTXReg<"rd20">; -def RD21 : PTXReg<"rd21">; -def RD22 : PTXReg<"rd22">; -def RD23 : PTXReg<"rd23">; -def RD24 : PTXReg<"rd24">; -def RD25 : PTXReg<"rd25">; -def RD26 : PTXReg<"rd26">; -def RD27 : PTXReg<"rd27">; -def RD28 : PTXReg<"rd28">; -def RD29 : PTXReg<"rd29">; -def RD30 : PTXReg<"rd30">; -def RD31 : PTXReg<"rd31">; - -///===- 32-bit Floating-Point Registers -----------------------------------===// - -def F0 : PTXReg<"f0">; -def F1 : PTXReg<"f1">; -def F2 : PTXReg<"f2">; -def F3 : PTXReg<"f3">; -def F4 : PTXReg<"f4">; -def F5 : PTXReg<"f5">; -def F6 : PTXReg<"f6">; -def F7 : PTXReg<"f7">; -def F8 : PTXReg<"f8">; -def F9 : PTXReg<"f9">; -def F10 : PTXReg<"f10">; -def F11 : PTXReg<"f11">; -def F12 : PTXReg<"f12">; -def F13 : PTXReg<"f13">; -def F14 : PTXReg<"f14">; -def F15 : PTXReg<"f15">; -def F16 : PTXReg<"f16">; -def F17 : PTXReg<"f17">; -def F18 : PTXReg<"f18">; -def F19 : PTXReg<"f19">; -def F20 : PTXReg<"f20">; -def F21 : PTXReg<"f21">; -def F22 : PTXReg<"f22">; -def F23 : PTXReg<"f23">; -def F24 : PTXReg<"f24">; -def F25 : PTXReg<"f25">; -def F26 : PTXReg<"f26">; -def F27 : PTXReg<"f27">; -def F28 : PTXReg<"f28">; -def F29 : PTXReg<"f29">; -def F30 : PTXReg<"f30">; -def F31 : PTXReg<"f31">; - -///===- 64-bit Floating-Point Registers -----------------------------------===// - -def FD0 : PTXReg<"fd0">; -def FD1 : PTXReg<"fd1">; -def FD2 : PTXReg<"fd2">; -def FD3 : PTXReg<"fd3">; -def FD4 : PTXReg<"fd4">; -def FD5 : PTXReg<"fd5">; -def FD6 : PTXReg<"fd6">; -def FD7 : PTXReg<"fd7">; -def FD8 : PTXReg<"fd8">; -def FD9 : PTXReg<"fd9">; -def FD10 : PTXReg<"fd10">; -def FD11 : PTXReg<"fd11">; -def FD12 : PTXReg<"fd12">; -def FD13 : PTXReg<"fd13">; -def FD14 : PTXReg<"fd14">; -def FD15 : PTXReg<"fd15">; -def FD16 : PTXReg<"fd16">; -def FD17 : PTXReg<"fd17">; -def FD18 : PTXReg<"fd18">; -def FD19 : PTXReg<"fd19">; -def FD20 : PTXReg<"fd20">; -def FD21 : PTXReg<"fd21">; -def FD22 : PTXReg<"fd22">; -def FD23 : PTXReg<"fd23">; -def FD24 : PTXReg<"fd24">; -def FD25 : PTXReg<"fd25">; -def FD26 : PTXReg<"fd26">; -def FD27 : PTXReg<"fd27">; -def FD28 : PTXReg<"fd28">; -def FD29 : PTXReg<"fd29">; -def FD30 : PTXReg<"fd30">; -def FD31 : PTXReg<"fd31">; - //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// -def Preds : RegisterClass<"PTX", [i1], 8, - [P0, P1, P2, P3, P4, P5, P6, P7, - P8, P9, P10, P11, P12, P13, P14, P15, - P16, P17, P18, P19, P20, P21, P22, P23, - P24, P25, P26, P27, P28, P29, P30, P31]>; - -def RRegu16 : RegisterClass<"PTX", [i16], 16, - [RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7, - RH8, RH9, RH10, RH11, RH12, RH13, RH14, RH15, - RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23, - RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31]>; - -def RRegu32 : RegisterClass<"PTX", [i32], 32, - [R0, R1, R2, R3, R4, R5, R6, R7, - R8, R9, R10, R11, R12, R13, R14, R15, - R16, R17, R18, R19, R20, R21, R22, R23, - R24, R25, R26, R27, R28, R29, R30, R31]>; - -def RRegu64 : RegisterClass<"PTX", [i64], 64, - [RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7, - RD8, RD9, RD10, RD11, RD12, RD13, RD14, RD15, - RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23, - RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31]>; - -def RRegf32 : RegisterClass<"PTX", [f32], 32, - [F0, F1, F2, F3, F4, F5, F6, F7, - F8, F9, F10, F11, F12, F13, F14, F15, - F16, F17, F18, F19, F20, F21, F22, F23, - F24, F25, F26, F27, F28, F29, F30, F31]>; - -def RRegf64 : RegisterClass<"PTX", [f64], 64, - [FD0, FD1, FD2, FD3, FD4, FD5, FD6, FD7, - FD8, FD9, FD10, FD11, FD12, FD13, FD14, FD15, - FD16, FD17, FD18, FD19, FD20, FD21, FD22, FD23, - FD24, FD25, FD26, FD27, FD28, FD29, FD30, FD31]>; +def RegPred : RegisterClass<"PTX", [i1], 8, (sequence "P%u", 0, 7)>; +def RegI16 : RegisterClass<"PTX", [i16], 16, (sequence "RH%u", 0, 7)>; +def RegI32 : RegisterClass<"PTX", [i32], 32, (sequence "R%u", 0, 7)>; +def RegI64 : RegisterClass<"PTX", [i64], 64, (sequence "RD%u", 0, 7)>; +def RegF32 : RegisterClass<"PTX", [f32], 32, (sequence "R%u", 0, 7)>; +def RegF64 : RegisterClass<"PTX", [f64], 64, (sequence "RD%u", 0, 7)>; diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp index 527622d..e8a1dfe 100644 --- a/lib/Target/PTX/PTXSubtarget.cpp +++ b/lib/Target/PTX/PTXSubtarget.cpp @@ -16,11 +16,13 @@ using namespace llvm; -PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS) +PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS, + bool is64Bit) : PTXShaderModel(PTX_SM_1_0), PTXVersion(PTX_VERSION_2_0), SupportsDouble(false), - Use64BitAddresses(false) { + SupportsFMA(true), + Is64Bit(is64Bit) { std::string TARGET = "generic"; ParseSubtargetFeatures(FS, TARGET); } @@ -40,6 +42,7 @@ std::string PTXSubtarget::getPTXVersionString() const { case PTX_VERSION_2_0: return "2.0"; case PTX_VERSION_2_1: return "2.1"; case PTX_VERSION_2_2: return "2.2"; + case PTX_VERSION_2_3: return "2.3"; } } diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h index 57cd43d..c8f8c3b 100644 --- a/lib/Target/PTX/PTXSubtarget.h +++ b/lib/Target/PTX/PTXSubtarget.h @@ -37,7 +37,8 @@ namespace llvm { enum PTXVersionEnum { PTX_VERSION_2_0, /*< PTX Version 2.0 */ PTX_VERSION_2_1, /*< PTX Version 2.1 */ - PTX_VERSION_2_2 /*< PTX Version 2.2 */ + PTX_VERSION_2_2, /*< PTX Version 2.2 */ + PTX_VERSION_2_3 /*< PTX Version 2.3 */ }; /// Shader Model supported on the target GPU. @@ -49,11 +50,15 @@ namespace llvm { // The native .f64 type is supported on the hardware. bool SupportsDouble; + // Support the fused-multiply add (FMA) and multiply-add (MAD) + // instructions + bool SupportsFMA; + // Use .u64 instead of .u32 for addresses. - bool Use64BitAddresses; + bool Is64Bit; public: - PTXSubtarget(const std::string &TT, const std::string &FS); + PTXSubtarget(const std::string &TT, const std::string &FS, bool is64Bit); std::string getTargetString() const; @@ -61,7 +66,9 @@ namespace llvm { bool supportsDouble() const { return SupportsDouble; } - bool use64BitAddresses() const { return Use64BitAddresses; } + bool is64Bit() const { return Is64Bit; } + + bool supportsFMA() const { return SupportsFMA; } bool supportsSM13() const { return PTXShaderModel >= PTX_SM_1_3; } @@ -71,6 +78,8 @@ namespace llvm { bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; } + bool supportsPTX23() const { return PTXVersion >= PTX_VERSION_2_3; } + std::string ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); }; // class PTXSubtarget diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp index 4701a94..1b737c9 100644 --- a/lib/Target/PTX/PTXTargetMachine.cpp +++ b/lib/Target/PTX/PTXTargetMachine.cpp @@ -23,6 +23,7 @@ using namespace llvm; namespace llvm { MCStreamer *createPTXAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, bool isVerboseAsm, bool useLoc, + bool useCFI, MCInstPrinter *InstPrint, MCCodeEmitter *CE, TargetAsmBackend *TAB, @@ -30,9 +31,15 @@ namespace llvm { } extern "C" void LLVMInitializePTXTarget() { - RegisterTargetMachine<PTXTargetMachine> X(ThePTXTarget); - RegisterAsmInfo<PTXMCAsmInfo> Y(ThePTXTarget); - TargetRegistry::RegisterAsmStreamer(ThePTXTarget, createPTXAsmStreamer); + + RegisterTargetMachine<PTX32TargetMachine> X(ThePTX32Target); + RegisterTargetMachine<PTX64TargetMachine> Y(ThePTX64Target); + + RegisterAsmInfo<PTXMCAsmInfo> Z(ThePTX32Target); + RegisterAsmInfo<PTXMCAsmInfo> W(ThePTX64Target); + + TargetRegistry::RegisterAsmStreamer(ThePTX32Target, createPTXAsmStreamer); + TargetRegistry::RegisterAsmStreamer(ThePTX64Target, createPTXAsmStreamer); } namespace { @@ -45,18 +52,28 @@ namespace { // DataLayout and FrameLowering are filled with dummy data PTXTargetMachine::PTXTargetMachine(const Target &T, const std::string &TT, - const std::string &FS) + const std::string &FS, + bool is64Bit) : LLVMTargetMachine(T, TT), - // FIXME: This feels like a dirty hack, but Subtarget does not appear to be - // initialized at this point, and we need to finish initialization of - // DataLayout. - DataLayout((FS.find("64bit") != FS.npos) ? DataLayout64 : DataLayout32), - Subtarget(TT, FS), + DataLayout(is64Bit ? DataLayout64 : DataLayout32), + Subtarget(TT, FS, is64Bit), FrameLowering(Subtarget), InstrInfo(*this), TLInfo(*this) { } +PTX32TargetMachine::PTX32TargetMachine(const Target &T, + const std::string& TT, + const std::string& FS) + : PTXTargetMachine(T, TT, FS, false) { +} + +PTX64TargetMachine::PTX64TargetMachine(const Target &T, + const std::string& TT, + const std::string& FS) + : PTXTargetMachine(T, TT, FS, true) { +} + bool PTXTargetMachine::addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel) { PM.add(createPTXISelDag(*this, OptLevel)); diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h index a5dba53..149be8e 100644 --- a/lib/Target/PTX/PTXTargetMachine.h +++ b/lib/Target/PTX/PTXTargetMachine.h @@ -33,7 +33,7 @@ class PTXTargetMachine : public LLVMTargetMachine { public: PTXTargetMachine(const Target &T, const std::string &TT, - const std::string &FS); + const std::string &FS, bool is64Bit); virtual const TargetData *getTargetData() const { return &DataLayout; } @@ -55,6 +55,22 @@ class PTXTargetMachine : public LLVMTargetMachine { virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); }; // class PTXTargetMachine + + +class PTX32TargetMachine : public PTXTargetMachine { +public: + + PTX32TargetMachine(const Target &T, const std::string &TT, + const std::string& FS); +}; // class PTX32TargetMachine + +class PTX64TargetMachine : public PTXTargetMachine { +public: + + PTX64TargetMachine(const Target &T, const std::string &TT, + const std::string& FS); +}; // class PTX32TargetMachine + } // namespace llvm #endif // PTX_TARGET_MACHINE_H diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp index a577d77..9df6c75 100644 --- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp +++ b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp @@ -13,9 +13,13 @@ using namespace llvm; -Target llvm::ThePTXTarget; +Target llvm::ThePTX32Target; +Target llvm::ThePTX64Target; extern "C" void LLVMInitializePTXTargetInfo() { // see llvm/ADT/Triple.h - RegisterTarget<Triple::ptx> X(ThePTXTarget, "ptx", "PTX"); + RegisterTarget<Triple::ptx32> X32(ThePTX32Target, "ptx32", + "PTX (32-bit) [Experimental]"); + RegisterTarget<Triple::ptx64> X64(ThePTX64Target, "ptx64", + "PTX (64-bit) [Experimental]"); } diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index c8db0c4..1a9bd76 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -26,6 +26,9 @@ StringRef PPCInstPrinter::getOpcodeName(unsigned Opcode) const { return getInstructionName(Opcode); } +void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) { // Check for slwi/srwi mnemonics. diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h index 9cf9db9..adfa0aa 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -33,6 +33,7 @@ public: return SyntaxVariant == 1; } + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &O); virtual StringRef getOpcodeName(unsigned Opcode) const; diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index 7242f3a..92672b5 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -43,7 +43,7 @@ namespace llvm { TargetAsmBackend *createPPCAsmBackend(const Target &, const std::string &); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, - AsmPrinter &AP); + AsmPrinter &AP, bool isDarwin); extern Target ThePPC32Target; extern Target ThePPC64Target; diff --git a/lib/Target/PowerPC/PPCAsmBackend.cpp b/lib/Target/PowerPC/PPCAsmBackend.cpp index c4d4ac9..f562a3f 100644 --- a/lib/Target/PowerPC/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/PPCAsmBackend.cpp @@ -110,10 +110,8 @@ namespace { TargetAsmBackend *llvm::createPPCAsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + if (Triple(TT).isOSDarwin()) return new DarwinPPCAsmBackend(T); - default: - return 0; - } + + return 0; } diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index 09a9be9..b795db9 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -344,7 +344,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } case PPC::LDtoc: { // Transform %X3 = LDtoc <ga:@min1>, %X2 - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); // Change the opcode to LD, and the global address operand to be a // reference to the TOC entry we will synthesize later. @@ -376,7 +376,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); OutStreamer.EmitInstruction(TmpInst); } diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index 6aca6b0..375e000 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -259,8 +259,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { MachineModuleInfo &MMI = MF.getMMI(); DebugLoc dl; bool needsFrameMoves = MMI.hasDebugInfo() || - !MF.getFunction()->doesNotThrow() || - UnwindTablesMandatory; + MF.getFunction()->needsUnwindTableEntry(); // Prepare for frame info. MCSymbol *FrameLabel = 0; @@ -488,6 +487,14 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); unsigned Reg = CSI[I].getReg(); if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue; + + // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just + // subregisters of CR2. We just need to emit a move of CR2. + if (Reg == PPC::CR2LT || Reg == PPC::CR2GT || Reg == PPC::CR2EQ) + continue; + if (Reg == PPC::CR2UN) + Reg = PPC::CR2; + MachineLocation CSDst(MachineLocation::VirtualFP, Offset); MachineLocation CSSrc(Reg); Moves.push_back(MachineMove(Label, CSDst, CSSrc)); diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp index 0de5844..74ecff5 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp +++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -233,7 +233,7 @@ void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) { unsigned Opcode = Node->getMachineOpcode(); // Update structural hazard information. - if (Opcode == PPC::MTCTR) HasCTRSet = true; + if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true; // Track the address stored to. if (isStore) { diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index faae9b2..511bb22 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -240,11 +240,11 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { if (PPCLowering.getPointerTy() == MVT::i32) { GlobalBaseReg = RegInfo->createVirtualRegister(PPC::GPRCRegisterClass); - BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR), PPC::LR); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); } else { GlobalBaseReg = RegInfo->createVirtualRegister(PPC::G8RCRegisterClass); - BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8), PPC::LR8); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8)); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg); } } @@ -1057,9 +1057,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue Target = N->getOperand(1); unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8; + unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8; Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Target, Chain), 0); - return CurDAG->SelectNodeTo(N, PPC::BCTR, MVT::Other, Chain); + return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain); } } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 46b97e1..55c15ec 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -394,6 +394,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); } + setMinFunctionAlignment(2); + if (PPCSubTarget.isDarwin()) + setPrefFunctionAlignment(4); + computeRegisterProperties(); } @@ -460,14 +464,6 @@ MVT::SimpleValueType PPCTargetLowering::getSetCCResultType(EVT VT) const { return MVT::i32; } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned PPCTargetLowering::getFunctionAlignment(const Function *F) const { - if (getTargetMachine().getSubtarget<PPCSubtarget>().isDarwin()) - return F->hasFnAttr(Attribute::OptimizeForSize) ? 2 : 4; - else - return 2; -} - //===----------------------------------------------------------------------===// // Node matching predicates, for use by the tblgen matching code. //===----------------------------------------------------------------------===// @@ -1014,7 +1010,8 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp, short Imm; if (isIntS16Immediate(CN, Imm)) { Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy()); - Base = DAG.getRegister(PPC::R0, CN->getValueType(0)); + Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0, + CN->getValueType(0)); return true; } @@ -1561,8 +1558,8 @@ PPCTargetLowering::LowerFormalArguments_SVR4( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); @@ -1622,8 +1619,8 @@ PPCTargetLowering::LowerFormalArguments_SVR4( // Aggregates passed by value are stored in the local variable space of the // caller's stack frame, right above the parameter list area. SmallVector<CCValAssign, 16> ByValArgLocs; - CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(), - ByValArgLocs, *DAG.getContext()); + CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -2155,7 +2152,7 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG, } /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be -/// adjusted to accomodate the arguments for the tailcall. +/// adjusted to accommodate the arguments for the tailcall. static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, unsigned ParamSize) { @@ -2396,7 +2393,7 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, // Emit a sequence of copyto/copyfrom virtual registers for arguments that // might overwrite each other in case of tail call optimization. SmallVector<SDValue, 8> MemOpChains2; - // Do not flag preceeding copytoreg stuff together with the following stuff. + // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, MemOpChains2, dl); @@ -2444,7 +2441,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) { unsigned OpFlags = 0; if (DAG.getTarget().getRelocationModel() != Reloc::Static && - PPCSubTarget.getDarwinVers() < 9 && + (!PPCSubTarget.getTargetTriple().isMacOSX() || + PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && (G->getGlobal()->isDeclaration() || G->getGlobal()->isWeakForLinker())) { // PC-relative references to external symbols should go through $stub, @@ -2467,7 +2465,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, unsigned char OpFlags = 0; if (DAG.getTarget().getRelocationModel() != Reloc::Static && - PPCSubTarget.getDarwinVers() < 9) { + (!PPCSubTarget.getTargetTriple().isMacOSX() || + PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -2563,7 +2562,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, Callee.setNode(0); // Add CTR register as callee so a bctr can be emitted later. if (isTailCall) - Ops.push_back(DAG.getRegister(PPC::CTR, PtrVT)); + Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); } // If this is a direct call, pass the chain and the callee. @@ -2592,8 +2591,8 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVectorImpl<SDValue> &InVals) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); // Copy all of the result registers out of their specified physreg. @@ -2642,8 +2641,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, // to the liveout set for the function. if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_PPC); for (unsigned i = 0; i != RVLocs.size(); ++i) DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); @@ -2756,8 +2755,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); @@ -2796,8 +2795,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing aggregate by value arguments. SmallVector<CCValAssign, 16> ByValArgLocs; - CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(), ByValArgLocs, - *DAG.getContext()); + CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -2903,6 +2902,12 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0], MemOpChains.size()); + // Set CR6 to true if this is a vararg call. + if (isVarArg) { + SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0); + RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR)); + } + // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; @@ -2912,13 +2917,6 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, InFlag = Chain.getValue(1); } - // Set CR6 to true if this is a vararg call. - if (isVarArg) { - SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0); - Chain = DAG.getCopyToReg(Chain, dl, PPC::CR1EQ, SetCR, InFlag); - InFlag = Chain.getValue(1); - } - if (isTailCall) PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp, false, TailCallArguments); @@ -3304,8 +3302,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain, DebugLoc dl, SelectionDAG &DAG) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_PPC); // If this is the first return lowered for this function, add the regs to the @@ -5440,10 +5438,16 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. -void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter, +void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, std::vector<SDValue>&Ops, SelectionDAG &DAG) const { SDValue Result(0,0); + + // Only support length 1 constraints. + if (Constraint.length() > 1) return; + + char Letter = Constraint[0]; switch (Letter) { default: break; case 'I': @@ -5499,7 +5503,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter, } // Handle standard constraint letters. - TargetLowering::LowerAsmOperandForConstraint(Op, Letter, Ops, DAG); + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } // isLegalAddressingMode - Return true if the addressing mode represented diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 33daae9..986b4e7 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -328,7 +328,7 @@ namespace llvm { /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. virtual void LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, + std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const; @@ -364,9 +364,6 @@ namespace llvm { bool NonScalarIntSafe, bool MemcpyStrSrc, MachineFunction &MF) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - private: SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 9f0fae5..e88ad37 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -60,7 +60,7 @@ def HI48_64 : SDNodeXForm<imm, [{ // let Defs = [LR8] in - def MovePCtoLR8 : Pseudo<(outs), (ins piclabel:$label), "", []>, + def MovePCtoLR8 : Pseudo<(outs), (ins), "", []>, PPC970_Unit_BRU; // Darwin ABI Calls. @@ -190,10 +190,15 @@ def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset, variable_ops) let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1, - isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in -def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>, - Requires<[In64BitMode]>; + isIndirectBranch = 1, isCall = 1, Uses = [CTR8, RM] in { + let isReturn = 1 in { + def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>, + Requires<[In64BitMode]>; + } + def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>, + Requires<[In64BitMode]>; +} let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 24071b7..773578c 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -300,7 +300,6 @@ def calltarget : Operand<iPTR> { def aaddr : Operand<iPTR> { let PrintMethod = "printAbsAddrOperand"; } -def piclabel: Operand<iPTR> {} def symbolHi: Operand<i32> { let PrintMethod = "printSymbolHi"; let EncoderMethod = "getHA16Encoding"; @@ -413,7 +412,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { } let Defs = [LR] in - def MovePCtoLR : Pseudo<(outs), (ins piclabel:$label), "", []>, + def MovePCtoLR : Pseudo<(outs), (ins), "", []>, PPC970_Unit_BRU; let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp index 78383e0..4590f00 100644 --- a/lib/Target/PowerPC/PPCJITInfo.cpp +++ b/lib/Target/PowerPC/PPCJITInfo.cpp @@ -87,7 +87,7 @@ asm( // FIXME: could shrink frame // Set up a proper stack frame // FIXME Layout - // PowerPC64 ABI linkage - 24 bytes + // PowerPC32 ABI linkage - 24 bytes // parameters - 32 bytes // 13 double registers - 104 bytes // 8 int registers - 32 bytes @@ -205,11 +205,27 @@ void PPC32CompilationCallback() { #if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \ defined(__ppc64__) +#ifdef __ELF__ +asm( + ".text\n" + ".align 2\n" + ".globl PPC64CompilationCallback\n" + ".section \".opd\",\"aw\"\n" + ".align 3\n" +"PPC64CompilationCallback:\n" + ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n" + ".size PPC64CompilationCallback,24\n" + ".previous\n" + ".align 4\n" + ".type PPC64CompilationCallback,@function\n" +".L.PPC64CompilationCallback:\n" +#else asm( ".text\n" ".align 2\n" ".globl _PPC64CompilationCallback\n" "_PPC64CompilationCallback:\n" +#endif // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the // FIXME: need to save v[0-19] for altivec? // Set up a proper stack frame @@ -218,49 +234,55 @@ asm( // parameters - 64 bytes // 13 double registers - 104 bytes // 8 int registers - 64 bytes - "mflr r0\n" - "std r0, 16(r1)\n" - "stdu r1, -280(r1)\n" + "mflr 0\n" + "std 0, 16(1)\n" + "stdu 1, -280(1)\n" // Save all int arg registers - "std r10, 272(r1)\n" "std r9, 264(r1)\n" - "std r8, 256(r1)\n" "std r7, 248(r1)\n" - "std r6, 240(r1)\n" "std r5, 232(r1)\n" - "std r4, 224(r1)\n" "std r3, 216(r1)\n" + "std 10, 272(1)\n" "std 9, 264(1)\n" + "std 8, 256(1)\n" "std 7, 248(1)\n" + "std 6, 240(1)\n" "std 5, 232(1)\n" + "std 4, 224(1)\n" "std 3, 216(1)\n" // Save all call-clobbered FP regs. - "stfd f13, 208(r1)\n" "stfd f12, 200(r1)\n" - "stfd f11, 192(r1)\n" "stfd f10, 184(r1)\n" - "stfd f9, 176(r1)\n" "stfd f8, 168(r1)\n" - "stfd f7, 160(r1)\n" "stfd f6, 152(r1)\n" - "stfd f5, 144(r1)\n" "stfd f4, 136(r1)\n" - "stfd f3, 128(r1)\n" "stfd f2, 120(r1)\n" - "stfd f1, 112(r1)\n" + "stfd 13, 208(1)\n" "stfd 12, 200(1)\n" + "stfd 11, 192(1)\n" "stfd 10, 184(1)\n" + "stfd 9, 176(1)\n" "stfd 8, 168(1)\n" + "stfd 7, 160(1)\n" "stfd 6, 152(1)\n" + "stfd 5, 144(1)\n" "stfd 4, 136(1)\n" + "stfd 3, 128(1)\n" "stfd 2, 120(1)\n" + "stfd 1, 112(1)\n" // Arguments to Compilation Callback: // r3 - our lr (address of the call instruction in stub plus 4) // r4 - stub's lr (address of instruction that called the stub plus 4) // r5 - is64Bit - always 1. - "mr r3, r0\n" - "ld r2, 280(r1)\n" // stub's frame - "ld r4, 16(r2)\n" // stub's lr - "li r5, 1\n" // 1 == 64 bit + "mr 3, 0\n" // return address (still in r0) + "ld 5, 280(1)\n" // stub's frame + "ld 4, 16(5)\n" // stub's lr + "li 5, 1\n" // 1 == 64 bit +#ifdef __ELF__ + "bl PPCCompilationCallbackC\n" + "nop\n" +#else "bl _PPCCompilationCallbackC\n" - "mtctr r3\n" +#endif + "mtctr 3\n" // Restore all int arg registers - "ld r10, 272(r1)\n" "ld r9, 264(r1)\n" - "ld r8, 256(r1)\n" "ld r7, 248(r1)\n" - "ld r6, 240(r1)\n" "ld r5, 232(r1)\n" - "ld r4, 224(r1)\n" "ld r3, 216(r1)\n" + "ld 10, 272(1)\n" "ld 9, 264(1)\n" + "ld 8, 256(1)\n" "ld 7, 248(1)\n" + "ld 6, 240(1)\n" "ld 5, 232(1)\n" + "ld 4, 224(1)\n" "ld 3, 216(1)\n" // Restore all FP arg registers - "lfd f13, 208(r1)\n" "lfd f12, 200(r1)\n" - "lfd f11, 192(r1)\n" "lfd f10, 184(r1)\n" - "lfd f9, 176(r1)\n" "lfd f8, 168(r1)\n" - "lfd f7, 160(r1)\n" "lfd f6, 152(r1)\n" - "lfd f5, 144(r1)\n" "lfd f4, 136(r1)\n" - "lfd f3, 128(r1)\n" "lfd f2, 120(r1)\n" - "lfd f1, 112(r1)\n" + "lfd 13, 208(1)\n" "lfd 12, 200(1)\n" + "lfd 11, 192(1)\n" "lfd 10, 184(1)\n" + "lfd 9, 176(1)\n" "lfd 8, 168(1)\n" + "lfd 7, 160(1)\n" "lfd 6, 152(1)\n" + "lfd 5, 144(1)\n" "lfd 4, 136(1)\n" + "lfd 3, 128(1)\n" "lfd 2, 120(1)\n" + "lfd 1, 112(1)\n" // Pop 3 frames off the stack and branch to target - "ld r1, 280(r1)\n" - "ld r2, 16(r1)\n" - "mtlr r2\n" + "ld 1, 280(1)\n" + "ld 0, 16(1)\n" + "mtlr 0\n" + // XXX: any special TOC handling in the ELF case for JIT? "bctr\n" ); #else diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/PPCMCAsmInfo.cpp index d1178dd..2d5c880 100644 --- a/lib/Target/PowerPC/PPCMCAsmInfo.cpp +++ b/lib/Target/PowerPC/PPCMCAsmInfo.cpp @@ -17,7 +17,7 @@ using namespace llvm; PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) { PCSymbol = "."; CommentString = ";"; - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; if (!is64Bit) Data64bitsDirective = 0; // We can't emit a 64-bit unit in PPC32 mode. @@ -48,7 +48,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) { // Exceptions handling if (!is64Bit) - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; ZeroDirective = "\t.space\t"; Data64bitsDirective = is64Bit ? "\t.quad\t" : 0; diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp index 6082587..33af426 100644 --- a/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -95,14 +95,14 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ } static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, - AsmPrinter &Printer) { + AsmPrinter &Printer, bool isDarwin) { MCContext &Ctx = Printer.OutContext; MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; if (MO.getTargetFlags() & PPCII::MO_LO16) - RefKind = MCSymbolRefExpr::VK_PPC_LO16; + RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : MCSymbolRefExpr::VK_PPC_GAS_LO16; else if (MO.getTargetFlags() & PPCII::MO_HA16) - RefKind = MCSymbolRefExpr::VK_PPC_HA16; + RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : MCSymbolRefExpr::VK_PPC_GAS_HA16; // FIXME: This isn't right, but we don't have a good way to express this in // the MC Level, see below. @@ -130,7 +130,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, } void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, - AsmPrinter &AP) { + AsmPrinter &AP, bool isDarwin) { OutMI.setOpcode(MI->getOpcode()); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { @@ -154,16 +154,17 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, break; case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_ExternalSymbol: - MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP); + MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin); break; case MachineOperand::MO_JumpTableIndex: - MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP); + MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin); break; case MachineOperand::MO_ConstantPoolIndex: - MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP); + MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin); break; case MachineOperand::MO_BlockAddress: - MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP); + MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP, + isDarwin); break; } diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 45d8b6b..3374e9b 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -686,9 +686,28 @@ unsigned PPCRegisterInfo::getEHHandlerRegister() const { return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4; } +/// DWARFFlavour - Flavour of dwarf regnumbers +/// +namespace DWARFFlavour { + enum { + PPC64 = 0, PPC32 = 1 + }; +} + int PPCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { // FIXME: Most probably dwarf numbers differs for Linux and Darwin - return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); + unsigned Flavour = Subtarget.isPPC64() ? + DWARFFlavour::PPC64 : DWARFFlavour::PPC32; + + return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, Flavour); +} + +int PPCRegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const { + // FIXME: Most probably dwarf numbers differs for Linux and Darwin + unsigned Flavour = Subtarget.isPPC64() ? + DWARFFlavour::PPC64 : DWARFFlavour::PPC32; + + return PPCGenRegisterInfo::getLLVMRegNumFull(RegNum, Flavour); } #include "PPCGenRegisterInfo.inc" diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h index aa29ffe..48c2562 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/lib/Target/PowerPC/PPCRegisterInfo.h @@ -68,6 +68,7 @@ public: unsigned getEHHandlerRegister() const; int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; }; } // end namespace llvm diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td index 2639165..1acdf4e 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/lib/Target/PowerPC/PPCRegisterInfo.td @@ -65,203 +65,203 @@ class CRBIT<bits<5> num, string n> : PPCReg<n> { // General-purpose registers -def R0 : GPR< 0, "r0">, DwarfRegNum<[0]>; -def R1 : GPR< 1, "r1">, DwarfRegNum<[1]>; -def R2 : GPR< 2, "r2">, DwarfRegNum<[2]>; -def R3 : GPR< 3, "r3">, DwarfRegNum<[3]>; -def R4 : GPR< 4, "r4">, DwarfRegNum<[4]>; -def R5 : GPR< 5, "r5">, DwarfRegNum<[5]>; -def R6 : GPR< 6, "r6">, DwarfRegNum<[6]>; -def R7 : GPR< 7, "r7">, DwarfRegNum<[7]>; -def R8 : GPR< 8, "r8">, DwarfRegNum<[8]>; -def R9 : GPR< 9, "r9">, DwarfRegNum<[9]>; -def R10 : GPR<10, "r10">, DwarfRegNum<[10]>; -def R11 : GPR<11, "r11">, DwarfRegNum<[11]>; -def R12 : GPR<12, "r12">, DwarfRegNum<[12]>; -def R13 : GPR<13, "r13">, DwarfRegNum<[13]>; -def R14 : GPR<14, "r14">, DwarfRegNum<[14]>; -def R15 : GPR<15, "r15">, DwarfRegNum<[15]>; -def R16 : GPR<16, "r16">, DwarfRegNum<[16]>; -def R17 : GPR<17, "r17">, DwarfRegNum<[17]>; -def R18 : GPR<18, "r18">, DwarfRegNum<[18]>; -def R19 : GPR<19, "r19">, DwarfRegNum<[19]>; -def R20 : GPR<20, "r20">, DwarfRegNum<[20]>; -def R21 : GPR<21, "r21">, DwarfRegNum<[21]>; -def R22 : GPR<22, "r22">, DwarfRegNum<[22]>; -def R23 : GPR<23, "r23">, DwarfRegNum<[23]>; -def R24 : GPR<24, "r24">, DwarfRegNum<[24]>; -def R25 : GPR<25, "r25">, DwarfRegNum<[25]>; -def R26 : GPR<26, "r26">, DwarfRegNum<[26]>; -def R27 : GPR<27, "r27">, DwarfRegNum<[27]>; -def R28 : GPR<28, "r28">, DwarfRegNum<[28]>; -def R29 : GPR<29, "r29">, DwarfRegNum<[29]>; -def R30 : GPR<30, "r30">, DwarfRegNum<[30]>; -def R31 : GPR<31, "r31">, DwarfRegNum<[31]>; +def R0 : GPR< 0, "r0">, DwarfRegNum<[-2, 0]>; +def R1 : GPR< 1, "r1">, DwarfRegNum<[-2, 1]>; +def R2 : GPR< 2, "r2">, DwarfRegNum<[-2, 2]>; +def R3 : GPR< 3, "r3">, DwarfRegNum<[-2, 3]>; +def R4 : GPR< 4, "r4">, DwarfRegNum<[-2, 4]>; +def R5 : GPR< 5, "r5">, DwarfRegNum<[-2, 5]>; +def R6 : GPR< 6, "r6">, DwarfRegNum<[-2, 6]>; +def R7 : GPR< 7, "r7">, DwarfRegNum<[-2, 7]>; +def R8 : GPR< 8, "r8">, DwarfRegNum<[-2, 8]>; +def R9 : GPR< 9, "r9">, DwarfRegNum<[-2, 9]>; +def R10 : GPR<10, "r10">, DwarfRegNum<[-2, 10]>; +def R11 : GPR<11, "r11">, DwarfRegNum<[-2, 11]>; +def R12 : GPR<12, "r12">, DwarfRegNum<[-2, 12]>; +def R13 : GPR<13, "r13">, DwarfRegNum<[-2, 13]>; +def R14 : GPR<14, "r14">, DwarfRegNum<[-2, 14]>; +def R15 : GPR<15, "r15">, DwarfRegNum<[-2, 15]>; +def R16 : GPR<16, "r16">, DwarfRegNum<[-2, 16]>; +def R17 : GPR<17, "r17">, DwarfRegNum<[-2, 17]>; +def R18 : GPR<18, "r18">, DwarfRegNum<[-2, 18]>; +def R19 : GPR<19, "r19">, DwarfRegNum<[-2, 19]>; +def R20 : GPR<20, "r20">, DwarfRegNum<[-2, 20]>; +def R21 : GPR<21, "r21">, DwarfRegNum<[-2, 21]>; +def R22 : GPR<22, "r22">, DwarfRegNum<[-2, 22]>; +def R23 : GPR<23, "r23">, DwarfRegNum<[-2, 23]>; +def R24 : GPR<24, "r24">, DwarfRegNum<[-2, 24]>; +def R25 : GPR<25, "r25">, DwarfRegNum<[-2, 25]>; +def R26 : GPR<26, "r26">, DwarfRegNum<[-2, 26]>; +def R27 : GPR<27, "r27">, DwarfRegNum<[-2, 27]>; +def R28 : GPR<28, "r28">, DwarfRegNum<[-2, 28]>; +def R29 : GPR<29, "r29">, DwarfRegNum<[-2, 29]>; +def R30 : GPR<30, "r30">, DwarfRegNum<[-2, 30]>; +def R31 : GPR<31, "r31">, DwarfRegNum<[-2, 31]>; // 64-bit General-purpose registers -def X0 : GP8< R0, "r0">, DwarfRegNum<[0]>; -def X1 : GP8< R1, "r1">, DwarfRegNum<[1]>; -def X2 : GP8< R2, "r2">, DwarfRegNum<[2]>; -def X3 : GP8< R3, "r3">, DwarfRegNum<[3]>; -def X4 : GP8< R4, "r4">, DwarfRegNum<[4]>; -def X5 : GP8< R5, "r5">, DwarfRegNum<[5]>; -def X6 : GP8< R6, "r6">, DwarfRegNum<[6]>; -def X7 : GP8< R7, "r7">, DwarfRegNum<[7]>; -def X8 : GP8< R8, "r8">, DwarfRegNum<[8]>; -def X9 : GP8< R9, "r9">, DwarfRegNum<[9]>; -def X10 : GP8<R10, "r10">, DwarfRegNum<[10]>; -def X11 : GP8<R11, "r11">, DwarfRegNum<[11]>; -def X12 : GP8<R12, "r12">, DwarfRegNum<[12]>; -def X13 : GP8<R13, "r13">, DwarfRegNum<[13]>; -def X14 : GP8<R14, "r14">, DwarfRegNum<[14]>; -def X15 : GP8<R15, "r15">, DwarfRegNum<[15]>; -def X16 : GP8<R16, "r16">, DwarfRegNum<[16]>; -def X17 : GP8<R17, "r17">, DwarfRegNum<[17]>; -def X18 : GP8<R18, "r18">, DwarfRegNum<[18]>; -def X19 : GP8<R19, "r19">, DwarfRegNum<[19]>; -def X20 : GP8<R20, "r20">, DwarfRegNum<[20]>; -def X21 : GP8<R21, "r21">, DwarfRegNum<[21]>; -def X22 : GP8<R22, "r22">, DwarfRegNum<[22]>; -def X23 : GP8<R23, "r23">, DwarfRegNum<[23]>; -def X24 : GP8<R24, "r24">, DwarfRegNum<[24]>; -def X25 : GP8<R25, "r25">, DwarfRegNum<[25]>; -def X26 : GP8<R26, "r26">, DwarfRegNum<[26]>; -def X27 : GP8<R27, "r27">, DwarfRegNum<[27]>; -def X28 : GP8<R28, "r28">, DwarfRegNum<[28]>; -def X29 : GP8<R29, "r29">, DwarfRegNum<[29]>; -def X30 : GP8<R30, "r30">, DwarfRegNum<[30]>; -def X31 : GP8<R31, "r31">, DwarfRegNum<[31]>; +def X0 : GP8< R0, "r0">, DwarfRegNum<[0, -2]>; +def X1 : GP8< R1, "r1">, DwarfRegNum<[1, -2]>; +def X2 : GP8< R2, "r2">, DwarfRegNum<[2, -2]>; +def X3 : GP8< R3, "r3">, DwarfRegNum<[3, -2]>; +def X4 : GP8< R4, "r4">, DwarfRegNum<[4, -2]>; +def X5 : GP8< R5, "r5">, DwarfRegNum<[5, -2]>; +def X6 : GP8< R6, "r6">, DwarfRegNum<[6, -2]>; +def X7 : GP8< R7, "r7">, DwarfRegNum<[7, -2]>; +def X8 : GP8< R8, "r8">, DwarfRegNum<[8, -2]>; +def X9 : GP8< R9, "r9">, DwarfRegNum<[9, -2]>; +def X10 : GP8<R10, "r10">, DwarfRegNum<[10, -2]>; +def X11 : GP8<R11, "r11">, DwarfRegNum<[11, -2]>; +def X12 : GP8<R12, "r12">, DwarfRegNum<[12, -2]>; +def X13 : GP8<R13, "r13">, DwarfRegNum<[13, -2]>; +def X14 : GP8<R14, "r14">, DwarfRegNum<[14, -2]>; +def X15 : GP8<R15, "r15">, DwarfRegNum<[15, -2]>; +def X16 : GP8<R16, "r16">, DwarfRegNum<[16, -2]>; +def X17 : GP8<R17, "r17">, DwarfRegNum<[17, -2]>; +def X18 : GP8<R18, "r18">, DwarfRegNum<[18, -2]>; +def X19 : GP8<R19, "r19">, DwarfRegNum<[19, -2]>; +def X20 : GP8<R20, "r20">, DwarfRegNum<[20, -2]>; +def X21 : GP8<R21, "r21">, DwarfRegNum<[21, -2]>; +def X22 : GP8<R22, "r22">, DwarfRegNum<[22, -2]>; +def X23 : GP8<R23, "r23">, DwarfRegNum<[23, -2]>; +def X24 : GP8<R24, "r24">, DwarfRegNum<[24, -2]>; +def X25 : GP8<R25, "r25">, DwarfRegNum<[25, -2]>; +def X26 : GP8<R26, "r26">, DwarfRegNum<[26, -2]>; +def X27 : GP8<R27, "r27">, DwarfRegNum<[27, -2]>; +def X28 : GP8<R28, "r28">, DwarfRegNum<[28, -2]>; +def X29 : GP8<R29, "r29">, DwarfRegNum<[29, -2]>; +def X30 : GP8<R30, "r30">, DwarfRegNum<[30, -2]>; +def X31 : GP8<R31, "r31">, DwarfRegNum<[31, -2]>; // Floating-point registers -def F0 : FPR< 0, "f0">, DwarfRegNum<[32]>; -def F1 : FPR< 1, "f1">, DwarfRegNum<[33]>; -def F2 : FPR< 2, "f2">, DwarfRegNum<[34]>; -def F3 : FPR< 3, "f3">, DwarfRegNum<[35]>; -def F4 : FPR< 4, "f4">, DwarfRegNum<[36]>; -def F5 : FPR< 5, "f5">, DwarfRegNum<[37]>; -def F6 : FPR< 6, "f6">, DwarfRegNum<[38]>; -def F7 : FPR< 7, "f7">, DwarfRegNum<[39]>; -def F8 : FPR< 8, "f8">, DwarfRegNum<[40]>; -def F9 : FPR< 9, "f9">, DwarfRegNum<[41]>; -def F10 : FPR<10, "f10">, DwarfRegNum<[42]>; -def F11 : FPR<11, "f11">, DwarfRegNum<[43]>; -def F12 : FPR<12, "f12">, DwarfRegNum<[44]>; -def F13 : FPR<13, "f13">, DwarfRegNum<[45]>; -def F14 : FPR<14, "f14">, DwarfRegNum<[46]>; -def F15 : FPR<15, "f15">, DwarfRegNum<[47]>; -def F16 : FPR<16, "f16">, DwarfRegNum<[48]>; -def F17 : FPR<17, "f17">, DwarfRegNum<[49]>; -def F18 : FPR<18, "f18">, DwarfRegNum<[50]>; -def F19 : FPR<19, "f19">, DwarfRegNum<[51]>; -def F20 : FPR<20, "f20">, DwarfRegNum<[52]>; -def F21 : FPR<21, "f21">, DwarfRegNum<[53]>; -def F22 : FPR<22, "f22">, DwarfRegNum<[54]>; -def F23 : FPR<23, "f23">, DwarfRegNum<[55]>; -def F24 : FPR<24, "f24">, DwarfRegNum<[56]>; -def F25 : FPR<25, "f25">, DwarfRegNum<[57]>; -def F26 : FPR<26, "f26">, DwarfRegNum<[58]>; -def F27 : FPR<27, "f27">, DwarfRegNum<[59]>; -def F28 : FPR<28, "f28">, DwarfRegNum<[60]>; -def F29 : FPR<29, "f29">, DwarfRegNum<[61]>; -def F30 : FPR<30, "f30">, DwarfRegNum<[62]>; -def F31 : FPR<31, "f31">, DwarfRegNum<[63]>; +def F0 : FPR< 0, "f0">, DwarfRegNum<[32, 32]>; +def F1 : FPR< 1, "f1">, DwarfRegNum<[33, 33]>; +def F2 : FPR< 2, "f2">, DwarfRegNum<[34, 34]>; +def F3 : FPR< 3, "f3">, DwarfRegNum<[35, 35]>; +def F4 : FPR< 4, "f4">, DwarfRegNum<[36, 36]>; +def F5 : FPR< 5, "f5">, DwarfRegNum<[37, 37]>; +def F6 : FPR< 6, "f6">, DwarfRegNum<[38, 38]>; +def F7 : FPR< 7, "f7">, DwarfRegNum<[39, 39]>; +def F8 : FPR< 8, "f8">, DwarfRegNum<[40, 40]>; +def F9 : FPR< 9, "f9">, DwarfRegNum<[41, 41]>; +def F10 : FPR<10, "f10">, DwarfRegNum<[42, 42]>; +def F11 : FPR<11, "f11">, DwarfRegNum<[43, 43]>; +def F12 : FPR<12, "f12">, DwarfRegNum<[44, 44]>; +def F13 : FPR<13, "f13">, DwarfRegNum<[45, 45]>; +def F14 : FPR<14, "f14">, DwarfRegNum<[46, 46]>; +def F15 : FPR<15, "f15">, DwarfRegNum<[47, 47]>; +def F16 : FPR<16, "f16">, DwarfRegNum<[48, 48]>; +def F17 : FPR<17, "f17">, DwarfRegNum<[49, 49]>; +def F18 : FPR<18, "f18">, DwarfRegNum<[50, 50]>; +def F19 : FPR<19, "f19">, DwarfRegNum<[51, 51]>; +def F20 : FPR<20, "f20">, DwarfRegNum<[52, 52]>; +def F21 : FPR<21, "f21">, DwarfRegNum<[53, 53]>; +def F22 : FPR<22, "f22">, DwarfRegNum<[54, 54]>; +def F23 : FPR<23, "f23">, DwarfRegNum<[55, 55]>; +def F24 : FPR<24, "f24">, DwarfRegNum<[56, 56]>; +def F25 : FPR<25, "f25">, DwarfRegNum<[57, 57]>; +def F26 : FPR<26, "f26">, DwarfRegNum<[58, 58]>; +def F27 : FPR<27, "f27">, DwarfRegNum<[59, 59]>; +def F28 : FPR<28, "f28">, DwarfRegNum<[60, 60]>; +def F29 : FPR<29, "f29">, DwarfRegNum<[61, 61]>; +def F30 : FPR<30, "f30">, DwarfRegNum<[62, 62]>; +def F31 : FPR<31, "f31">, DwarfRegNum<[63, 63]>; // Vector registers -def V0 : VR< 0, "v0">, DwarfRegNum<[77]>; -def V1 : VR< 1, "v1">, DwarfRegNum<[78]>; -def V2 : VR< 2, "v2">, DwarfRegNum<[79]>; -def V3 : VR< 3, "v3">, DwarfRegNum<[80]>; -def V4 : VR< 4, "v4">, DwarfRegNum<[81]>; -def V5 : VR< 5, "v5">, DwarfRegNum<[82]>; -def V6 : VR< 6, "v6">, DwarfRegNum<[83]>; -def V7 : VR< 7, "v7">, DwarfRegNum<[84]>; -def V8 : VR< 8, "v8">, DwarfRegNum<[85]>; -def V9 : VR< 9, "v9">, DwarfRegNum<[86]>; -def V10 : VR<10, "v10">, DwarfRegNum<[87]>; -def V11 : VR<11, "v11">, DwarfRegNum<[88]>; -def V12 : VR<12, "v12">, DwarfRegNum<[89]>; -def V13 : VR<13, "v13">, DwarfRegNum<[90]>; -def V14 : VR<14, "v14">, DwarfRegNum<[91]>; -def V15 : VR<15, "v15">, DwarfRegNum<[92]>; -def V16 : VR<16, "v16">, DwarfRegNum<[93]>; -def V17 : VR<17, "v17">, DwarfRegNum<[94]>; -def V18 : VR<18, "v18">, DwarfRegNum<[95]>; -def V19 : VR<19, "v19">, DwarfRegNum<[96]>; -def V20 : VR<20, "v20">, DwarfRegNum<[97]>; -def V21 : VR<21, "v21">, DwarfRegNum<[98]>; -def V22 : VR<22, "v22">, DwarfRegNum<[99]>; -def V23 : VR<23, "v23">, DwarfRegNum<[100]>; -def V24 : VR<24, "v24">, DwarfRegNum<[101]>; -def V25 : VR<25, "v25">, DwarfRegNum<[102]>; -def V26 : VR<26, "v26">, DwarfRegNum<[103]>; -def V27 : VR<27, "v27">, DwarfRegNum<[104]>; -def V28 : VR<28, "v28">, DwarfRegNum<[105]>; -def V29 : VR<29, "v29">, DwarfRegNum<[106]>; -def V30 : VR<30, "v30">, DwarfRegNum<[107]>; -def V31 : VR<31, "v31">, DwarfRegNum<[108]>; +def V0 : VR< 0, "v0">, DwarfRegNum<[77, 77]>; +def V1 : VR< 1, "v1">, DwarfRegNum<[78, 78]>; +def V2 : VR< 2, "v2">, DwarfRegNum<[79, 79]>; +def V3 : VR< 3, "v3">, DwarfRegNum<[80, 80]>; +def V4 : VR< 4, "v4">, DwarfRegNum<[81, 81]>; +def V5 : VR< 5, "v5">, DwarfRegNum<[82, 82]>; +def V6 : VR< 6, "v6">, DwarfRegNum<[83, 83]>; +def V7 : VR< 7, "v7">, DwarfRegNum<[84, 84]>; +def V8 : VR< 8, "v8">, DwarfRegNum<[85, 85]>; +def V9 : VR< 9, "v9">, DwarfRegNum<[86, 86]>; +def V10 : VR<10, "v10">, DwarfRegNum<[87, 87]>; +def V11 : VR<11, "v11">, DwarfRegNum<[88, 88]>; +def V12 : VR<12, "v12">, DwarfRegNum<[89, 89]>; +def V13 : VR<13, "v13">, DwarfRegNum<[90, 90]>; +def V14 : VR<14, "v14">, DwarfRegNum<[91, 91]>; +def V15 : VR<15, "v15">, DwarfRegNum<[92, 92]>; +def V16 : VR<16, "v16">, DwarfRegNum<[93, 93]>; +def V17 : VR<17, "v17">, DwarfRegNum<[94, 94]>; +def V18 : VR<18, "v18">, DwarfRegNum<[95, 95]>; +def V19 : VR<19, "v19">, DwarfRegNum<[96, 96]>; +def V20 : VR<20, "v20">, DwarfRegNum<[97, 97]>; +def V21 : VR<21, "v21">, DwarfRegNum<[98, 98]>; +def V22 : VR<22, "v22">, DwarfRegNum<[99, 99]>; +def V23 : VR<23, "v23">, DwarfRegNum<[100, 100]>; +def V24 : VR<24, "v24">, DwarfRegNum<[101, 101]>; +def V25 : VR<25, "v25">, DwarfRegNum<[102, 102]>; +def V26 : VR<26, "v26">, DwarfRegNum<[103, 103]>; +def V27 : VR<27, "v27">, DwarfRegNum<[104, 104]>; +def V28 : VR<28, "v28">, DwarfRegNum<[105, 105]>; +def V29 : VR<29, "v29">, DwarfRegNum<[106, 106]>; +def V30 : VR<30, "v30">, DwarfRegNum<[107, 107]>; +def V31 : VR<31, "v31">, DwarfRegNum<[108, 108]>; // Condition register bits -def CR0LT : CRBIT< 0, "0">, DwarfRegNum<[0]>; -def CR0GT : CRBIT< 1, "1">, DwarfRegNum<[0]>; -def CR0EQ : CRBIT< 2, "2">, DwarfRegNum<[0]>; -def CR0UN : CRBIT< 3, "3">, DwarfRegNum<[0]>; -def CR1LT : CRBIT< 4, "4">, DwarfRegNum<[0]>; -def CR1GT : CRBIT< 5, "5">, DwarfRegNum<[0]>; -def CR1EQ : CRBIT< 6, "6">, DwarfRegNum<[0]>; -def CR1UN : CRBIT< 7, "7">, DwarfRegNum<[0]>; -def CR2LT : CRBIT< 8, "8">, DwarfRegNum<[0]>; -def CR2GT : CRBIT< 9, "9">, DwarfRegNum<[0]>; -def CR2EQ : CRBIT<10, "10">, DwarfRegNum<[0]>; -def CR2UN : CRBIT<11, "11">, DwarfRegNum<[0]>; -def CR3LT : CRBIT<12, "12">, DwarfRegNum<[0]>; -def CR3GT : CRBIT<13, "13">, DwarfRegNum<[0]>; -def CR3EQ : CRBIT<14, "14">, DwarfRegNum<[0]>; -def CR3UN : CRBIT<15, "15">, DwarfRegNum<[0]>; -def CR4LT : CRBIT<16, "16">, DwarfRegNum<[0]>; -def CR4GT : CRBIT<17, "17">, DwarfRegNum<[0]>; -def CR4EQ : CRBIT<18, "18">, DwarfRegNum<[0]>; -def CR4UN : CRBIT<19, "19">, DwarfRegNum<[0]>; -def CR5LT : CRBIT<20, "20">, DwarfRegNum<[0]>; -def CR5GT : CRBIT<21, "21">, DwarfRegNum<[0]>; -def CR5EQ : CRBIT<22, "22">, DwarfRegNum<[0]>; -def CR5UN : CRBIT<23, "23">, DwarfRegNum<[0]>; -def CR6LT : CRBIT<24, "24">, DwarfRegNum<[0]>; -def CR6GT : CRBIT<25, "25">, DwarfRegNum<[0]>; -def CR6EQ : CRBIT<26, "26">, DwarfRegNum<[0]>; -def CR6UN : CRBIT<27, "27">, DwarfRegNum<[0]>; -def CR7LT : CRBIT<28, "28">, DwarfRegNum<[0]>; -def CR7GT : CRBIT<29, "29">, DwarfRegNum<[0]>; -def CR7EQ : CRBIT<30, "30">, DwarfRegNum<[0]>; -def CR7UN : CRBIT<31, "31">, DwarfRegNum<[0]>; +def CR0LT : CRBIT< 0, "0">; +def CR0GT : CRBIT< 1, "1">; +def CR0EQ : CRBIT< 2, "2">; +def CR0UN : CRBIT< 3, "3">; +def CR1LT : CRBIT< 4, "4">; +def CR1GT : CRBIT< 5, "5">; +def CR1EQ : CRBIT< 6, "6">; +def CR1UN : CRBIT< 7, "7">; +def CR2LT : CRBIT< 8, "8">; +def CR2GT : CRBIT< 9, "9">; +def CR2EQ : CRBIT<10, "10">; +def CR2UN : CRBIT<11, "11">; +def CR3LT : CRBIT<12, "12">; +def CR3GT : CRBIT<13, "13">; +def CR3EQ : CRBIT<14, "14">; +def CR3UN : CRBIT<15, "15">; +def CR4LT : CRBIT<16, "16">; +def CR4GT : CRBIT<17, "17">; +def CR4EQ : CRBIT<18, "18">; +def CR4UN : CRBIT<19, "19">; +def CR5LT : CRBIT<20, "20">; +def CR5GT : CRBIT<21, "21">; +def CR5EQ : CRBIT<22, "22">; +def CR5UN : CRBIT<23, "23">; +def CR6LT : CRBIT<24, "24">; +def CR6GT : CRBIT<25, "25">; +def CR6EQ : CRBIT<26, "26">; +def CR6UN : CRBIT<27, "27">; +def CR7LT : CRBIT<28, "28">; +def CR7GT : CRBIT<29, "29">; +def CR7EQ : CRBIT<30, "30">; +def CR7UN : CRBIT<31, "31">; // Condition registers let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in { -def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68]>; -def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69]>; -def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70]>; -def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71]>; -def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72]>; -def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73]>; -def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74]>; -def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75]>; +def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>; +def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>; +def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>; +def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>; +def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>; +def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>; +def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>; +def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>; } // Link register -def LR : SPR<8, "lr">, DwarfRegNum<[65]>; +def LR : SPR<8, "lr">, DwarfRegNum<[-2, 65]>; //let Aliases = [LR] in -def LR8 : SPR<8, "lr">, DwarfRegNum<[65]>; +def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>; // Count register -def CTR : SPR<9, "ctr">, DwarfRegNum<[66]>; -def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66]>; +def CTR : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>; +def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>; // VRsave register -def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[107]>; +def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[109]>; // Carry bit. In the architecture this is really bit 0 of the XER register // (which really is SPR register 1); this is the only bit interesting to a // compiler. -def CARRY: SPR<1, "ca">, DwarfRegNum<[0]>; +def CARRY: SPR<1, "ca">; // FP rounding mode: bits 30 and 31 of the FP status and control register // This is not allocated as a normal register; it appears only in @@ -271,76 +271,18 @@ def CARRY: SPR<1, "ca">, DwarfRegNum<[0]>; // return and call instructions are described as Uses of RM, so instructions // that do nothing but change RM will not get deleted. // Also, in the architecture it is not really a SPR; 512 is arbitrary. -def RM: SPR<512, "**ROUNDING MODE**">, DwarfRegNum<[0]>; +def RM: SPR<512, "**ROUNDING MODE**">; /// Register classes // Allocate volatiles first // then nonvolatiles in reverse order since stmw/lmw save from rN to r31 -def GPRC : RegisterClass<"PPC", [i32], 32, - [R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, - R30, R29, R28, R27, R26, R25, R24, R23, R22, R21, R20, R19, R18, R17, - R16, R15, R14, R13, R31, R0, R1, LR]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GPRCClass::iterator - GPRCClass::allocation_order_begin(const MachineFunction &MF) const { - // 32-bit SVR4 ABI: r2 is reserved for the OS. - // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer. - // Darwin: R2 is reserved for CR save/restore sequence. - return begin()+1; - } - GPRCClass::iterator - GPRCClass::allocation_order_end(const MachineFunction &MF) const { - // On PPC64, r13 is the thread pointer. Never allocate this register. - // Note that this is overconservative, as it also prevents allocation of - // R31 when the FP is not needed. - // When using the 32-bit SVR4 ABI, r13 is reserved for the Small Data Area - // pointer. - const PPCSubtarget &Subtarget = MF.getTarget().getSubtarget<PPCSubtarget>(); - const PPCFrameLowering *PPCFI = - static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering()); - - if (Subtarget.isPPC64() || Subtarget.isSVR4ABI()) - return end()-5; // don't allocate R13, R31, R0, R1, LR - - if (PPCFI->needsFP(MF)) - return end()-4; // don't allocate R31, R0, R1, LR - else - return end()-3; // don't allocate R0, R1, LR - } - }]; -} -def G8RC : RegisterClass<"PPC", [i64], 64, - [X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, - X30, X29, X28, X27, X26, X25, X24, X23, X22, X21, X20, X19, X18, X17, - X16, X15, X14, X31, X13, X0, X1, LR8]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - G8RCClass::iterator - G8RCClass::allocation_order_begin(const MachineFunction &MF) const { - // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer. - // Darwin: r2 is reserved for CR save/restore sequence. - return begin()+1; - } - G8RCClass::iterator - G8RCClass::allocation_order_end(const MachineFunction &MF) const { - const PPCFrameLowering *PPCFI = - static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering()); - if (PPCFI->needsFP(MF)) - return end()-5; - else - return end()-4; - } - }]; -} +def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12), + (sequence "R%u", 30, 13), + R31, R0, R1, LR)>; + +def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12), + (sequence "X%u", 30, 14), + X31, X13, X0, X1, LR8)>; // Allocate volatiles first, then non-volatiles in reverse order. With the SVR4 // ABI the size of the Floating-point register save area is determined by the @@ -349,41 +291,36 @@ def G8RC : RegisterClass<"PPC", [i64], 64, // previous stack frame. By allocating non-volatiles in reverse order we make // sure that the Floating-point register save area is always as small as // possible because there aren't any unused spill slots. -def F8RC : RegisterClass<"PPC", [f64], 64, [F0, F1, F2, F3, F4, F5, F6, F7, - F8, F9, F10, F11, F12, F13, F31, F30, F29, F28, F27, F26, F25, F24, F23, - F22, F21, F20, F19, F18, F17, F16, F15, F14]>; -def F4RC : RegisterClass<"PPC", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, - F8, F9, F10, F11, F12, F13, F31, F30, F29, F28, F27, F26, F25, F24, F23, - F22, F21, F20, F19, F18, F17, F16, F15, F14]>; +def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13), + (sequence "F%u", 31, 14))>; +def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>; def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128, - [V2, V3, V4, V5, V0, V1, - V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V31, V30, - V29, V28, V27, V26, V25, V24, V23, V22, V21, V20]>; + (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11, + V12, V13, V14, V15, V16, V17, V18, V19, V31, V30, + V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>; def CRBITRC : RegisterClass<"PPC", [i32], 32, - [CR0LT, CR0GT, CR0EQ, CR0UN, - CR1LT, CR1GT, CR1EQ, CR1UN, - CR2LT, CR2GT, CR2EQ, CR2UN, - CR3LT, CR3GT, CR3EQ, CR3UN, - CR4LT, CR4GT, CR4EQ, CR4UN, - CR5LT, CR5GT, CR5EQ, CR5UN, - CR6LT, CR6GT, CR6EQ, CR6UN, - CR7LT, CR7GT, CR7EQ, CR7UN - ]> + (add CR0LT, CR0GT, CR0EQ, CR0UN, + CR1LT, CR1GT, CR1EQ, CR1UN, + CR2LT, CR2GT, CR2EQ, CR2UN, + CR3LT, CR3GT, CR3EQ, CR3UN, + CR4LT, CR4GT, CR4EQ, CR4UN, + CR5LT, CR5GT, CR5EQ, CR5UN, + CR6LT, CR6GT, CR6EQ, CR6UN, + CR7LT, CR7GT, CR7EQ, CR7UN)> { let CopyCost = -1; } -def CRRC : RegisterClass<"PPC", [i32], 32, [CR0, CR1, CR5, CR6, CR7, CR2, - CR3, CR4]> -{ +def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6, + CR7, CR2, CR3, CR4)> { let SubRegClasses = [(CRBITRC sub_lt, sub_gt, sub_eq, sub_un)]; } -def CTRRC : RegisterClass<"PPC", [i32], 32, [CTR]>; -def CTRRC8 : RegisterClass<"PPC", [i64], 64, [CTR8]>; -def VRSAVERC : RegisterClass<"PPC", [i32], 32, [VRSAVE]>; -def CARRYRC : RegisterClass<"PPC", [i32], 32, [CARRY]> { +def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)>; +def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)>; +def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>; +def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> { let CopyCost = -1; } diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index 72a1dee..5f3aa23 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -70,7 +70,7 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS, , HasSTFIWX(false) , HasLazyResolverStubs(false) , IsJITCodeModel(false) - , DarwinVers(0) { + , TargetTriple(TT) { // Determine default and user specified characteristics std::string CPU = "generic"; @@ -92,19 +92,6 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS, // support it, ignore. if (use64BitRegs() && !has64BitSupport()) Use64BitRegs = false; - - // Set the boolean corresponding to the current target triple, or the default - // if one cannot be determined, to true. - if (TT.length() > 7) { - // Determine which version of darwin this is. - size_t DarwinPos = TT.find("-darwin"); - if (DarwinPos != std::string::npos) { - if (isdigit(TT[DarwinPos+7])) - DarwinVers = atoi(&TT[DarwinPos+7]); - else - DarwinVers = 8; // Minimum supported darwin is Tiger. - } - } // Set up darwin-specific properties. if (isDarwin()) diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index 00ec747..8fd1a44 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -14,6 +14,7 @@ #ifndef POWERPCSUBTARGET_H #define POWERPCSUBTARGET_H +#include "llvm/ADT/Triple.h" #include "llvm/Target/TargetInstrItineraries.h" #include "llvm/Target/TargetSubtarget.h" @@ -65,9 +66,9 @@ protected: bool HasLazyResolverStubs; bool IsJITCodeModel; - /// DarwinVers - Nonzero if this is a darwin platform. Otherwise, the numeric - /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc. - unsigned char DarwinVers; // Is any darwin-ppc platform. + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -134,13 +135,10 @@ public: bool hasAltivec() const { return HasAltivec; } bool isGigaProcessor() const { return IsGigaProcessor; } - /// isDarwin - True if this is any darwin platform. - bool isDarwin() const { return DarwinVers != 0; } - /// isDarwin - True if this is darwin9 (leopard, 10.5) or above. - bool isDarwin9() const { return DarwinVers >= 9; } + const Triple &getTargetTriple() const { return TargetTriple; } - /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard. - unsigned getDarwinVers() const { return DarwinVers; } + /// isDarwin - True if this is any darwin platform. + bool isDarwin() const { return TargetTriple.isMacOSX(); } bool isDarwinABI() const { return isDarwin(); } bool isSVR4ABI() const { return !isDarwin(); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 212b450..d27e54e 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -24,7 +24,7 @@ using namespace llvm; static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); bool isPPC64 = TheTriple.getArch() == Triple::ppc64; - if (TheTriple.getOS() == Triple::Darwin) + if (TheTriple.isOSDarwin()) return new PPCMCAsmInfoDarwin(isPPC64); return new PPCLinuxMCAsmInfo(isPPC64); @@ -37,12 +37,10 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, MCCodeEmitter *Emitter, bool RelaxAll, bool NoExecStack) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + if (Triple(TT).isOSDarwin()) return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll); - default: - return NULL; - } + + return NULL; } extern "C" void LLVMInitializePowerPCTarget() { diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 99616b4..4e382e8 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -870,11 +870,6 @@ rshift_gt (unsigned int a) bar (); } -void neg_eq_cst(unsigned int a) { -if (-a == 123) -bar(); -} - All should simplify to a single comparison. All of these are currently not optimized with "clang -emit-llvm-bc | opt -std-compile-opts". @@ -2258,3 +2253,103 @@ SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the icmp transform. //===---------------------------------------------------------------------===// + +This code: + +typedef struct { +int f1:1; +int f2:1; +int f3:1; +int f4:29; +} t1; + +typedef struct { +int f1:1; +int f2:1; +int f3:30; +} t2; + +t1 s1; +t2 s2; + +void func1(void) +{ +s1.f1 = s2.f1; +s1.f2 = s2.f2; +} + +Compiles into this IR (on x86-64 at least): + +%struct.t1 = type { i8, [3 x i8] } +@s2 = global %struct.t1 zeroinitializer, align 4 +@s1 = global %struct.t1 zeroinitializer, align 4 +define void @func1() nounwind ssp noredzone { +entry: + %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4 + %bf.val.sext5 = and i32 %0, 1 + %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4 + %2 = and i32 %1, -4 + %3 = or i32 %2, %bf.val.sext5 + %bf.val.sext26 = and i32 %0, 2 + %4 = or i32 %3, %bf.val.sext26 + store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4 + ret void +} + +The two or/and's should be merged into one each. + +//===---------------------------------------------------------------------===// + +Machine level code hoisting can be useful in some cases. For example, PR9408 +is about: + +typedef union { + void (*f1)(int); + void (*f2)(long); +} funcs; + +void foo(funcs f, int which) { + int a = 5; + if (which) { + f.f1(a); + } else { + f.f2(a); + } +} + +which we compile to: + +foo: # @foo +# BB#0: # %entry + pushq %rbp + movq %rsp, %rbp + testl %esi, %esi + movq %rdi, %rax + je .LBB0_2 +# BB#1: # %if.then + movl $5, %edi + callq *%rax + popq %rbp + ret +.LBB0_2: # %if.else + movl $5, %edi + callq *%rax + popq %rbp + ret + +Note that bb1 and bb2 are the same. This doesn't happen at the IR level +because one call is passing an i32 and the other is passing an i64. + +//===---------------------------------------------------------------------===// + +I see this sort of pattern in 176.gcc in a few places (e.g. the start of +store_bit_field). The rem should be replaced with a multiply and subtract: + + %3 = sdiv i32 %A, %B + %4 = srem i32 %A, %B + +Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM, +which can do this in a single operation (instruction or libcall). It is +probably best to do this in the code generator. + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 70574c3..0b4612d 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -91,8 +91,8 @@ SparcTargetLowering::LowerReturn(SDValue Chain, SmallVector<CCValAssign, 16> RVLocs; // CCState - Info about the registers and stack slot. - CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + DAG.getTarget(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32); @@ -139,7 +139,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, RetAddrOffsetNode, Flag); - return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, + return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, RetAddrOffsetNode); } @@ -161,8 +161,8 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32); const unsigned StackOffset = 92; @@ -182,8 +182,6 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain, } if (VA.isRegLoc()) { - EVT RegVT = VA.getLocVT(); - if (VA.needsCustom()) { assert(VA.getLocVT() == MVT::f64); unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); @@ -362,8 +360,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + DAG.getTarget(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32); // Get the size of the outgoing arguments stack space requirement. @@ -544,7 +542,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { @@ -593,8 +591,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState RVInfo(CallConv, isVarArg, DAG.getTarget(), - RVLocs, *DAG.getContext()); + CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(), + DAG.getTarget(), RVLocs, *DAG.getContext()); RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32); @@ -801,6 +799,8 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM) if (TM.getSubtarget<SparcSubtarget>().isV9()) setOperationAction(ISD::CTPOP, MVT::i32, Legal); + setMinFunctionAlignment(2); + computeRegisterProperties(); } @@ -1290,8 +1290,3 @@ SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The Sparc target isn't yet aware of offsets. return false; } - -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned SparcTargetLowering::getFunctionAlignment(const Function *) const { - return 2; -} diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h index 7d02df8..9ea6e16 100644 --- a/lib/Target/Sparc/SparcISelLowering.h +++ b/lib/Target/Sparc/SparcISelLowering.h @@ -71,9 +71,6 @@ namespace llvm { virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp index b010d04..9fcf028 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.cpp +++ b/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -39,6 +39,8 @@ const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); + // FIXME: G1 reserved for now for large imm generation by frame code. + Reserved.set(SP::G1); Reserved.set(SP::G2); Reserved.set(SP::G3); Reserved.set(SP::G4); @@ -130,5 +132,9 @@ int SparcRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return SparcGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); } +int SparcRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + return SparcGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0); +} + #include "SparcGenRegisterInfo.inc" diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h index d930b53..56c8068 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.h +++ b/lib/Target/Sparc/SparcRegisterInfo.h @@ -52,6 +52,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo { unsigned getEHHandlerRegister() const; int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; }; } // end namespace llvm diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td index 5ef4dae..cf92829 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.td +++ b/lib/Target/Sparc/SparcRegisterInfo.td @@ -117,59 +117,43 @@ def F30 : Rf<30, "F30">, DwarfRegNum<[62]>; def F31 : Rf<31, "F31">, DwarfRegNum<[63]>; // Aliases of the F* registers used to hold 64-bit fp values (doubles) -def D0 : Rd< 0, "F0", [F0, F1]>, DwarfRegNum<[32]>; -def D1 : Rd< 2, "F2", [F2, F3]>, DwarfRegNum<[34]>; -def D2 : Rd< 4, "F4", [F4, F5]>, DwarfRegNum<[36]>; -def D3 : Rd< 6, "F6", [F6, F7]>, DwarfRegNum<[38]>; -def D4 : Rd< 8, "F8", [F8, F9]>, DwarfRegNum<[40]>; -def D5 : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[42]>; -def D6 : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[44]>; -def D7 : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[46]>; -def D8 : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[48]>; -def D9 : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[50]>; -def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[52]>; -def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[54]>; -def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[56]>; -def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[58]>; -def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[60]>; -def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[62]>; +def D0 : Rd< 0, "F0", [F0, F1]>, DwarfRegNum<[72]>; +def D1 : Rd< 2, "F2", [F2, F3]>, DwarfRegNum<[73]>; +def D2 : Rd< 4, "F4", [F4, F5]>, DwarfRegNum<[74]>; +def D3 : Rd< 6, "F6", [F6, F7]>, DwarfRegNum<[75]>; +def D4 : Rd< 8, "F8", [F8, F9]>, DwarfRegNum<[76]>; +def D5 : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[77]>; +def D6 : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[78]>; +def D7 : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[79]>; +def D8 : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[80]>; +def D9 : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[81]>; +def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[82]>; +def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[83]>; +def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[84]>; +def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>; +def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>; +def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>; // Register classes. // // FIXME: the register order should be defined in terms of the preferred // allocation order... // -def IntRegs : RegisterClass<"SP", [i32], 32, [L0, L1, L2, L3, L4, L5, L6, L7, - I0, I1, I2, I3, I4, I5, - O0, O1, O2, O3, O4, O5, O7, +def IntRegs : RegisterClass<"SP", [i32], 32, + (add L0, L1, L2, L3, L4, L5, L6, + L7, I0, I1, I2, I3, I4, I5, + O0, O1, O2, O3, O4, O5, O7, + G1, + // Non-allocatable regs: + G2, G3, G4, // FIXME: OK for use only in + // applications, not libraries. + O6, // stack ptr + I6, // frame ptr + I7, // return address + G0, // constant zero + G5, G6, G7 // reserved for kernel + )>; - // FIXME: G1 reserved for now for large imm generation by frame code. - G1, - // Non-allocatable regs: - G2, G3, G4, // FIXME: OK for use only in - // applications, not libraries. - O6, // stack ptr - I6, // frame ptr - I7, // return address - G0, // constant zero - G5, G6, G7 // reserved for kernel - ]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - IntRegsClass::iterator - IntRegsClass::allocation_order_end(const MachineFunction &MF) const { - // FIXME: These special regs should be taken out of the regclass! - return end()-10 // Don't allocate special registers - -1; // FIXME: G1 reserved for large imm generation by frame code. - } - }]; -} - -def FPRegs : RegisterClass<"SP", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, F8, - F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22, - F23, F24, F25, F26, F27, F28, F29, F30, F31]>; +def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>; -def DFPRegs : RegisterClass<"SP", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, - D8, D9, D10, D11, D12, D13, D14, D15]>; +def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>; diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp index 3cf95b5..e0a9de8 100644 --- a/lib/Target/SubtargetFeature.cpp +++ b/lib/Target/SubtargetFeature.cpp @@ -211,7 +211,7 @@ const std::string & SubtargetFeatures::getCPU() const { /// feature, set it. /// static -void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, +void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry, const SubtargetFeatureKV *FeatureTable, size_t FeatureTableSize) { for (size_t i = 0; i < FeatureTableSize; ++i) { @@ -230,7 +230,7 @@ void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, /// feature, clear it. /// static -void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, +void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry, const SubtargetFeatureKV *FeatureTable, size_t FeatureTableSize) { for (size_t i = 0; i < FeatureTableSize; ++i) { @@ -247,7 +247,7 @@ void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, /// getBits - Get feature bits. /// -uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable, +uint64_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize, const SubtargetFeatureKV *FeatureTable, size_t FeatureTableSize) { @@ -263,7 +263,7 @@ uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable, "CPU features table is not sorted"); } #endif - uint32_t Bits = 0; // Resulting bits + uint64_t Bits = 0; // Resulting bits // Check if help is needed if (Features[0] == "help") diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 90939c3..af85df5 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -153,6 +153,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) : setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + setMinFunctionAlignment(1); } SDValue SystemZTargetLowering::LowerOperation(SDValue Op, @@ -289,8 +291,8 @@ SystemZTargetLowering::LowerCCCArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ); if (isVarArg) @@ -382,8 +384,8 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_SystemZ); @@ -451,7 +453,7 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain and // flag operands which copy the outgoing args into registers. The InFlag in - // necessary since all emited instructions must be stuck together. + // necessary since all emitted instructions must be stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, @@ -511,8 +513,8 @@ SystemZTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ); @@ -556,8 +558,8 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, SmallVector<CCValAssign, 16> RVLocs; // CCState - Info about the registers and stack slot. - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_SystemZ); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 3019242..bab3dc2 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -66,11 +66,6 @@ namespace llvm { /// DAG node. virtual const char *getTargetNodeName(unsigned Opcode) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const { - return 1; - } - std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; TargetLowering::ConstraintType diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index 28f94f4..d5c165f 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -51,13 +51,37 @@ BitVector SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const BitVector Reserved(getNumRegs()); const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - if (TFI->hasFP(MF)) + if (TFI->hasFP(MF)) { + // R11D is the frame pointer. Reserve all aliases. Reserved.set(SystemZ::R11D); + Reserved.set(SystemZ::R11W); + Reserved.set(SystemZ::R10P); + Reserved.set(SystemZ::R10Q); + } + Reserved.set(SystemZ::R14D); Reserved.set(SystemZ::R15D); + Reserved.set(SystemZ::R14W); + Reserved.set(SystemZ::R15W); + Reserved.set(SystemZ::R14P); + Reserved.set(SystemZ::R14Q); return Reserved; } +const TargetRegisterClass* +SystemZRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, + unsigned Idx) const { + switch(Idx) { + // Exact sub-classes don't exist for the other sub-register indexes. + default: return 0; + case SystemZ::subreg_32bit: + if (B == SystemZ::ADDR32RegisterClass) + return A->getSize() == 8 ? SystemZ::ADDR64RegisterClass : 0; + return A; + } +} + void SystemZRegisterInfo:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { @@ -125,4 +149,10 @@ int SystemZRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return -1; } +int SystemZRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + assert(0 && "What is the dwarf register number"); + return -1; +} + + #include "SystemZGenRegisterInfo.inc" diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h index b450798..cd8f20f 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -34,6 +34,10 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo { BitVector getReservedRegs(const MachineFunction &MF) const; + const TargetRegisterClass* + getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, unsigned Idx) const; + void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; @@ -50,6 +54,7 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo { unsigned getEHHandlerRegister() const; int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; }; } // end namespace llvm diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td index 0028c85..a24cbcf 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -61,22 +61,22 @@ def subreg_odd : SubRegIndex; } // General-purpose registers -def R0W : GPR32< 0, "r0">, DwarfRegNum<[0]>; -def R1W : GPR32< 1, "r1">, DwarfRegNum<[1]>; -def R2W : GPR32< 2, "r2">, DwarfRegNum<[2]>; -def R3W : GPR32< 3, "r3">, DwarfRegNum<[3]>; -def R4W : GPR32< 4, "r4">, DwarfRegNum<[4]>; -def R5W : GPR32< 5, "r5">, DwarfRegNum<[5]>; -def R6W : GPR32< 6, "r6">, DwarfRegNum<[6]>; -def R7W : GPR32< 7, "r7">, DwarfRegNum<[7]>; -def R8W : GPR32< 8, "r8">, DwarfRegNum<[8]>; -def R9W : GPR32< 9, "r9">, DwarfRegNum<[9]>; -def R10W : GPR32<10, "r10">, DwarfRegNum<[10]>; -def R11W : GPR32<11, "r11">, DwarfRegNum<[11]>; -def R12W : GPR32<12, "r12">, DwarfRegNum<[12]>; -def R13W : GPR32<13, "r13">, DwarfRegNum<[13]>; -def R14W : GPR32<14, "r14">, DwarfRegNum<[14]>; -def R15W : GPR32<15, "r15">, DwarfRegNum<[15]>; +def R0W : GPR32< 0, "r0">; +def R1W : GPR32< 1, "r1">; +def R2W : GPR32< 2, "r2">; +def R3W : GPR32< 3, "r3">; +def R4W : GPR32< 4, "r4">; +def R5W : GPR32< 5, "r5">; +def R6W : GPR32< 6, "r6">; +def R7W : GPR32< 7, "r7">; +def R8W : GPR32< 8, "r8">; +def R9W : GPR32< 9, "r9">; +def R10W : GPR32<10, "r10">; +def R11W : GPR32<11, "r11">; +def R12W : GPR32<12, "r12">; +def R13W : GPR32<13, "r13">; +def R14W : GPR32<14, "r14">; +def R15W : GPR32<15, "r15">; let SubRegIndices = [subreg_32bit] in { def R0D : GPR64< 0, "r0", [R0W]>, DwarfRegNum<[0]>; @@ -99,26 +99,26 @@ def R15D : GPR64<15, "r15", [R15W]>, DwarfRegNum<[15]>; // Register pairs let SubRegIndices = [subreg_32bit, subreg_odd32] in { -def R0P : GPR64< 0, "r0", [R0W, R1W], [R0D, R1D]>, DwarfRegNum<[0]>; -def R2P : GPR64< 2, "r2", [R2W, R3W], [R2D, R3D]>, DwarfRegNum<[2]>; -def R4P : GPR64< 4, "r4", [R4W, R5W], [R4D, R5D]>, DwarfRegNum<[4]>; -def R6P : GPR64< 6, "r6", [R6W, R7W], [R6D, R7D]>, DwarfRegNum<[6]>; -def R8P : GPR64< 8, "r8", [R8W, R9W], [R8D, R9D]>, DwarfRegNum<[8]>; -def R10P : GPR64<10, "r10", [R10W, R11W], [R10D, R11D]>, DwarfRegNum<[10]>; -def R12P : GPR64<12, "r12", [R12W, R13W], [R12D, R13D]>, DwarfRegNum<[12]>; -def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>, DwarfRegNum<[14]>; +def R0P : GPR64< 0, "r0", [R0W, R1W], [R0D, R1D]>; +def R2P : GPR64< 2, "r2", [R2W, R3W], [R2D, R3D]>; +def R4P : GPR64< 4, "r4", [R4W, R5W], [R4D, R5D]>; +def R6P : GPR64< 6, "r6", [R6W, R7W], [R6D, R7D]>; +def R8P : GPR64< 8, "r8", [R8W, R9W], [R8D, R9D]>; +def R10P : GPR64<10, "r10", [R10W, R11W], [R10D, R11D]>; +def R12P : GPR64<12, "r12", [R12W, R13W], [R12D, R13D]>; +def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>; } let SubRegIndices = [subreg_even, subreg_odd], CompositeIndices = [(subreg_odd32 subreg_odd, subreg_32bit)] in { -def R0Q : GPR128< 0, "r0", [R0D, R1D], [R0P]>, DwarfRegNum<[0]>; -def R2Q : GPR128< 2, "r2", [R2D, R3D], [R2P]>, DwarfRegNum<[2]>; -def R4Q : GPR128< 4, "r4", [R4D, R5D], [R4P]>, DwarfRegNum<[4]>; -def R6Q : GPR128< 6, "r6", [R6D, R7D], [R6P]>, DwarfRegNum<[6]>; -def R8Q : GPR128< 8, "r8", [R8D, R9D], [R8P]>, DwarfRegNum<[8]>; -def R10Q : GPR128<10, "r10", [R10D, R11D], [R10P]>, DwarfRegNum<[10]>; -def R12Q : GPR128<12, "r12", [R12D, R13D], [R12P]>, DwarfRegNum<[12]>; -def R14Q : GPR128<14, "r14", [R14D, R15D], [R14P]>, DwarfRegNum<[14]>; +def R0Q : GPR128< 0, "r0", [R0D, R1D], [R0P]>; +def R2Q : GPR128< 2, "r2", [R2D, R3D], [R2P]>; +def R4Q : GPR128< 4, "r4", [R4D, R5D], [R4P]>; +def R6Q : GPR128< 6, "r6", [R6D, R7D], [R6P]>; +def R8Q : GPR128< 8, "r8", [R8D, R9D], [R8P]>; +def R10Q : GPR128<10, "r10", [R10D, R11D], [R10P]>; +def R12Q : GPR128<12, "r12", [R12D, R13D], [R12P]>; +def R14Q : GPR128<14, "r14", [R14D, R15D], [R14P]>; } // Floating-point registers @@ -140,339 +140,66 @@ def F14S : FPRS<14, "f14">, DwarfRegNum<[30]>; def F15S : FPRS<15, "f15">, DwarfRegNum<[31]>; let SubRegIndices = [subreg_32bit] in { -def F0L : FPRL< 0, "f0", [F0S]>, DwarfRegNum<[16]>; -def F1L : FPRL< 1, "f1", [F1S]>, DwarfRegNum<[17]>; -def F2L : FPRL< 2, "f2", [F2S]>, DwarfRegNum<[18]>; -def F3L : FPRL< 3, "f3", [F3S]>, DwarfRegNum<[19]>; -def F4L : FPRL< 4, "f4", [F4S]>, DwarfRegNum<[20]>; -def F5L : FPRL< 5, "f5", [F5S]>, DwarfRegNum<[21]>; -def F6L : FPRL< 6, "f6", [F6S]>, DwarfRegNum<[22]>; -def F7L : FPRL< 7, "f7", [F7S]>, DwarfRegNum<[23]>; -def F8L : FPRL< 8, "f8", [F8S]>, DwarfRegNum<[24]>; -def F9L : FPRL< 9, "f9", [F9S]>, DwarfRegNum<[25]>; -def F10L : FPRL<10, "f10", [F10S]>, DwarfRegNum<[26]>; -def F11L : FPRL<11, "f11", [F11S]>, DwarfRegNum<[27]>; -def F12L : FPRL<12, "f12", [F12S]>, DwarfRegNum<[28]>; -def F13L : FPRL<13, "f13", [F13S]>, DwarfRegNum<[29]>; -def F14L : FPRL<14, "f14", [F14S]>, DwarfRegNum<[30]>; -def F15L : FPRL<15, "f15", [F15S]>, DwarfRegNum<[31]>; +def F0L : FPRL< 0, "f0", [F0S]>; +def F1L : FPRL< 1, "f1", [F1S]>; +def F2L : FPRL< 2, "f2", [F2S]>; +def F3L : FPRL< 3, "f3", [F3S]>; +def F4L : FPRL< 4, "f4", [F4S]>; +def F5L : FPRL< 5, "f5", [F5S]>; +def F6L : FPRL< 6, "f6", [F6S]>; +def F7L : FPRL< 7, "f7", [F7S]>; +def F8L : FPRL< 8, "f8", [F8S]>; +def F9L : FPRL< 9, "f9", [F9S]>; +def F10L : FPRL<10, "f10", [F10S]>; +def F11L : FPRL<11, "f11", [F11S]>; +def F12L : FPRL<12, "f12", [F12S]>; +def F13L : FPRL<13, "f13", [F13S]>; +def F14L : FPRL<14, "f14", [F14S]>; +def F15L : FPRL<15, "f15", [F15S]>; } // Status register def PSW : SystemZReg<"psw">; -/// Register classes -def GR32 : RegisterClass<"SystemZ", [i32], 32, - // Volatile registers - [R0W, R1W, R2W, R3W, R4W, R5W, R6W, R7W, R8W, R9W, R10W, R12W, R13W, - // Frame pointer, sometimes allocable - R11W, - // Volatile, but not allocable - R14W, R15W]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned SystemZ_REG32[] = { - SystemZ::R1W, SystemZ::R2W, SystemZ::R3W, SystemZ::R4W, - SystemZ::R5W, SystemZ::R0W, SystemZ::R12W, SystemZ::R11W, - SystemZ::R10W, SystemZ::R9W, SystemZ::R8W, SystemZ::R7W, - SystemZ::R6W, SystemZ::R14W, SystemZ::R13W - }; - static const unsigned SystemZ_REG32_nofp[] = { - SystemZ::R1W, SystemZ::R2W, SystemZ::R3W, SystemZ::R4W, - SystemZ::R5W, SystemZ::R0W, SystemZ::R12W, /* No R11W */ - SystemZ::R10W, SystemZ::R9W, SystemZ::R8W, SystemZ::R7W, - SystemZ::R6W, SystemZ::R14W, SystemZ::R13W - }; - GR32Class::iterator - GR32Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_REG32_nofp; - else - return SystemZ_REG32; - } - GR32Class::iterator - GR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_REG32_nofp + (sizeof(SystemZ_REG32_nofp) / sizeof(unsigned)); - else - return SystemZ_REG32 + (sizeof(SystemZ_REG32) / sizeof(unsigned)); - } - }]; -} +/// Register classes. +/// Allocate the callee-saved R6-R12 backwards. That way they can be saved +/// together with R14 and R15 in one prolog instruction. +def GR32 : RegisterClass<"SystemZ", [i32], 32, (add (sequence "R%uW", 0, 5), + (sequence "R%uW", 15, 6))>; /// Registers used to generate address. Everything except R0. -def ADDR32 : RegisterClass<"SystemZ", [i32], 32, - // Volatile registers - [R1W, R2W, R3W, R4W, R5W, R6W, R7W, R8W, R9W, R10W, R12W, R13W, - // Frame pointer, sometimes allocable - R11W, - // Volatile, but not allocable - R14W, R15W]> -{ - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned SystemZ_ADDR32[] = { - SystemZ::R1W, SystemZ::R2W, SystemZ::R3W, SystemZ::R4W, - SystemZ::R5W, /* No R0W */ SystemZ::R12W, SystemZ::R11W, - SystemZ::R10W, SystemZ::R9W, SystemZ::R8W, SystemZ::R7W, - SystemZ::R6W, SystemZ::R14W, SystemZ::R13W - }; - static const unsigned SystemZ_ADDR32_nofp[] = { - SystemZ::R1W, SystemZ::R2W, SystemZ::R3W, SystemZ::R4W, - SystemZ::R5W, /* No R0W */ SystemZ::R12W, /* No R11W */ - SystemZ::R10W, SystemZ::R9W, SystemZ::R8W, SystemZ::R7W, - SystemZ::R6W, SystemZ::R14W, SystemZ::R13W - }; - ADDR32Class::iterator - ADDR32Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_ADDR32_nofp; - else - return SystemZ_ADDR32; - } - ADDR32Class::iterator - ADDR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_ADDR32_nofp + (sizeof(SystemZ_ADDR32_nofp) / sizeof(unsigned)); - else - return SystemZ_ADDR32 + (sizeof(SystemZ_ADDR32) / sizeof(unsigned)); - } - }]; -} +def ADDR32 : RegisterClass<"SystemZ", [i32], 32, (sub GR32, R0W)>; -def GR64 : RegisterClass<"SystemZ", [i64], 64, - // Volatile registers - [R0D, R1D, R2D, R3D, R4D, R5D, R6D, R7D, R8D, R9D, R10D, R12D, R13D, - // Frame pointer, sometimes allocable - R11D, - // Volatile, but not allocable - R14D, R15D]> -{ +def GR64 : RegisterClass<"SystemZ", [i64], 64, (add (sequence "R%uD", 0, 5), + (sequence "R%uD", 15, 6))> { let SubRegClasses = [(GR32 subreg_32bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned SystemZ_REG64[] = { - SystemZ::R1D, SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, - SystemZ::R5D, SystemZ::R0D, SystemZ::R12D, SystemZ::R11D, - SystemZ::R10D, SystemZ::R9D, SystemZ::R8D, SystemZ::R7D, - SystemZ::R6D, SystemZ::R14D, SystemZ::R13D - }; - static const unsigned SystemZ_REG64_nofp[] = { - SystemZ::R1D, SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, - SystemZ::R5D, SystemZ::R0D, SystemZ::R12D, /* No R11D */ - SystemZ::R10D, SystemZ::R9D, SystemZ::R8D, SystemZ::R7D, - SystemZ::R6D, SystemZ::R14D, SystemZ::R13D - }; - GR64Class::iterator - GR64Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_REG64_nofp; - else - return SystemZ_REG64; - } - GR64Class::iterator - GR64Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_REG64_nofp + (sizeof(SystemZ_REG64_nofp) / sizeof(unsigned)); - else - return SystemZ_REG64 + (sizeof(SystemZ_REG64) / sizeof(unsigned)); - } - }]; } -def ADDR64 : RegisterClass<"SystemZ", [i64], 64, - // Volatile registers - [R1D, R2D, R3D, R4D, R5D, R6D, R7D, R8D, R9D, R10D, R12D, R13D, - // Frame pointer, sometimes allocable - R11D, - // Volatile, but not allocable - R14D, R15D]> -{ +def ADDR64 : RegisterClass<"SystemZ", [i64], 64, (sub GR64, R0D)> { let SubRegClasses = [(ADDR32 subreg_32bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned SystemZ_ADDR64[] = { - SystemZ::R1D, SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, - SystemZ::R5D, /* No R0D */ SystemZ::R12D, SystemZ::R11D, - SystemZ::R10D, SystemZ::R9D, SystemZ::R8D, SystemZ::R7D, - SystemZ::R6D, SystemZ::R14D, SystemZ::R13D - }; - static const unsigned SystemZ_ADDR64_nofp[] = { - SystemZ::R1D, SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, - SystemZ::R5D, /* No R0D */ SystemZ::R12D, /* No R11D */ - SystemZ::R10D, SystemZ::R9D, SystemZ::R8D, SystemZ::R7D, - SystemZ::R6D, SystemZ::R14D, SystemZ::R13D - }; - ADDR64Class::iterator - ADDR64Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_ADDR64_nofp; - else - return SystemZ_ADDR64; - } - ADDR64Class::iterator - ADDR64Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_ADDR64_nofp + (sizeof(SystemZ_ADDR64_nofp) / sizeof(unsigned)); - else - return SystemZ_ADDR64 + (sizeof(SystemZ_ADDR64) / sizeof(unsigned)); - } - }]; } // Even-odd register pairs -def GR64P : RegisterClass<"SystemZ", [v2i32], 64, - [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P]> -{ +def GR64P : RegisterClass<"SystemZ", [v2i32], 64, (add R0P, R2P, R4P, + R12P, R10P, R8P, R6P, + R14P)> { let SubRegClasses = [(GR32 subreg_32bit, subreg_odd32)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned SystemZ_REG64P[] = { - SystemZ::R0P, SystemZ::R2P, SystemZ::R4P, SystemZ::R10P, - SystemZ::R8P, SystemZ::R6P }; - static const unsigned SystemZ_REG64P_nofp[] = { - SystemZ::R0P, SystemZ::R2P, SystemZ::R4P, /* NO R10P */ - SystemZ::R8P, SystemZ::R6P }; - GR64PClass::iterator - GR64PClass::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_REG64P_nofp; - else - return SystemZ_REG64P; - } - GR64PClass::iterator - GR64PClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_REG64P_nofp + (sizeof(SystemZ_REG64P_nofp) / sizeof(unsigned)); - else - return SystemZ_REG64P + (sizeof(SystemZ_REG64P) / sizeof(unsigned)); - } - }]; } -def GR128 : RegisterClass<"SystemZ", [v2i64], 128, - [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q]> -{ +def GR128 : RegisterClass<"SystemZ", [v2i64], 128, (add R0Q, R2Q, R4Q, + R12Q, R10Q, R8Q, R6Q, + R14Q)> { let SubRegClasses = [(GR32 subreg_32bit, subreg_odd32), - (GR64 subreg_even, subreg_odd)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned SystemZ_REG128[] = { - SystemZ::R0Q, SystemZ::R2Q, SystemZ::R4Q, SystemZ::R10Q, - SystemZ::R8Q, SystemZ::R6Q }; - static const unsigned SystemZ_REG128_nofp[] = { - SystemZ::R0Q, SystemZ::R2Q, SystemZ::R4Q, /* NO R10Q */ - SystemZ::R8Q, SystemZ::R6Q }; - GR128Class::iterator - GR128Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_REG128_nofp; - else - return SystemZ_REG128; - } - GR128Class::iterator - GR128Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return SystemZ_REG128_nofp + (sizeof(SystemZ_REG128_nofp) / sizeof(unsigned)); - else - return SystemZ_REG128 + (sizeof(SystemZ_REG128) / sizeof(unsigned)); - } - }]; + (GR64 subreg_even, subreg_odd)]; } -def FP32 : RegisterClass<"SystemZ", [f32], 32, - [F0S, F1S, F2S, F3S, F4S, F5S, F6S, F7S, - F8S, F9S, F10S, F11S, F12S, F13S, F14S, F15S]> { - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned SystemZ_REGFP32[] = { - SystemZ::F0S, SystemZ::F2S, SystemZ::F4S, SystemZ::F6S, - SystemZ::F1S, SystemZ::F3S, SystemZ::F5S, SystemZ::F7S, - SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S, - SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S }; - FP32Class::iterator - FP32Class::allocation_order_begin(const MachineFunction &MF) const { - return SystemZ_REGFP32; - } - FP32Class::iterator - FP32Class::allocation_order_end(const MachineFunction &MF) const { - return SystemZ_REGFP32 + (sizeof(SystemZ_REGFP32) / sizeof(unsigned)); - } - }]; -} +def FP32 : RegisterClass<"SystemZ", [f32], 32, (sequence "F%uS", 0, 15)>; -def FP64 : RegisterClass<"SystemZ", [f64], 64, - [F0L, F1L, F2L, F3L, F4L, F5L, F6L, F7L, - F8L, F9L, F10L, F11L, F12L, F13L, F14L, F15L]> { +def FP64 : RegisterClass<"SystemZ", [f64], 64, (sequence "F%uL", 0, 15)> { let SubRegClasses = [(FP32 subreg_32bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned SystemZ_REGFP64[] = { - SystemZ::F0L, SystemZ::F2L, SystemZ::F4L, SystemZ::F6L, - SystemZ::F1L, SystemZ::F3L, SystemZ::F5L, SystemZ::F7L, - SystemZ::F8L, SystemZ::F9L, SystemZ::F10L, SystemZ::F11L, - SystemZ::F12L, SystemZ::F13L, SystemZ::F14L, SystemZ::F15L }; - FP64Class::iterator - FP64Class::allocation_order_begin(const MachineFunction &MF) const { - return SystemZ_REGFP64; - } - FP64Class::iterator - FP64Class::allocation_order_end(const MachineFunction &MF) const { - return SystemZ_REGFP64 + (sizeof(SystemZ_REGFP64) / sizeof(unsigned)); - } - }]; } // Status flags registers. -def CCR : RegisterClass<"SystemZ", [i64], 64, [PSW]> { +def CCR : RegisterClass<"SystemZ", [i64], 64, (add PSW)> { let CopyCost = -1; // Don't allow copying of status registers. } diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp index c628df0..1990bc7 100644 --- a/lib/Target/TargetData.cpp +++ b/lib/Target/TargetData.cpp @@ -617,10 +617,14 @@ uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices, unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const { const Type *ElemType = GV->getType()->getElementType(); unsigned Alignment = getPrefTypeAlignment(ElemType); - if (GV->getAlignment() > Alignment) - Alignment = GV->getAlignment(); + unsigned GVAlignment = GV->getAlignment(); + if (GVAlignment >= Alignment) { + Alignment = GVAlignment; + } else if (GVAlignment != 0) { + Alignment = std::max(GVAlignment, getABITypeAlignment(ElemType)); + } - if (GV->hasInitializer()) { + if (GV->hasInitializer() && GVAlignment == 0) { if (Alignment < 16) { // If the global is not external, see if it is large. If so, give it a // larger alignment. diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp index 90ea343..709dfd2 100644 --- a/lib/Target/TargetLibraryInfo.cpp +++ b/lib/Target/TargetLibraryInfo.cpp @@ -28,11 +28,18 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T) { // memset_pattern16 is only available on iOS 3.0 and Mac OS/X 10.5 and later. - if (T.getOS() != Triple::Darwin || T.getDarwinMajorNumber() < 9) + if (T.isMacOSX()) { + if (T.isMacOSXVersionLT(10, 5)) + TLI.setUnavailable(LibFunc::memset_pattern16); + } else if (T.getOS() == Triple::IOS) { + if (T.isOSVersionLT(3, 0)) + TLI.setUnavailable(LibFunc::memset_pattern16); + } else { TLI.setUnavailable(LibFunc::memset_pattern16); + } - // iprintf and friends are only available on XCore. - if (T.getArch() != Triple::xcore) { + // iprintf and friends are only available on XCore and TCE. + if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) { TLI.setUnavailable(LibFunc::iprintf); TLI.setUnavailable(LibFunc::siprintf); TLI.setUnavailable(LibFunc::fiprintf); @@ -54,6 +61,12 @@ TargetLibraryInfo::TargetLibraryInfo(const Triple &T) : ImmutablePass(ID) { initialize(*this, T); } +TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI) + : ImmutablePass(ID) { + memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray)); +} + + /// disableAllFunctions - This disables all builtins, which is used for options /// like -fno-builtin. void TargetLibraryInfo::disableAllFunctions() { diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp index 5d34c7d..3343384 100644 --- a/lib/Target/TargetLoweringObjectFile.cpp +++ b/lib/Target/TargetLoweringObjectFile.cpp @@ -58,7 +58,6 @@ TargetLoweringObjectFile::TargetLoweringObjectFile() : Ctx(0) { DwarfRangesSection = 0; DwarfMacroInfoSection = 0; - IsFunctionEHSymbolGlobal = false; IsFunctionEHFrameSymbolPrivate = true; SupportsWeakOmittedEHFrame = true; } @@ -120,6 +119,18 @@ static bool IsNullTerminatedString(const Constant *C) { return false; } +MCSymbol *TargetLoweringObjectFile:: +getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const { + return Mang->getSymbol(GV); +} + +void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer, + const TargetMachine &TM, + const MCSymbol *Sym) const { +} + + /// getKindForGlobal - This is a top-level target-independent classifier for /// a global variable. Given an global variable and information from TM, it /// classifies the global in a variety of ways that make various target @@ -305,16 +316,15 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, MachineModuleInfo *MMI, unsigned Encoding, MCStreamer &Streamer) const { const MCSymbol *Sym = Mang->getSymbol(GV); - return getExprForDwarfReference(Sym, Mang, MMI, Encoding, Streamer); + return getExprForDwarfReference(Sym, Encoding, Streamer); } const MCExpr *TargetLoweringObjectFile:: -getExprForDwarfReference(const MCSymbol *Sym, Mangler *Mang, - MachineModuleInfo *MMI, unsigned Encoding, +getExprForDwarfReference(const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const { const MCExpr *Res = MCSymbolRefExpr::Create(Sym, getContext()); - switch (Encoding & 0xF0) { + switch (Encoding & 0x70) { default: report_fatal_error("We do not support this DWARF encoding yet!"); case dwarf::DW_EH_PE_absptr: @@ -339,7 +349,7 @@ unsigned TargetLoweringObjectFile::getLSDAEncoding() const { return dwarf::DW_EH_PE_absptr; } -unsigned TargetLoweringObjectFile::getFDEEncoding() const { +unsigned TargetLoweringObjectFile::getFDEEncoding(bool CFI) const { return dwarf::DW_EH_PE_absptr; } diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index 8c7330a..863b811 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -40,7 +40,6 @@ namespace llvm { bool JITExceptionHandling; bool JITEmitDebugInfo; bool JITEmitDebugInfoToDisk; - bool UnwindTablesMandatory; Reloc::Model RelocationModel; CodeModel::Model CMModel; bool GuaranteedTailCallOpt; @@ -143,11 +142,6 @@ EmitJitDebugInfoToDisk("jit-emit-debug-to-disk", cl::desc("Emit debug info objfiles to disk"), cl::location(JITEmitDebugInfoToDisk), cl::init(false)); -static cl::opt<bool, true> -EnableUnwindTables("unwind-tables", - cl::desc("Generate unwinding tables for all functions"), - cl::location(UnwindTablesMandatory), - cl::init(false)); static cl::opt<llvm::Reloc::Model, true> DefRelocationModel("relocation-model", @@ -206,11 +200,10 @@ EnableStrongPHIElim(cl::Hidden, "strong-phi-elim", cl::desc("Use strong PHI elimination."), cl::location(StrongPHIElim), cl::init(false)); -static cl::opt<bool, true> -UseDivMod("use-divmod-libcall", - cl::desc("Use __{u}divmod libcalls for div / rem pairs"), - cl::location(HasDivModLibcall), - cl::init(false)); +static cl::opt<std::string> +TrapFuncName("trap-func", cl::Hidden, + cl::desc("Emit a call to trap function rather than a trap instruction"), + cl::init("")); static cl::opt<bool> DataSections("fdata-sections", cl::desc("Emit data into separate sections"), @@ -228,7 +221,8 @@ TargetMachine::TargetMachine(const Target &T) MCRelaxAll(false), MCNoExecStack(false), MCSaveTempLabels(false), - MCUseLoc(true) { + MCUseLoc(true), + MCUseCFI(true) { // Typically it will be subtargets that will adjust FloatABIType from Default // to Soft or Hard. if (UseSoftFloat) @@ -310,4 +304,11 @@ namespace llvm { bool HonorSignDependentRoundingFPMath() { return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption; } + + /// getTrapFunctionName - If this returns a non-empty string, this means isel + /// should lower Intrinsic::trap to a call to the specified function name + /// instead of an ISD::TRAP node. + StringRef getTrapFunctionName() { + return TrapFuncName; + } } diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp index 4811ba5..e36e136 100644 --- a/lib/Target/TargetRegisterInfo.cpp +++ b/lib/Target/TargetRegisterInfo.cpp @@ -23,12 +23,8 @@ using namespace llvm; TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterDesc *D, unsigned NR, regclass_iterator RCB, regclass_iterator RCE, const char *const *subregindexnames, - int CFSO, int CFDO, - const unsigned* subregs, const unsigned subregsize, - const unsigned* aliases, const unsigned aliasessize) - : SubregHash(subregs), SubregHashSize(subregsize), - AliasesHash(aliases), AliasesHashSize(aliasessize), - Desc(D), SubRegIndexNames(subregindexnames), NumRegs(NR), + int CFSO, int CFDO) + : Desc(D), SubRegIndexNames(subregindexnames), NumRegs(NR), RegClassBegin(RCB), RegClassEnd(RCE) { assert(isPhysicalRegister(NumRegs) && "Target has too many physical registers!"); @@ -96,7 +92,8 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, } else { for (TargetRegisterInfo::regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I) - getAllocatableSetForRC(MF, *I, Allocatable); + if ((*I)->isAllocatable()) + getAllocatableSetForRC(MF, *I, Allocatable); } // Mask out the reserved registers diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index e0989b0..c352bfc 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -928,6 +928,18 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, Operands.erase(Operands.begin() + 1); } } + + // Transforms "int $3" into "int3" as a size optimization. We can't write an + // instalias with an immediate operand yet. + if (Name == "int" && Operands.size() == 2) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand*>(Operands[0])->setTokenValue("int3"); + } + } return false; } diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index d4a88d7..a9c90f8 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -485,7 +485,7 @@ struct InternalInstruction { consumed___ indicates that the byte was already consumed and does not need to be consumed again */ - /* The VEX.vvvv field, which contains a thrid register operand for some AVX + /* The VEX.vvvv field, which contains a third register operand for some AVX instructions */ Reg vvvv; diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index d006eca..68247d2 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -41,8 +41,15 @@ X86ATTInstPrinter::X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) &TM.getSubtarget<X86Subtarget>())); } +void X86ATTInstPrinter::printRegName(raw_ostream &OS, + unsigned RegNo) const { + OS << '%' << getRegisterName(RegNo); +} + void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) { - printInstruction(MI, OS); + // Try to print any aliases first. + if (!printAliasInstr(MI, OS)) + printInstruction(MI, OS); // If verbose assembly is enabled, we can print some informative comments. if (CommentStream) diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index f24674f..5f939b6 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -26,11 +26,14 @@ class X86ATTInstPrinter : public MCInstPrinter { public: X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI); + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &OS); virtual StringRef getOpcodeName(unsigned Opcode) const; // Methods used to print the alias of an instruction. unsigned ComputeAvailableFeatures(const X86Subtarget *Subtarget) const; + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. bool printAliasInstr(const MCInst *MI, raw_ostream &OS); // Autogenerated by tblgen. diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 47253eb..5f581ba 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -29,6 +29,10 @@ using namespace llvm; #define GET_INSTRUCTION_NAME #include "X86GenAsmWriter1.inc" +void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) { printInstruction(MI, OS); diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index ca99dc0..c8030c3 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -27,6 +27,7 @@ public: X86IntelInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) : MCInstPrinter(MAI) {} + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &OS); virtual StringRef getOpcodeName(unsigned Opcode) const; diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt index e21d69a..bcfdf0b 100644 --- a/lib/Target/X86/README-X86-64.txt +++ b/lib/Target/X86/README-X86-64.txt @@ -36,7 +36,7 @@ _conv: cmovb %rcx, %rax ret -Seems like the jb branch has high likelyhood of being taken. It would have +Seems like the jb branch has high likelihood of being taken. It would have saved a few instructions. //===---------------------------------------------------------------------===// @@ -124,51 +124,6 @@ if we have whole-function selectiondags. //===---------------------------------------------------------------------===// -Take the following C code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640): - -struct u1 -{ - float x; - float y; -}; - -float foo(struct u1 u) -{ - return u.x + u.y; -} - -Optimizes to the following IR: -define float @foo(double %u.0) nounwind readnone { -entry: - %tmp8 = bitcast double %u.0 to i64 ; <i64> [#uses=2] - %tmp6 = trunc i64 %tmp8 to i32 ; <i32> [#uses=1] - %tmp7 = bitcast i32 %tmp6 to float ; <float> [#uses=1] - %tmp2 = lshr i64 %tmp8, 32 ; <i64> [#uses=1] - %tmp3 = trunc i64 %tmp2 to i32 ; <i32> [#uses=1] - %tmp4 = bitcast i32 %tmp3 to float ; <float> [#uses=1] - %0 = fadd float %tmp7, %tmp4 ; <float> [#uses=1] - ret float %0 -} - -And current llvm-gcc/clang output: - movd %xmm0, %rax - movd %eax, %xmm1 - shrq $32, %rax - movd %eax, %xmm0 - addss %xmm1, %xmm0 - ret - -We really shouldn't move the floats to RAX, only to immediately move them -straight back to the XMM registers. - -There really isn't any good way to handle this purely in IR optimizers; it -could possibly be handled by changing the output of the fronted, though. It -would also be feasible to add a x86-specific DAGCombine to optimize the -bitcast+trunc+(lshr+)bitcast combination. - -//===---------------------------------------------------------------------===// - Take the following code (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): extern unsigned long table[]; diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 1ac2305..560947a 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -7,14 +7,6 @@ copy (3-addr bswap + memory support?) This is available on Atom processors. //===---------------------------------------------------------------------===// -CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 -backend knows how to three-addressify this shift, but it appears the register -allocator isn't even asking it to do so in this case. We should investigate -why this isn't happening, it could have significant impact on other important -cases for X86 as well. - -//===---------------------------------------------------------------------===// - This should be one DIV/IDIV instruction, not a libcall: unsigned test(unsigned long long X, unsigned Y) { @@ -1572,7 +1564,7 @@ Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations: 1. ix86_pad_returns inserts a noop before ret instructions if immediately - preceeded by a conditional branch or is the target of a jump. + preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. @@ -1736,26 +1728,6 @@ are functionally identical. //===---------------------------------------------------------------------===// Take the following C code: -int x(int y) { return (y & 63) << 14; } - -Code produced by gcc: - andl $63, %edi - sall $14, %edi - movl %edi, %eax - ret - -Code produced by clang: - shll $14, %edi - movl %edi, %eax - andl $1032192, %eax - ret - -The code produced by gcc is 3 bytes shorter. This sort of construct often -shows up with bitfields. - -//===---------------------------------------------------------------------===// - -Take the following C code: int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } We generate the following IR with clang: @@ -2016,3 +1988,81 @@ We currently generate: We could save an instruction here by commuting the addss. //===---------------------------------------------------------------------===// + +This (from PR9661): + +float clamp_float(float a) { + if (a > 1.0f) + return 1.0f; + else if (a < 0.0f) + return 0.0f; + else + return a; +} + +Could compile to: + +clamp_float: # @clamp_float + movss .LCPI0_0(%rip), %xmm1 + minss %xmm1, %xmm0 + pxor %xmm1, %xmm1 + maxss %xmm1, %xmm0 + ret + +with -ffast-math. + +//===---------------------------------------------------------------------===// + +This function (from PR9803): + +int clamp2(int a) { + if (a > 5) + a = 5; + if (a < 0) + return 0; + return a; +} + +Compiles to: + +_clamp2: ## @clamp2 + pushq %rbp + movq %rsp, %rbp + cmpl $5, %edi + movl $5, %ecx + cmovlel %edi, %ecx + testl %ecx, %ecx + movl $0, %eax + cmovnsl %ecx, %eax + popq %rbp + ret + +The move of 0 could be scheduled above the test to make it is xor reg,reg. + +//===---------------------------------------------------------------------===// + +GCC PR48986. We currently compile this: + +void bar(void); +void yyy(int* p) { + if (__sync_fetch_and_add(p, -1) == 1) + bar(); +} + +into: + movl $-1, %eax + lock + xaddl %eax, (%rdi) + cmpl $1, %eax + je LBB0_2 + +Instead we could generate: + + lock + dec %rdi + je LBB0_2 + +The trick is to match "fetch_and_add(X, -C) == C". + +//===---------------------------------------------------------------------===// + diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index efb6c8c..7bb9676 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -1,13 +1,13 @@ //===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // -// This is a target description file for the Intel i386 architecture, refered to +// This is a target description file for the Intel i386 architecture, referred to // here as the "X86" architecture. // //===----------------------------------------------------------------------===// @@ -32,7 +32,7 @@ def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", "Enable SSE instructions", // SSE codegen depends on cmovs, and all - // SSE1+ processors support them. + // SSE1+ processors support them. [FeatureMMX, FeatureCMOV]>; def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", "Enable SSE2 instructions", @@ -50,7 +50,8 @@ def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", [FeatureSSE41, FeaturePOPCNT]>; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", - "Enable 3DNow! instructions">; + "Enable 3DNow! instructions", + [FeatureMMX]>; def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", "Enable 3DNow! Athlon instructions", [Feature3DNow]>; @@ -100,8 +101,10 @@ def : Proc<"i686", []>; def : Proc<"pentiumpro", [FeatureCMOV]>; def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; def : Proc<"pentium3", [FeatureSSE1]>; +def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium4", [FeatureSSE2]>; +def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; @@ -121,14 +124,14 @@ def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. // FIXME: Disabling AVX for now since it's not ready. -def : Proc<"sandybridge", [FeatureSSE42, Feature64Bit, +def : Proc<"corei7-avx", [FeatureSSE42, Feature64Bit, FeatureAES, FeatureCLMUL]>; def : Proc<"k6", [FeatureMMX]>; -def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>; -def : Proc<"k6-3", [FeatureMMX, Feature3DNow]>; -def : Proc<"athlon", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>; -def : Proc<"athlon-tbird", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"k6-2", [Feature3DNow]>; +def : Proc<"k6-3", [Feature3DNow]>; +def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; @@ -156,8 +159,8 @@ def : Proc<"shanghai", [Feature3DNowA, Feature64Bit, FeatureSSE4A, Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureMMX]>; -def : Proc<"winchip2", [FeatureMMX, Feature3DNow]>; -def : Proc<"c3", [FeatureMMX, Feature3DNow]>; +def : Proc<"winchip2", [Feature3DNow]>; +def : Proc<"c3", [Feature3DNow]>; def : Proc<"c3-2", [FeatureSSE1]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp index a7581eb..4d7d96d 100644 --- a/lib/Target/X86/X86AsmBackend.cpp +++ b/lib/Target/X86/X86AsmBackend.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Object/MachOFormat.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -28,6 +29,13 @@ #include "llvm/Target/TargetAsmBackend.h" using namespace llvm; +// Option to allow disabling arithmetic relaxation to workaround PR9807, which +// is useful when running bitwise comparison experiments on Darwin. We should be +// able to remove this once PR9807 is resolved. +static cl::opt<bool> +MCDisableArithRelaxation("mc-x86-disable-arith-relaxation", + cl::desc("Disable relaxation of arithmetic instruction for X86")); + static unsigned getFixupKindLog2Size(unsigned Kind) { switch (Kind) { default: assert(0 && "invalid fixup kind!"); @@ -201,6 +209,9 @@ bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const { if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode()) return true; + if (MCDisableArithRelaxation) + return false; + // Check if this instruction is ever relaxable. if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode()) return false; @@ -414,34 +425,26 @@ public: TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return new DarwinX86_32AsmBackend(T); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (Triple(TT).getEnvironment() == Triple::MachO) - return new DarwinX86_32AsmBackend(T); - else - return new WindowsX86AsmBackend(T, false); - default: - return new ELFX86_32AsmBackend(T, Triple(TT).getOS()); - } + + if (TheTriple.isOSWindows()) + return new WindowsX86AsmBackend(T, false); + + return new ELFX86_32AsmBackend(T, TheTriple.getOS()); } TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return new DarwinX86_64AsmBackend(T); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (Triple(TT).getEnvironment() == Triple::MachO) - return new DarwinX86_64AsmBackend(T); - else - return new WindowsX86AsmBackend(T, true); - default: - return new ELFX86_64AsmBackend(T, Triple(TT).getOS()); - } + + if (TheTriple.isOSWindows()) + return new WindowsX86AsmBackend(T, true); + + return new ELFX86_64AsmBackend(T, TheTriple.getOS()); } diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 8874486..f1b9972 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -23,6 +23,7 @@ #include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Operator.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -77,10 +78,8 @@ private: bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); - bool X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM); - bool X86FastEmitStore(EVT VT, unsigned Val, - const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -109,11 +108,11 @@ private: bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); - bool X86SelectExtractValue(const Instruction *I); - bool X86VisitIntrinsicCall(const IntrinsicInst &I); bool X86SelectCall(const Instruction *I); + bool DoSelectCall(const Instruction *I, const char *MemIntName); + const X86InstrInfo *getInstrInfo() const { return getTargetMachine()->getInstrInfo(); } @@ -125,6 +124,8 @@ private: unsigned TargetMaterializeAlloca(const AllocaInst *C); + unsigned TargetMaterializeFloatZero(const ConstantFP *CF); + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { @@ -133,6 +134,11 @@ private: } bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false); + + bool IsMemcpySmall(uint64_t Len); + + bool TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len); }; } // end anonymous namespace. @@ -224,8 +230,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, - const X86AddressMode &AM) { +X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -395,43 +400,45 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { const Value *Op = *i; if (const StructType *STy = dyn_cast<StructType>(*GTI)) { const StructLayout *SL = TD.getStructLayout(STy); - unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); - Disp += SL->getElementOffset(Idx); - } else { - uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); - for (;;) { - if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { - // Constant-offset addressing. - Disp += CI->getSExtValue() * S; - break; - } - if (isa<AddOperator>(Op) && - (!isa<Instruction>(Op) || - FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] - == FuncInfo.MBB) && - isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { - // An add (in the same block) with a constant operand. Fold the - // constant. - ConstantInt *CI = - cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); - Disp += CI->getSExtValue() * S; - // Iterate on the other operand. - Op = cast<AddOperator>(Op)->getOperand(0); - continue; - } - if (IndexReg == 0 && - (!AM.GV || !Subtarget->isPICStyleRIPRel()) && - (S == 1 || S == 2 || S == 4 || S == 8)) { - // Scaled-index addressing. - Scale = S; - IndexReg = getRegForGEPIndex(Op).first; - if (IndexReg == 0) - return false; - break; - } - // Unsupported. - goto unsupported_gep; + Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); + continue; + } + + // A array/variable index is always of the form i*S where S is the + // constant scale size. See if we can push the scale into immediates. + uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + Disp += CI->getSExtValue() * S; + break; + } + if (isa<AddOperator>(Op) && + (!isa<Instruction>(Op) || + FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] + == FuncInfo.MBB) && + isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { + // An add (in the same block) with a constant operand. Fold the + // constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + Disp += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + if (IndexReg == 0 && + (!AM.GV || !Subtarget->isPICStyleRIPRel()) && + (S == 1 || S == 2 || S == 4 || S == 8)) { + // Scaled-index addressing. + Scale = S; + IndexReg = getRegForGEPIndex(Op).first; + if (IndexReg == 0) + return false; + break; } + // Unsupported. + goto unsupported_gep; } } // Check for displacement overflow. @@ -445,7 +452,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { if (X86SelectAddress(U->getOperand(0), AM)) return true; - // If we couldn't merge the sub value into this addr mode, revert back to + // If we couldn't merge the gep value into this addr mode, revert back to // our address and just match the value instead of completely failing. AM = SavedAM; break; @@ -457,91 +464,91 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { // Handle constant address. if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { - // Can't handle alternate code models yet. + // Can't handle alternate code models or TLS yet. if (TM.getCodeModel() != CodeModel::Small) return false; - // RIP-relative addresses can't have additional register operands. - if (Subtarget->isPICStyleRIPRel() && - (AM.Base.Reg != 0 || AM.IndexReg != 0)) - return false; - - // Can't handle TLS yet. if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) if (GVar->isThreadLocal()) return false; - // Okay, we've committed to selecting this global. Set up the basic address. - AM.GV = GV; - - // Allow the subtarget to classify the global. - unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); - - // If this reference is relative to the pic base, set it now. - if (isGlobalRelativeToPICBase(GVFlags)) { - // FIXME: How do we know Base.Reg is free?? - AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); - } + // RIP-relative addresses can't have additional register operands, so if + // we've already folded stuff into the addressing mode, just force the + // global value into its own register, which we can use as the basereg. + if (!Subtarget->isPICStyleRIPRel() || + (AM.Base.Reg == 0 && AM.IndexReg == 0)) { + // Okay, we've committed to selecting this global. Set up the address. + AM.GV = GV; + + // Allow the subtarget to classify the global. + unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + // If this reference is relative to the pic base, set it now. + if (isGlobalRelativeToPICBase(GVFlags)) { + // FIXME: How do we know Base.Reg is free?? + AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } - // Unless the ABI requires an extra load, return a direct reference to - // the global. - if (!isGlobalStubReference(GVFlags)) { - if (Subtarget->isPICStyleRIPRel()) { - // Use rip-relative addressing if we can. Above we verified that the - // base and index registers are unused. - assert(AM.Base.Reg == 0 && AM.IndexReg == 0); - AM.Base.Reg = X86::RIP; + // Unless the ABI requires an extra load, return a direct reference to + // the global. + if (!isGlobalStubReference(GVFlags)) { + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + AM.GVOpFlags = GVFlags; + return true; } - AM.GVOpFlags = GVFlags; - return true; - } - // Ok, we need to do a load from a stub. If we've already loaded from this - // stub, reuse the loaded pointer, otherwise emit the load now. - DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); - unsigned LoadReg; - if (I != LocalValueMap.end() && I->second != 0) { - LoadReg = I->second; - } else { - // Issue load from stub. - unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; - X86AddressMode StubAM; - StubAM.Base.Reg = AM.Base.Reg; - StubAM.GV = GV; - StubAM.GVOpFlags = GVFlags; - - // Prepare for inserting code in the local-value area. - SavePoint SaveInsertPt = enterLocalValueArea(); - - if (TLI.getPointerTy() == MVT::i64) { - Opc = X86::MOV64rm; - RC = X86::GR64RegisterClass; - - if (Subtarget->isPICStyleRIPRel()) - StubAM.Base.Reg = X86::RIP; + // Ok, we need to do a load from a stub. If we've already loaded from + // this stub, reuse the loaded pointer, otherwise emit the load now. + DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); + unsigned LoadReg; + if (I != LocalValueMap.end() && I->second != 0) { + LoadReg = I->second; } else { - Opc = X86::MOV32rm; - RC = X86::GR32RegisterClass; - } + // Issue load from stub. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + X86AddressMode StubAM; + StubAM.Base.Reg = AM.Base.Reg; + StubAM.GV = GV; + StubAM.GVOpFlags = GVFlags; + + // Prepare for inserting code in the local-value area. + SavePoint SaveInsertPt = enterLocalValueArea(); + + if (TLI.getPointerTy() == MVT::i64) { + Opc = X86::MOV64rm; + RC = X86::GR64RegisterClass; + + if (Subtarget->isPICStyleRIPRel()) + StubAM.Base.Reg = X86::RIP; + } else { + Opc = X86::MOV32rm; + RC = X86::GR32RegisterClass; + } - LoadReg = createResultReg(RC); - MachineInstrBuilder LoadMI = - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); - addFullAddress(LoadMI, StubAM); + LoadReg = createResultReg(RC); + MachineInstrBuilder LoadMI = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); + addFullAddress(LoadMI, StubAM); - // Ok, back to normal mode. - leaveLocalValueArea(SaveInsertPt); + // Ok, back to normal mode. + leaveLocalValueArea(SaveInsertPt); - // Prevent loading GV stub multiple times in same MBB. - LocalValueMap[V] = LoadReg; - } + // Prevent loading GV stub multiple times in same MBB. + LocalValueMap[V] = LoadReg; + } - // Now construct the final address. Note that the Disp, Scale, - // and Index values may already be set here. - AM.Base.Reg = LoadReg; - AM.GV = 0; - return true; + // Now construct the final address. Note that the Disp, Scale, + // and Index values may already be set here. + AM.Base.Reg = LoadReg; + AM.GV = 0; + return true; + } } // If all else fails, try to materialize the value in a register. @@ -699,7 +706,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; - CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext()); + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, + I->getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); @@ -719,18 +727,38 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Only handle register returns for now. if (!VA.isRegLoc()) return false; - // TODO: For now, don't try to handle cases where getLocInfo() - // says Full but the types don't match. - if (TLI.getValueType(RV->getType()) != VA.getValVT()) - return false; // The calling-convention tables for x87 returns don't tell // the whole story. if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) return false; - // Make the copy. unsigned SrcReg = Reg + VA.getValNo(); + EVT SrcVT = TLI.getValueType(RV->getType()); + EVT DstVT = VA.getValVT(); + // Special handling for extended integers. + if (SrcVT != DstVT) { + if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + assert(DstVT == MVT::i32 && "X86 should always ext to i32"); + + if (SrcVT == MVT::i1) { + if (Outs[0].Flags.isSExt()) + return false; + SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + } + unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : + ISD::SIGN_EXTEND; + SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg, /*TODO: Kill=*/false); + } + + // Make the copy. unsigned DstReg = VA.getLocReg(); const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. @@ -862,12 +890,9 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { unsigned NEReg = createResultReg(&X86::GR8RegClass); unsigned PReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::SETNEr), NEReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::SETPr), PReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::OR8rr), ResultReg) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg) .addReg(PReg).addReg(NEReg); UpdateValueMap(I, ResultReg); return true; @@ -914,18 +939,31 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { bool X86FastISel::X86SelectZExt(const Instruction *I) { // Handle zero-extension from i1 to i8, which is common. - if (I->getType()->isIntegerTy(8) && - I->getOperand(0)->getType()->isIntegerTy(1)) { - unsigned ResultReg = getRegForValue(I->getOperand(0)); - if (ResultReg == 0) return false; - // Set the high bits to zero. - ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); - if (ResultReg == 0) return false; - UpdateValueMap(I, ResultReg); - return true; + if (!I->getOperand(0)->getType()->isIntegerTy(1)) + return false; + + EVT DstVT = TLI.getValueType(I->getType()); + if (!TLI.isTypeLegal(DstVT)) + return false; + + unsigned ResultReg = getRegForValue(I->getOperand(0)); + if (ResultReg == 0) + return false; + + // Set the high bits to zero. + ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + if (ResultReg == 0) + return false; + + if (DstVT != MVT::i8) { + ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg, /*Kill=*/true); + if (ResultReg == 0) + return false; } - return false; + UpdateValueMap(I, ResultReg); + return true; } @@ -1008,71 +1046,49 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { FuncInfo.MBB->addSuccessor(TrueMBB); return true; } - } else if (ExtractValueInst *EI = - dyn_cast<ExtractValueInst>(BI->getCondition())) { - // Check to see if the branch instruction is from an "arithmetic with - // overflow" intrinsic. The main way these intrinsics are used is: - // - // %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - // %sum = extractvalue { i32, i1 } %t, 0 - // %obit = extractvalue { i32, i1 } %t, 1 - // br i1 %obit, label %overflow, label %normal - // - // The %sum and %obit are converted in an ADD and a SETO/SETB before - // reaching the branch. Therefore, we search backwards through the MBB - // looking for the SETO/SETB instruction. If an instruction modifies the - // EFLAGS register before we reach the SETO/SETB instruction, then we can't - // convert the branch into a JO/JB instruction. - if (const IntrinsicInst *CI = - dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){ - if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow || - CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) { - const MachineInstr *SetMI = 0; - unsigned Reg = getRegForValue(EI); - - for (MachineBasicBlock::const_reverse_iterator - RI = FuncInfo.MBB->rbegin(), RE = FuncInfo.MBB->rend(); - RI != RE; ++RI) { - const MachineInstr &MI = *RI; - - if (MI.definesRegister(Reg)) { - if (MI.isCopy()) { - Reg = MI.getOperand(1).getReg(); - continue; - } - - SetMI = &MI; - break; - } - - const TargetInstrDesc &TID = MI.getDesc(); - if (TID.hasImplicitDefOfPhysReg(X86::EFLAGS) || - MI.hasUnmodeledSideEffects()) - break; - } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which + // typically happen for _Bool and C++ bools. + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { + unsigned TestOpc = 0; + switch (SourceVT.SimpleTy) { + default: break; + case MVT::i8: TestOpc = X86::TEST8ri; break; + case MVT::i16: TestOpc = X86::TEST16ri; break; + case MVT::i32: TestOpc = X86::TEST32ri; break; + case MVT::i64: TestOpc = X86::TEST64ri32; break; + } + if (TestOpc) { + unsigned OpReg = getRegForValue(TI->getOperand(0)); + if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc)) + .addReg(OpReg).addImm(1); - if (SetMI) { - unsigned OpCode = SetMI->getOpcode(); - - if (OpCode == X86::SETOr || OpCode == X86::SETBr) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(OpCode == X86::SETOr ? X86::JO_4 : X86::JB_4)) - .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DL); - FuncInfo.MBB->addSuccessor(TrueMBB); - return true; - } + unsigned JmpOpc = X86::JNE_4; + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + JmpOpc = X86::JE_4; } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; } } } // Otherwise do a clumsy setcc and re-test it. + // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used + // in an explicit cast, so make sure to handle that correctly. unsigned OpReg = getRegForValue(BI->getCondition()); if (OpReg == 0) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr)) - .addReg(OpReg).addReg(OpReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri)) + .addReg(OpReg).addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DL); @@ -1081,42 +1097,42 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { } bool X86FastISel::X86SelectShift(const Instruction *I) { - unsigned CReg = 0, OpReg = 0, OpImm = 0; + unsigned CReg = 0, OpReg = 0; const TargetRegisterClass *RC = NULL; if (I->getType()->isIntegerTy(8)) { CReg = X86::CL; RC = &X86::GR8RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break; - case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break; - case Instruction::Shl: OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break; + case Instruction::LShr: OpReg = X86::SHR8rCL; break; + case Instruction::AShr: OpReg = X86::SAR8rCL; break; + case Instruction::Shl: OpReg = X86::SHL8rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(16)) { CReg = X86::CX; RC = &X86::GR16RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break; - case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break; - case Instruction::Shl: OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break; + case Instruction::LShr: OpReg = X86::SHR16rCL; break; + case Instruction::AShr: OpReg = X86::SAR16rCL; break; + case Instruction::Shl: OpReg = X86::SHL16rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(32)) { CReg = X86::ECX; RC = &X86::GR32RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break; - case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break; - case Instruction::Shl: OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break; + case Instruction::LShr: OpReg = X86::SHR32rCL; break; + case Instruction::AShr: OpReg = X86::SAR32rCL; break; + case Instruction::Shl: OpReg = X86::SHL32rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(64)) { CReg = X86::RCX; RC = &X86::GR64RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break; - case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break; - case Instruction::Shl: OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break; + case Instruction::LShr: OpReg = X86::SHR64rCL; break; + case Instruction::AShr: OpReg = X86::SAR64rCL; break; + case Instruction::Shl: OpReg = X86::SHL64rCL; break; default: return false; } } else { @@ -1130,15 +1146,6 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; - // Fold immediate in shl(x,3). - if (const ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpImm), - ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff); - UpdateValueMap(I, ResultReg); - return true; - } - unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), @@ -1238,18 +1245,13 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { } bool X86FastISel::X86SelectTrunc(const Instruction *I) { - if (Subtarget->is64Bit()) - // All other cases should be handled by the tblgen generated code. - return false; EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(I->getType()); - // This code only handles truncation to byte right now. + // This code only handles truncation to byte. if (DstVT != MVT::i8 && DstVT != MVT::i1) - // All other cases should be handled by the tblgen generated code. return false; - if (SrcVT != MVT::i16 && SrcVT != MVT::i32) - // All other cases should be handled by the tblgen generated code. + if (!TLI.isTypeLegal(SrcVT)) return false; unsigned InputReg = getRegForValue(I->getOperand(0)); @@ -1257,16 +1259,26 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { // Unhandled operand. Halt "fast" selection and bail. return false; - // First issue a copy to GR16_ABCD or GR32_ABCD. - const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) - ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass; - unsigned CopyReg = createResultReg(CopyRC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - CopyReg).addReg(InputReg); + if (SrcVT == MVT::i8) { + // Truncate from i8 to i1; no code needed. + UpdateValueMap(I, InputReg); + return true; + } - // Then issue an extract_subreg. + if (!Subtarget->is64Bit()) { + // If we're on x86-32; we can't extract an i8 from a general register. + // First issue a copy to GR16_ABCD or GR32_ABCD. + const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) + ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass; + unsigned CopyReg = createResultReg(CopyRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CopyReg).addReg(InputReg); + InputReg = CopyReg; + } + + // Issue an extract_subreg. unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8, - CopyReg, /*Kill=*/true, + InputReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; @@ -1275,35 +1287,92 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { return true; } -bool X86FastISel::X86SelectExtractValue(const Instruction *I) { - const ExtractValueInst *EI = cast<ExtractValueInst>(I); - const Value *Agg = EI->getAggregateOperand(); +bool X86FastISel::IsMemcpySmall(uint64_t Len) { + return Len <= (Subtarget->is64Bit() ? 32 : 16); +} - if (const IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) { - switch (CI->getIntrinsicID()) { - default: break; - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: { - // Cheat a little. We know that the registers for "add" and "seto" are - // allocated sequentially. However, we only keep track of the register - // for "add" in the value map. Use extractvalue's index to get the - // correct register for "seto". - unsigned OpReg = getRegForValue(Agg); - if (OpReg == 0) - return false; - UpdateValueMap(I, OpReg + *EI->idx_begin()); - return true; - } +bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len) { + + // Make sure we don't bloat code by inlining very large memcpy's. + if (!IsMemcpySmall(Len)) + return false; + + bool i64Legal = Subtarget->is64Bit(); + + // We don't care about alignment here since we just emit integer accesses. + while (Len) { + MVT VT; + if (Len >= 8 && i64Legal) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + assert(Len == 1); + VT = MVT::i8; } + + unsigned Reg; + bool RV = X86FastEmitLoad(VT, SrcAM, Reg); + RV &= X86FastEmitStore(VT, Reg, DestAM); + assert(RV && "Failed to emit load or store??"); + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + DestAM.Disp += Size; + SrcAM.Disp += Size; } - return false; + return true; } bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { default: return false; + case Intrinsic::memcpy: { + const MemCpyInst &MCI = cast<MemCpyInst>(I); + // Don't handle volatile or variable length memcpys. + if (MCI.isVolatile()) + return false; + + if (isa<ConstantInt>(MCI.getLength())) { + // Small memcpy's are common enough that we want to do them + // without a call if possible. + uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue(); + if (IsMemcpySmall(Len)) { + X86AddressMode DestAM, SrcAM; + if (!X86SelectAddress(MCI.getRawDest(), DestAM) || + !X86SelectAddress(MCI.getRawSource(), SrcAM)) + return false; + TryEmitSmallMemcpy(DestAM, SrcAM, Len); + return true; + } + } + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memcpy"); + } + case Intrinsic::memset: { + const MemSetInst &MSI = cast<MemSetInst>(I); + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MSI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memset"); + } case Intrinsic::stackprotector: { // Emit code inline code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(); @@ -1314,33 +1383,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // Grab the frame index. X86AddressMode AM; if (!X86SelectAddress(Slot, AM)) return false; - if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; - - return true; - } - case Intrinsic::objectsize: { - ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1)); - const Type *Ty = I.getCalledFunction()->getReturnType(); - - assert(CI && "Non-constant type in Intrinsic::objectsize?"); - - MVT VT; - if (!isTypeLegal(Ty, VT)) - return false; - - unsigned OpC = 0; - if (VT == MVT::i32) - OpC = X86::MOV32ri; - else if (VT == MVT::i64) - OpC = X86::MOV64ri; - else - return false; - - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg). - addImm(CI->isZero() ? -1ULL : 0); - UpdateValueMap(&I, ResultReg); return true; } case Intrinsic::dbg_declare: { @@ -1362,11 +1405,10 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { } case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: { + // FIXME: Should fold immediates. + // Replace "add with overflow" intrinsics with an "add" instruction followed - // by a seto/setc instruction. Later on, when the "extractvalue" - // instructions are encountered, we use the fact that two registers were - // created sequentially to get the correct registers for the "sum" and the - // "overflow bit". + // by a seto/setc instruction. const Function *Callee = I.getCalledFunction(); const Type *RetTy = cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); @@ -1392,27 +1434,18 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { else return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + // The call to CreateRegs builds two sequential registers, to store the + // both the the returned values. + unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg) .addReg(Reg1).addReg(Reg2); - unsigned DestReg1 = UpdateValueMap(&I, ResultReg); - - // If the add with overflow is an intra-block value then we just want to - // create temporaries for it like normal. If it is a cross-block value then - // UpdateValueMap will return the cross-block register used. Since we - // *really* want the value to be live in the register pair known by - // UpdateValueMap, we have to use DestReg1+1 as the destination register in - // the cross block case. In the non-cross-block case, we should just make - // another register for the value. - if (DestReg1 != ResultReg) - ResultReg = DestReg1+1; - else - ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8)); unsigned Opc = X86::SETBr; if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) Opc = X86::SETOr; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1); + + UpdateValueMap(&I, ResultReg, 2); return true; } } @@ -1430,11 +1463,18 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) return X86VisitIntrinsicCall(*II); + return DoSelectCall(I, 0); +} + +// Select either a call, or an llvm.memcpy/memmove/memset intrinsic +bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + // Handle only C and fastcc calling conventions for now. ImmutableCallSite CS(CI); CallingConv::ID CC = CS.getCallingConv(); - if (CC != CallingConv::C && - CC != CallingConv::Fast && + if (CC != CallingConv::C && CC != CallingConv::Fast && CC != CallingConv::X86_FastCall) return false; @@ -1443,22 +1483,28 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (CC == CallingConv::Fast && GuaranteedTailCallOpt) return false; - // Let SDISel handle vararg functions. const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); const FunctionType *FTy = cast<FunctionType>(PT->getElementType()); - if (FTy->isVarArg()) + bool isVarArg = FTy->isVarArg(); + + // Don't know how to handle Win64 varargs yet. Nothing special needed for + // x86-32. Special handling for x86-64 is implemented. + if (isVarArg && Subtarget->isTargetWin64()) return false; // Fast-isel doesn't know about callee-pop yet. - if (Subtarget->IsCalleePop(FTy->isVarArg(), CC)) + if (Subtarget->IsCalleePop(isVarArg, CC)) return false; - // Handle *simple* calls for now. - const Type *RetTy = CS.getType(); - MVT RetVT; - if (RetTy->isVoidTy()) - RetVT = MVT::isVoid; - else if (!isTypeLegal(RetTy, RetVT, true)) + // Check whether the function can return without sret-demotion. + SmallVector<ISD::OutputArg, 4> Outs; + SmallVector<uint64_t, 4> Offsets; + GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(), + Outs, TLI, &Offsets); + bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), + *FuncInfo.MF, FTy->isVarArg(), + Outs, FTy->getContext()); + if (!CanLowerReturn) return false; // Materialize callee address in a register. FIXME: GV address can be @@ -1475,13 +1521,6 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { } else return false; - // Allow calls which produce i1 results. - bool AndToI1 = false; - if (RetVT == MVT::i1) { - RetVT = MVT::i8; - AndToI1 = true; - } - // Deal with call operands first. SmallVector<const Value *, 8> ArgVals; SmallVector<unsigned, 8> Args; @@ -1493,9 +1532,11 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { ArgFlags.reserve(CS.arg_size()); for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { - unsigned Arg = getRegForValue(*i); - if (Arg == 0) - return false; + // If we're lowering a mem intrinsic instead of a regular call, skip the + // last two arguments, which should not passed to the underlying functions. + if (MemIntName && e-i <= 2) + break; + Value *ArgVal = *i; ISD::ArgFlagsTy Flags; unsigned AttrInd = i - CS.arg_begin() + 1; if (CS.paramHasAttr(AttrInd, Attribute::SExt)) @@ -1503,34 +1544,83 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) Flags.setZExt(); - // FIXME: Only handle *easy* calls for now. - if (CS.paramHasAttr(AttrInd, Attribute::InReg) || - CS.paramHasAttr(AttrInd, Attribute::StructRet) || - CS.paramHasAttr(AttrInd, Attribute::Nest) || - CS.paramHasAttr(AttrInd, Attribute::ByVal)) - return false; + if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) { + const PointerType *Ty = cast<PointerType>(ArgVal->getType()); + const Type *ElementTy = Ty->getElementType(); + unsigned FrameSize = TD.getTypeAllocSize(ElementTy); + unsigned FrameAlign = CS.getParamAlignment(AttrInd); + if (!FrameAlign) + FrameAlign = TLI.getByValTypeAlignment(ElementTy); + Flags.setByVal(); + Flags.setByValSize(FrameSize); + Flags.setByValAlign(FrameAlign); + if (!IsMemcpySmall(FrameSize)) + return false; + } + + if (CS.paramHasAttr(AttrInd, Attribute::InReg)) + Flags.setInReg(); + if (CS.paramHasAttr(AttrInd, Attribute::Nest)) + Flags.setNest(); + + // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all fastisel supported + // calling conventions on x86. + if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) { + if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 || + CI->getBitWidth() == 16) { + if (Flags.isSExt()) + ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext())); + else + ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext())); + } + } + + unsigned ArgReg; - const Type *ArgTy = (*i)->getType(); + // Passing bools around ends up doing a trunc to i1 and passing it. + // Codegen this as an argument + "and 1". + if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) && + cast<TruncInst>(ArgVal)->getParent() == I->getParent() && + ArgVal->hasOneUse()) { + ArgVal = cast<TruncInst>(ArgVal)->getOperand(0); + ArgReg = getRegForValue(ArgVal); + if (ArgReg == 0) return false; + + MVT ArgVT; + if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false; + + ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg, + ArgVal->hasOneUse(), 1); + } else { + ArgReg = getRegForValue(ArgVal); + } + + if (ArgReg == 0) return false; + + const Type *ArgTy = ArgVal->getType(); MVT ArgVT; if (!isTypeLegal(ArgTy, ArgVT)) return false; + if (ArgVT == MVT::x86mmx) + return false; unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); Flags.setOrigAlign(OriginalAlignment); - Args.push_back(Arg); - ArgVals.push_back(*i); + Args.push_back(ArgReg); + ArgVals.push_back(ArgVal); ArgVTs.push_back(ArgVT); ArgFlags.push_back(Flags); } // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext()); + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, + I->getParent()->getContext()); // Allocate shadow area for Win64 - if (Subtarget->isTargetWin64()) { + if (Subtarget->isTargetWin64()) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); @@ -1555,6 +1645,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); assert(Emitted && "Failed to emit a sext!"); (void)Emitted; @@ -1562,6 +1654,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { break; } case CCValAssign::ZExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); assert(Emitted && "Failed to emit a zext!"); (void)Emitted; @@ -1569,9 +1663,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { break; } case CCValAssign::AExt: { - // We don't handle MMX parameters yet. - if (VA.getLocVT().isVector() && VA.getLocVT().getSizeInBits() == 128) - return false; + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); if (!Emitted) @@ -1605,14 +1698,21 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { AM.Base.Reg = StackPtr; AM.Disp = LocMemOffset; const Value *ArgVal = ArgVals[VA.getValNo()]; - - // If this is a really simple value, emit this with the Value* version of - // X86FastEmitStore. If it isn't simple, we don't want to do this, as it - // can cause us to reevaluate the argument. - if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) + ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()]; + + if (Flags.isByVal()) { + X86AddressMode SrcAM; + SrcAM.Base.Reg = Arg; + bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()); + assert(Res && "memcpy length already checked!"); (void)Res; + } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { + // If this is a really simple value, emit this with the Value* version + //of X86FastEmitStore. If it isn't simple, we don't want to do this, + // as it can cause us to reevaluate the argument. X86FastEmitStore(ArgVT, ArgVal, AM); - else + } else { X86FastEmitStore(ArgVT, Arg, AM); + } } } @@ -1624,6 +1724,17 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { X86::EBX).addReg(Base); } + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) { + // Count the number of XMM registers allocated. + static const unsigned XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri), + X86::AL).addImm(NumXMMRegs); + } + // Issue the call. MachineInstrBuilder MIB; if (CalleeOp) { @@ -1662,7 +1773,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -1670,80 +1782,100 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { } - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) - .addGlobalAddress(GV, 0, OpFlags); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)); + if (MemIntName) + MIB.addExternalSymbol(MemIntName, OpFlags); + else + MIB.addGlobalAddress(GV, 0, OpFlags); } // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX); + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) + MIB.addReg(X86::AL); + // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) MIB.addReg(RegArgs[i]); // Issue CALLSEQ_END unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode(); + unsigned NumBytesCallee = 0; + if (!Subtarget->is64Bit() && CS.paramHasAttr(1, Attribute::StructRet)) + NumBytesCallee = 4; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp)) - .addImm(NumBytes).addImm(0); + .addImm(NumBytes).addImm(NumBytesCallee); + + // Build info for return calling conv lowering code. + // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo. + SmallVector<ISD::InputArg, 32> Ins; + SmallVector<EVT, 4> RetTys; + ComputeValueVTs(TLI, I->getType(), RetTys); + for (unsigned i = 0, e = RetTys.size(); i != e; ++i) { + EVT VT = RetTys[i]; + EVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); + unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT); + for (unsigned j = 0; j != NumRegs; ++j) { + ISD::InputArg MyFlags; + MyFlags.VT = RegisterVT.getSimpleVT(); + MyFlags.Used = !CS.getInstruction()->use_empty(); + if (CS.paramHasAttr(0, Attribute::SExt)) + MyFlags.Flags.setSExt(); + if (CS.paramHasAttr(0, Attribute::ZExt)) + MyFlags.Flags.setZExt(); + if (CS.paramHasAttr(0, Attribute::InReg)) + MyFlags.Flags.setInReg(); + Ins.push_back(MyFlags); + } + } - // Now handle call return value (if any). + // Now handle call return values. SmallVector<unsigned, 4> UsedRegs; - if (RetVT != MVT::isVoid) { - SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext()); - CCInfo.AnalyzeCallResult(RetVT, RetCC_X86); - - // Copy all of the result registers out of their specified physreg. - assert(RVLocs.size() == 1 && "Can't handle multi-value calls!"); - EVT CopyVT = RVLocs[0].getValVT(); - TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs, + I->getParent()->getContext()); + unsigned ResultReg = FuncInfo.CreateRegs(I->getType()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + EVT CopyVT = RVLocs[i].getValVT(); + unsigned CopyReg = ResultReg + i; // If this is a call to a function that returns an fp value on the x87 fp // stack, but where we prefer to use the value in xmm registers, copy it // out as F80 and use a truncate to move it from fp stack reg to xmm reg. - if ((RVLocs[0].getLocReg() == X86::ST0 || - RVLocs[0].getLocReg() == X86::ST1) && + if ((RVLocs[i].getLocReg() == X86::ST0 || + RVLocs[i].getLocReg() == X86::ST1) && isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) { CopyVT = MVT::f80; - DstRC = X86::RFP80RegisterClass; + CopyReg = createResultReg(X86::RFP80RegisterClass); } - unsigned ResultReg = createResultReg(DstRC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - ResultReg).addReg(RVLocs[0].getLocReg()); - UsedRegs.push_back(RVLocs[0].getLocReg()); + CopyReg).addReg(RVLocs[i].getLocReg()); + UsedRegs.push_back(RVLocs[i].getLocReg()); - if (CopyVT != RVLocs[0].getValVT()) { + if (CopyVT != RVLocs[i].getValVT()) { // Round the F80 the right size, which also moves to the appropriate xmm // register. This is accomplished by storing the F80 value in memory and // then loading it back. Ewww... - EVT ResVT = RVLocs[0].getValVT(); + EVT ResVT = RVLocs[i].getValVT(); unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; unsigned MemSize = ResVT.getSizeInBits()/8; int FI = MFI.CreateStackObject(MemSize, MemSize, false); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)), FI) - .addReg(ResultReg); - DstRC = ResVT == MVT::f32 - ? X86::FR32RegisterClass : X86::FR64RegisterClass; + .addReg(CopyReg); Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; - ResultReg = createResultReg(DstRC); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(Opc), ResultReg), FI); - } - - if (AndToI1) { - // Mask out all but lowest bit for some call which produces an i1. - unsigned AndResult = createResultReg(X86::GR8RegisterClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1); - ResultReg = AndResult; + TII.get(Opc), ResultReg + i), FI); } - - UpdateValueMap(I, ResultReg); } + if (RVLocs.size()) + UpdateValueMap(I, ResultReg, RVLocs.size()); + // Set all unused physreg defs as dead. static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); @@ -1782,8 +1914,6 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return X86SelectFPExt(I); case Instruction::FPTrunc: return X86SelectFPTrunc(I); - case Instruction::ExtractValue: - return X86SelectExtractValue(I); case Instruction::IntToPtr: // Deliberate fall-through. case Instruction::PtrToInt: { EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); @@ -1856,10 +1986,13 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { if (isa<GlobalValue>(C)) { X86AddressMode AM; if (X86SelectAddress(C, AM)) { - if (TLI.getPointerTy() == MVT::i32) - Opc = X86::LEA32r; - else - Opc = X86::LEA64r; + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0) + return AM.Base.Reg; + + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg), AM); @@ -1921,6 +2054,45 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { return ResultReg; } +unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { + MVT VT; + if (!isTypeLegal(CF->getType(), VT)) + return false; + + // Get opcode and regclass for the given zero. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: + if (Subtarget->hasSSE1()) { + Opc = X86::FsFLD0SS; + RC = X86::FR32RegisterClass; + } else { + Opc = X86::LD_Fp032; + RC = X86::RFP32RegisterClass; + } + break; + case MVT::f64: + if (Subtarget->hasSSE2()) { + Opc = X86::FsFLD0SD; + RC = X86::FR64RegisterClass; + } else { + Opc = X86::LD_Fp064; + RC = X86::RFP64RegisterClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + } + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); + return ResultReg; +} + + /// TryToFoldLoad - The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, /// try to fold the load as an operand to the instruction, returning true if diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 3aaa693..325d061 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -1307,7 +1307,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // set up by FpSET_ST0, and our StackTop is off by one because of it. unsigned Op0 = getFPReg(MI->getOperand(0)); // Restore the actual StackTop from before Fp_SET_ST0. - // Note we can't handle Fp_SET_ST1 without a preceeding Fp_SET_ST0, and we + // Note we can't handle Fp_SET_ST1 without a preceding Fp_SET_ST0, and we // are not enforcing the constraint. ++StackTop; unsigned RegOnTop = getStackEntry(0); // This reg must remain in st(0). diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 071fbe0..cd4e954 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1,4 +1,4 @@ -//=======- X86FrameLowering.cpp - X86 Frame Information ------------*- C++ -*-====// +//=======- X86FrameLowering.cpp - X86 Frame Information --------*- C++ -*-====// // // The LLVM Compiler Infrastructure // @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/CommandLine.h" @@ -159,8 +160,10 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) : (Is64Bit ? X86::POP64r : X86::POP32r); - BuildMI(MBB, MBBI, DL, TII.get(Opc)) + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); Offset -= ThisVal; continue; } @@ -170,6 +173,8 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(ThisVal); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. Offset -= ThisVal; } @@ -296,7 +301,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, // FIXME: This is dirty hack. The code itself is pretty mess right now. // It should be rewritten from scratch and generalized sometimes. - // Determine maximum offset (minumum due to stack growth). + // Determine maximum offset (minimum due to stack growth). int64_t MaxOffset = 0; for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) @@ -354,7 +359,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || - !Fn->doesNotThrow() || UnwindTablesMandatory; + Fn->needsUnwindTableEntry(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); @@ -408,7 +413,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { TII.get(getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta)), StackPtr) .addReg(StackPtr) - .addImm(-TailCallReturnAddrDelta); + .addImm(-TailCallReturnAddrDelta) + .setMIFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. } @@ -446,7 +452,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) - .addReg(FramePtr, RegState::Kill); + .addReg(FramePtr, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); if (needsFrameMoves) { // Mark the place where EBP/RBP was saved. @@ -473,7 +480,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Update EBP with the new base value... BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) - .addReg(StackPtr); + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); if (needsFrameMoves) { // Mark effective beginning of when frame pointer becomes valid. @@ -615,7 +623,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII, *RegInfo); - if ((NumBytes || PushedRegs) && needsFrameMoves) { + if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { // Mark end of stack pointer adjustment. MCSymbol *Label = MMI.getContext().CreateTempSymbol(); BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label); @@ -641,7 +649,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { } void X86FrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { + MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); @@ -785,7 +793,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, assert(Offset >= 0 && "Offset should never be negative"); if (Offset) { - // Check for possible merge with preceeding ADD instruction. + // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo); } @@ -829,7 +837,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, int delta = -1*X86FI->getTCReturnAddrDelta(); MBBI = MBB.getLastNonDebugInstr(); - // Check for possible merge with preceeding ADD instruction. + // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo); } @@ -918,7 +926,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, // X86RegisterInfo::emitPrologue will handle spilling of frame register. continue; CalleeFrameSize += SlotSize; - BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill); + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); } X86FI->setCalleeSavedFrameSize(CalleeFrameSize); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 9b0ec6e..1fcc274 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -189,6 +189,7 @@ namespace { SDNode *Select(SDNode *N); SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT); + SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); @@ -1329,6 +1330,8 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { return ResNode; } +// FIXME: Figure out some way to unify this with the 'or' and other code +// below. SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { if (Node->hasAnyUseOfValue(0)) return 0; @@ -1479,6 +1482,158 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { } } +enum AtomicOpc { + OR, + AND, + XOR, + AtomicOpcEnd +}; + +enum AtomicSz { + ConstantI8, + I8, + SextConstantI16, + ConstantI16, + I16, + SextConstantI32, + ConstantI32, + I32, + SextConstantI64, + ConstantI64, + I64, + AtomicSzEnd +}; + +static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { + { + X86::LOCK_OR8mi, + X86::LOCK_OR8mr, + X86::LOCK_OR16mi8, + X86::LOCK_OR16mi, + X86::LOCK_OR16mr, + X86::LOCK_OR32mi8, + X86::LOCK_OR32mi, + X86::LOCK_OR32mr, + X86::LOCK_OR64mi8, + X86::LOCK_OR64mi32, + X86::LOCK_OR64mr + }, + { + X86::LOCK_AND8mi, + X86::LOCK_AND8mr, + X86::LOCK_AND16mi8, + X86::LOCK_AND16mi, + X86::LOCK_AND16mr, + X86::LOCK_AND32mi8, + X86::LOCK_AND32mi, + X86::LOCK_AND32mr, + X86::LOCK_AND64mi8, + X86::LOCK_AND64mi32, + X86::LOCK_AND64mr + }, + { + X86::LOCK_XOR8mi, + X86::LOCK_XOR8mr, + X86::LOCK_XOR16mi8, + X86::LOCK_XOR16mi, + X86::LOCK_XOR16mr, + X86::LOCK_XOR32mi8, + X86::LOCK_XOR32mi, + X86::LOCK_XOR32mr, + X86::LOCK_XOR64mi8, + X86::LOCK_XOR64mi32, + X86::LOCK_XOR64mr + } +}; + +SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { + if (Node->hasAnyUseOfValue(0)) + return 0; + + // Optimize common patterns for __sync_or_and_fetch and similar arith + // operations where the result is not used. This allows us to use the "lock" + // version of the arithmetic instruction. + // FIXME: Same as for 'add' and 'sub', try to merge those down here. + SDValue Chain = Node->getOperand(0); + SDValue Ptr = Node->getOperand(1); + SDValue Val = Node->getOperand(2); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + return 0; + + // Which index into the table. + enum AtomicOpc Op; + switch (Node->getOpcode()) { + case ISD::ATOMIC_LOAD_OR: + Op = OR; + break; + case ISD::ATOMIC_LOAD_AND: + Op = AND; + break; + case ISD::ATOMIC_LOAD_XOR: + Op = XOR; + break; + default: + return 0; + } + + bool isCN = false; + ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val); + if (CN) { + isCN = true; + Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT); + } + + unsigned Opc = 0; + switch (NVT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: + if (isCN) + Opc = AtomicOpcTbl[Op][ConstantI8]; + else + Opc = AtomicOpcTbl[Op][I8]; + break; + case MVT::i16: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI16]; + else + Opc = AtomicOpcTbl[Op][ConstantI16]; + } else + Opc = AtomicOpcTbl[Op][I16]; + break; + case MVT::i32: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI32]; + else + Opc = AtomicOpcTbl[Op][ConstantI32]; + } else + Opc = AtomicOpcTbl[Op][I32]; + break; + case MVT::i64: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI64]; + else if (i64immSExt32(Val.getNode())) + Opc = AtomicOpcTbl[Op][ConstantI64]; + } else + Opc = AtomicOpcTbl[Op][I64]; + break; + } + + DebugLoc dl = Node->getDebugLoc(); + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); +} + /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has /// any uses which require the SF or OF bits to be accurate. static bool HasNoSignedComparisonUses(SDNode *N) { @@ -1580,6 +1735,89 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return RetVal; break; } + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: { + SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); + if (RetVal) + return RetVal; + break; + } + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + break; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + break; + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + if (!Cst || !ShlCst) + break; + + int64_t Val = Cst->getSExtValue(); + uint64_t ShlVal = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val) + break; + + unsigned ShlOp, Op = 0; + EVT CstVT = NVT; + + // Check the minimum bitwidth for the new constant. + // TODO: AND32ri is the same as AND64ri32 with zext imm. + // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) + CstVT = MVT::i8; + else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) + CstVT = MVT::i32; + + // Bail if there is no smaller encoding. + if (NVT == CstVT) + break; + + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i32: + assert(CstVT == MVT::i8); + ShlOp = X86::SHL32ri; + + switch (Opcode) { + case ISD::AND: Op = X86::AND32ri8; break; + case ISD::OR: Op = X86::OR32ri8; break; + case ISD::XOR: Op = X86::XOR32ri8; break; + } + break; + case MVT::i64: + assert(CstVT == MVT::i8 || CstVT == MVT::i32); + ShlOp = X86::SHL64ri; + + switch (Opcode) { + case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; + case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; + case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + } + break; + } + + // Emit the smaller op and the shift. + SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT); + SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); + return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), + getI8Imm(ShlVal)); + break; + } case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); @@ -1768,17 +2006,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; Move = - SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16, + SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, MVT::Other, Ops, array_lengthof(Ops)), 0); Chain = Move.getValue(1); ReplaceUses(N0.getValue(1), Chain); } else { Move = - SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0); + SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0); Chain = CurDAG->getEntryNode(); } - Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue()); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue()); InFlag = Chain.getValue(1); } else { InFlag = diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cd1d201..1cdf2b6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -222,7 +222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // X86 is weird, it always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); - + // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. if (Subtarget->is64Bit()) @@ -574,6 +574,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + // Lower this to FGETSIGNx86 plus an AND. + setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); + setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); + // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); @@ -927,7 +931,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Can turn SHL into an integer multiply. setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -949,6 +952,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } + if (Subtarget->hasSSE2()) { + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v8i16, Custom); + + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v8i16, Custom); + } + if (Subtarget->hasSSE42()) setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); @@ -1081,6 +1097,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); @@ -1096,6 +1113,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; + + setPrefFunctionAlignment(4); } @@ -1247,11 +1266,6 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { - return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; -} - // FIXME: Why this routine is here? Move to RegInfo! std::pair<const TargetRegisterClass*, uint8_t> X86TargetLowering::findRepresentativeClass(EVT VT) const{ @@ -1306,11 +1320,12 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, #include "X86GenCallingConv.inc" bool -X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, +X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } @@ -1325,7 +1340,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -1476,8 +1491,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -1518,20 +1533,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1)); - } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { - // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. - if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - MVT::v2i64, InFlag).getValue(1); - Val = Chain.getValue(0); - Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, - Val, DAG.getConstant(0, MVT::i64)); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - MVT::i64, InFlag).getValue(1); - Val = Chain.getValue(0); - } - Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); @@ -1680,7 +1681,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -1952,7 +1953,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, return SDValue(OutRetAddr.getNode(), 1); } -/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call +/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, @@ -2007,7 +2008,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -2043,7 +2044,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); SDValue RetAddrFrIdx; - // Load return adress for tail calls. + // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); @@ -2200,7 +2201,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector<SDValue, 8> MemOpChains2; SDValue FIN; int FI = 0; - // Do not flag preceeding copytoreg stuff together with the following stuff. + // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); if (GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -2270,6 +2271,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportLinkage()) { unsigned char OpFlags = 0; + bool ExtraLoad = false; + unsigned WrapperKind = ISD::DELETED_NODE; // On ELF targets, in both X86-64 and X86-32 mode, direct calls to // external symbols most go through the PLT in PIC mode. If the symbol @@ -2281,15 +2284,34 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; + } else if (Subtarget->isPICStyleRIPRel() && + isa<Function>(GV) && + cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { + // If the function is marked as non-lazy, generate an indirect call + // which loads from the GOT directly. This avoids runtime overhead + // at the cost of eager binding (and one extra byte of encoding). + OpFlags = X86II::MO_GOTPCREL; + WrapperKind = X86ISD::WrapperRIP; + ExtraLoad = true; } Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), G->getOffset(), OpFlags); + + // Add a wrapper if needed. + if (WrapperKind != ISD::DELETED_NODE) + Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); + // Add extra indirection if needed. + if (ExtraLoad) + Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(), + false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -2300,7 +2322,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, getTargetMachine().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -2528,16 +2551,30 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (RegInfo->needsStackRealignment(MF)) return false; - // Do not sibcall optimize vararg calls unless the call site is not passing - // any arguments. - if (isVarArg && !Outs.empty()) - return false; - // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; + // Do not sibcall optimize vararg calls unless all arguments are passed via + // registers. + if (isVarArg && !Outs.empty()) { + + // Optimizing for varargs on Win64 is unlikely to be safe without + // additional testing. + if (Subtarget->isTargetWin64()) + return false; + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. // Therefore if it's not used by the call it is not safe to optimize this into // a sibcall. @@ -2550,8 +2587,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } if (Unused) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CalleeCC, false, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; @@ -2564,13 +2601,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, getTargetMachine(), - RVLocs1, *DAG.getContext()); + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, getTargetMachine(), - RVLocs2, *DAG.getContext()); + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -2596,8 +2633,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (Subtarget->isTargetWin64()) { @@ -4018,7 +4055,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, /// getNumOfConsecutiveZeros - Return the number of elements of a vector /// shuffle operation which come from a consecutively from a zero. The -/// search can start in two diferent directions, from left or right. +/// search can start in two different directions, from left or right. static unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, bool ZerosFromLeft, SelectionDAG &DAG) { @@ -6617,9 +6654,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { } -/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and /// take a 2 x i32 value to shift plus a shift amount. -SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -6708,12 +6745,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, unsigned ByteSize = SrcVT.getSizeInBits()/8; - int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, ByteSize, ByteSize); - + FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); + MachineMemOperand *MMO; + if (FI) { + int SSFI = FI->getIndex(); + MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); + } else { + MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); + StackSlot = StackSlot.getOperand(1); + } SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, @@ -7204,6 +7247,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } +SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). + SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, + DAG.getConstant(1, VT)); + return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, @@ -8779,16 +8833,71 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { return Res; } -SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); LLVMContext *Context = DAG.getContext(); - assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); + // Must have SSE2. + if (!Subtarget->hasSSE2()) return SDValue(); + + // Optimize shl/srl/sra with constant shift amount. + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + uint64_t ShiftAmt = C->getZExtValue(); + + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + } + } + + // Lower SHL with variable shift amount. + // Cannot lower SHL without SSE4.1 or later. + if (!Subtarget->hasSSE41()) return SDValue(); - if (VT == MVT::v4i32) { + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), Op.getOperand(1), DAG.getConstant(23, MVT::i32)); @@ -8807,7 +8916,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } - if (VT == MVT::v16i8) { + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { // a = a << 5; Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), @@ -9112,7 +9221,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::SHL_PARTS: case ISD::SRA_PARTS: - case ISD::SRL_PARTS: return LowerShift(Op, DAG); + case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); @@ -9120,6 +9229,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VSETCC: return LowerVSETCC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); @@ -9140,7 +9250,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL_V2I64(Op, DAG); - case ISD::SHL: return LowerSHL(Op, DAG); + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: return LowerShift(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: @@ -9307,6 +9419,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; + case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; + case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -10984,14 +11098,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; - // Compute the element's address. + // cOMpute the element's address. SDValue Idx = Extract->getOperand(1); unsigned EltSize = InputVector.getValueType().getVectorElementType().getSizeInBits()/8; uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), StackPtr, OffsetVal); // Load the scalar. @@ -11264,15 +11378,28 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) return SDValue(); + SDValue FalseOp = N->getOperand(0); + SDValue TrueOp = N->getOperand(1); + X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); + SDValue Cond = N->getOperand(3); + if (CC == X86::COND_E || CC == X86::COND_NE) { + switch (Cond.getOpcode()) { + default: break; + case X86ISD::BSR: + case X86ISD::BSF: + // If operand of BSR / BSF are proven never zero, then ZF cannot be set. + if (DAG.isKnownNeverZero(Cond.getOperand(0))) + return (CC == X86::COND_E) ? FalseOp : TrueOp; + } + } + // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. - if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { - if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is // larger than FalseC (the false value). - X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); - if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); @@ -11282,7 +11409,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); @@ -11300,7 +11426,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); @@ -11339,7 +11464,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); // Zero extend the condition if needed. @@ -11574,12 +11698,94 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, } +// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) +// where both setccs reference the same FP CMP, and rewrite for CMPEQSS +// and friends. Likewise for OR -> CMPNEQSS. +static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + unsigned opcode; + + // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but + // we're requiring SSE2 for both. + if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CMP0 = N0->getOperand(1); + SDValue CMP1 = N1->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // The SETCCs should both refer to the same CMP. + if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) + return SDValue(); + + SDValue CMP00 = CMP0->getOperand(0); + SDValue CMP01 = CMP0->getOperand(1); + EVT VT = CMP00.getValueType(); + + if (VT == MVT::f32 || VT == MVT::f64) { + bool ExpectingFlags = false; + // Check for any users that want flags: + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); + !ExpectingFlags && UI != UE; ++UI) + switch (UI->getOpcode()) { + default: + case ISD::BR_CC: + case ISD::BRCOND: + case ISD::SELECT: + ExpectingFlags = true; + break; + case ISD::CopyToReg: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + break; + } + + if (!ExpectingFlags) { + enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); + enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); + + if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { + X86::CondCode tmp = cc0; + cc0 = cc1; + cc1 = tmp; + } + + if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || + (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { + bool is64BitFP = (CMP00.getValueType() == MVT::f64); + X86ISD::NodeType NTOperator = is64BitFP ? + X86ISD::FSETCCsd : X86ISD::FSETCCss; + // FIXME: need symbolic constants for these magic numbers. + // See X86ATTInstPrinter.cpp:printSSECC(). + unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; + SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, + DAG.getConstant(x86cc, MVT::i8)); + SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, + OnesOrZeroesF); + SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, + DAG.getConstant(1, MVT::i32)); + SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); + return OneBitOfTruth; + } + } + } + } + return SDValue(); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + // Want to form PANDN nodes, in the hopes of then easily combining them with // OR and AND nodes to form PBLEND/PSIGN. EVT VT = N->getValueType(0); @@ -11609,6 +11815,10 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + EVT VT = N->getValueType(0); if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) return SDValue(); @@ -11976,6 +12186,26 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { + DebugLoc dl = N->getDebugLoc(); + SDValue Op0 = N->getOperand(0); + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have + // a 32-bit target where SSE doesn't support i64->FP operations. + if (Op0.getOpcode() == ISD::LOAD) { + LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); + EVT VT = Ld->getValueType(0); + if (!Ld->isVolatile() && !N->getValueType(0).isVector() && + ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && + !XTLI->getSubtarget()->is64Bit() && + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); + DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); + return FILDChain; + } + } + return SDValue(); +} + // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -12060,6 +12290,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); @@ -12216,7 +12447,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. - // FIXME: this should verify that we are targetting a 486 or better. If not, + // FIXME: this should verify that we are targeting a 486 or better. If not, // we will turn this bswap into something that will be lowered to logical ops // instead of emitting the bswap asm. For now, we don't support 486 or lower // so don't worry about this. @@ -12489,12 +12720,16 @@ LowerXConstraint(EVT ConstraintVT) const { /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, - char Constraint, + std::string &Constraint, std::vector<SDValue>&Ops, SelectionDAG &DAG) const { SDValue Result(0, 0); - switch (Constraint) { + // Only support length 1 constraints for now. + if (Constraint.length() > 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { default: break; case 'I': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { @@ -12686,7 +12921,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return std::make_pair(0U, X86::GR8RegisterClass); if (VT == MVT::i16) return std::make_pair(0U, X86::GR16RegisterClass); - if (VT == MVT::i32 || !Subtarget->is64Bit()) + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) return std::make_pair(0U, X86::GR32RegisterClass); return std::make_pair(0U, X86::GR64RegisterClass); case 'R': // LEGACY_REGS diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 6301057..d61a125 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -94,6 +94,15 @@ namespace llvm { // one's or all zero's. SETCC_CARRY, // R = carry_bit ? ~0 : 0 + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCCss, FSETCCsd, + + /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values, + /// result in an integer GPR. Needs masking for scalar result. + FGETSIGNx86, + /// X86 conditional moves. Operand 0 and operand 1 are the two values /// to select from. Operand 2 is the condition code, and operand 3 is the /// flag operand produced by a CMP or TEST instruction. It also writes a @@ -592,7 +601,7 @@ namespace llvm { /// true it means one of the asm constraint of the inline asm instruction /// being processed is 'm'. virtual void LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, + std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const; @@ -674,15 +683,15 @@ namespace llvm { /// or null if the target does not support "fast" ISel. virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - /// getStackCookieLocation - Return true if the target stores stack /// protector cookies at a fixed offset in some non-standard address /// space, and populates the address space and offset as /// appropriate. virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const; + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, + SelectionDAG &DAG) const; + protected: std::pair<const TargetRegisterClass*, uint8_t> findRepresentativeClass(EVT VT) const; @@ -773,9 +782,7 @@ namespace llvm { SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; - SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, - SelectionDAG &DAG) const; + SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; @@ -786,6 +793,7 @@ namespace llvm { SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, DebugLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; @@ -808,7 +816,7 @@ namespace llvm { SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const; @@ -850,9 +858,10 @@ namespace llvm { ISD::NodeType ExtendKind) const; virtual bool - CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - LLVMContext &Context) const; + CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const; void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG, unsigned NewOp) const; diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index 45d1c6b..dd4f6a5 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -12,66 +12,91 @@ // //===----------------------------------------------------------------------===// -// FIXME: We don't support any intrinsics for these instructions yet. +class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat> + : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> { +} -class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, Requires<[Has3DNow]> { +class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; + let Constraints = "$src1 = $dst"; } -class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic> - : I<o, F, (outs VR64:$dst), ins, - !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), []>, - TB, Requires<[Has3DNow]>, Has3DNow0F0FOpcode { +class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, + Has3DNow0F0FOpcode { // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. let isAsmParserOnly = 1; } +multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>; +} + +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; +} + +multiclass I3DNow_conv_rm<bits<8> opc, string Mn> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>; +} -let Constraints = "$src1 = $dst" in { - // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. - // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. - multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { - def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn>; - def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn>; - } +multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) + (bitconvert (load_mmx addr:$src))))]>; } -defm PAVGUSB : I3DNow_binop_rm<0xBF, "pavgusb">; -defm PF2ID : I3DNow_binop_rm<0x1D, "pf2id">; -defm PFACC : I3DNow_binop_rm<0xAE, "pfacc">; -defm PFADD : I3DNow_binop_rm<0x9E, "pfadd">; -defm PFCMPEQ : I3DNow_binop_rm<0xB0, "pfcmpeq">; -defm PFCMPGE : I3DNow_binop_rm<0x90, "pfcmpge">; -defm PFCMPGT : I3DNow_binop_rm<0xA0, "pfcmpgt">; -defm PFMAX : I3DNow_binop_rm<0xA4, "pfmax">; -defm PFMIN : I3DNow_binop_rm<0x94, "pfmin">; -defm PFMUL : I3DNow_binop_rm<0xB4, "pfmul">; -defm PFRCP : I3DNow_binop_rm<0x96, "pfrcp">; -defm PFRCPIT1 : I3DNow_binop_rm<0xA6, "pfrcpit1">; -defm PFRCPIT2 : I3DNow_binop_rm<0xB6, "pfrcpit2">; -defm PFRSQIT1 : I3DNow_binop_rm<0xA7, "pfrsqit1">; -defm PFRSQRT : I3DNow_binop_rm<0x97, "pfrsqrt">; -defm PFSUB : I3DNow_binop_rm<0x9A, "pfsub">; -defm PFSUBR : I3DNow_binop_rm<0xAA, "pfsubr">; -defm PI2FD : I3DNow_binop_rm<0x0D, "pi2fd">; -defm PMULHRW : I3DNow_binop_rm<0xB7, "pmulhrw">; +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">; +defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">; +defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">; +defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">; +defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">; +defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">; +defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">; +defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">; +defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">; +defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">; +defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">; +defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">; +defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">; def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>; def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr), "prefetch $addr", []>; - + // FIXME: Diassembler gets a bogus decode conflict. -let isAsmParserOnly = 1 in { +let isAsmParserOnly = 1 in def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr), "prefetchw $addr", []>; -} // "3DNowA" instructions -defm PF2IW : I3DNow_binop_rm<0x1C, "pf2iw">; -defm PI2FW : I3DNow_binop_rm<0x0C, "pi2fw">; -defm PFNACC : I3DNow_binop_rm<0x8A, "pfnacc">; -defm PFPNACC : I3DNow_binop_rm<0x8E, "pfpnacc">; -defm PSWAPD : I3DNow_binop_rm<0xBB, "pswapd">; +defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; +defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">; +defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index f0ea068..9f7a4b0 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -163,7 +163,7 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), } // Defs = [EFLAGS] -// Suprisingly enough, these are not two address instructions! +// Surprisingly enough, these are not two address instructions! let Defs = [EFLAGS] in { // Register-Integer Signed Integer Multiply def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 4c915d9..adcc747 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -214,6 +214,30 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), (SETBr)>; +// (add OP, SETB) -> (adc OP, 0) +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), + (ADC8ri GR8:$op, 0)>; +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), + (ADC64ri8 GR64:$op, 0)>; + +// (sub OP, SETB) -> (sbb OP, 0) +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB64ri8 GR64:$op, 0)>; + +// (sub OP, SETCC_CARRY) -> (adc OP, 0) +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC64ri8 GR64:$op, 0)>; + //===----------------------------------------------------------------------===// // String Pseudo Instructions // @@ -519,85 +543,98 @@ def Int_MemBarrierNoSSE64 : RI<0x09, MRM1r, (outs), (ins GR64:$zero), Requires<[In64BitMode]>, LOCK; -// Optimized codegen when the non-memory output is not used. +// RegOpc corresponds to the mr version of the instruction +// ImmOpc corresponds to the mi version of the instruction +// ImmOpc8 corresponds to the mi8 version of the instruction +// ImmMod corresponds to the instruction format of the mi and mi8 versions +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, + Format ImmMod, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { -def LOCK_ADD8mr : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), - "lock\n\t" - "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD16mr : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_ADD32mr : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mr : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD8mi : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2), - "lock\n\t" - "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD16mi : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD32mi : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mi32 : RIi32<0x81, MRM0m, (outs), - (ins i64mem:$dst, i64i32imm :$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; - -def LOCK_ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs), - (ins i64mem:$dst, i64i8imm :$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; - -def LOCK_SUB8mr : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2), - "lock\n\t" - "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB16mr : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mr : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat("lock\n\t", mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, OpSize, LOCK; +def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +} -def LOCK_SUB8mi : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2), - "lock\n\t" - "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB16mi : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mi : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mi32 : RIi32<0x81, MRM5m, (outs), - (ins i64mem:$dst, i64i32imm:$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +} +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM4m, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM6m, "xor">; -def LOCK_SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mi8 : RIi8<0x83, MRM5m, (outs), - (ins i64mem:$dst, i64i8imm :$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +// Optimized codegen when the non-memory output is not used. +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "lock\n\t" @@ -960,7 +997,8 @@ def : Pat<(extloadi64i32 addr:$src), // anyext. Define these to do an explicit zero-extend to // avoid partial-register updates. -def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8 GR8 :$src)>; +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG + (MOVZX32rr8 GR8 :$src), sub_16bit)>; def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; // Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. @@ -1127,9 +1165,9 @@ def : Pat<(and GR32:$src1, 0xff), Requires<[In32BitMode]>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), - (MOVZX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src1, - GR16_ABCD)), - sub_8bit))>, + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG + (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), + sub_16bit)>, Requires<[In32BitMode]>; // r & (2^32-1) ==> movz @@ -1147,7 +1185,8 @@ def : Pat<(and GR32:$src1, 0xff), Requires<[In64BitMode]>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), - (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)))>, + (EXTRACT_SUBREG (MOVZX32rr8 (i8 + (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, Requires<[In64BitMode]>; @@ -1159,10 +1198,11 @@ def : Pat<(sext_inreg GR32:$src, i8), GR32_ABCD)), sub_8bit))>, Requires<[In32BitMode]>; + def : Pat<(sext_inreg GR16:$src, i8), - (MOVSX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, - GR16_ABCD)), - sub_8bit))>, + (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG + (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), + sub_16bit)>, Requires<[In32BitMode]>; def : Pat<(sext_inreg GR64:$src, i32), @@ -1175,9 +1215,19 @@ def : Pat<(sext_inreg GR32:$src, i8), (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, Requires<[In64BitMode]>; def : Pat<(sext_inreg GR16:$src, i8), - (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, sub_8bit)))>, + (EXTRACT_SUBREG (MOVSX32rr8 + (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, Requires<[In64BitMode]>; +// sext, sext_load, zext, zext_load +def: Pat<(i16 (sext GR8:$src)), + (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(sextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; +def: Pat<(i16 (zext GR8:$src)), + (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(zextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; // trunc patterns def : Pat<(i16 (trunc GR32:$src)), @@ -1318,6 +1368,11 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), // (shl x, 1) ==> (add x, x) +// Note that if x is undef (immediate or otherwise), we could theoretically +// end up with the two uses of x getting different values, producing a result +// where the least significant bit is not 0. However, the probability of this +// happening is considered low enough that this is officially not a +// "real problem". def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; @@ -1474,12 +1529,6 @@ def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; -// Optimize multiply by 2 with EFLAGS result. -let AddedComplexity = 2 in { -def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>; -def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>; -} - // Patterns for nodes that do not produce flags, for instructions that do. // addition diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index 867c0f8..2e1d523 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -38,22 +38,11 @@ let neverHasSideEffects = 1 in { // Sign/Zero extenders -// Use movsbl intead of movsbw; we don't care about the high 16 bits -// of the register here. This has a smaller encoding and avoids a -// partial-register update. Actual movsbw included for the disassembler. -def MOVSX16rr8W : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), - "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -def MOVSX16rm8W : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), - "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; - -// FIXME: Use a pat pattern or define a syntax here. -let isCodeGenOnly=1 in { -def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src), - "", [(set GR16:$dst, (sext GR8:$src))]>, TB; -def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src), - "", [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB; -} -def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), +def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sext GR8:$src))]>, TB; def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), @@ -66,20 +55,10 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "movs{wl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB; -// Use movzbl intead of movzbw; we don't care about the high 16 bits -// of the register here. This has a smaller encoding and avoids a -// partial-register update. Actual movzbw included for the disassembler. -def MOVZX16rr8W : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), - "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -def MOVZX16rm8W : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), - "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -// FIXME: Use a pat pattern or define a syntax here. -let isCodeGenOnly=1 in { -def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src), - "", [(set GR16:$dst, (zext GR8:$src))]>, TB; -def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src), - "", [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB; -} +def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (zext GR8:$src))]>, TB; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 3cbfac1..7c9a9f7 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -38,8 +38,11 @@ def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; +def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>; +def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 21df57c..b3237d5 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -232,7 +232,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?"); RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE) continue; @@ -335,7 +335,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable0.count(RegOp) && "Duplicated entries?"); RegOp2MemOpTable0[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl0[i][1] & TB_NOT_REVERSABLE) continue; @@ -460,7 +460,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable1.count(RegOp) && "Duplicate entries"); RegOp2MemOpTable1[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl1[i][1] & TB_NOT_REVERSABLE) continue; @@ -682,7 +682,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!"); RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl2[i][1] & TB_NOT_REVERSABLE) continue; @@ -916,7 +916,6 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::MOVSDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MMX_MOVD64rm: @@ -1790,7 +1789,6 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, .addMBB(UnCondBrIter->getOperand(0).getMBB()); BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4)) .addMBB(TargetBB); - MBB.addSuccessor(TargetBB); OldInst->eraseFromParent(); UnCondBrIter->eraseFromParent(); @@ -2016,62 +2014,48 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, bool isStackAligned, const TargetMachine &TM, bool load) { - switch (RC->getID()) { + switch (RC->getSize()) { default: - llvm_unreachable("Unknown regclass"); - case X86::GR64RegClassID: - case X86::GR64_ABCDRegClassID: - case X86::GR64_NOREXRegClassID: - case X86::GR64_NOREX_NOSPRegClassID: - case X86::GR64_NOSPRegClassID: - case X86::GR64_TCRegClassID: - case X86::GR64_TCW64RegClassID: - return load ? X86::MOV64rm : X86::MOV64mr; - case X86::GR32RegClassID: - case X86::GR32_ABCDRegClassID: - case X86::GR32_ADRegClassID: - case X86::GR32_NOREXRegClassID: - case X86::GR32_NOSPRegClassID: - case X86::GR32_TCRegClassID: - return load ? X86::MOV32rm : X86::MOV32mr; - case X86::GR16RegClassID: - case X86::GR16_ABCDRegClassID: - case X86::GR16_NOREXRegClassID: - return load ? X86::MOV16rm : X86::MOV16mr; - case X86::GR8RegClassID: - // Copying to or from a physical H register on x86-64 requires a NOREX - // move. Otherwise use a normal move. - if (isHReg(Reg) && - TM.getSubtarget<X86Subtarget>().is64Bit()) - return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; - else - return load ? X86::MOV8rm : X86::MOV8mr; - case X86::GR8_ABCD_LRegClassID: - case X86::GR8_NOREXRegClassID: - return load ? X86::MOV8rm :X86::MOV8mr; - case X86::GR8_ABCD_HRegClassID: + llvm_unreachable("Unknown spill size"); + case 1: + assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); if (TM.getSubtarget<X86Subtarget>().is64Bit()) - return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; - else - return load ? X86::MOV8rm : X86::MOV8mr; - case X86::RFP80RegClassID: + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) + return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; + return load ? X86::MOV8rm : X86::MOV8mr; + case 2: + assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); + return load ? X86::MOV16rm : X86::MOV16mr; + case 4: + if (X86::GR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOV32rm : X86::MOV32mr; + if (X86::FR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOVSSrm : X86::MOVSSmr; + if (X86::RFP32RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp32m : X86::ST_Fp32m; + llvm_unreachable("Unknown 4-byte regclass"); + case 8: + if (X86::GR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOV64rm : X86::MOV64mr; + if (X86::FR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOVSDrm : X86::MOVSDmr; + if (X86::VR64RegClass.hasSubClassEq(RC)) + return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; + if (X86::RFP64RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp64m : X86::ST_Fp64m; + llvm_unreachable("Unknown 8-byte regclass"); + case 10: + assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; - case X86::RFP64RegClassID: - return load ? X86::LD_Fp64m : X86::ST_Fp64m; - case X86::RFP32RegClassID: - return load ? X86::LD_Fp32m : X86::ST_Fp32m; - case X86::FR32RegClassID: - return load ? X86::MOVSSrm : X86::MOVSSmr; - case X86::FR64RegClassID: - return load ? X86::MOVSDrm : X86::MOVSDmr; - case X86::VR128RegClassID: + case 16: + assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? X86::MOVAPSrm : X86::MOVAPSmr; else return load ? X86::MOVUPSrm : X86::MOVUPSmr; - case X86::VR64RegClassID: - return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; } } @@ -2241,6 +2225,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, bool isTwoAddr = NumOps > 1 && MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1; + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOpcode() == X86::ADD32ri && + MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return NULL; + MachineInstr *NewMI = NULL; // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires @@ -2429,7 +2419,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = 4; break; default: - llvm_unreachable("Don't know how to fold this instruction!"); + return 0; } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; @@ -2535,6 +2525,12 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, case X86::TEST32rr: case X86::TEST64rr: return true; + case X86::ADD32ri: + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return false; + break; } } @@ -2845,11 +2841,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::MOVDQUrm_Int: break; } switch (Opc2) { @@ -2869,11 +2863,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::MOVDQUrm_Int: break; } diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 4625b4c..d895023 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -449,7 +449,6 @@ namespace X86II { SSEDomainShift = SegOvrShift + 2, OpcodeShift = SSEDomainShift + 2, - OpcodeMask = 0xFFULL << OpcodeShift, //===------------------------------------------------------------------===// /// VEX - The opcode prefix used by AVX instructions @@ -807,7 +806,7 @@ public: int64_t &Offset1, int64_t &Offset2) const; /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to - /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should /// be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index f832a7c..8cab808 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -23,6 +23,9 @@ def SDTIntShiftDOp: SDTypeProfile<1, 3, def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; +def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; @@ -459,7 +462,7 @@ def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; include "X86InstrFormats.td" //===----------------------------------------------------------------------===// -// Pattern fragments... +// Pattern fragments. // // X86 specific condition code. These correspond to CondCode in @@ -481,21 +484,21 @@ def X86_COND_O : PatLeaf<(i8 13)>; def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE def X86_COND_S : PatLeaf<(i8 15)>; -def immSext8 : PatLeaf<(imm), [{ return immSext8(N); }]>; +let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. + def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; + def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; + def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; +} -def i16immSExt8 : PatLeaf<(i16 immSext8)>; -def i32immSExt8 : PatLeaf<(i32 immSext8)>; -def i64immSExt8 : PatLeaf<(i64 immSext8)>; -def i64immSExt32 : PatLeaf<(i64 imm), [{ return i64immSExt32(N); }]>; -def i64immZExt32 : PatLeaf<(i64 imm), [{ - // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit - // unsignedsign extended field. - return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue(); -}]>; +def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; + + +// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit +// unsigned field. +def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>; -def i64immZExt32SExt8 : PatLeaf<(i64 imm), [{ - uint64_t v = N->getZExtValue(); - return v == (uint32_t)v && (int32_t)v == (int8_t)v; +def i64immZExt32SExt8 : ImmLeaf<i64, [{ + return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm; }]>; // Helper fragments for loads. @@ -1437,7 +1440,7 @@ def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" -def : InstAlias<"faddp", (ADD_FPrST0 ST1)>; +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>; def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>; def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; @@ -1455,13 +1458,15 @@ def : InstAlias<"fucompi", (UCOM_FIPr ST1)>; // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with // gas. -multiclass FpUnaryAlias<string Mnemonic, Instruction Inst> { - def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), (Inst RST:$op)>; - def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), (Inst ST0)>; +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), + (Inst ST0), EmitAlias>; } defm : FpUnaryAlias<"fadd", ADD_FST0r>; -defm : FpUnaryAlias<"faddp", ADD_FPrST0>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; defm : FpUnaryAlias<"fsub", SUB_FST0r>; defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>; defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; @@ -1472,8 +1477,8 @@ defm : FpUnaryAlias<"fdiv", DIV_FST0r>; defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>; defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>; -defm : FpUnaryAlias<"fcomi", COM_FIr>; -defm : FpUnaryAlias<"fucomi", UCOM_FIr>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; defm : FpUnaryAlias<"fcompi", COM_FIPr>; defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; @@ -1481,8 +1486,9 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they // commute. We also allow fdiv[r]p/fsubrp even though they don't commute, // solely because gas supports it. -def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op)>; +def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>; def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; @@ -1534,29 +1540,31 @@ def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>; def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>; // Match 'movq GR64, MMX' as an alias for movd. -def : InstAlias<"movq $src, $dst", (MMX_MOVD64to64rr VR64:$dst, GR64:$src)>; -def : InstAlias<"movq $src, $dst", (MMX_MOVD64from64rr GR64:$dst, VR64:$src)>; +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; // movsd with no operands (as opposed to the SSE scalar move of a double) is an // alias for movsl. (as in rep; movsd) def : InstAlias<"movsd", (MOVSD)>; // movsx aliases -def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src)>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases -def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src)>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index bb2165a..b2d9fca 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -285,7 +285,7 @@ let Constraints = "$src1 = $dst" in defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>; defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, 1>; defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>; -defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, 1>; +defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>; // Shift Instructions defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 8f08e68..7774057 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -135,18 +135,16 @@ class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, // is used instead. Register-to-register movss/movsd is not modeled as an // INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable // in terms of a copy, and just mentioned, we don't use movss/movsd for copies. -let isAsmParserOnly = 0 in { - def VMOVSSrr : sse12_move_rr<FR32, v4f32, - "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V; - def VMOVSDrr : sse12_move_rr<FR64, v2f64, - "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V; +def VMOVSSrr : sse12_move_rr<FR32, v4f32, + "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V; +def VMOVSDrr : sse12_move_rr<FR64, v2f64, + "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V; - let canFoldAsLoad = 1, isReMaterializable = 1 in { - def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX; +let canFoldAsLoad = 1, isReMaterializable = 1 in { + def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX; - let AddedComplexity = 20 in - def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX; - } + let AddedComplexity = 20 in + def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX; } let Constraints = "$src1 = $dst" in { @@ -218,14 +216,12 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", [(store FR64:$src, addr:$dst)]>; -let isAsmParserOnly = 0 in { def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), "movss\t{$src, $dst|$dst, $src}", [(store FR32:$src, addr:$dst)]>, XS, VEX; def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", [(store FR64:$src, addr:$dst)]>, XD, VEX; -} // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), @@ -251,7 +247,6 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in [(set RC:$dst, (ld_frag addr:$src))], d>; } -let isAsmParserOnly = 0 in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle>, VEX; defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, @@ -269,7 +264,6 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle>, VEX; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, 0>, OpSize, VEX; -} defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle>, TB; defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, @@ -279,7 +273,6 @@ defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, 0>, TB, OpSize; -let isAsmParserOnly = 0 in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX; @@ -304,7 +297,6 @@ def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [(store (v4f64 VR256:$src), addr:$dst)]>, VEX; -} def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>; def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), @@ -328,32 +320,14 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), [(store (v2f64 VR128:$src), addr:$dst)]>; // Intrinsic forms of MOVUPS/D load and store -let isAsmParserOnly = 0 in { - let canFoldAsLoad = 1, isReMaterializable = 1 in - def VMOVUPSrm_Int : VPSI<0x10, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "movups\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>, VEX; - def VMOVUPDrm_Int : VPDI<0x10, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>, VEX; - def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movups\t{$src, $dst|$dst, $src}", - [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX; - def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX; -} -let canFoldAsLoad = 1, isReMaterializable = 1 in -def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "movups\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>; -def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>; +def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX; +def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX; def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movups\t{$src, $dst|$dst, $src}", @@ -382,7 +356,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC, SSEPackedDouble>, TB, OpSize; } -let isAsmParserOnly = 0, AddedComplexity = 20 in { +let AddedComplexity = 20 in { defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp", "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V; defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp", @@ -395,7 +369,6 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { "\t{$src2, $dst|$dst, $src2}">; } -let isAsmParserOnly = 0 in { def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), @@ -404,7 +377,6 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), @@ -416,7 +388,6 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. -let isAsmParserOnly = 0 in { def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract @@ -429,7 +400,6 @@ def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (v2f64 (unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract @@ -441,7 +411,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (v2f64 (unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst)]>; -let isAsmParserOnly = 0, AddedComplexity = 20 in { +let AddedComplexity = 20 in { def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -516,7 +486,6 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; } -let isAsmParserOnly = 0 in { defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX; defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, @@ -542,7 +511,6 @@ defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD, VEX_4V; defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD, VEX_4V, VEX_W; -} defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, "cvttss2si\t{$src, $dst|$dst, $src}">, XS; @@ -591,27 +559,25 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>; } -let isAsmParserOnly = 0 in { - defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, - f32mem, load, "cvtss2si">, XS, VEX; - defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">, - XS, VEX, VEX_W; - defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, - f128mem, load, "cvtsd2si">, XD, VEX; - defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">, - XD, VEX, VEX_W; - - // FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ - // Get rid of this hack or rename the intrinsics, there are several - // intructions that only match with the intrinsic form, why create duplicates - // to let them be recognized by the assembler? - defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, - "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; - defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, - "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W; -} +defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + f32mem, load, "cvtss2si">, XS, VEX; +defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">, + XS, VEX, VEX_W; +defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + f128mem, load, "cvtsd2si">, XD, VEX; +defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">, + XD, VEX, VEX_W; + +// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ +// Get rid of this hack or rename the intrinsics, there are several +// intructions that only match with the intrinsic form, why create duplicates +// to let them be recognized by the assembler? +defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; +defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W; defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, f32mem, load, "cvtss2si">, XS; defm Int_CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, @@ -622,18 +588,16 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si{q}">, XD, REX_W; -let isAsmParserOnly = 0 in { - defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V; - defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V, - VEX_W; - defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V; - defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD, - VEX_4V, VEX_W; -} +defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V; +defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V, + VEX_W; +defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V; +defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD, + VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in { defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, @@ -653,7 +617,6 @@ let Constraints = "$src1 = $dst" in { /// SSE 1 Only // Aliases for intrinsics -let isAsmParserOnly = 0 in { defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, f32mem, load, "cvttss2si">, XS, VEX; defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, @@ -664,7 +627,6 @@ defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, f128mem, load, "cvttsd2si">, XD, VEX, VEX_W; -} defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, f32mem, load, "cvttss2si">, XS; defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, @@ -676,7 +638,7 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, f128mem, load, "cvttsd2si{q}">, XD, REX_W; -let isAsmParserOnly = 0, Pattern = []<dag> in { +let Pattern = []<dag> in { defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load, "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX; defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load, @@ -702,7 +664,6 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/, /// SSE 2 Only // Convert scalar double to scalar single -let isAsmParserOnly = 0 in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -711,7 +672,6 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR64:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V; -} def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, Requires<[HasAVX]>; @@ -723,7 +683,6 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD, Requires<[HasSSE2, OptForSize]>; -let isAsmParserOnly = 0 in defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>, XS, VEX_4V; @@ -732,7 +691,7 @@ defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS; // Convert scalar single to scalar double -let isAsmParserOnly = 0 in { // SSE2 instructions with XS prefix +// SSE2 instructions with XS prefix def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -741,7 +700,6 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR32:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>; -} def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; @@ -754,7 +712,6 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, Requires<[HasSSE2, OptForSize]>; -let isAsmParserOnly = 0 in { def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -767,7 +724,6 @@ def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, (load addr:$src2)))]>, XS, VEX_4V, Requires<[HasAVX]>; -} let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -788,7 +744,7 @@ def : Pat<(extloadf32 addr:$src), Requires<[HasSSE2, OptForSpeed]>; // Convert doubleword to packed single/double fp -let isAsmParserOnly = 0 in { // SSE2 instructions without OpSize prefix +// SSE2 instructions without OpSize prefix def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, @@ -798,7 +754,6 @@ def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtdq2ps (bitconvert (memopv2i64 addr:$src))))]>, TB, VEX, Requires<[HasAVX]>; -} def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, @@ -810,7 +765,7 @@ def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), TB, Requires<[HasSSE2]>; // FIXME: why the non-intrinsic version is described as SSE3? -let isAsmParserOnly = 0 in { // SSE2 instructions with XS prefix +// SSE2 instructions with XS prefix def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, @@ -820,7 +775,6 @@ def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtdq2pd (bitconvert (memopv2i64 addr:$src))))]>, XS, VEX, Requires<[HasAVX]>; -} def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, @@ -833,7 +787,6 @@ def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), // Convert packed single/double fp to doubleword -let isAsmParserOnly = 0 in { def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), @@ -842,13 +795,11 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; -} def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>; def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>; -let isAsmParserOnly = 0 in { def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>, @@ -858,7 +809,6 @@ def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq (memop addr:$src)))]>, VEX; -} def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>; @@ -867,7 +817,7 @@ def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtps2dq (memop addr:$src)))]>; -let isAsmParserOnly = 0 in { // SSE2 packed instructions with XD prefix +// SSE2 packed instructions with XD prefix def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, @@ -877,7 +827,6 @@ def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtpd2dq (memop addr:$src)))]>, XD, VEX, Requires<[HasAVX]>; -} def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, @@ -890,7 +839,7 @@ def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), // Convert with truncation packed single/double fp to doubleword -let isAsmParserOnly = 0 in { // SSE2 packed instructions with XS prefix +// SSE2 packed instructions with XS prefix def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), @@ -899,7 +848,6 @@ def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; -} def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -910,7 +858,6 @@ def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), (int_x86_sse2_cvttps2dq (memop addr:$src)))]>; -let isAsmParserOnly = 0 in { def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -921,9 +868,7 @@ def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvttps2dq (memop addr:$src)))]>, XS, VEX, Requires<[HasAVX]>; -} -let isAsmParserOnly = 0 in { def Int_VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", @@ -934,7 +879,6 @@ def Int_VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memop addr:$src)))]>, VEX; -} def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>; @@ -943,7 +887,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memop addr:$src)))]>; -let isAsmParserOnly = 0 in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -963,10 +906,9 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; -} // Convert packed single to packed double -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // SSE2 instructions without OpSize prefix def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; @@ -982,7 +924,6 @@ def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; -let isAsmParserOnly = 0 in { def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, @@ -992,7 +933,6 @@ def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtps2pd (load addr:$src)))]>, VEX, Requires<[HasAVX]>; -} def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, @@ -1004,7 +944,6 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), TB, Requires<[HasSSE2]>; // Convert packed double to packed single -let isAsmParserOnly = 0 in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -1024,14 +963,12 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; -} def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; -let isAsmParserOnly = 0 in { def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; @@ -1040,7 +977,6 @@ def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps (memop addr:$src)))]>; -} def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; @@ -1109,7 +1045,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, asm_alt, []>; } -let neverHasSideEffects = 1, isAsmParserOnly = 0 in { +let neverHasSideEffects = 1 in { defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", "cmpss\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">, @@ -1120,13 +1056,37 @@ let neverHasSideEffects = 1, isAsmParserOnly = 0 in { XD, VEX_4V; } +let Constraints = "$src1 = $dst" in { +def CMPSSrr : SIi8<0xC2, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, SSECC:$cc), + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), FR32:$src2, imm:$cc))]>, XS; +def CMPSSrm : SIi8<0xC2, MRMSrcMem, + (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, SSECC:$cc), + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), (loadf32 addr:$src2), imm:$cc))]>, XS; +def CMPSDrr : SIi8<0xC2, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, SSECC:$cc), + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), FR64:$src2, imm:$cc))]>, XD; +def CMPSDrm : SIi8<0xC2, MRMSrcMem, + (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, SSECC:$cc), + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), (loadf64 addr:$src2), imm:$cc))]>, XD; +} let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in { - defm CMPSS : sse12_cmp_scalar<FR32, f32mem, - "cmp${cc}ss\t{$src, $dst|$dst, $src}", - "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}">, XS; - defm CMPSD : sse12_cmp_scalar<FR64, f64mem, - "cmp${cc}sd\t{$src, $dst|$dst, $src}", - "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}">, XD; +def CMPSSrr_alt : SIi8<0xC2, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2), + "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS; +def CMPSSrm_alt : SIi8<0xC2, MRMSrcMem, + (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2), + "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS; +def CMPSDrr_alt : SIi8<0xC2, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2), + "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD; +def CMPSDrm_alt : SIi8<0xC2, MRMSrcMem, + (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2), + "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD; } multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop, @@ -1142,14 +1102,12 @@ multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop, } // Aliases to match intrinsics which expect XMM operand(s). -let isAsmParserOnly = 0 in { - defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">, - XS, VEX_4V; - defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">, - XD, VEX_4V; -} +defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">, + XS, VEX_4V; +defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">, + XD, VEX_4V; let Constraints = "$src1 = $dst" in { defm Int_CMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS; @@ -1172,28 +1130,26 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, } let Defs = [EFLAGS] in { - let isAsmParserOnly = 0 in { - defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", SSEPackedSingle>, VEX; - defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", SSEPackedDouble>, OpSize, VEX; - let Pattern = []<dag> in { - defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, VEX; - defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, OpSize, VEX; - } - - defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss", SSEPackedSingle>, VEX; - defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd", SSEPackedDouble>, OpSize, VEX; - - defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, - load, "comiss", SSEPackedSingle>, VEX; - defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, - load, "comisd", SSEPackedDouble>, OpSize, VEX; + defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, VEX; + defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, OpSize, VEX; + let Pattern = []<dag> in { + defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, VEX; + defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, OpSize, VEX; } + + defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss", SSEPackedSingle>, VEX; + defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd", SSEPackedDouble>, OpSize, VEX; + + defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, + load, "comiss", SSEPackedSingle>, VEX; + defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, + load, "comisd", SSEPackedDouble>, OpSize, VEX; defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss", SSEPackedSingle>, TB; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, @@ -1239,24 +1195,22 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, asm_alt, [], d>; } -let isAsmParserOnly = 0 in { - defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, - "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedSingle>, VEX_4V; - defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, - "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256, - "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedSingle>, VEX_4V; - defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256, - "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; -} +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedSingle>, VEX_4V; +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256, + "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedSingle>, VEX_4V; +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256, + "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src, $dst|$dst, $src}", @@ -1296,20 +1250,18 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>; } -let isAsmParserOnly = 0 in { - defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, - "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv4f32, SSEPackedSingle>, TB, VEX_4V; - defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, - "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv8f32, SSEPackedSingle>, TB, VEX_4V; - defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, - "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; - defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, - "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V; -} +defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv4f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv8f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -1342,33 +1294,31 @@ multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt, } let AddedComplexity = 10 in { - let isAsmParserOnly = 0 in { - defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, - VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, - VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, - VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, - VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - - defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, - VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, - VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, - VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, - VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - } + defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + + defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, + VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, + VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, + VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, + VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, @@ -1401,35 +1351,46 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, } // Mask creation +defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, + "movmskps", SSEPackedSingle>, VEX; +defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, + "movmskpd", SSEPackedDouble>, OpSize, + VEX; +defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, + "movmskps", SSEPackedSingle>, VEX; +defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, + "movmskpd", SSEPackedDouble>, OpSize, + VEX; defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", SSEPackedSingle>, TB; defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", SSEPackedDouble>, TB, OpSize; -let isAsmParserOnly = 0 in { - defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, - "movmskps", SSEPackedSingle>, VEX; - defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, - "movmskpd", SSEPackedDouble>, OpSize, - VEX; - defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, - "movmskps", SSEPackedSingle>, VEX; - defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, - "movmskpd", SSEPackedDouble>, OpSize, - VEX; +// X86fgetsign +def MOVMSKPDrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src), + "movmskpd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize; +def MOVMSKPDrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src), + "movmskpd\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize; +def MOVMSKPSrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src), + "movmskps\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB; +def MOVMSKPSrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src), + "movmskps\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB; - // Assembler Only - def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; - def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, - VEX; - def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; - def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, - VEX; -} +// Assembler Only +def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; +def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + VEX; +def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; +def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + VEX; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Misc aliasing of packed SSE 1 & 2 instructions @@ -1484,13 +1445,11 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), /// multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, SDNode OpNode> { - let isAsmParserOnly = 0 in { - defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V; + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V; - defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V; - } + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, @@ -1516,7 +1475,7 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, SDNode OpNode, int HasPat = 0, list<list<dag>> Pattern = []> { - let isAsmParserOnly = 0, Pattern = []<dag> in { + let Pattern = []<dag> in { defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, !if(HasPat, Pattern[0], // rr @@ -1563,7 +1522,6 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, /// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms /// -let isAsmParserOnly = 0 in { multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> { defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f256mem, [], [], 0>, VEX_4V; @@ -1571,7 +1529,6 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> { defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f256mem, [], [], 0>, OpSize, VEX_4V; } -} // AVX 256-bit packed logical ops forms defm VAND : sse12_fp_packed_logical_y<0x54, "and">; @@ -1669,38 +1626,36 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> { } // Binary Arithmetic instructions -let isAsmParserOnly = 0 in { - defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, - basic_sse12_fp_binop_s_int<0x58, "add", 0>, - basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, - basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; - defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, - basic_sse12_fp_binop_s_int<0x59, "mul", 0>, - basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, - basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; +defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_s_int<0x58, "add", 0>, + basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; +defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_s_int<0x59, "mul", 0>, + basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; - let isCommutable = 0 in { - defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, - basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, - basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, - basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; - defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, - basic_sse12_fp_binop_s_int<0x5E, "div", 0>, - basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, - basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; - defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, - basic_sse12_fp_binop_s_int<0x5F, "max", 0>, - basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, - basic_sse12_fp_binop_p_int<0x5F, "max", 0>, - basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, - basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V; - defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, - basic_sse12_fp_binop_s_int<0x5D, "min", 0>, - basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, - basic_sse12_fp_binop_p_int<0x5D, "min", 0>, - basic_sse12_fp_binop_p_y_int<0x5D, "min">, - basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; - } +let isCommutable = 0 in { + defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, + basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; + defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_s_int<0x5E, "div", 0>, + basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; + defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_s_int<0x5F, "max", 0>, + basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_p_int<0x5F, "max", 0>, + basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, + basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V; + defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_s_int<0x5D, "min", 0>, + basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_p_int<0x5D, "min", 0>, + basic_sse12_fp_binop_p_y_int<0x5D, "min">, + basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; } let Constraints = "$src1 = $dst" in { @@ -1901,7 +1856,7 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // Square root. defm VSQRT : sse1_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ss>, sse2_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_sd>, @@ -1957,67 +1912,54 @@ defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, // SSE 1 & 2 - Non-temporal stores //===----------------------------------------------------------------------===// -let isAsmParserOnly = 0 in { - def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX; - def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX; - - let ExeDomain = SSEPackedInt in - def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs), +let AddedComplexity = 400 in { // Prefer non-temporal versions + def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX; - - let AddedComplexity = 400 in { // Prefer non-temporal versions - def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), - addr:$dst)]>, VEX; - def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2f64 VR128:$src), - addr:$dst)]>, VEX; - def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2f64 VR128:$src), - addr:$dst)]>, VEX; - let ExeDomain = SSEPackedInt in - def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), + addr:$dst)]>, VEX; + def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), + [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>, VEX; - def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f32 VR256:$src), - addr:$dst)]>, VEX; - def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f64 VR256:$src), - addr:$dst)]>, VEX; - def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f64 VR256:$src), - addr:$dst)]>, VEX; - let ExeDomain = SSEPackedInt in - def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + let ExeDomain = SSEPackedInt in + def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + + def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>; + + def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX; + def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f64 VR256:$src), + addr:$dst)]>, VEX; + def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f32 VR256:$src), + [(alignednontemporalstore (v4f64 VR256:$src), addr:$dst)]>, VEX; - } + let ExeDomain = SSEPackedInt in + def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX; } def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src), @@ -2027,18 +1969,6 @@ def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src), def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src), (VMOVNTPSYmr addr:$dst, VR256:$src)>; -def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>; -def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>; - -let ExeDomain = SSEPackedInt in -def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>; - let AddedComplexity = 400 in { // Prefer non-temporal versions def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -2056,22 +1986,19 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; +def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; + // There is no AVX form for instructions below this point def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movnti\t{$src, $dst|$dst, $src}", [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, TB, Requires<[HasSSE2]>; - def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movnti\t{$src, $dst|$dst, $src}", [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, TB, Requires<[HasSSE2]>; - } -def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), - "movnti\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, - TB, Requires<[HasSSE2]>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Misc Instructions (No AVX form) @@ -2079,13 +2006,13 @@ def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), // Prefetch intrinsic. def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src), - "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>; + "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>; def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src), - "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>; + "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>; def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src), - "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>; + "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>; def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src), - "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>; + "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>; // Load, store, and memory fence def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, @@ -2136,16 +2063,23 @@ def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>; def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; +// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while +// in the non-AVX version bits 127:64 aren't touched. Find a better way to +// represent this instead of always zeroing SRC1. One possible solution is +// to represent the instruction w/ something similar as the "$src1 = $dst" +// constraint but without the tied operands. +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)), addr:$src)>, + Requires<[HasAVX, OptForSpeed]>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Load/Store XCSR register //===----------------------------------------------------------------------===// -let isAsmParserOnly = 0 in { - def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX; - def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX; -} +def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX; +def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX; def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>; @@ -2158,45 +2092,43 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), let ExeDomain = SSEPackedInt in { // SSE integer instructions -let isAsmParserOnly = 0 in { - let neverHasSideEffects = 1 in { - def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - } - def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - - let canFoldAsLoad = 1, mayLoad = 1 in { - def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - let Predicates = [HasAVX] in { - def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - } - } +let neverHasSideEffects = 1 in { +def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +} +def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; +def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - let mayStore = 1 in { - def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), - (ins i256mem:$dst, VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - let Predicates = [HasAVX] in { - def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), +let canFoldAsLoad = 1, mayLoad = 1 in { +def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +let Predicates = [HasAVX] in { + def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - } - } +} +} + +let mayStore = 1 in { +def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +let Predicates = [HasAVX] in { +def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +} } let neverHasSideEffects = 1 in @@ -2228,23 +2160,11 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), } // Intrinsic forms of MOVDQU load and store -let isAsmParserOnly = 0 in { -let canFoldAsLoad = 1 in -def VMOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, VEX, Requires<[HasAVX]>; def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "vmovdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, XS, VEX, Requires<[HasAVX]>; -} -let canFoldAsLoad = 1 in -def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, Requires<[HasSSE2]>; def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, @@ -2349,7 +2269,7 @@ multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode, // 128-bit Integer Arithmetic -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V; defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V; defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V; @@ -2439,7 +2359,7 @@ defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>; // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw", int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>, VEX_4V; @@ -2586,7 +2506,7 @@ let Predicates = [HasSSE2] in { // SSE2 - Packed Integer Comparison Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPEQB : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1, 0>, VEX_4V; defm VPCMPEQW : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1, @@ -2640,7 +2560,7 @@ def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), // SSE2 - Packed Integer Pack Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128, 0, 0>, VEX_4V; defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128, @@ -2678,7 +2598,7 @@ def mi : Ii8<0x70, MRMSrcMem, } } // ExeDomain = SSEPackedInt -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let AddedComplexity = 5 in defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize, VEX; @@ -2726,7 +2646,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, addr:$src2))))]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16, @@ -2836,7 +2756,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { } // Extract -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -2849,7 +2769,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, imm:$src2))]>; // Insert -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V; def VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), @@ -2868,13 +2788,11 @@ let Constraints = "$src1 = $dst" in let ExeDomain = SSEPackedInt in { -let isAsmParserOnly = 0 in { def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX; def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX; -} def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>; @@ -2887,7 +2805,6 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), let ExeDomain = SSEPackedInt in { -let isAsmParserOnly = 0 in { let Uses = [EDI] in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), @@ -2898,7 +2815,6 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX; -} let Uses = [EDI] in def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), @@ -2916,7 +2832,6 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), //===---------------------------------------------------------------------===// // Move Int Doubleword to Packed Double Int -let isAsmParserOnly = 0 in { def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2926,7 +2841,6 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), [(set VR128:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, VEX; -} def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2945,7 +2859,6 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), // Move Int Doubleword to Single Scalar -let isAsmParserOnly = 0 in { def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX; @@ -2954,7 +2867,6 @@ def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, VEX; -} def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))]>; @@ -2964,7 +2876,6 @@ def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>; // Move Packed Doubleword Int to Packed Double Int -let isAsmParserOnly = 0 in { def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), @@ -2974,7 +2885,6 @@ def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), @@ -3000,14 +2910,12 @@ def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>; // Move Scalar Single to Double Int -let isAsmParserOnly = 0 in { def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX; def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX; -} def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>; @@ -3016,7 +2924,7 @@ def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>; // movd / movq to XMM register zero-extends -let AddedComplexity = 15, isAsmParserOnly = 0 in { +let AddedComplexity = 15 in { def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl @@ -3040,7 +2948,6 @@ def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), } let AddedComplexity = 20 in { -let isAsmParserOnly = 0 in def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3066,7 +2973,6 @@ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), //===---------------------------------------------------------------------===// // Move Quadword Int to Packed Quadword Int -let isAsmParserOnly = 0 in def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3079,7 +2985,6 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix // Move Packed Quadword Int to Quadword Int -let isAsmParserOnly = 0 in def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), @@ -3093,7 +2998,6 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; // Store / copy lower 64-bits of a XMM register. -let isAsmParserOnly = 0 in def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX; @@ -3101,7 +3005,7 @@ def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>; -let AddedComplexity = 20, isAsmParserOnly = 0 in +let AddedComplexity = 20 in def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3126,7 +3030,7 @@ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. -let isAsmParserOnly = 0, AddedComplexity = 15 in +let AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, @@ -3137,7 +3041,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, XS, Requires<[HasSSE2]>; -let AddedComplexity = 20, isAsmParserOnly = 0 in +let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl @@ -3155,7 +3059,6 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))), } // Instructions to match in the assembler -let isAsmParserOnly = 0 in { def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), @@ -3163,13 +3066,12 @@ def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), // Recognize "movd" with GR64 destination, but encode as a "movq" def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; -} // Instructions for the disassembler // xr = XMM register // xm = mem64 -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3211,7 +3113,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, //===---------------------------------------------------------------------===// // Convert Packed Double FP to Packed DW Integers -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -3239,7 +3141,7 @@ def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; // Convert Packed DW Integers to Packed Double FP -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { def VCVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3290,7 +3192,7 @@ def rm : S3SI<op, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // FIXME: Merge above classes when we have patterns for the ymm version defm VMOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX; defm VMOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX; @@ -3321,7 +3223,7 @@ def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), []>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // FIXME: Merge above classes when we have patterns for the ymm version defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX; @@ -3329,7 +3231,7 @@ let isAsmParserOnly = 0, Predicates = [HasAVX] in { defm MOVDDUP : sse3_replicate_dfp<"movddup">; // Move Unaligned Integer -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; @@ -3393,7 +3295,7 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX], +let Predicates = [HasAVX], ExeDomain = SSEPackedDouble in { defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, f128mem, 0>, TB, XD, VEX_4V; @@ -3446,7 +3348,7 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, int_x86_sse3_hadd_ps, 0>, VEX_4V; defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, @@ -3498,7 +3400,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (mem_frag128 addr:$src))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8, int_x86_ssse3_pabs_b_128>, VEX; defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16, @@ -3540,7 +3442,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16, int_x86_ssse3_phadd_w_128, 0>, VEX_4V; @@ -3632,7 +3534,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> { []>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PALIGN : ssse3_palign<"palignr">; @@ -3696,6 +3598,16 @@ let Predicates = [HasSSE2] in def : Pat<(fextend (loadf32 addr:$src)), (CVTSS2SDrm addr:$src)>; +// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while +// in the non-AVX version bits 127:64 aren't touched. Find a better way to +// represent this instead of always zeroing SRC1. One possible solution is +// to represent the instruction w/ something similar as the "$src1 = $dst" +// constraint but without the tied operands. +let Predicates = [HasAVX] in + def : Pat<(fextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)), + addr:$src)>; + // bit_convert let Predicates = [HasXMMInt] in { def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; @@ -3987,7 +3899,7 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, VEX; defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>, @@ -4053,7 +3965,7 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, VEX; defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>, @@ -4094,7 +4006,7 @@ multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, VEX; defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, @@ -4136,7 +4048,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; def VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst), (ins VR128:$src1, i32i8imm:$src2), @@ -4158,7 +4070,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -4180,7 +4092,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; defm PEXTRD : SS41I_extract32<0x16, "pextrd">; @@ -4201,7 +4113,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize, REX_W; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; @@ -4224,7 +4136,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), (ins VR128:$src1, i32i8imm:$src2), @@ -4264,7 +4176,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -4290,7 +4202,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRD : SS41I_insert32<0x22, "pinsrd">; @@ -4316,7 +4228,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; @@ -4349,7 +4261,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { let Constraints = "$src1 = $dst" in defm INSERTPS : SS41I_insertf32<0x21, "insertps">; -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), @@ -4519,7 +4431,7 @@ multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd, } // FP round - roundss, roundps, roundsd, roundpd -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // Intrinsic form defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, memopv4f32, memopv2f64, @@ -4554,7 +4466,7 @@ defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", // ptest instruction we'll lower to this in X86ISelLowering primarily from // the intel intrinsic that corresponds to this. -let Defs = [EFLAGS], isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Defs = [EFLAGS], Predicates = [HasAVX] in { def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>, @@ -4597,7 +4509,7 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, OpSize, VEX; } -let Defs = [EFLAGS], isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Defs = [EFLAGS], Predicates = [HasAVX] in { defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>; defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>; defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>; @@ -4646,7 +4558,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, (bitconvert (memopv8i16 addr:$src))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", int_x86_sse41_phminposuw>, VEX; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", @@ -4672,7 +4584,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, 0>, VEX_4V; @@ -4739,7 +4651,7 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>; @@ -4771,7 +4683,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, VR128, memopv16i8, i128mem, 0>, VEX_4V; @@ -4812,7 +4724,7 @@ let Constraints = "$src1 = $dst" in { } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, Intrinsic IntId> { @@ -4851,14 +4763,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, - "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"), + "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, OpSize; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, - "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"), + "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize; @@ -4872,7 +4784,7 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>; def : Pat<(X86pblendv VR128:$src1, VR128:$src2, XMM0), (PBLENDVBrr0 VR128:$src1, VR128:$src2)>; -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, @@ -4906,7 +4818,7 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq, 0>, VEX_4V; let Constraints = "$src1 = $dst" in @@ -4938,8 +4850,7 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in { defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; } -let Defs = [XMM0, EFLAGS], isAsmParserOnly = 0, - Predicates = [HasAVX] in { +let Defs = [XMM0, EFLAGS], Predicates = [HasAVX] in { def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; @@ -4974,7 +4885,7 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX], +let Predicates = [HasAVX], Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in { def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, i8imm:$src5), @@ -5009,7 +4920,7 @@ let Defs = [ECX, EFLAGS] in { } } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPISTRI : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">, VEX; defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">, @@ -5048,7 +4959,7 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in { } } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPESTRI : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">, VEX; defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">, @@ -5080,66 +4991,66 @@ defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>; // This set of instructions are only rm, the only difference is the size // of r and m. let Constraints = "$src1 = $dst" in { - def CRC32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), + def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i8mem:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_8 GR32:$src1, + (int_x86_sse42_crc32_32_8 GR32:$src1, (load addr:$src2)))]>; - def CRC32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR8:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>; - def CRC32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>; + def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i16mem:$src2), "crc32{w} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_16 GR32:$src1, + (int_x86_sse42_crc32_32_16 GR32:$src1, (load addr:$src2)))]>, OpSize; - def CRC32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR16:$src2), "crc32{w} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>, + (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>, OpSize; - def CRC32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "crc32{l} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_32 GR32:$src1, + (int_x86_sse42_crc32_32_32 GR32:$src1, (load addr:$src2)))]>; - def CRC32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "crc32{l} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>; - def CRC64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), + (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>; + def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i8mem:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_8 GR64:$src1, + (int_x86_sse42_crc32_64_8 GR64:$src1, (load addr:$src2)))]>, REX_W; - def CRC64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), + def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR8:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_8 GR64:$src1, GR8:$src2))]>, + (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>, REX_W; - def CRC64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), + def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "crc32{q} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_64 GR64:$src1, + (int_x86_sse42_crc32_64_64 GR64:$src1, (load addr:$src2)))]>, REX_W; - def CRC64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), + def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "crc32{q} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_64 GR64:$src1, GR64:$src2))]>, + (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>, REX_W; } @@ -5167,7 +5078,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, } // Perform One Round of an AES Encryption/Decryption Flow -let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", int_x86_aesni_aesenc, 0>, VEX_4V; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", @@ -5207,7 +5118,7 @@ def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))), (AESDECLASTrm VR128:$src1, addr:$src2)>; // Perform the AES InvMixColumn Transformation -let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", @@ -5235,7 +5146,7 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), OpSize; // AES Round Key Generation Assist -let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -5271,7 +5182,6 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), // Only the AVX version of CLMUL instructions are described here. // Carry-less Multiplication instructions -let isAsmParserOnly = 0 in { def VPCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -5297,13 +5207,10 @@ defm VPCLMULHQLQDQ : avx_vpclmul<"vpclmulhqlqdq">; defm VPCLMULLQHQDQ : avx_vpclmul<"vpclmullqhqdq">; defm VPCLMULLQLQDQ : avx_vpclmul<"vpclmullqlqdq">; -} // isAsmParserOnly - //===----------------------------------------------------------------------===// // AVX Instructions //===----------------------------------------------------------------------===// -let isAsmParserOnly = 0 in { // Load from memory and broadcast to all elements of the destination operand class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, @@ -5437,8 +5344,6 @@ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>; -} // isAsmParserOnly - def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3), (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3), diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 2710425..f73cff3 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -34,9 +34,16 @@ let Uses = [EFLAGS] in def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>; + +// The long form of "int $3" turns into int3 as a size optimization. +// FIXME: This doesn't work because InstAlias can't match immediate constants. +//def : InstAlias<"int\t$3", (INT3)>; + + def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", [(int_x86_int imm:$trap)]>; + def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; def SYSRETL : I<0x07, RawFrm, (outs), (ins), "sysretl", []>, TB; def SYSRETQ :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB, diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp index 6686214..2e1ec63 100644 --- a/lib/Target/X86/X86MCAsmInfo.cpp +++ b/lib/Target/X86/X86MCAsmInfo.cpp @@ -15,7 +15,9 @@ #include "X86TargetMachine.h" #include "llvm/ADT/Triple.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ELF.h" using namespace llvm; @@ -69,7 +71,22 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) { DwarfUsesInlineInfoSection = true; // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; +} + +const MCExpr * +X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context); + const MCExpr *Four = MCConstantExpr::Create(4, Context); + return MCBinaryExpr::CreateAdd(Res, Four, Context); +} + +X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple) + : X86MCAsmInfoDarwin(Triple) { } X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { @@ -89,7 +106,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { SupportsDebugInformation = true; // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; // OpenBSD has buggy support for .quad in 32-bit mode, just split into two // .words. diff --git a/lib/Target/X86/X86MCAsmInfo.h b/lib/Target/X86/X86MCAsmInfo.h index 5815225..2cd4c8e 100644 --- a/lib/Target/X86/X86MCAsmInfo.h +++ b/lib/Target/X86/X86MCAsmInfo.h @@ -25,6 +25,14 @@ namespace llvm { explicit X86MCAsmInfoDarwin(const Triple &Triple); }; + struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + virtual const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const; + }; + struct X86ELFMCAsmInfo : public MCAsmInfo { explicit X86ELFMCAsmInfo(const Triple &Triple); virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const; diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp index a2bd638..55aceba 100644 --- a/lib/Target/X86/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/X86MCCodeEmitter.cpp @@ -514,7 +514,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, } // To only check operands before the memory address ones, start - // the search from the begining + // the search from the beginning if (IsDestMem) CurOp = 0; @@ -1015,7 +1015,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, } else { unsigned FixupKind; // FIXME: Is there a better way to know that we need a signed relocation? - if (MI.getOpcode() == X86::MOV64ri32 || + if (MI.getOpcode() == X86::ADD64ri32 || + MI.getOpcode() == X86::MOV64ri32 || MI.getOpcode() == X86::MOV64mi32 || MI.getOpcode() == X86::PUSH64i32) FixupKind = X86::reloc_signed_4byte; diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index cbe6db2..793156f 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -355,10 +355,6 @@ ReSimplify: assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 && "LEA has segment specified!"); break; - case X86::MOVZX16rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break; - case X86::MOVZX16rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break; - case X86::MOVSX16rr8: LowerSubReg32_Op0(OutMI, X86::MOVSX32rr8); break; - case X86::MOVSX16rm8: LowerSubReg32_Op0(OutMI, X86::MOVSX32rm8); break; case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break; case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break; case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 1f464f4..1ad6203 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -73,29 +73,61 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, } } -/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF -/// specific numbering, used in debug info and exception tables. -int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const { - const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); - unsigned Flavour = DWARFFlavour::X86_64; - +static unsigned getFlavour(const X86Subtarget *Subtarget, bool isEH) { if (!Subtarget->is64Bit()) { if (Subtarget->isTargetDarwin()) { if (isEH) - Flavour = DWARFFlavour::X86_32_DarwinEH; + return DWARFFlavour::X86_32_DarwinEH; else - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } else if (Subtarget->isTargetCygMing()) { // Unsupported by now, just quick fallback - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } else { - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } } + return DWARFFlavour::X86_64; +} + +/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF +/// specific numbering, used in debug info and exception tables. +int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + unsigned Flavour = getFlavour(Subtarget, isEH); return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour); } +/// getLLVMRegNum - This function maps DWARF register numbers to LLVM register. +int X86RegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + unsigned Flavour = getFlavour(Subtarget, isEH); + + return X86GenRegisterInfo::getLLVMRegNumFull(DwarfRegNo, Flavour); +} + +int +X86RegisterInfo::getSEHRegNum(unsigned i) const { + int reg = getX86RegNum(i); + switch (i) { + case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: + case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: + case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: + case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: + case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: + case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: + case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: + case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: + case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: + case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: + reg += 8; + } + return reg; +} + /// getX86RegNum - This function maps LLVM register identifiers to their X86 /// specific numbering, which is used in various places encoding instructions. unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) { @@ -229,19 +261,13 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, } break; case X86::sub_8bit_hi: - if (B == &X86::GR8_ABCD_HRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || - A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || - A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_ABCDRegClass; - else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass || - A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass) - return &X86::GR32_ABCDRegClass; - else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass || - A == &X86::GR16_NOREXRegClass) - return &X86::GR16_ABCDRegClass; - } + if (B->hasSubClassEq(&X86::GR8_ABCD_HRegClass)) + switch (A->getSize()) { + case 2: return getCommonSubClass(A, &X86::GR16_ABCDRegClass); + case 4: return getCommonSubClass(A, &X86::GR32_ABCDRegClass); + case 8: return getCommonSubClass(A, &X86::GR64_ABCDRegClass); + default: return 0; + } break; case X86::sub_16bit: if (B == &X86::GR16RegClass) { @@ -285,9 +311,16 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, A == &X86::GR64_NOREX_NOSPRegClass) return &X86::GR64_ABCDRegClass; } else if (B == &X86::GR32_NOREXRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass) + return &X86::GR64_NOREXRegClass; + else if (A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_NOREX_NOSPRegClass; + else if (A == &X86::GR64_ABCDRegClass) + return &X86::GR64_ABCDRegClass; + } else if (B == &X86::GR32_NOREX_NOSPRegClass) { if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_NOREXRegClass; + return &X86::GR64_NOREX_NOSPRegClass; else if (A == &X86::GR64_ABCDRegClass) return &X86::GR64_ABCDRegClass; } @@ -308,6 +341,33 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, return 0; } +const TargetRegisterClass* +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->superclasses_begin(); + do { + switch (Super->getID()) { + case X86::GR8RegClassID: + case X86::GR16RegClassID: + case X86::GR32RegClassID: + case X86::GR64RegClassID: + case X86::FR32RegClassID: + case X86::FR64RegClassID: + case X86::RFP32RegClassID: + case X86::RFP64RegClassID: + case X86::RFP80RegClassID: + case X86::VR128RegClassID: + case X86::VR256RegClassID: + // Don't return a super-class that would shrink the spill size. + // That can happen with the vector and float classes. + if (Super->getSize() == RC->getSize()) + return Super; + } + Super = *I++; + } while (Super); + return RC; +} + const TargetRegisterClass * X86RegisterInfo::getPointerRegClass(unsigned Kind) const { switch (Kind) { @@ -446,6 +506,34 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(X86::ST5); Reserved.set(X86::ST6); Reserved.set(X86::ST7); + + // Mark the segment registers as reserved. + Reserved.set(X86::CS); + Reserved.set(X86::SS); + Reserved.set(X86::DS); + Reserved.set(X86::ES); + Reserved.set(X86::FS); + Reserved.set(X86::GS); + + // Reserve the registers that only exist in 64-bit mode. + if (!Is64Bit) { + for (unsigned n = 0; n != 8; ++n) { + const unsigned GPR64[] = { + X86::R8, X86::R9, X86::R10, X86::R11, + X86::R12, X86::R13, X86::R14, X86::R15 + }; + for (const unsigned *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; + ++AI) + Reserved.set(Reg); + + // XMM8, XMM9, ... + assert(X86::XMM15 == X86::XMM8+7); + for (const unsigned *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI; + ++AI) + Reserved.set(Reg); + } + } + return Reserved; } @@ -470,7 +558,7 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { // FIXME: It's more complicated than this... if (0 && requiresRealignment && MFI->hasVarSizedObjects()) report_fatal_error( - "Stack realignment in presense of dynamic allocas is not supported"); + "Stack realignment in presence of dynamic allocas is not supported"); // If we've requested that we force align the stack do so now. if (ForceStackAlign) diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index cccddfa..dd3d3dc 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -80,6 +80,10 @@ public: /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum /// (created by TableGen) for target dependencies. int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; + + // FIXME: This should be tablegen'd like getDwarfRegNum is + int getSEHRegNum(unsigned i) const; /// Code Generation virtual methods... /// @@ -91,6 +95,9 @@ public: getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned Idx) const; + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 612fac2..590b38b 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -41,80 +41,83 @@ let Namespace = "X86" in { // 8-bit registers // Low registers - def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>; - def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>; - def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>; - def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>; - - // X86-64 only - def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>; - def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>; - def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>; - def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>; - def R8B : Register<"r8b">, DwarfRegNum<[8, -2, -2]>; - def R9B : Register<"r9b">, DwarfRegNum<[9, -2, -2]>; - def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>; - def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>; - def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>; - def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>; - def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>; - def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>; + def AL : Register<"al">; + def DL : Register<"dl">; + def CL : Register<"cl">; + def BL : Register<"bl">; + + // X86-64 only, requires REX. + let CostPerUse = 1 in { + def SIL : Register<"sil">; + def DIL : Register<"dil">; + def BPL : Register<"bpl">; + def SPL : Register<"spl">; + def R8B : Register<"r8b">; + def R9B : Register<"r9b">; + def R10B : Register<"r10b">; + def R11B : Register<"r11b">; + def R12B : Register<"r12b">; + def R13B : Register<"r13b">; + def R14B : Register<"r14b">; + def R15B : Register<"r15b">; + } // High registers. On x86-64, these cannot be used in any instruction // with a REX prefix. - def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>; - def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>; - def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>; - def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>; + def AH : Register<"ah">; + def DH : Register<"dh">; + def CH : Register<"ch">; + def BH : Register<"bh">; // 16-bit registers let SubRegIndices = [sub_8bit, sub_8bit_hi] in { - def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>; - def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>; - def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>; - def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>; + def AX : RegisterWithSubRegs<"ax", [AL,AH]>; + def DX : RegisterWithSubRegs<"dx", [DL,DH]>; + def CX : RegisterWithSubRegs<"cx", [CL,CH]>; + def BX : RegisterWithSubRegs<"bx", [BL,BH]>; } let SubRegIndices = [sub_8bit] in { - def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>; - def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>; - def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>; - def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>; + def SI : RegisterWithSubRegs<"si", [SIL]>; + def DI : RegisterWithSubRegs<"di", [DIL]>; + def BP : RegisterWithSubRegs<"bp", [BPL]>; + def SP : RegisterWithSubRegs<"sp", [SPL]>; } - def IP : Register<"ip">, DwarfRegNum<[16]>; - - // X86-64 only - let SubRegIndices = [sub_8bit] in { - def R8W : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>; - def R9W : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>; - def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>; - def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>; - def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>; - def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>; - def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>; - def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>; + def IP : Register<"ip">; + + // X86-64 only, requires REX. + let SubRegIndices = [sub_8bit], CostPerUse = 1 in { + def R8W : RegisterWithSubRegs<"r8w", [R8B]>; + def R9W : RegisterWithSubRegs<"r9w", [R9B]>; + def R10W : RegisterWithSubRegs<"r10w", [R10B]>; + def R11W : RegisterWithSubRegs<"r11w", [R11B]>; + def R12W : RegisterWithSubRegs<"r12w", [R12B]>; + def R13W : RegisterWithSubRegs<"r13w", [R13B]>; + def R14W : RegisterWithSubRegs<"r14w", [R14B]>; + def R15W : RegisterWithSubRegs<"r15w", [R15B]>; } // 32-bit registers let SubRegIndices = [sub_16bit] in { - def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>; - def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>; - def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>; - def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>; - def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>; - def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>; - def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>; - def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>; - def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>; - - // X86-64 only - def R8D : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>; - def R9D : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>; - def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>; - def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>; - def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>; - def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>; - def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>; - def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>; - } + def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>; + def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>; + def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>; + def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>; + def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>; + def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>; + def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>; + def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>; + def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>; + + // X86-64 only, requires REX + let CostPerUse = 1 in { + def R8D : RegisterWithSubRegs<"r8d", [R8W]>; + def R9D : RegisterWithSubRegs<"r9d", [R9W]>; + def R10D : RegisterWithSubRegs<"r10d", [R10W]>; + def R11D : RegisterWithSubRegs<"r11d", [R11W]>; + def R12D : RegisterWithSubRegs<"r12d", [R12W]>; + def R13D : RegisterWithSubRegs<"r13d", [R13W]>; + def R14D : RegisterWithSubRegs<"r14d", [R14W]>; + def R15D : RegisterWithSubRegs<"r15d", [R15W]>; + }} // 64-bit registers, X86-64 only let SubRegIndices = [sub_32bit] in { @@ -127,6 +130,8 @@ let Namespace = "X86" in { def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>; def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>; + // These also require REX. + let CostPerUse = 1 in { def R8 : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>; def R9 : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>; def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>; @@ -136,7 +141,7 @@ let Namespace = "X86" in { def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>; def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>; def RIP : RegisterWithSubRegs<"rip", [EIP]>, DwarfRegNum<[16, -2, -2]>; - } + }} // MMX Registers. These are actually aliased to ST0 .. ST7 def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>; @@ -170,6 +175,7 @@ let Namespace = "X86" in { def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>; // X86-64 only + let CostPerUse = 1 in { def XMM8: Register<"xmm8">, DwarfRegNum<[25, -2, -2]>; def XMM9: Register<"xmm9">, DwarfRegNum<[26, -2, -2]>; def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>; @@ -178,26 +184,26 @@ let Namespace = "X86" in { def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>; def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>; def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>; - } + }} // YMM Registers, used by AVX instructions let SubRegIndices = [sub_xmm] in { - def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegNum<[17, 21, 21]>; - def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegNum<[18, 22, 22]>; - def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegNum<[19, 23, 23]>; - def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegNum<[20, 24, 24]>; - def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegNum<[21, 25, 25]>; - def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegNum<[22, 26, 26]>; - def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegNum<[23, 27, 27]>; - def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegNum<[24, 28, 28]>; - def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegNum<[25, -2, -2]>; - def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegNum<[26, -2, -2]>; - def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegNum<[27, -2, -2]>; - def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegNum<[28, -2, -2]>; - def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegNum<[29, -2, -2]>; - def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegNum<[30, -2, -2]>; - def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegNum<[31, -2, -2]>; - def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegNum<[32, -2, -2]>; + def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>; + def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>; + def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>; + def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>; + def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>; + def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>; + def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>; + def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>; + def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>; + def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>; + def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>; + def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>; + def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>; + def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>; + def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>; + def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>; } // Floating point stack registers @@ -273,8 +279,8 @@ let Namespace = "X86" in { // require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d" // cannot be encoded. def GR8 : RegisterClass<"X86", [i8], 8, - [AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, - R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> { + (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, + R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; @@ -317,152 +323,38 @@ def GR8 : RegisterClass<"X86", [i8], 8, } def GR16 : RegisterClass<"X86", [i16], 16, - [AX, CX, DX, SI, DI, BX, BP, SP, - R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> { + (add AX, CX, DX, SI, DI, BX, BP, SP, + R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR16_AO_64[] = { - X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, - X86::R8W, X86::R9W, X86::R10W, X86::R11W, - X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP - }; - - GR16Class::iterator - GR16Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR16_AO_64; - else - return begin(); - } - - GR16Class::iterator - GR16Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return array_endof(X86_GR16_AO_64) - 1; - else - // If not, just don't allocate SP. - return array_endof(X86_GR16_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return begin() + 6; - else - // If not, just don't allocate SP. - return begin() + 7; - } - } - }]; } def GR32 : RegisterClass<"X86", [i32], 32, - [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, - R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> { + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, + R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR32_AO_64[] = { - X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, - X86::R8D, X86::R9D, X86::R10D, X86::R11D, - X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP - }; - - GR32Class::iterator - GR32Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR32_AO_64; - else - return begin(); - } - - GR32Class::iterator - GR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return array_endof(X86_GR32_AO_64) - 1; - else - // If not, just don't allocate ESP. - return array_endof(X86_GR32_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return begin() + 6; - else - // If not, just don't allocate ESP. - return begin() + 7; - } - } - }]; } // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since // RIP isn't really a register and it can't be used anywhere except in an // address, but it doesn't cause trouble. def GR64 : RegisterClass<"X86", [i64], 64, - [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, - RBX, R14, R15, R12, R13, RBP, RSP, RIP]> { + (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + RBX, R14, R15, R12, R13, RBP, RSP, RIP)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32 sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64Class::iterator - GR64Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (!Subtarget.is64Bit()) - return begin(); // None of these are allocatable in 32-bit. - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - return end()-3; // If so, don't allocate RIP, RSP or RBP - else - return end()-2; // If not, just don't allocate RIP or RSP - } - }]; } // Segment registers for use by MOV instructions (and others) that have a // segment register as one operand. Always contain a 16-bit segment // descriptor. -def SEGMENT_REG : RegisterClass<"X86", [i16], 16, [CS, DS, SS, ES, FS, GS]>; +def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>; // Debug registers. -def DEBUG_REG : RegisterClass<"X86", [i32], 32, - [DR0, DR1, DR2, DR3, DR4, DR5, DR6, DR7]>; +def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>; // Control registers. -def CONTROL_REG : RegisterClass<"X86", [i64], 64, - [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7, CR8, - CR9, CR10, CR11, CR12, CR13, CR14, CR15]>; +def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; // GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of // GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d" @@ -470,38 +362,38 @@ def CONTROL_REG : RegisterClass<"X86", [i64], 64, // that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD, // and GR64_ABCD are classes for registers that support 8-bit h-register // operations. -def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]>; -def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, [AH, CH, DH, BH]>; -def GR16_ABCD : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> { +def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>; +def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>; +def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)> { let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)]; } -def GR32_ABCD : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> { +def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)> { let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi), (GR16_ABCD sub_16bit)]; } -def GR64_ABCD : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> { +def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)> { let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi), (GR16_ABCD sub_16bit), (GR32_ABCD sub_32bit)]; } -def GR32_TC : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX]> { +def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; } -def GR64_TC : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RSI, RDI, - R8, R9, R11]> { +def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, + R8, R9, R11, RIP)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32_TC sub_32bit)]; } -def GR64_TCW64 : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, - R8, R9, R11]>; +def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, + R8, R9, R11)>; // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, - [AL, CL, DL, AH, CH, DH, BL, BH]> { + (add AL, CL, DL, AH, CH, DH, BL, BH)> { let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; @@ -535,232 +427,62 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8, } // GR16_NOREX - GR16 registers which do not require a REX prefix. def GR16_NOREX : RegisterClass<"X86", [i16], 16, - [AX, CX, DX, SI, DI, BX, BP, SP]> { + (add AX, CX, DX, SI, DI, BX, BP, SP)> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR16_NOREXClass::iterator - GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP / EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return end() - 2; - else - // If not, just don't allocate SP. - return end() - 1; - } - }]; } // GR32_NOREX - GR32 registers which do not require a REX prefix. def GR32_NOREX : RegisterClass<"X86", [i32], 32, - [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> { + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR32_NOREXClass::iterator - GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP / EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return end() - 2; - else - // If not, just don't allocate ESP. - return end() - 1; - } - }]; } // GR64_NOREX - GR64 registers which do not require a REX prefix. def GR64_NOREX : RegisterClass<"X86", [i64], 64, - [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP]> { + (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit), (GR32_NOREX sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOREXClass::iterator - GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate RIP, RSP or RBP. - return end() - 3; - else - // If not, just don't allocate RIP or RSP. - return end() - 2; - } - }]; } // GR32_NOSP - GR32 registers except ESP. -def GR32_NOSP : RegisterClass<"X86", [i32], 32, - [EAX, ECX, EDX, ESI, EDI, EBX, EBP, - R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> { +def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR32_NOSP_AO_64[] = { - X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, - X86::R8D, X86::R9D, X86::R10D, X86::R11D, - X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP - }; - - GR32_NOSPClass::iterator - GR32_NOSPClass::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR32_NOSP_AO_64; - else - return begin(); - } - - GR32_NOSPClass::iterator - GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate EBP. - return array_endof(X86_GR32_NOSP_AO_64) - 1; - else - // If not, any reg in this class is ok. - return array_endof(X86_GR32_NOSP_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate EBP. - return begin() + 6; - else - // If not, any reg in this class is ok. - return begin() + 7; - } - } - }]; } // GR64_NOSP - GR64 registers except RSP (and RIP). -def GR64_NOSP : RegisterClass<"X86", [i64], 64, - [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, - RBX, R14, R15, R12, R13, RBP]> { +def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32_NOSP sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOSPClass::iterator - GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (!Subtarget.is64Bit()) - return begin(); // None of these are allocatable in 32-bit. - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - return end()-1; // If so, don't allocate RBP - else - return end(); // If not, any reg in this class is ok. - } - }]; +} + +// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except +// ESP. +def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, + (and GR32_NOREX, GR32_NOSP)> { + let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), + (GR16_NOREX sub_16bit)]; } // GR64_NOREX_NOSP - GR64_NOREX registers except RSP. def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, - [RAX, RCX, RDX, RSI, RDI, RBX, RBP]> { + (and GR64_NOREX, GR64_NOSP)> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit), - (GR32_NOREX sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOREX_NOSPClass::iterator - GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const - { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate RBP. - return end() - 1; - else - // If not, any reg in this class is ok. - return end(); - } - }]; + (GR32_NOREX_NOSP sub_32bit)]; } // A class to support the 'A' assembler constraint: EAX then EDX. -def GR32_AD : RegisterClass<"X86", [i32], 32, [EAX, EDX]> { +def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)> { let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi), (GR16_ABCD sub_16bit)]; } // Scalar SSE2 floating point registers. -def FR32 : RegisterClass<"X86", [f32], 32, - [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, - XMM12, XMM13, XMM14, XMM15]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - FR32Class::iterator - FR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; -} +def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; -def FR64 : RegisterClass<"X86", [f64], 64, - [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, - XMM12, XMM13, XMM14, XMM15]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - FR64Class::iterator - FR64Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; -} +def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; // FIXME: This sets up the floating point register files as though they are f64 @@ -769,85 +491,31 @@ def FR64 : RegisterClass<"X86", [f64], 64, // faster on common hardware. In reality, this should be controlled by a // command line option or something. -def RFP32 : RegisterClass<"X86",[f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; -def RFP64 : RegisterClass<"X86",[f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; -def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; +def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>; +def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>; +def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>; // Floating point stack registers (these are not allocatable by the // register allocator - the floating point stackifier is responsible // for transforming FPn allocations to STn registers) -def RST : RegisterClass<"X86", [f80, f64, f32], 32, - [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - RSTClass::iterator - RSTClass::allocation_order_end(const MachineFunction &MF) const { - return begin(); - } - }]; +def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { + let isAllocatable = 0; } // Generic vector registers: VR64 and VR128. -def VR64: RegisterClass<"X86", [x86mmx], 64, - [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>; -def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128, - [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, - XMM12, XMM13, XMM14, XMM15]> { +def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; +def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add FR32)> { let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)]; - - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - VR128Class::iterator - VR128Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; } def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256, - [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, - YMM8, YMM9, YMM10, YMM11, - YMM12, YMM13, YMM14, YMM15]> { + (sequence "YMM%u", 0, 15)> { let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)]; - - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - VR256Class::iterator - VR256Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only YMM0 to YMM7 are available in 32-bit mode. - else - return end(); - } - }]; } // Status flags registers. -def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> { +def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { let CopyCost = -1; // Don't allow copying of status registers. - - // EFLAGS is not allocatable. - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - CCRClass::iterator - CCRClass::allocation_order_end(const MachineFunction &MF) const { - return allocation_order_begin(MF); - } - }]; + let isAllocatable = 0; } diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 42e8193..02754f9 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -178,7 +178,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - // This requires the copy size to be a constant, preferrably + // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 1ee7312..481e821 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -144,7 +144,8 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { /// passed as the second argument. Otherwise it returns null. const char *X86Subtarget::getBZeroEntry() const { // Darwin 10 has a __bzero entry point for this purpose. - if (getDarwinVers() >= 10) + if (getTargetTriple().isMacOSX() && + !getTargetTriple().isMacOSXVersionLT(10, 6)) return "__bzero"; return 0; @@ -264,6 +265,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasCLMUL = IsIntel && ((ECX >> 1) & 0x1); HasFMA3 = IsIntel && ((ECX >> 12) & 0x1); + HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1); HasAES = IsIntel && ((ECX >> 25) & 0x1); if (IsIntel || IsAMD) { diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 0a62a02..286a798 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -165,9 +165,15 @@ public: bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool hasVectorUAMem() const { return HasVectorUAMem; } - bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; } - bool isTargetFreeBSD() const { return TargetTriple.getOS() == Triple::FreeBSD; } - bool isTargetSolaris() const { return TargetTriple.getOS() == Triple::Solaris; } + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetFreeBSD() const { + return TargetTriple.getOS() == Triple::FreeBSD; + } + bool isTargetSolaris() const { + return TargetTriple.getOS() == Triple::Solaris; + } // ELF is a reasonably sane default and the only other X86 targets we // support are Darwin and Windows. Just use "not those". @@ -215,13 +221,6 @@ public: return PICStyle == PICStyles::StubDynamicNoPIC || PICStyle == PICStyles::StubPIC; } - /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard, - /// 10 = Snow Leopard, etc. - unsigned getDarwinVers() const { - if (isTargetDarwin()) return TargetTriple.getDarwinMajorNumber(); - return 0; - } - /// ClassifyGlobalReference - Classify a global variable reference for the /// current subtarget according to how we should reference it in a non-pcrel /// context. diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 8fb9470..7483329 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -26,19 +26,18 @@ using namespace llvm; static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: - return new X86MCAsmInfoDarwin(TheTriple); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (TheTriple.getEnvironment() == Triple::MachO) - return new X86MCAsmInfoDarwin(TheTriple); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) { + if (TheTriple.getArch() == Triple::x86_64) + return new X86_64MCAsmInfoDarwin(TheTriple); else - return new X86MCAsmInfoCOFF(TheTriple); - default: - return new X86ELFMCAsmInfo(TheTriple); + return new X86MCAsmInfoDarwin(TheTriple); } + + if (TheTriple.isOSWindows()) + return new X86MCAsmInfoCOFF(TheTriple); + + return new X86ELFMCAsmInfo(TheTriple); } static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, @@ -48,19 +47,14 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, bool RelaxAll, bool NoExecStack) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (TheTriple.getEnvironment() == Triple::MachO) - return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); - else - return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll); - default: - return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); - } + + if (TheTriple.isOSWindows()) + return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll); + + return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); } extern "C" void LLVMInitializeX86Target() { diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index c15dfbb..1231798 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -38,6 +38,12 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer); } +MCSymbol *X8664_MachoTargetObjectFile:: +getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const { + return Mang->getSymbol(GV); +} + unsigned X8632_ELFTargetObjectFile::getPersonalityEncoding() const { if (TM.getRelocationModel() == Reloc::PIC_) return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4; @@ -52,7 +58,7 @@ unsigned X8632_ELFTargetObjectFile::getLSDAEncoding() const { return DW_EH_PE_absptr; } -unsigned X8632_ELFTargetObjectFile::getFDEEncoding() const { +unsigned X8632_ELFTargetObjectFile::getFDEEncoding(bool FDE) const { if (TM.getRelocationModel() == Reloc::PIC_) return DW_EH_PE_pcrel | DW_EH_PE_sdata4; else @@ -91,17 +97,14 @@ unsigned X8664_ELFTargetObjectFile::getLSDAEncoding() const { return DW_EH_PE_absptr; } -unsigned X8664_ELFTargetObjectFile::getFDEEncoding() const { - CodeModel::Model Model = TM.getCodeModel(); - if (TM.getRelocationModel() == Reloc::PIC_) - return DW_EH_PE_pcrel | (Model == CodeModel::Small || - Model == CodeModel::Medium ? - DW_EH_PE_sdata4 : DW_EH_PE_sdata8); +unsigned X8664_ELFTargetObjectFile::getFDEEncoding(bool CFI) const { + if (CFI) + return DW_EH_PE_pcrel | DW_EH_PE_sdata4; - if (Model == CodeModel::Small || Model == CodeModel::Medium) - return DW_EH_PE_udata4; + if (TM.getRelocationModel() == Reloc::PIC_) + return DW_EH_PE_pcrel | DW_EH_PE_sdata4; - return DW_EH_PE_absptr; + return DW_EH_PE_udata4; } unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const { diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index f2fd49c..e21b5bf 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -25,6 +25,12 @@ namespace llvm { getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, MachineModuleInfo *MMI, unsigned Encoding, MCStreamer &Streamer) const; + + // getCFIPersonalitySymbol - The symbol that gets passed to + // .cfi_personality. + virtual MCSymbol * + getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const; }; class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF { @@ -34,7 +40,7 @@ namespace llvm { :TM(tm) { } virtual unsigned getPersonalityEncoding() const; virtual unsigned getLSDAEncoding() const; - virtual unsigned getFDEEncoding() const; + virtual unsigned getFDEEncoding(bool CFI) const; virtual unsigned getTTypeEncoding() const; }; @@ -45,7 +51,7 @@ namespace llvm { :TM(tm) { } virtual unsigned getPersonalityEncoding() const; virtual unsigned getLSDAEncoding() const; - virtual unsigned getFDEEncoding() const; + virtual unsigned getFDEEncoding(bool CFI) const; virtual unsigned getTTypeEncoding() const; }; diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp index 8ce93fd..a8dd847 100644 --- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -30,8 +30,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include <queue> -#include <set> using namespace llvm; /// XCoreDAGToDAGISel - XCore specific code to select XCore machine @@ -207,6 +205,16 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) { return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32, Ops, 4); } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + case Intrinsic::xcore_crc8: + SDValue Ops[] = { N->getOperand(1), N->getOperand(2), N->getOperand(3) }; + return CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32, MVT::i32, + Ops, 3); + } + break; + } case ISD::BRIND: if (SDNode *ResNode = SelectBRIND(N)) return ResNode; diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 4817787..8cabbbf 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -37,8 +37,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/VectorExtras.h" -#include <queue> -#include <set> using namespace llvm; const char *XCoreTargetLowering:: @@ -158,6 +156,8 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ADD); + + setMinFunctionAlignment(1); } SDValue XCoreTargetLowering:: @@ -203,12 +203,6 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N, } } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned XCoreTargetLowering:: -getFunctionAlignment(const Function *) const { - return 1; -} - //===----------------------------------------------------------------------===// // Misc Lower Operation implementation //===----------------------------------------------------------------------===// @@ -250,9 +244,6 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), MVT::i32); - // If it's a debug information descriptor, don't mess with it. - if (DAG.isVerifiedDebugInfoDesc(Op)) - return GA; return getGlobalAddressWrapper(GA, GV, DAG); } @@ -906,8 +897,8 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // The ABI dictates there should be one stack slot available to the callee // on function entry (for saving lr). @@ -967,7 +958,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { @@ -1029,8 +1020,8 @@ XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_XCore); @@ -1089,8 +1080,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_XCore); @@ -1194,12 +1185,12 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain, //===----------------------------------------------------------------------===// bool XCoreTargetLowering:: -CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, +CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_XCore); } @@ -1215,10 +1206,10 @@ XCoreTargetLowering::LowerReturn(SDValue Chain, SmallVector<CCValAssign, 16> RVLocs; // CCState - Info about the registers and stack slot. - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); - // Analize return values. + // Analyze return values. CCInfo.AnalyzeReturn(Outs, RetCC_XCore); // If this is the first return lowered for this function, add diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h index bb3f2cc..a8d67d4 100644 --- a/lib/Target/XCore/XCoreISelLowering.h +++ b/lib/Target/XCore/XCoreISelLowering.h @@ -103,9 +103,6 @@ namespace llvm { virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - private: const XCoreTargetMachine &TM; const XCoreSubtarget &Subtarget; @@ -194,7 +191,8 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG) const; virtual bool - CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, + CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &ArgsFlags, LLVMContext &Context) const; }; diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td index 789546e..55c7527 100644 --- a/lib/Target/XCore/XCoreInstrInfo.td +++ b/lib/Target/XCore/XCoreInstrInfo.td @@ -472,7 +472,16 @@ def REMU_l3r : FL3R<"remu", urem>; } def XOR_l3r : FL3R<"xor", xor>; defm ASHR : FL3R_L2RBITP<"ashr", sra>; -// TODO crc32, crc8, inpw, outpw + +let Constraints = "$src1 = $dst" in +def CRC_l3r : _FL3R<(outs GRRegs:$dst), + (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3), + "crc32 $dst, $src2, $src3", + [(set GRRegs:$dst, + (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2, + GRRegs:$src3))]>; + +// TODO inpw, outpw let mayStore=1 in { def ST16_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset), "st16 $val, $addr[$offset]", @@ -498,6 +507,12 @@ def MACCS_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2), []>; } +let Constraints = "$src1 = $dst1" in +def CRC8_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2), + (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3), + "crc8 $dst1, $dst2, $src2, $src3", + []>; + // Five operand long def LADD_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2), diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index 0287a51..46c9e57 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -68,8 +68,8 @@ unsigned XCoreRegisterInfo::getNumArgRegs(const MachineFunction *MF) } bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) { - return MF.getMMI().hasDebugInfo() || !MF.getFunction()->doesNotThrow() || - UnwindTablesMandatory; + return MF.getMMI().hasDebugInfo() || + MF.getFunction()->needsUnwindTableEntry(); } const unsigned* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) @@ -315,6 +315,10 @@ int XCoreRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return XCoreGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); } +int XCoreRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + return XCoreGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0); +} + unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h index 770483b..7a9bc9f 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.h +++ b/lib/Target/XCore/XCoreRegisterInfo.h @@ -75,6 +75,7 @@ public: //! Get DWARF debugging register number int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; }; } // end namespace llvm diff --git a/lib/Target/XCore/XCoreRegisterInfo.td b/lib/Target/XCore/XCoreRegisterInfo.td index 765f717..c354230 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.td +++ b/lib/Target/XCore/XCoreRegisterInfo.td @@ -44,48 +44,13 @@ def LR : Ri<15, "lr">, DwarfRegNum<[15]>; // def GRRegs : RegisterClass<"XCore", [i32], 32, // Return values and arguments - [R0, R1, R2, R3, + (add R0, R1, R2, R3, // Not preserved across procedure calls R11, // Callee save - R4, R5, R6, R7, R8, R9, R10]> { - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GRRegsClass::iterator - GRRegsClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - GRRegsClass::iterator - GRRegsClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - if (TFI->hasFP(MF)) - return end()-1; // don't allocate R10 - else - return end(); - } - }]; -} + R4, R5, R6, R7, R8, R9, R10)>; -def RRegs : RegisterClass<"XCore", [i32], 32, - // Reserved - [CP, DP, SP, LR]> { - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - RRegsClass::iterator - RRegsClass::allocation_order_begin(const MachineFunction &MF) const { - return begin(); - } - RRegsClass::iterator - RRegsClass::allocation_order_end(const MachineFunction &MF) const { - // No allocatable registers - return begin(); - } - }]; +// Reserved +def RRegs : RegisterClass<"XCore", [i32], 32, (add CP, DP, SP, LR)> { + let isAllocatable = 0; } diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index 0c650cf..54a7f67 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -771,8 +771,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // function empty. NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); - // Loop over the argument list, transfering uses of the old arguments over to - // the new arguments, also transfering over the names as well. + // Loop over the argument list, transferring uses of the old arguments over to + // the new arguments, also transferring over the names as well. // for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), I2 = NF->arg_begin(); I != E; ++I) { diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index efdeec5..179b150 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -20,5 +20,4 @@ add_llvm_library(LLVMipo PruneEH.cpp StripDeadPrototypes.cpp StripSymbols.cpp - StructRetPromotion.cpp ) diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 4d1f7ab..d4eaf0c 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -49,7 +49,7 @@ namespace { /// Struct that represents (part of) either a return value or a function /// argument. Used so that arguments and return values can be used - /// interchangably. + /// interchangeably. struct RetOrArg { RetOrArg(const Function *F, unsigned Idx, bool IsArg) : F(F), Idx(Idx), IsArg(IsArg) {} @@ -273,8 +273,8 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { // function empty. NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList()); - // Loop over the argument list, transfering uses of the old arguments over to - // the new arguments, also transfering over the names as well. While we're at + // Loop over the argument list, transferring uses of the old arguments over to + // the new arguments, also transferring over the names as well. While we're at // it, remove the dead arguments from the DeadArguments list. // for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(), @@ -379,7 +379,7 @@ DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U, // The value is returned from a function. It's only live when the // function's return value is live. We use RetValNum here, for the case // that U is really a use of an insertvalue instruction that uses the - // orginal Use. + // original Use. RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum); // We might be live, depending on the liveness of Use. return MarkIfNotLive(Use, MaybeLiveUses); @@ -894,8 +894,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // function empty. NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); - // Loop over the argument list, transfering uses of the old arguments over to - // the new arguments, also transfering over the names as well. + // Loop over the argument list, transferring uses of the old arguments over to + // the new arguments, also transferring over the names as well. i = 0; for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), I2 = NF->arg_begin(); I != E; ++I, ++i) diff --git a/lib/Transforms/IPO/DeadTypeElimination.cpp b/lib/Transforms/IPO/DeadTypeElimination.cpp index a509931..d3d4963 100644 --- a/lib/Transforms/IPO/DeadTypeElimination.cpp +++ b/lib/Transforms/IPO/DeadTypeElimination.cpp @@ -83,7 +83,8 @@ bool DTE::runOnModule(Module &M) { bool Changed = false; TypeSymbolTable &ST = M.getTypeSymbolTable(); - std::set<const Type *> UsedTypes = getAnalysis<FindUsedTypes>().getTypes(); + const SetVector<const Type*> &T = getAnalysis<FindUsedTypes>().getTypes(); + std::set<const Type*> UsedTypes(T.begin(), T.end()); // Check the symbol table for superfluous type entries... // diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp index 9d432de..d9911bf 100644 --- a/lib/Transforms/IPO/ExtractGV.cpp +++ b/lib/Transforms/IPO/ExtractGV.cpp @@ -51,20 +51,32 @@ namespace { // Visit the GlobalVariables. for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { + if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) { + I->setInitializer(0); + } else { + if (I->hasAvailableExternallyLinkage()) + continue; + if (I->getName() == "llvm.global_ctors") + continue; + } + if (I->hasLocalLinkage()) I->setVisibility(GlobalValue::HiddenVisibility); I->setLinkage(GlobalValue::ExternalLinkage); - if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) - I->setInitializer(0); } // Visit the Functions. for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) { + I->deleteBody(); + } else { + if (I->hasAvailableExternallyLinkage()) + continue; + } + if (I->hasLocalLinkage()) I->setVisibility(GlobalValue::HiddenVisibility); I->setLinkage(GlobalValue::ExternalLinkage); - if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) - I->deleteBody(); } return true; diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 5b6ed2c..cdf7b76 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -21,6 +21,7 @@ #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Module.h" +#include "llvm/Operator.h" #include "llvm/Pass.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/MemoryBuiltins.h" @@ -240,15 +241,15 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, GS.HasPHIUser = true; } else if (isa<CmpInst>(I)) { GS.isCompared = true; - } else if (isa<MemTransferInst>(I)) { - const MemTransferInst *MTI = cast<MemTransferInst>(I); + } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) { + if (MTI->isVolatile()) return true; if (MTI->getArgOperand(0) == V) GS.StoredType = GlobalStatus::isStored; if (MTI->getArgOperand(1) == V) GS.isLoaded = true; - } else if (isa<MemSetInst>(I)) { - assert(cast<MemSetInst>(I)->getArgOperand(0) == V && - "Memset only takes one pointer!"); + } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) { + assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!"); + if (MSI->isVolatile()) return true; GS.StoredType = GlobalStatus::isStored; } else { return true; // Any other non-load instruction might take address! @@ -798,7 +799,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) { // If we get here we could have other crazy uses that are transitively // loaded. assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) || - isa<ConstantExpr>(GlobalUser)) && "Only expect load and stores!"); + isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser)) && + "Only expect load and stores!"); } } @@ -1588,8 +1590,7 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, GV->getInitializer()->isNullValue()) { if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) { if (GV->getInitializer()->getType() != SOVC->getType()) - SOVC = - ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType()); + SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType()); // Optimize away any trapping uses of the loaded value. if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC)) @@ -1953,12 +1954,15 @@ GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) { // Verify that the initializer is simple enough for us to handle. We are // only allowed to optimize the initializer if it is unique. if (!GV->hasUniqueInitializer()) return 0; - + + if (isa<ConstantAggregateZero>(GV->getInitializer())) + return GV; ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); - + for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) { + if (isa<ConstantAggregateZero>(*i)) + continue; ConstantStruct *CS = cast<ConstantStruct>(*i); - if (isa<ConstantPointerNull>(CS->getOperand(1))) continue; @@ -1978,6 +1982,8 @@ GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) { /// ParseGlobalCtors - Given a llvm.global_ctors list that we can understand, /// return a list of the functions and null terminator as a vector. static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) { + if (GV->getInitializer()->isNullValue()) + return std::vector<Function*>(); ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); std::vector<Function*> Result; Result.reserve(CA->getNumOperands()); @@ -2008,7 +2014,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, const PointerType *PFTy = PointerType::getUnqual(FTy); CSVals[1] = Constant::getNullValue(PFTy); CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()), - 2147483647); + 0x7fffffff); } CAList.push_back(ConstantStruct::get(GCL->getContext(), CSVals, false)); } @@ -2432,6 +2438,20 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal, // Cannot handle inline asm. if (isa<InlineAsm>(CI->getCalledValue())) return false; + if (MemSetInst *MSI = dyn_cast<MemSetInst>(CI)) { + if (MSI->isVolatile()) return false; + Constant *Ptr = getVal(Values, MSI->getDest()); + Constant *Val = getVal(Values, MSI->getValue()); + Constant *DestVal = ComputeLoadResult(getVal(Values, Ptr), + MutatedMemory); + if (Val->isNullValue() && DestVal && DestVal->isNullValue()) { + // This memset is a no-op. + ++CurInst; + continue; + } + return false; + } + // Resolve function pointers. Function *Callee = dyn_cast<Function>(getVal(Values, CI->getCalledValue())); diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp index fbe90ce..21dcb51 100644 --- a/lib/Transforms/IPO/IPO.cpp +++ b/lib/Transforms/IPO/IPO.cpp @@ -45,7 +45,6 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeStripDebugDeclarePass(Registry); initializeStripDeadDebugInfoPass(Registry); initializeStripNonDebugSymbolsPass(Registry); - initializeSRETPromotionPass(Registry); } void LLVMInitializeIPO(LLVMPassRegistryRef R) { diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 37eafd7..57f3e77 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -29,7 +29,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include <set> using namespace llvm; STATISTIC(NumInlined, "Number of functions inlined"); diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index a38d2c2..f741443 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -55,6 +55,7 @@ #include "llvm/Instructions.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" +#include "llvm/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp index d91c2c4..2f3baeb 100644 --- a/lib/Transforms/IPO/PruneEH.cpp +++ b/lib/Transforms/IPO/PruneEH.cpp @@ -27,7 +27,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CFG.h" -#include <set> #include <algorithm> using namespace llvm; @@ -181,6 +180,7 @@ bool PruneEH::SimplifyFunction(Function *F) { Call->takeName(II); Call->setCallingConv(II->getCallingConv()); Call->setAttributes(II->getAttributes()); + Call->setDebugLoc(II->getDebugLoc()); // Anything that used the value produced by the invoke instruction // now uses the value produced by the call instruction. Note that we @@ -239,7 +239,7 @@ void PruneEH::DeleteBasicBlock(BasicBlock *BB) { for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) { --I; if (CallInst *CI = dyn_cast<CallInst>(I)) { - if (!isa<DbgInfoIntrinsic>(I)) + if (!isa<IntrinsicInst>(I)) CGN->removeCallEdgeFor(CI); } else if (InvokeInst *II = dyn_cast<InvokeInst>(I)) CGN->removeCallEdgeFor(II); diff --git a/lib/Transforms/IPO/StructRetPromotion.cpp b/lib/Transforms/IPO/StructRetPromotion.cpp deleted file mode 100644 index 584deac..0000000 --- a/lib/Transforms/IPO/StructRetPromotion.cpp +++ /dev/null @@ -1,357 +0,0 @@ -//===-- StructRetPromotion.cpp - Promote sret arguments -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass finds functions that return a struct (using a pointer to the struct -// as the first argument of the function, marked with the 'sret' attribute) and -// replaces them with a new function that simply returns each of the elements of -// that struct (using multiple return values). -// -// This pass works under a number of conditions: -// 1. The returned struct must not contain other structs -// 2. The returned struct must only be used to load values from -// 3. The placeholder struct passed in is the result of an alloca -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "sretpromotion" -#include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/CallGraphSCCPass.h" -#include "llvm/Instructions.h" -#include "llvm/Analysis/CallGraph.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/Debug.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -STATISTIC(NumRejectedSRETUses , "Number of sret rejected due to unexpected uses"); -STATISTIC(NumSRET , "Number of sret promoted"); -namespace { - /// SRETPromotion - This pass removes sret parameter and updates - /// function to use multiple return value. - /// - struct SRETPromotion : public CallGraphSCCPass { - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - CallGraphSCCPass::getAnalysisUsage(AU); - } - - virtual bool runOnSCC(CallGraphSCC &SCC); - static char ID; // Pass identification, replacement for typeid - SRETPromotion() : CallGraphSCCPass(ID) { - initializeSRETPromotionPass(*PassRegistry::getPassRegistry()); - } - - private: - CallGraphNode *PromoteReturn(CallGraphNode *CGN); - bool isSafeToUpdateAllCallers(Function *F); - Function *cloneFunctionBody(Function *F, const StructType *STy); - CallGraphNode *updateCallSites(Function *F, Function *NF); - }; -} - -char SRETPromotion::ID = 0; -INITIALIZE_PASS_BEGIN(SRETPromotion, "sretpromotion", - "Promote sret arguments to multiple ret values", false, false) -INITIALIZE_AG_DEPENDENCY(CallGraph) -INITIALIZE_PASS_END(SRETPromotion, "sretpromotion", - "Promote sret arguments to multiple ret values", false, false) - -Pass *llvm::createStructRetPromotionPass() { - return new SRETPromotion(); -} - -bool SRETPromotion::runOnSCC(CallGraphSCC &SCC) { - bool Changed = false; - - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) - if (CallGraphNode *NewNode = PromoteReturn(*I)) { - SCC.ReplaceNode(*I, NewNode); - Changed = true; - } - - return Changed; -} - -/// PromoteReturn - This method promotes function that uses StructRet paramater -/// into a function that uses multiple return values. -CallGraphNode *SRETPromotion::PromoteReturn(CallGraphNode *CGN) { - Function *F = CGN->getFunction(); - - if (!F || F->isDeclaration() || !F->hasLocalLinkage()) - return 0; - - // Make sure that function returns struct. - if (F->arg_size() == 0 || !F->hasStructRetAttr() || F->doesNotReturn()) - return 0; - - DEBUG(dbgs() << "SretPromotion: Looking at sret function " - << F->getName() << "\n"); - - assert(F->getReturnType()->isVoidTy() && "Invalid function return type"); - Function::arg_iterator AI = F->arg_begin(); - const llvm::PointerType *FArgType = dyn_cast<PointerType>(AI->getType()); - assert(FArgType && "Invalid sret parameter type"); - const llvm::StructType *STy = - dyn_cast<StructType>(FArgType->getElementType()); - assert(STy && "Invalid sret parameter element type"); - - // Check if it is ok to perform this promotion. - if (isSafeToUpdateAllCallers(F) == false) { - DEBUG(dbgs() << "SretPromotion: Not all callers can be updated\n"); - ++NumRejectedSRETUses; - return 0; - } - - DEBUG(dbgs() << "SretPromotion: sret argument will be promoted\n"); - ++NumSRET; - // [1] Replace use of sret parameter - AllocaInst *TheAlloca = new AllocaInst(STy, NULL, "mrv", - F->getEntryBlock().begin()); - Value *NFirstArg = F->arg_begin(); - NFirstArg->replaceAllUsesWith(TheAlloca); - - // [2] Find and replace ret instructions - for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI) - for(BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { - Instruction *I = BI; - ++BI; - if (isa<ReturnInst>(I)) { - Value *NV = new LoadInst(TheAlloca, "mrv.ld", I); - ReturnInst *NR = ReturnInst::Create(F->getContext(), NV, I); - I->replaceAllUsesWith(NR); - I->eraseFromParent(); - } - } - - // [3] Create the new function body and insert it into the module. - Function *NF = cloneFunctionBody(F, STy); - - // [4] Update all call sites to use new function - CallGraphNode *NF_CFN = updateCallSites(F, NF); - - CallGraph &CG = getAnalysis<CallGraph>(); - NF_CFN->stealCalledFunctionsFrom(CG[F]); - - delete CG.removeFunctionFromModule(F); - return NF_CFN; -} - -// Check if it is ok to perform this promotion. -bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) { - - if (F->use_empty()) - // No users. OK to modify signature. - return true; - - for (Value::use_iterator FnUseI = F->use_begin(), FnUseE = F->use_end(); - FnUseI != FnUseE; ++FnUseI) { - // The function is passed in as an argument to (possibly) another function, - // we can't change it! - CallSite CS(*FnUseI); - Instruction *Call = CS.getInstruction(); - // The function is used by something else than a call or invoke instruction, - // we can't change it! - if (!Call || !CS.isCallee(FnUseI)) - return false; - CallSite::arg_iterator AI = CS.arg_begin(); - Value *FirstArg = *AI; - - if (!isa<AllocaInst>(FirstArg)) - return false; - - // Check FirstArg's users. - for (Value::use_iterator ArgI = FirstArg->use_begin(), - ArgE = FirstArg->use_end(); ArgI != ArgE; ++ArgI) { - User *U = *ArgI; - // If FirstArg user is a CallInst that does not correspond to current - // call site then this function F is not suitable for sret promotion. - if (CallInst *CI = dyn_cast<CallInst>(U)) { - if (CI != Call) - return false; - } - // If FirstArg user is a GEP whose all users are not LoadInst then - // this function F is not suitable for sret promotion. - else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { - // TODO : Use dom info and insert PHINodes to collect get results - // from multiple call sites for this GEP. - if (GEP->getParent() != Call->getParent()) - return false; - for (Value::use_iterator GEPI = GEP->use_begin(), GEPE = GEP->use_end(); - GEPI != GEPE; ++GEPI) - if (!isa<LoadInst>(*GEPI)) - return false; - } - // Any other FirstArg users make this function unsuitable for sret - // promotion. - else - return false; - } - } - - return true; -} - -/// cloneFunctionBody - Create a new function based on F and -/// insert it into module. Remove first argument. Use STy as -/// the return type for new function. -Function *SRETPromotion::cloneFunctionBody(Function *F, - const StructType *STy) { - - const FunctionType *FTy = F->getFunctionType(); - std::vector<const Type*> Params; - - // Attributes - Keep track of the parameter attributes for the arguments. - SmallVector<AttributeWithIndex, 8> AttributesVec; - const AttrListPtr &PAL = F->getAttributes(); - - // Add any return attributes. - if (Attributes attrs = PAL.getRetAttributes()) - AttributesVec.push_back(AttributeWithIndex::get(0, attrs)); - - // Skip first argument. - Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); - ++I; - // 0th parameter attribute is reserved for return type. - // 1th parameter attribute is for first 1st sret argument. - unsigned ParamIndex = 2; - while (I != E) { - Params.push_back(I->getType()); - if (Attributes Attrs = PAL.getParamAttributes(ParamIndex)) - AttributesVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs)); - ++I; - ++ParamIndex; - } - - // Add any fn attributes. - if (Attributes attrs = PAL.getFnAttributes()) - AttributesVec.push_back(AttributeWithIndex::get(~0, attrs)); - - - FunctionType *NFTy = FunctionType::get(STy, Params, FTy->isVarArg()); - Function *NF = Function::Create(NFTy, F->getLinkage()); - NF->takeName(F); - NF->copyAttributesFrom(F); - NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), AttributesVec.end())); - F->getParent()->getFunctionList().insert(F, NF); - NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); - - // Replace arguments - I = F->arg_begin(); - E = F->arg_end(); - Function::arg_iterator NI = NF->arg_begin(); - ++I; - while (I != E) { - I->replaceAllUsesWith(NI); - NI->takeName(I); - ++I; - ++NI; - } - - return NF; -} - -/// updateCallSites - Update all sites that call F to use NF. -CallGraphNode *SRETPromotion::updateCallSites(Function *F, Function *NF) { - CallGraph &CG = getAnalysis<CallGraph>(); - SmallVector<Value*, 16> Args; - - // Attributes - Keep track of the parameter attributes for the arguments. - SmallVector<AttributeWithIndex, 8> ArgAttrsVec; - - // Get a new callgraph node for NF. - CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); - - while (!F->use_empty()) { - CallSite CS(*F->use_begin()); - Instruction *Call = CS.getInstruction(); - - const AttrListPtr &PAL = F->getAttributes(); - // Add any return attributes. - if (Attributes attrs = PAL.getRetAttributes()) - ArgAttrsVec.push_back(AttributeWithIndex::get(0, attrs)); - - // Copy arguments, however skip first one. - CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); - Value *FirstCArg = *AI; - ++AI; - // 0th parameter attribute is reserved for return type. - // 1th parameter attribute is for first 1st sret argument. - unsigned ParamIndex = 2; - while (AI != AE) { - Args.push_back(*AI); - if (Attributes Attrs = PAL.getParamAttributes(ParamIndex)) - ArgAttrsVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs)); - ++ParamIndex; - ++AI; - } - - // Add any function attributes. - if (Attributes attrs = PAL.getFnAttributes()) - ArgAttrsVec.push_back(AttributeWithIndex::get(~0, attrs)); - - AttrListPtr NewPAL = AttrListPtr::get(ArgAttrsVec.begin(), ArgAttrsVec.end()); - - // Build new call instruction. - Instruction *New; - if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args.begin(), Args.end(), "", Call); - cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(NewPAL); - } else { - New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call); - cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(NewPAL); - if (cast<CallInst>(Call)->isTailCall()) - cast<CallInst>(New)->setTailCall(); - } - Args.clear(); - ArgAttrsVec.clear(); - New->takeName(Call); - - // Update the callgraph to know that the callsite has been transformed. - CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()]; - CalleeNode->removeCallEdgeFor(Call); - CalleeNode->addCalledFunction(New, NF_CGN); - - // Update all users of sret parameter to extract value using extractvalue. - for (Value::use_iterator UI = FirstCArg->use_begin(), - UE = FirstCArg->use_end(); UI != UE; ) { - User *U2 = *UI++; - CallInst *C2 = dyn_cast<CallInst>(U2); - if (C2 && (C2 == Call)) - continue; - - GetElementPtrInst *UGEP = cast<GetElementPtrInst>(U2); - ConstantInt *Idx = cast<ConstantInt>(UGEP->getOperand(2)); - Value *GR = ExtractValueInst::Create(New, Idx->getZExtValue(), - "evi", UGEP); - while(!UGEP->use_empty()) { - // isSafeToUpdateAllCallers has checked that all GEP uses are - // LoadInsts - LoadInst *L = cast<LoadInst>(*UGEP->use_begin()); - L->replaceAllUsesWith(GR); - L->eraseFromParent(); - } - UGEP->eraseFromParent(); - continue; - } - Call->eraseFromParent(); - } - - return NF_CGN; -} - diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h index 625f546..8257d6b 100644 --- a/lib/Transforms/InstCombine/InstCombine.h +++ b/lib/Transforms/InstCombine/InstCombine.h @@ -11,6 +11,7 @@ #define INSTCOMBINE_INSTCOMBINE_H #include "InstCombineWorklist.h" +#include "llvm/Operator.h" #include "llvm/Pass.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/IRBuilder.h" @@ -69,7 +70,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner : public FunctionPass, public InstVisitor<InstCombiner, Instruction*> { TargetData *TD; - bool MustPreserveLCSSA; bool MadeIRChange; public: /// Worklist - All of the instructions that need to be simplified. @@ -233,7 +233,15 @@ public: Worklist.Add(New); return New; } - + + // InsertNewInstWith - same as InsertNewInstBefore, but also sets the + // debug loc. + // + Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) { + New->setDebugLoc(Old.getDebugLoc()); + return InsertNewInstBefore(New, Old); + } + // ReplaceInstUsesWith - This method is to be used when an instruction is // found to be dead, replacable with another preexisting expression. Here // we add all uses of I to the worklist, replace all uses of I with the new diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 1cb18e1..a08446e 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -331,7 +331,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, /// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is -/// true, otherwise (V < Lo || V >= Hi). In pratice, we emit the more efficient +/// true, otherwise (V < Lo || V >= Hi). In practice, we emit the more efficient /// (V-Lo) <u Hi-Lo. This method expects that Lo <= Hi. isSigned indicates /// whether to treat the V, Lo and HI as signed or not. IB is the location to /// insert new instructions. @@ -769,6 +769,42 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { return Builder->CreateICmp(LHSCC, NewOr, LHSCst); } } + + // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2 + // where CMAX is the all ones value for the truncated type, + // iff the lower bits of C2 and CA are zero. + if (LHSCC == RHSCC && ICmpInst::isEquality(LHSCC) && + LHS->hasOneUse() && RHS->hasOneUse()) { + Value *V; + ConstantInt *AndCst, *SmallCst = 0, *BigCst = 0; + + // (trunc x) == C1 & (and x, CA) == C2 + if (match(Val2, m_Trunc(m_Value(V))) && + match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) { + SmallCst = RHSCst; + BigCst = LHSCst; + } + // (and x, CA) == C2 & (trunc x) == C1 + else if (match(Val, m_Trunc(m_Value(V))) && + match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) { + SmallCst = LHSCst; + BigCst = RHSCst; + } + + if (SmallCst && BigCst) { + unsigned BigBitSize = BigCst->getType()->getBitWidth(); + unsigned SmallBitSize = SmallCst->getType()->getBitWidth(); + + // Check that the low bits are zero. + APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize); + if ((Low & AndCst->getValue()) == 0 && (Low & BigCst->getValue()) == 0) { + Value *NewAnd = Builder->CreateAnd(V, Low | AndCst->getValue()); + APInt N = SmallCst->getValue().zext(BigBitSize) | BigCst->getValue(); + Value *NewVal = ConstantInt::get(AndCst->getType()->getContext(), N); + return Builder->CreateICmp(LHSCC, NewAnd, NewVal); + } + } + } // From here on, we only handle: // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler. @@ -2003,7 +2039,14 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { } } } - + + // or(sext(A), B) -> A ? -1 : B where A is an i1 + // or(A, sext(B)) -> B ? -1 : A where B is an i1 + if (match(Op0, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1)) + return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1); + if (match(Op1, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1)) + return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0); + // Note: If we've gotten to the point of visiting the outer OR, then the // inner one couldn't be simplified. If it was a constant, then it won't // be simplified by a later pass either, so we try swapping the inner/outer diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 875e9ca..ef67701 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -111,10 +111,10 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); - Instruction *L = new LoadInst(Src, "tmp", MI->isVolatile(), SrcAlign); - InsertNewInstBefore(L, *MI); - InsertNewInstBefore(new StoreInst(L, Dest, MI->isVolatile(), DstAlign), - *MI); + LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile()); + L->setAlignment(SrcAlign); + StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile()); + S->setAlignment(DstAlign); // Set the size of the copy to 0, it will be deleted on the next iteration. MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType())); @@ -154,8 +154,9 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { // Extract the fill value and store. uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; - InsertNewInstBefore(new StoreInst(ConstantInt::get(ITy, Fill), - Dest, false, Alignment), *MI); + StoreInst *S = Builder->CreateStore(ConstantInt::get(ITy, Fill), Dest, + MI->isVolatile()); + S->setAlignment(Alignment); // Set the size of the copy to 0, it will be deleted on the next iteration. MI->setLength(Constant::getNullValue(LenC->getType())); @@ -405,20 +406,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (LHSKnownNegative && RHSKnownNegative) { // The sign bit is set in both cases: this MUST overflow. // Create a simple add instruction, and insert it into the struct. - Instruction *Add = BinaryOperator::CreateAdd(LHS, RHS, "", &CI); - Worklist.Add(Add); + Value *Add = Builder->CreateAdd(LHS, RHS); + Add->takeName(&CI); Constant *V[] = { - UndefValue::get(LHS->getType()),ConstantInt::getTrue(II->getContext()) + UndefValue::get(LHS->getType()), + ConstantInt::getTrue(II->getContext()) }; Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false); return InsertValueInst::Create(Struct, Add, 0); } - + if (LHSKnownPositive && RHSKnownPositive) { // The sign bit is clear in both cases: this CANNOT overflow. // Create a simple add instruction, and insert it into the struct. - Instruction *Add = BinaryOperator::CreateNUWAdd(LHS, RHS, "", &CI); - Worklist.Add(Add); + Value *Add = Builder->CreateNUWAdd(LHS, RHS); + Add->takeName(&CI); Constant *V[] = { UndefValue::get(LHS->getType()), ConstantInt::getFalse(II->getContext()) @@ -537,11 +539,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: - case Intrinsic::x86_sse_loadu_ps: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse2_loadu_dq: - // Turn PPC lvx -> load if the pointer is known aligned. - // Turn X86 loadups -> load if the pointer is known aligned. + // Turn PPC lvx -> load if the pointer is known aligned. if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), PointerType::getUnqual(II->getType())); @@ -592,6 +590,28 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + + case Intrinsic::x86_sse41_pmovsxbw: + case Intrinsic::x86_sse41_pmovsxwd: + case Intrinsic::x86_sse41_pmovsxdq: + case Intrinsic::x86_sse41_pmovzxbw: + case Intrinsic::x86_sse41_pmovzxwd: + case Intrinsic::x86_sse41_pmovzxdq: { + // pmov{s|z}x ignores the upper half of their input vectors. + unsigned VWidth = + cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); + unsigned LowHalfElts = VWidth / 2; + APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts)); + APInt UndefElts(VWidth, 0); + if (Value *TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), + InputDemandedElts, + UndefElts)) { + II->setArgOperand(0, TmpV); + return II; + } + break; + } + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getArgOperand(2))) { @@ -817,7 +837,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // If OldCall dues not return void then replaceAllUsesWith undef. // This allows ValueHandlers and custom metadata to adjust itself. if (!OldCall->getType()->isVoidTy()) - OldCall->replaceAllUsesWith(UndefValue::get(OldCall->getType())); + ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); if (isa<CallInst>(OldCall)) return EraseInstFromFunction(*OldCall); @@ -839,8 +859,8 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // If CS does not return void then replaceAllUsesWith undef. // This allows ValueHandlers and custom metadata to adjust itself. if (!CS.getInstruction()->getType()->isVoidTy()) - CS.getInstruction()-> - replaceAllUsesWith(UndefValue::get(CS.getInstruction()->getType())); + ReplaceInstUsesWith(*CS.getInstruction(), + UndefValue::get(CS.getInstruction()->getType())); if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { // Don't break the CFG, insert a dummy cond branch. @@ -1088,15 +1108,15 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { Instruction *NC; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { - NC = InvokeInst::Create(Callee, II->getNormalDest(), II->getUnwindDest(), - Args.begin(), Args.end(), - Caller->getName(), Caller); + NC = Builder->CreateInvoke(Callee, II->getNormalDest(), + II->getUnwindDest(), Args.begin(), Args.end()); + NC->takeName(II); cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv()); cast<InvokeInst>(NC)->setAttributes(NewCallerPAL); } else { - NC = CallInst::Create(Callee, Args.begin(), Args.end(), - Caller->getName(), Caller); CallInst *CI = cast<CallInst>(Caller); + NC = Builder->CreateCall(Callee, Args.begin(), Args.end()); + NC->takeName(CI); if (CI->isTailCall()) cast<CallInst>(NC)->setTailCall(); cast<CallInst>(NC)->setCallingConv(CI->getCallingConv()); @@ -1110,6 +1130,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { Instruction::CastOps opcode = CastInst::getCastOpcode(NC, false, OldRetTy, false); NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp"); + NC->setDebugLoc(Caller->getDebugLoc()); // If this is an invoke instruction, we should insert it after the first // non-phi, instruction in the normal successor block. @@ -1127,8 +1148,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { } if (!Caller->use_empty()) - Caller->replaceAllUsesWith(NV); - + ReplaceInstUsesWith(*Caller, NV); + EraseInstFromFunction(*Caller); return true; } @@ -1193,7 +1214,7 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) { // Add the chain argument and attributes. Value *NestVal = Tramp->getArgOperand(2); if (NestVal->getType() != NestTy) - NestVal = new BitCastInst(NestVal, NestTy, "nest", Caller); + NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest"); NewArgs.push_back(NestVal); NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr)); } @@ -1259,24 +1280,19 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) { if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { NewCaller = InvokeInst::Create(NewCallee, II->getNormalDest(), II->getUnwindDest(), - NewArgs.begin(), NewArgs.end(), - Caller->getName(), Caller); + NewArgs.begin(), NewArgs.end()); cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); } else { - NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end(), - Caller->getName(), Caller); + NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end()); if (cast<CallInst>(Caller)->isTailCall()) cast<CallInst>(NewCaller)->setTailCall(); cast<CallInst>(NewCaller)-> setCallingConv(cast<CallInst>(Caller)->getCallingConv()); cast<CallInst>(NewCaller)->setAttributes(NewPAL); } - if (!Caller->getType()->isVoidTy()) - Caller->replaceAllUsesWith(NewCaller); - Caller->eraseFromParent(); - Worklist.Remove(Caller); - return 0; + + return NewCaller; } } diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 6f70de8..601d9b4 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -71,6 +71,11 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, // This requires TargetData to get the alloca alignment and size information. if (!TD) return 0; + // Insist that the amount-to-allocate not overflow. + OverflowingBinaryOperator *OBI = + dyn_cast<OverflowingBinaryOperator>(AI.getOperand(0)); + if (OBI && !(OBI->hasNoSignedWrap() || OBI->hasNoUnsignedWrap())) return 0; + const PointerType *PTy = cast<PointerType>(CI.getType()); BuilderTy AllocaBuilder(*Builder); @@ -133,7 +138,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, // New is the allocation instruction, pointer typed. AI is the original // allocation instruction, also pointer typed. Thus, cast to use is BitCast. Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast"); - AI.replaceAllUsesWith(NewCast); + ReplaceInstUsesWith(AI, NewCast); } return ReplaceInstUsesWith(CI, New); } @@ -211,7 +216,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, } Res->takeName(I); - return InsertNewInstBefore(Res, *I); + return InsertNewInstWith(Res, *I); } @@ -1228,7 +1233,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { // Remove the old Call. With -fmath-errno, it won't get marked readnone. - Call->replaceAllUsesWith(UndefValue::get(Call->getType())); + ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType())); EraseInstFromFunction(*Call); return ret; } @@ -1684,8 +1689,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // If we found a path from the src to dest, create the getelementptr now. if (SrcElTy == DstElTy) { SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt); - return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end(),"", - ((Instruction*)NULL)); + return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end()); } } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 8afd2f8..42db444 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -469,8 +469,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, /// /// If we can't emit an optimized form for this expression, this returns null. /// -static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I, - InstCombiner &IC) { +static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { TargetData &TD = *IC.getTargetData(); gep_type_iterator GTI = gep_type_begin(GEP); @@ -533,10 +532,10 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I, // Cast to intptrty in case a truncation occurs. If an extension is needed, // we don't need to bother extending: the extension won't affect where the // computation crosses zero. - if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) - VariableIdx = new TruncInst(VariableIdx, - TD.getIntPtrType(VariableIdx->getContext()), - VariableIdx->getName(), &I); + if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) { + const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext()); + VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy); + } return VariableIdx; } @@ -558,11 +557,10 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I, // Okay, we can do this evaluation. Start by converting the index to intptr. const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext()); if (VariableIdx->getType() != IntPtrTy) - VariableIdx = CastInst::CreateIntegerCast(VariableIdx, IntPtrTy, - true /*SExt*/, - VariableIdx->getName(), &I); + VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy, + true /*Signed*/); Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs); - return BinaryOperator::CreateAdd(VariableIdx, OffsetVal, "offset", &I); + return IC.Builder->CreateAdd(VariableIdx, OffsetVal, "offset"); } /// FoldGEPICmp - Fold comparisons between a GEP instruction and something @@ -580,7 +578,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // This transformation (ignoring the base and scales) is valid because we // know pointers can't overflow since the gep is inbounds. See if we can // output an optimized form. - Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, I, *this); + Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this); // If not, synthesize the offset the hard way. if (Offset == 0) @@ -634,6 +632,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, if (AllZeros) return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I); + bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds(); if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) { // If the GEPs only differ by one index, compare it. unsigned NumDifferences = 0; // Keep track of # differences. @@ -656,7 +655,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ConstantInt::get(Type::getInt1Ty(I.getContext()), ICmpInst::isTrueWhenEqual(Cond))); - else if (NumDifferences == 1) { + else if (NumDifferences == 1 && GEPsInBounds) { Value *LHSV = GEPLHS->getOperand(DiffOperand); Value *RHSV = GEPRHS->getOperand(DiffOperand); // Make sure we do a signed comparison here. @@ -667,6 +666,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // Only lower this if the icmp is the only user of the GEP or if we expect // the result to fold to a constant! if (TD && + GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) && (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) { // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) @@ -699,7 +699,7 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI, return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext())); // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0, - // so the values can never be equal. Similiarly for all other "or equals" + // so the values can never be equal. Similarly for all other "or equals" // operators. // (X+1) <u X --> X >u (MAXUINT-1) --> X == 255 @@ -919,11 +919,11 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr)) return 0; - // Otherwise, all lshr and all exact ashr's are equivalent to a udiv/sdiv by - // a power of 2. Since we already have logic to simplify these, transform - // to div and then simplify the resultant comparison. + // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv + // by a power of 2. Since we already have logic to simplify these, + // transform to div and then simplify the resultant comparison. if (Shr->getOpcode() == Instruction::AShr && - !Shr->isExact()) + (!Shr->isExact() || ShAmtVal == TypeBits - 1)) return 0; // Revisit the shift (to delete it). @@ -1087,22 +1087,33 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // have its sign bit set or if it is an equality comparison. // Extending a relational comparison when we're checking the sign // bit would not work. - if (Cast->hasOneUse() && - (ICI.isEquality() || - (AndCST->getValue().isNonNegative() && RHSV.isNonNegative()))) { - uint32_t BitWidth = - cast<IntegerType>(Cast->getOperand(0)->getType())->getBitWidth(); - APInt NewCST = AndCST->getValue().zext(BitWidth); - APInt NewCI = RHSV.zext(BitWidth); - Value *NewAnd = + if (ICI.isEquality() || + (AndCST->getValue().isNonNegative() && RHSV.isNonNegative())) { + Value *NewAnd = Builder->CreateAnd(Cast->getOperand(0), - ConstantInt::get(ICI.getContext(), NewCST), - LHSI->getName()); + ConstantExpr::getZExt(AndCST, Cast->getSrcTy())); + NewAnd->takeName(LHSI); return new ICmpInst(ICI.getPredicate(), NewAnd, - ConstantInt::get(ICI.getContext(), NewCI)); + ConstantExpr::getZExt(RHS, Cast->getSrcTy())); } } - + + // If the LHS is an AND of a zext, and we have an equality compare, we can + // shrink the and/compare to the smaller type, eliminating the cast. + if (ZExtInst *Cast = dyn_cast<ZExtInst>(LHSI->getOperand(0))) { + const IntegerType *Ty = cast<IntegerType>(Cast->getSrcTy()); + // Make sure we don't compare the upper bits, SimplifyDemandedBits + // should fold the icmp to true/false in that case. + if (ICI.isEquality() && RHSV.getActiveBits() <= Ty->getBitWidth()) { + Value *NewAnd = + Builder->CreateAnd(Cast->getOperand(0), + ConstantExpr::getTrunc(AndCST, Ty)); + NewAnd->takeName(LHSI); + return new ICmpInst(ICI.getPredicate(), NewAnd, + ConstantExpr::getTrunc(RHS, Ty)); + } + } + // If this is: (X >> C1) & C2 != C3 (where any shift and any compare // could exist), turn it into (X & (C2 << C1)) != (C3 << C1). This // happens a LOT in code produced by the C front-end, for bitfield @@ -1384,9 +1395,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (Value *NegVal = dyn_castNegVal(BOp1)) return new ICmpInst(ICI.getPredicate(), BOp0, NegVal); - else if (Value *NegVal = dyn_castNegVal(BOp0)) + if (Value *NegVal = dyn_castNegVal(BOp0)) return new ICmpInst(ICI.getPredicate(), NegVal, BOp1); - else if (BO->hasOneUse()) { + if (BO->hasOneUse()) { Value *Neg = Builder->CreateNeg(BOp1); Neg->takeName(BO); return new ICmpInst(ICI.getPredicate(), BOp0, Neg); @@ -1396,18 +1407,27 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, case Instruction::Xor: // For the xor case, we can xor two constants together, eliminating // the explicit xor. - if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) - return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), + if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) { + return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), ConstantExpr::getXor(RHS, BOC)); - - // FALLTHROUGH + } else if (RHSV == 0) { + // Replace ((xor A, B) != 0) with (A != B) + return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), + BO->getOperand(1)); + } + break; case Instruction::Sub: - // Replace (([sub|xor] A, B) != 0) with (A != B) - if (RHSV == 0) + // Replace ((sub A, B) != C) with (B != A-C) if A & C are constants. + if (ConstantInt *BOp0C = dyn_cast<ConstantInt>(BO->getOperand(0))) { + if (BO->hasOneUse()) + return new ICmpInst(ICI.getPredicate(), BO->getOperand(1), + ConstantExpr::getSub(BOp0C, RHS)); + } else if (RHSV == 0) { + // Replace ((sub A, B) != 0) with (A != B) return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), BO->getOperand(1)); + } break; - case Instruction::Or: // If bits are being or'd in that are not present in the constant we // are comparing against, then the comparison could never succeed! @@ -2400,7 +2420,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // fall-through case Instruction::SDiv: case Instruction::AShr: - if (!BO0->isExact() && !BO1->isExact()) + if (!BO0->isExact() || !BO1->isExact()) break; return new ICmpInst(I.getPredicate(), BO0->getOperand(0), BO1->getOperand(0)); @@ -2483,9 +2503,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } // (X&Z) == (Y&Z) -> (X^Y) & Z == 0 - if (Op0->hasOneUse() && Op1->hasOneUse() && - match(Op0, m_And(m_Value(A), m_Value(B))) && - match(Op1, m_And(m_Value(C), m_Value(D)))) { + if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) && + match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) { Value *X = 0, *Y = 0, *Z = 0; if (A == C) { @@ -2506,6 +2525,32 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { return &I; } } + + // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to + // "icmp (and X, mask), cst" + uint64_t ShAmt = 0; + ConstantInt *Cst1; + if (Op0->hasOneUse() && + match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), + m_ConstantInt(ShAmt))))) && + match(Op1, m_ConstantInt(Cst1)) && + // Only do this when A has multiple uses. This is most important to do + // when it exposes other optimizations. + !A->hasOneUse()) { + unsigned ASize =cast<IntegerType>(A->getType())->getPrimitiveSizeInBits(); + + if (ShAmt < ASize) { + APInt MaskV = + APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits()); + MaskV <<= ShAmt; + + APInt CmpV = Cst1->getValue().zext(ASize); + CmpV <<= ShAmt; + + Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV)); + return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV)); + } + } } { diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 432adc9..f499290 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -57,12 +57,14 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { Value *Idx[2]; Idx[0] = NullIdx; Idx[1] = NullIdx; - Value *V = GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2, - New->getName()+".sub", It); + Instruction *GEP = + GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2, + New->getName()+".sub"); + InsertNewInstBefore(GEP, *It); // Now make everything use the getelementptr instead of the original // allocation. - return ReplaceInstUsesWith(AI, V); + return ReplaceInstUsesWith(AI, GEP); } else if (isa<UndefValue>(AI.getArraySize())) { return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); } @@ -600,10 +602,12 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { // Advance to a place where it is safe to insert the new store and // insert it. BBI = DestBB->getFirstNonPHI(); - InsertNewInstBefore(new StoreInst(MergedVal, SI.getOperand(1), - OtherStore->isVolatile(), - SI.getAlignment()), *BBI); - + StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1), + OtherStore->isVolatile(), + SI.getAlignment()); + InsertNewInstBefore(NewSI, *BBI); + NewSI->setDebugLoc(OtherStore->getDebugLoc()); + // Nuke the old stores. EraseInstFromFunction(SI); EraseInstFromFunction(*OtherStore); diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 6651387..2d29403 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -19,6 +19,60 @@ using namespace llvm; using namespace PatternMatch; + +/// simplifyValueKnownNonZero - The specific integer value is used in a context +/// where it is known to be non-zero. If this allows us to simplify the +/// computation, do so and return the new operand, otherwise return null. +static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) { + // If V has multiple uses, then we would have to do more analysis to determine + // if this is safe. For example, the use could be in dynamically unreached + // code. + if (!V->hasOneUse()) return 0; + + bool MadeChange = false; + + // ((1 << A) >>u B) --> (1 << (A-B)) + // Because V cannot be zero, we know that B is less than A. + Value *A = 0, *B = 0, *PowerOf2 = 0; + if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))), + m_Value(B))) && + // The "1" can be any value known to be a power of 2. + isPowerOfTwo(PowerOf2, IC.getTargetData())) { + A = IC.Builder->CreateSub(A, B, "tmp"); + return IC.Builder->CreateShl(PowerOf2, A); + } + + // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it + // inexact. Similarly for <<. + if (BinaryOperator *I = dyn_cast<BinaryOperator>(V)) + if (I->isLogicalShift() && + isPowerOfTwo(I->getOperand(0), IC.getTargetData())) { + // We know that this is an exact/nuw shift and that the input is a + // non-zero context as well. + if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) { + I->setOperand(0, V2); + MadeChange = true; + } + + if (I->getOpcode() == Instruction::LShr && !I->isExact()) { + I->setIsExact(); + MadeChange = true; + } + + if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) { + I->setHasNoUnsignedWrap(); + MadeChange = true; + } + } + + // TODO: Lots more we could do here: + // If V is a phi node, we can call this on each of its operands. + // "select cond, X, 0" can simplify to "X". + + return MadeChange ? V : 0; +} + + /// MultiplyOverflows - True if the multiply can not be expressed in an int /// this size. static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) { @@ -81,6 +135,29 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, CI)); } } + + // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n + // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n + // The "* (2**n)" thus becomes a potential shifting opportunity. + { + const APInt & Val = CI->getValue(); + const APInt &PosVal = Val.abs(); + if (Val.isNegative() && PosVal.isPowerOf2()) { + Value *X = 0, *Y = 0; + if (Op0->hasOneUse()) { + ConstantInt *C1; + Value *Sub = 0; + if (match(Op0, m_Sub(m_Value(Y), m_Value(X)))) + Sub = Builder->CreateSub(X, Y, "suba"); + else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1)))) + Sub = Builder->CreateSub(Builder->CreateNeg(C1), Y, "subc"); + if (Sub) + return + BinaryOperator::CreateMul(Sub, + ConstantInt::get(Y->getType(), PosVal)); + } + } + } } // Simplify mul instructions with a constant RHS. @@ -293,6 +370,12 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + // The RHS is known non-zero. + if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) { + I.setOperand(1, V); + return &I; + } + // Handle cases involving: [su]div X, (select Cond, Y, Z) // This does not apply for fdiv. if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I)) @@ -320,6 +403,10 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { } } + // See if we can fold away this div instruction. + if (SimplifyDemandedInstructionBits(I)) + return &I; + // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y Value *X = 0, *Z = 0; if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1 @@ -332,6 +419,19 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { return 0; } +/// dyn_castZExtVal - Checks if V is a zext or constant that can +/// be truncated to Ty without losing bits. +static Value *dyn_castZExtVal(Value *V, const Type *Ty) { + if (ZExtInst *Z = dyn_cast<ZExtInst>(V)) { + if (Z->getSrcTy() == Ty) + return Z->getOperand(0); + } else if (ConstantInt *C = dyn_cast<ConstantInt>(V)) { + if (C->getValue().getActiveBits() <= cast<IntegerType>(Ty)->getBitWidth()) + return ConstantExpr::getTrunc(C, Ty); + } + return 0; +} + Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -390,6 +490,14 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { return SelectInst::Create(Cond, TSI, FSI); } } + + // (zext A) udiv (zext B) --> zext (A udiv B) + if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0)) + if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy())) + return new ZExtInst(Builder->CreateUDiv(ZOp0->getOperand(0), ZOp1, "div", + I.isExact()), + I.getType()); + return 0; } @@ -467,28 +575,6 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { return 0; } -/// This function implements the transforms on rem instructions that work -/// regardless of the kind of rem instruction it is (urem, srem, or frem). It -/// is used by the visitors to those instructions. -/// @brief Transforms common to all three rem instructions -Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) { - Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - - if (isa<UndefValue>(Op0)) { // undef % X -> 0 - if (I.getType()->isFPOrFPVectorTy()) - return ReplaceInstUsesWith(I, Op0); // X % undef -> undef (could be SNaN) - return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); - } - if (isa<UndefValue>(Op1)) - return ReplaceInstUsesWith(I, Op1); // X % undef -> undef - - // Handle cases involving: rem X, (select Cond, Y, Z) - if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I)) - return &I; - - return 0; -} - /// This function implements the transforms common to both integer remainder /// instructions (urem and srem). It is called by the visitors to those integer /// remainder instructions. @@ -496,26 +582,17 @@ Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) { Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Instruction *common = commonRemTransforms(I)) - return common; - - // X % X == 0 - if (Op0 == Op1) - return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); - - // 0 % X == 0 for integer, we don't need to preserve faults! - if (Constant *LHS = dyn_cast<Constant>(Op0)) - if (LHS->isNullValue()) - return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); + // The RHS is known non-zero. + if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) { + I.setOperand(1, V); + return &I; + } - if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { - // X % 0 == undef, we don't need to preserve faults! - if (RHS->equalsInt(0)) - return ReplaceInstUsesWith(I, UndefValue::get(I.getType())); - - if (RHS->equalsInt(1)) // X % 1 == 0 - return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); + // Handle cases involving: rem X, (select Cond, Y, Z) + if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I)) + return &I; + if (isa<ConstantInt>(Op1)) { if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) { if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) { if (Instruction *R = FoldOpIntoSelect(I, SI)) @@ -537,6 +614,9 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) { Instruction *InstCombiner::visitURem(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (Value *V = SimplifyURemInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + if (Instruction *common = commonIRemTransforms(I)) return common; @@ -564,13 +644,22 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { return SelectInst::Create(Cond, TrueAnd, FalseAnd); } } - + + // (zext A) urem (zext B) --> zext (A urem B) + if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0)) + if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy())) + return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1), + I.getType()); + return 0; } Instruction *InstCombiner::visitSRem(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (Value *V = SimplifySRemInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + // Handle the integer rem common cases if (Instruction *Common = commonIRemTransforms(I)) return Common; @@ -629,6 +718,14 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) { } Instruction *InstCombiner::visitFRem(BinaryOperator &I) { - return commonRemTransforms(I); -} + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (Value *V = SimplifyFRemInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + // Handle cases involving: rem X, (select Cond, Y, Z) + if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I)) + return &I; + + return 0; +} diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp index c5f31fb..3777340 100644 --- a/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -110,16 +110,20 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) { } } - if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) - return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), - LHSVal, RHSVal); - + if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) { + CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), + LHSVal, RHSVal); + NewCI->setDebugLoc(FirstInst->getDebugLoc()); + return NewCI; + } + BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst); BinaryOperator *NewBinOp = BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal); if (isNUW) NewBinOp->setHasNoUnsignedWrap(); if (isNSW) NewBinOp->setHasNoSignedWrap(); if (isExact) NewBinOp->setIsExact(); + NewBinOp->setDebugLoc(FirstInst->getDebugLoc()); return NewBinOp; } @@ -228,6 +232,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { GetElementPtrInst::Create(Base, FixedOperands.begin()+1, FixedOperands.end()); if (AllInBounds) NewGEP->setIsInBounds(); + NewGEP->setDebugLoc(FirstInst->getDebugLoc()); return NewGEP; } @@ -237,7 +242,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { /// obvious the value of the load is not changed from the point of the load to /// the end of the block it is in. /// -/// Finally, it is safe, but not profitable, to sink a load targetting a +/// Finally, it is safe, but not profitable, to sink a load targeting a /// non-address-taken alloca. Doing so will cause us to not promote the alloca /// to a register. static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { @@ -369,7 +374,9 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false); - return new LoadInst(PhiVal, "", isVolatile, LoadAlignment); + LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment); + NewLI->setDebugLoc(FirstLI->getDebugLoc()); + return NewLI; } @@ -469,20 +476,27 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { } // Insert and return the new operation. - if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) - return CastInst::Create(FirstCI->getOpcode(), PhiVal, PN.getType()); + if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) { + CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal, + PN.getType()); + NewCI->setDebugLoc(FirstInst->getDebugLoc()); + return NewCI; + } if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) { BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp); if (isNUW) BinOp->setHasNoUnsignedWrap(); if (isNSW) BinOp->setHasNoSignedWrap(); if (isExact) BinOp->setIsExact(); + BinOp->setDebugLoc(FirstInst->getDebugLoc()); return BinOp; } CmpInst *CIOp = cast<CmpInst>(FirstInst); - return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), - PhiVal, ConstantOp); + CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), + PhiVal, ConstantOp); + NewCI->setDebugLoc(FirstInst->getDebugLoc()); + return NewCI; } /// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle @@ -774,9 +788,6 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // PHINode simplification // Instruction *InstCombiner::visitPHINode(PHINode &PN) { - // If LCSSA is around, don't mess with Phi nodes - if (MustPreserveLCSSA) return 0; - if (Value *V = SimplifyInstruction(&PN, TD)) return ReplaceInstUsesWith(PN, V); @@ -824,18 +835,18 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { // quick check to see if the PHI node only contains a single non-phi value, if // so, scan to see if the phi cycle is actually equal to that value. { - unsigned InValNo = 0, NumOperandVals = PN.getNumIncomingValues(); + unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues(); // Scan for the first non-phi operand. - while (InValNo != NumOperandVals && + while (InValNo != NumIncomingVals && isa<PHINode>(PN.getIncomingValue(InValNo))) ++InValNo; - if (InValNo != NumOperandVals) { - Value *NonPhiInVal = PN.getOperand(InValNo); + if (InValNo != NumIncomingVals) { + Value *NonPhiInVal = PN.getIncomingValue(InValNo); // Scan the rest of the operands to see if there are any conflicts, if so // there is no need to recursively scan other phis. - for (++InValNo; InValNo != NumOperandVals; ++InValNo) { + for (++InValNo; InValNo != NumIncomingVals; ++InValNo) { Value *OpVal = PN.getIncomingValue(InValNo); if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal)) break; @@ -844,7 +855,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { // If we scanned over all operands, then we have one unique value plus // phi values. Scan PHI nodes to see if they all merge in each other or // the value. - if (InValNo == NumOperandVals) { + if (InValNo == NumIncomingVals) { SmallPtrSet<PHINode*, 16> ValueEqualPHIs; if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs)) return ReplaceInstUsesWith(PN, NonPhiInVal); diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 61a433a..aeb3c3e 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -133,9 +133,8 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, } // Fold this by inserting a select from the input values. - SelectInst *NewSI = SelectInst::Create(SI.getCondition(), TI->getOperand(0), - FI->getOperand(0), SI.getName()+".v"); - InsertNewInstBefore(NewSI, SI); + Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0), + FI->getOperand(0), SI.getName()+".v"); return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI, TI->getType()); } @@ -174,9 +173,8 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, } // If we reach here, they do have operations in common. - SelectInst *NewSI = SelectInst::Create(SI.getCondition(), OtherOpT, - OtherOpF, SI.getName()+".v"); - InsertNewInstBefore(NewSI, SI); + Value *NewSI = Builder->CreateSelect(SI.getCondition(), OtherOpT, + OtherOpF, SI.getName()+".v"); if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) { if (MatchIsOpZero) @@ -224,8 +222,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, // Avoid creating select between 2 constants unless it's selecting // between 0, 1 and -1. if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) { - Instruction *NewSel = SelectInst::Create(SI.getCondition(), OOp, C); - InsertNewInstBefore(NewSel, SI); + Value *NewSel = Builder->CreateSelect(SI.getCondition(), OOp, C); NewSel->takeName(TVI); BinaryOperator *TVI_BO = cast<BinaryOperator>(TVI); BinaryOperator *BO = BinaryOperator::Create(TVI_BO->getOpcode(), @@ -260,8 +257,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, // Avoid creating select between 2 constants unless it's selecting // between 0, 1 and -1. if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) { - Instruction *NewSel = SelectInst::Create(SI.getCondition(), C, OOp); - InsertNewInstBefore(NewSel, SI); + Value *NewSel = Builder->CreateSelect(SI.getCondition(), C, OOp); NewSel->takeName(FVI); BinaryOperator *FVI_BO = cast<BinaryOperator>(FVI); BinaryOperator *BO = BinaryOperator::Create(FVI_BO->getOpcode(), @@ -282,6 +278,59 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, return 0; } +/// SimplifyWithOpReplaced - See if V simplifies when its operand Op is +/// replaced with RepOp. +static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, + const TargetData *TD) { + // Trivial replacement. + if (V == Op) + return RepOp; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return 0; + + // If this is a binary operator, try to simplify it with the replaced op. + if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) { + if (B->getOperand(0) == Op) + return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), TD); + if (B->getOperand(1) == Op) + return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, TD); + } + + // Same for CmpInsts. + if (CmpInst *C = dyn_cast<CmpInst>(I)) { + if (C->getOperand(0) == Op) + return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD); + if (C->getOperand(1) == Op) + return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD); + } + + // TODO: We could hand off more cases to instsimplify here. + + // If all operands are constant after substituting Op for RepOp then we can + // constant fold the instruction. + if (Constant *CRepOp = dyn_cast<Constant>(RepOp)) { + // Build a list of all constant operands. + SmallVector<Constant*, 8> ConstOps; + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + if (I->getOperand(i) == Op) + ConstOps.push_back(CRepOp); + else if (Constant *COp = dyn_cast<Constant>(I->getOperand(i))) + ConstOps.push_back(COp); + else + break; + } + + // All operands were constants, fold it. + if (ConstOps.size() == I->getNumOperands()) + return ConstantFoldInstOperands(I->getOpcode(), I->getType(), + ConstOps.data(), ConstOps.size(), TD); + } + + return 0; +} + /// visitSelectInstWithICmp - Visit a SelectInst that has an /// ICmpInst as its first operand. /// @@ -420,25 +469,21 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, } } - if (CmpLHS == TrueVal && CmpRHS == FalseVal) { - // Transform (X == Y) ? X : Y -> Y - if (Pred == ICmpInst::ICMP_EQ) + // If we have an equality comparison then we know the value in one of the + // arms of the select. See if substituting this value into the arm and + // simplifying the result yields the same value as the other arm. + if (Pred == ICmpInst::ICMP_EQ) { + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal || + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal) return ReplaceInstUsesWith(SI, FalseVal); - // Transform (X != Y) ? X : Y -> X - if (Pred == ICmpInst::ICMP_NE) + } else if (Pred == ICmpInst::ICMP_NE) { + if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal || + SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal) return ReplaceInstUsesWith(SI, TrueVal); - /// NOTE: if we wanted to, this is where to detect integer MIN/MAX - - } else if (CmpLHS == FalseVal && CmpRHS == TrueVal) { - // Transform (X == Y) ? Y : X -> X - if (Pred == ICmpInst::ICMP_EQ) - return ReplaceInstUsesWith(SI, FalseVal); - // Transform (X != Y) ? Y : X -> Y - if (Pred == ICmpInst::ICMP_NE) - return ReplaceInstUsesWith(SI, TrueVal); - /// NOTE: if we wanted to, this is where to detect integer MIN/MAX } + // NOTE: if we wanted to, this is where to detect integer MIN/MAX + if (isa<Constant>(CmpRHS)) { if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) { // Transform (X == C) ? X : Y -> (X == C) ? C : Y @@ -604,9 +649,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return BinaryOperator::CreateOr(CondVal, FalseVal); } // Change: A = select B, false, C --> A = and !B, C - Value *NotCond = - InsertNewInstBefore(BinaryOperator::CreateNot(CondVal, - "not."+CondVal->getName()), SI); + Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName()); return BinaryOperator::CreateAnd(NotCond, FalseVal); } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) { if (C->getZExtValue() == false) { @@ -614,9 +657,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return BinaryOperator::CreateAnd(CondVal, TrueVal); } // Change: A = select B, C, true --> A = or !B, C - Value *NotCond = - InsertNewInstBefore(BinaryOperator::CreateNot(CondVal, - "not."+CondVal->getName()), SI); + Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName()); return BinaryOperator::CreateOr(NotCond, TrueVal); } @@ -755,27 +796,20 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // So at this point we know we have (Y -> OtherAddOp): // select C, (add X, Y), (sub X, Z) Value *NegVal; // Compute -Z - if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) { - NegVal = ConstantExpr::getNeg(C); - } else if (SI.getType()->isFloatingPointTy()) { - NegVal = InsertNewInstBefore( - BinaryOperator::CreateFNeg(SubOp->getOperand(1), - "tmp"), SI); + if (SI.getType()->isFloatingPointTy()) { + NegVal = Builder->CreateFNeg(SubOp->getOperand(1)); } else { - NegVal = InsertNewInstBefore( - BinaryOperator::CreateNeg(SubOp->getOperand(1), - "tmp"), SI); + NegVal = Builder->CreateNeg(SubOp->getOperand(1)); } Value *NewTrueOp = OtherAddOp; Value *NewFalseOp = NegVal; if (AddOp != TI) std::swap(NewTrueOp, NewFalseOp); - Instruction *NewSel = - SelectInst::Create(CondVal, NewTrueOp, - NewFalseOp, SI.getName() + ".p"); + Value *NewSel = + Builder->CreateSelect(CondVal, NewTrueOp, + NewFalseOp, SI.getName() + ".p"); - NewSel = InsertNewInstBefore(NewSel, SI); if (SI.getType()->isFloatingPointTy()) return BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel); else diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index a7f8005..811f949 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -644,7 +644,14 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { return &I; } } - + + // (C1 << A) << C2 -> (C1 << C2) << A + Constant *C1, *C2; + Value *A; + if (match(I.getOperand(0), m_OneUse(m_Shl(m_Constant(C1), m_Value(A)))) && + match(I.getOperand(1), m_Constant(C2))) + return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A); + return 0; } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 6e727ce..8fea8eb 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -313,7 +313,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Instruction *Or = BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), I->getName()); - return InsertNewInstBefore(Or, *I); + return InsertNewInstWith(Or, *I); } // If all of the demanded bits on one side are known, and all of the set @@ -327,7 +327,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, ~RHSKnownOne & DemandedMask); Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp"); - return InsertNewInstBefore(And, *I); + return InsertNewInstWith(And, *I); } } @@ -353,13 +353,13 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, ConstantInt::get(I->getType(), NewMask & AndRHS->getValue()); Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp"); - InsertNewInstBefore(NewAnd, *I); + InsertNewInstWith(NewAnd, *I); Constant *XorC = ConstantInt::get(I->getType(), NewMask & XorRHS->getValue()); Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC, "tmp"); - return InsertNewInstBefore(NewXor, *I); + return InsertNewInstWith(NewXor, *I); } // Output known-0 bits are known if clear or set in both the LHS & RHS. @@ -472,7 +472,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (KnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) { // Convert to ZExt cast CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName()); - return InsertNewInstBefore(NewCast, *I); + return InsertNewInstWith(NewCast, *I); } else if (KnownOne[SrcBitWidth-1]) { // Input sign bit known set KnownOne |= NewBits; } @@ -515,7 +515,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Instruction *Or = BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), I->getName()); - return InsertNewInstBefore(Or, *I); + return InsertNewInstWith(Or, *I); } // We can say something about the output known-zero and known-one bits, @@ -632,7 +632,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // Perform the logical shift right. Instruction *NewVal = BinaryOperator::CreateLShr( I->getOperand(0), I->getOperand(1), I->getName()); - return InsertNewInstBefore(NewVal, *I); + return InsertNewInstWith(NewVal, *I); } // If the sign bit is the only bit demanded by this ashr, then there is no @@ -676,7 +676,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // Perform the logical shift right. Instruction *NewVal = BinaryOperator::CreateLShr( I->getOperand(0), SA, I->getName()); - return InsertNewInstBefore(NewVal, *I); + return InsertNewInstWith(NewVal, *I); } else if ((KnownOne & SignBit) != 0) { // New bits are known one. KnownOne |= HighBits; } @@ -774,12 +774,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, NewVal = BinaryOperator::CreateShl(II->getArgOperand(0), ConstantInt::get(I->getType(), ResultBit-InputBit)); NewVal->takeName(I); - return InsertNewInstBefore(NewVal, *I); + return InsertNewInstWith(NewVal, *I); } // TODO: Could compute known zero/one bits based on the input. break; } + case Intrinsic::x86_sse42_crc32_64_8: + case Intrinsic::x86_sse42_crc32_64_64: + KnownZero = APInt::getHighBitsSet(64, 32); + return 0; } } ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth); @@ -867,7 +871,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (Depth == 10) return 0; - // If multiple users are using the root value, procede with + // If multiple users are using the root value, proceed with // simplification conservatively assuming that all elements // are needed. if (!V->hasOneUse()) { @@ -1108,21 +1112,21 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, Value *LHS = II->getArgOperand(0); Value *RHS = II->getArgOperand(1); // Extract the element as scalars. - LHS = InsertNewInstBefore(ExtractElementInst::Create(LHS, + LHS = InsertNewInstWith(ExtractElementInst::Create(LHS, ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II); - RHS = InsertNewInstBefore(ExtractElementInst::Create(RHS, + RHS = InsertNewInstWith(ExtractElementInst::Create(RHS, ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II); switch (II->getIntrinsicID()) { default: llvm_unreachable("Case stmts out of sync!"); case Intrinsic::x86_sse_sub_ss: case Intrinsic::x86_sse2_sub_sd: - TmpV = InsertNewInstBefore(BinaryOperator::CreateFSub(LHS, RHS, + TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS, II->getName()), *II); break; case Intrinsic::x86_sse_mul_ss: case Intrinsic::x86_sse2_mul_sd: - TmpV = InsertNewInstBefore(BinaryOperator::CreateFMul(LHS, RHS, + TmpV = InsertNewInstWith(BinaryOperator::CreateFMul(LHS, RHS, II->getName()), *II); break; } @@ -1132,7 +1136,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, UndefValue::get(II->getType()), TmpV, ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U, false), II->getName()); - InsertNewInstBefore(New, *II); + InsertNewInstWith(New, *II); return New; } } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 4be671f..92c10f5 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -76,7 +76,6 @@ INITIALIZE_PASS(InstCombiner, "instcombine", "Combine redundant instructions", false, false) void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreservedID(LCSSAID); AU.setPreservesCFG(); } @@ -241,9 +240,9 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Constant *C2 = cast<Constant>(Op1->getOperand(1)); Constant *Folded = ConstantExpr::get(Opcode, C1, C2); - Instruction *New = BinaryOperator::Create(Opcode, A, B, Op1->getName(), - &I); - Worklist.Add(New); + Instruction *New = BinaryOperator::Create(Opcode, A, B); + InsertNewInstWith(New, I); + New->takeName(Op1); I.setOperand(0, New); I.setOperand(1, Folded); // Conservatively clear the optional flags, since they may not be @@ -600,7 +599,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { } // Okay, we can do the transformation: create the new PHI node. - PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues(), ""); + PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues()); InsertNewInstBefore(NewPN, *PN); NewPN->takeName(PN); @@ -1089,8 +1088,8 @@ Instruction *InstCombiner::visitFree(CallInst &FI) { // free undef -> unreachable. if (isa<UndefValue>(Op)) { // Insert a new store to null because we cannot modify the CFG here. - new StoreInst(ConstantInt::getTrue(FI.getContext()), - UndefValue::get(Type::getInt1PtrTy(FI.getContext())), &FI); + Builder->CreateStore(ConstantInt::getTrue(FI.getContext()), + UndefValue::get(Type::getInt1PtrTy(FI.getContext()))); return EraseInstFromFunction(FI); } @@ -1262,7 +1261,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { case Intrinsic::sadd_with_overflow: if (*EV.idx_begin() == 0) { // Normal result. Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - II->replaceAllUsesWith(UndefValue::get(II->getType())); + ReplaceInstUsesWith(*II, UndefValue::get(II->getType())); EraseInstFromFunction(*II); return BinaryOperator::CreateAdd(LHS, RHS); } @@ -1279,7 +1278,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { case Intrinsic::ssub_with_overflow: if (*EV.idx_begin() == 0) { // Normal result. Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - II->replaceAllUsesWith(UndefValue::get(II->getType())); + ReplaceInstUsesWith(*II, UndefValue::get(II->getType())); EraseInstFromFunction(*II); return BinaryOperator::CreateSub(LHS, RHS); } @@ -1288,7 +1287,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { case Intrinsic::smul_with_overflow: if (*EV.idx_begin() == 0) { // Normal result. Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - II->replaceAllUsesWith(UndefValue::get(II->getType())); + ReplaceInstUsesWith(*II, UndefValue::get(II->getType())); EraseInstFromFunction(*II); return BinaryOperator::CreateMul(LHS, RHS); } @@ -1386,8 +1385,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, Worklist.push_back(BB); SmallVector<Instruction*, 128> InstrsForInstCombineWorklist; - SmallPtrSet<ConstantExpr*, 64> FoldedConstants; - + DenseMap<ConstantExpr*, Constant*> FoldedConstants; + do { BB = Worklist.pop_back_val(); @@ -1422,14 +1421,15 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, i != e; ++i) { ConstantExpr *CE = dyn_cast<ConstantExpr>(i); if (CE == 0) continue; - - // If we already folded this constant, don't try again. - if (!FoldedConstants.insert(CE)) - continue; - - Constant *NewC = ConstantFoldConstantExpression(CE, TD); - if (NewC && NewC != CE) { - *i = NewC; + + Constant*& FoldRes = FoldedConstants[CE]; + if (!FoldRes) + FoldRes = ConstantFoldConstantExpression(CE, TD); + if (!FoldRes) + FoldRes = CE; + + if (FoldRes != CE) { + *i = FoldRes; MadeIRChange = true; } } @@ -1576,6 +1576,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // Now that we have an instruction, try combining it to simplify it. Builder->SetInsertPoint(I->getParent(), I); + Builder->SetCurrentDebugLocation(I->getDebugLoc()); #ifndef NDEBUG std::string OrigI; @@ -1590,7 +1591,8 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { DEBUG(errs() << "IC: Old = " << *I << '\n' << " New = " << *Result << '\n'); - Result->setDebugLoc(I->getDebugLoc()); + if (!I->getDebugLoc().isUnknown()) + Result->setDebugLoc(I->getDebugLoc()); // Everything uses the new instruction now. I->replaceAllUsesWith(Result); @@ -1637,7 +1639,6 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { bool InstCombiner::runOnFunction(Function &F) { - MustPreserveLCSSA = mustPreserveAnalysisID(LCSSAID); TD = getAnalysisIfAvailable<TargetData>(); diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index 0ac1cb0..5700ac8 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMInstrumentation EdgeProfiling.cpp + GCOVProfiling.cpp Instrumentation.cpp OptimalEdgeProfiling.cpp PathProfiling.cpp diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp new file mode 100644 index 0000000..b902213 --- /dev/null +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -0,0 +1,673 @@ +//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements GCOV-style profiling. When this pass is run it emits +// "gcno" files next to the existing source, and instruments the code that runs +// to records the edges between blocks that run and emit a complementary "gcda" +// file on exit. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "insert-gcov-profiling" + +#include "ProfilingUtils.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Instructions.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/PathV2.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/UniqueVector.h" +#include <string> +#include <utility> +using namespace llvm; + +namespace { + class GCOVProfiler : public ModulePass { + public: + static char ID; + GCOVProfiler() + : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false) { + initializeGCOVProfilerPass(*PassRegistry::getPassRegistry()); + } + GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format = false) + : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData), + Use402Format(use402Format) { + assert((EmitNotes || EmitData) && "GCOVProfiler asked to do nothing?"); + initializeGCOVProfilerPass(*PassRegistry::getPassRegistry()); + } + virtual const char *getPassName() const { + return "GCOV Profiler"; + } + + private: + bool runOnModule(Module &M); + + // Create the GCNO files for the Module based on DebugInfo. + void emitGCNO(DebugInfoFinder &DIF); + + // Modify the program to track transitions along edges and call into the + // profiling runtime to emit .gcda files when run. + bool emitProfileArcs(DebugInfoFinder &DIF); + + // Get pointers to the functions in the runtime library. + Constant *getStartFileFunc(); + Constant *getIncrementIndirectCounterFunc(); + Constant *getEmitFunctionFunc(); + Constant *getEmitArcsFunc(); + Constant *getEndFileFunc(); + + // Create or retrieve an i32 state value that is used to represent the + // pred block number for certain non-trivial edges. + GlobalVariable *getEdgeStateValue(); + + // Produce a table of pointers to counters, by predecessor and successor + // block number. + GlobalVariable *buildEdgeLookupTable(Function *F, + GlobalVariable *Counter, + const UniqueVector<BasicBlock *> &Preds, + const UniqueVector<BasicBlock *> &Succs); + + // Add the function to write out all our counters to the global destructor + // list. + void insertCounterWriteout(DebugInfoFinder &, + SmallVector<std::pair<GlobalVariable *, + MDNode *>, 8> &); + + std::string mangleName(DICompileUnit CU, std::string NewStem); + + bool EmitNotes; + bool EmitData; + bool Use402Format; + + Module *M; + LLVMContext *Ctx; + }; +} + +char GCOVProfiler::ID = 0; +INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling", + "Insert instrumentation for GCOV profiling", false, false) + +ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData, + bool Use402Format) { + return new GCOVProfiler(EmitNotes, EmitData, Use402Format); +} + +static DISubprogram findSubprogram(DIScope Scope) { + while (!Scope.isSubprogram()) { + assert(Scope.isLexicalBlock() && + "Debug location not lexical block or subprogram"); + Scope = DILexicalBlock(Scope).getContext(); + } + return DISubprogram(Scope); +} + +namespace { + class GCOVRecord { + protected: + static const char *LinesTag; + static const char *FunctionTag; + static const char *BlockTag; + static const char *EdgeTag; + + GCOVRecord() {} + + void writeBytes(const char *Bytes, int Size) { + os->write(Bytes, Size); + } + + void write(uint32_t i) { + writeBytes(reinterpret_cast<char*>(&i), 4); + } + + // Returns the length measured in 4-byte blocks that will be used to + // represent this string in a GCOV file + unsigned lengthOfGCOVString(StringRef s) { + // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs + // padding out to the next 4-byte word. The length is measured in 4-byte + // words including padding, not bytes of actual string. + return (s.size() / 4) + 1; + } + + void writeGCOVString(StringRef s) { + uint32_t Len = lengthOfGCOVString(s); + write(Len); + writeBytes(s.data(), s.size()); + + // Write 1 to 4 bytes of NUL padding. + assert((unsigned)(4 - (s.size() % 4)) > 0); + assert((unsigned)(4 - (s.size() % 4)) <= 4); + writeBytes("\0\0\0\0", 4 - (s.size() % 4)); + } + + raw_ostream *os; + }; + const char *GCOVRecord::LinesTag = "\0\0\x45\x01"; + const char *GCOVRecord::FunctionTag = "\0\0\0\1"; + const char *GCOVRecord::BlockTag = "\0\0\x41\x01"; + const char *GCOVRecord::EdgeTag = "\0\0\x43\x01"; + + class GCOVFunction; + class GCOVBlock; + + // Constructed only by requesting it from a GCOVBlock, this object stores a + // list of line numbers and a single filename, representing lines that belong + // to the block. + class GCOVLines : public GCOVRecord { + public: + void addLine(uint32_t Line) { + Lines.push_back(Line); + } + + uint32_t length() { + return lengthOfGCOVString(Filename) + 2 + Lines.size(); + } + + private: + friend class GCOVBlock; + + GCOVLines(std::string Filename, raw_ostream *os) + : Filename(Filename) { + this->os = os; + } + + std::string Filename; + SmallVector<uint32_t, 32> Lines; + }; + + // Represent a basic block in GCOV. Each block has a unique number in the + // function, number of lines belonging to each block, and a set of edges to + // other blocks. + class GCOVBlock : public GCOVRecord { + public: + GCOVLines &getFile(std::string Filename) { + GCOVLines *&Lines = LinesByFile[Filename]; + if (!Lines) { + Lines = new GCOVLines(Filename, os); + } + return *Lines; + } + + void addEdge(GCOVBlock &Successor) { + OutEdges.push_back(&Successor); + } + + void writeOut() { + uint32_t Len = 3; + for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(), + E = LinesByFile.end(); I != E; ++I) { + Len += I->second->length(); + } + + writeBytes(LinesTag, 4); + write(Len); + write(Number); + for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(), + E = LinesByFile.end(); I != E; ++I) { + write(0); + writeGCOVString(I->second->Filename); + for (int i = 0, e = I->second->Lines.size(); i != e; ++i) { + write(I->second->Lines[i]); + } + } + write(0); + write(0); + } + + ~GCOVBlock() { + DeleteContainerSeconds(LinesByFile); + } + + private: + friend class GCOVFunction; + + GCOVBlock(uint32_t Number, raw_ostream *os) + : Number(Number) { + this->os = os; + } + + uint32_t Number; + StringMap<GCOVLines *> LinesByFile; + SmallVector<GCOVBlock *, 4> OutEdges; + }; + + // A function has a unique identifier, a checksum (we leave as zero) and a + // set of blocks and a map of edges between blocks. This is the only GCOV + // object users can construct, the blocks and lines will be rooted here. + class GCOVFunction : public GCOVRecord { + public: + GCOVFunction(DISubprogram SP, raw_ostream *os, bool Use402Format) { + this->os = os; + + Function *F = SP.getFunction(); + uint32_t i = 0; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + Blocks[BB] = new GCOVBlock(i++, os); + } + ReturnBlock = new GCOVBlock(i++, os); + + writeBytes(FunctionTag, 4); + uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(SP.getName()) + + 1 + lengthOfGCOVString(SP.getFilename()) + 1; + if (!Use402Format) + ++BlockLen; // For second checksum. + write(BlockLen); + uint32_t Ident = reinterpret_cast<intptr_t>((MDNode*)SP); + write(Ident); + write(0); // checksum #1 + if (!Use402Format) + write(0); // checksum #2 + writeGCOVString(SP.getName()); + writeGCOVString(SP.getFilename()); + write(SP.getLineNumber()); + } + + ~GCOVFunction() { + DeleteContainerSeconds(Blocks); + delete ReturnBlock; + } + + GCOVBlock &getBlock(BasicBlock *BB) { + return *Blocks[BB]; + } + + GCOVBlock &getReturnBlock() { + return *ReturnBlock; + } + + void writeOut() { + // Emit count of blocks. + writeBytes(BlockTag, 4); + write(Blocks.size() + 1); + for (int i = 0, e = Blocks.size() + 1; i != e; ++i) { + write(0); // No flags on our blocks. + } + + // Emit edges between blocks. + for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(), + E = Blocks.end(); I != E; ++I) { + GCOVBlock &Block = *I->second; + if (Block.OutEdges.empty()) continue; + + writeBytes(EdgeTag, 4); + write(Block.OutEdges.size() * 2 + 1); + write(Block.Number); + for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) { + write(Block.OutEdges[i]->Number); + write(0); // no flags + } + } + + // Emit lines for each block. + for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(), + E = Blocks.end(); I != E; ++I) { + I->second->writeOut(); + } + } + + private: + DenseMap<BasicBlock *, GCOVBlock *> Blocks; + GCOVBlock *ReturnBlock; + }; +} + +std::string GCOVProfiler::mangleName(DICompileUnit CU, std::string NewStem) { + if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) { + for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) { + MDNode *N = GCov->getOperand(i); + if (N->getNumOperands() != 2) continue; + MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0)); + MDNode *CompileUnit = dyn_cast<MDNode>(N->getOperand(1)); + if (!GCovFile || !CompileUnit) continue; + if (CompileUnit == CU) { + SmallString<128> Filename = GCovFile->getString(); + sys::path::replace_extension(Filename, NewStem); + return Filename.str(); + } + } + } + + SmallString<128> Filename = CU.getFilename(); + sys::path::replace_extension(Filename, NewStem); + return sys::path::filename(Filename.str()); +} + +bool GCOVProfiler::runOnModule(Module &M) { + this->M = &M; + Ctx = &M.getContext(); + + DebugInfoFinder DIF; + DIF.processModule(M); + + if (EmitNotes) emitGCNO(DIF); + if (EmitData) return emitProfileArcs(DIF); + return false; +} + +void GCOVProfiler::emitGCNO(DebugInfoFinder &DIF) { + DenseMap<const MDNode *, raw_fd_ostream *> GcnoFiles; + for (DebugInfoFinder::iterator I = DIF.compile_unit_begin(), + E = DIF.compile_unit_end(); I != E; ++I) { + // Each compile unit gets its own .gcno file. This means that whether we run + // this pass over the original .o's as they're produced, or run it after + // LTO, we'll generate the same .gcno files. + + DICompileUnit CU(*I); + raw_fd_ostream *&out = GcnoFiles[CU]; + std::string ErrorInfo; + out = new raw_fd_ostream(mangleName(CU, "gcno").c_str(), ErrorInfo, + raw_fd_ostream::F_Binary); + if (!Use402Format) + out->write("oncg*404MVLL", 12); + else + out->write("oncg*402MVLL", 12); + } + + for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(), + SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) { + DISubprogram SP(*SPI); + raw_fd_ostream *&os = GcnoFiles[SP.getCompileUnit()]; + + Function *F = SP.getFunction(); + if (!F) continue; + GCOVFunction Func(SP, os, Use402Format); + + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + GCOVBlock &Block = Func.getBlock(BB); + TerminatorInst *TI = BB->getTerminator(); + if (int successors = TI->getNumSuccessors()) { + for (int i = 0; i != successors; ++i) { + Block.addEdge(Func.getBlock(TI->getSuccessor(i))); + } + } else if (isa<ReturnInst>(TI)) { + Block.addEdge(Func.getReturnBlock()); + } + + uint32_t Line = 0; + for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) { + const DebugLoc &Loc = I->getDebugLoc(); + if (Loc.isUnknown()) continue; + if (Line == Loc.getLine()) continue; + Line = Loc.getLine(); + if (SP != findSubprogram(DIScope(Loc.getScope(*Ctx)))) continue; + + GCOVLines &Lines = Block.getFile(SP.getFilename()); + Lines.addLine(Loc.getLine()); + } + } + Func.writeOut(); + } + + for (DenseMap<const MDNode *, raw_fd_ostream *>::iterator + I = GcnoFiles.begin(), E = GcnoFiles.end(); I != E; ++I) { + raw_fd_ostream *&out = I->second; + out->write("\0\0\0\0\0\0\0\0", 8); // EOF + out->close(); + delete out; + } +} + +bool GCOVProfiler::emitProfileArcs(DebugInfoFinder &DIF) { + if (DIF.subprogram_begin() == DIF.subprogram_end()) + return false; + + SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP; + for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(), + SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) { + DISubprogram SP(*SPI); + Function *F = SP.getFunction(); + if (!F) continue; + + unsigned Edges = 0; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + TerminatorInst *TI = BB->getTerminator(); + if (isa<ReturnInst>(TI)) + ++Edges; + else + Edges += TI->getNumSuccessors(); + } + + const ArrayType *CounterTy = + ArrayType::get(Type::getInt64Ty(*Ctx), Edges); + GlobalVariable *Counters = + new GlobalVariable(*M, CounterTy, false, + GlobalValue::InternalLinkage, + Constant::getNullValue(CounterTy), + "__llvm_gcov_ctr", 0, false, 0); + CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP)); + + UniqueVector<BasicBlock *> ComplexEdgePreds; + UniqueVector<BasicBlock *> ComplexEdgeSuccs; + + unsigned Edge = 0; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + TerminatorInst *TI = BB->getTerminator(); + int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors(); + if (Successors) { + IRBuilder<> Builder(TI); + + if (Successors == 1) { + Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0, + Edge); + Value *Count = Builder.CreateLoad(Counter); + Count = Builder.CreateAdd(Count, + ConstantInt::get(Type::getInt64Ty(*Ctx),1)); + Builder.CreateStore(Count, Counter); + } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + Value *Sel = Builder.CreateSelect( + BI->getCondition(), + ConstantInt::get(Type::getInt64Ty(*Ctx), Edge), + ConstantInt::get(Type::getInt64Ty(*Ctx), Edge + 1)); + SmallVector<Value *, 2> Idx; + Idx.push_back(Constant::getNullValue(Type::getInt64Ty(*Ctx))); + Idx.push_back(Sel); + Value *Counter = Builder.CreateInBoundsGEP(Counters, + Idx.begin(), Idx.end()); + Value *Count = Builder.CreateLoad(Counter); + Count = Builder.CreateAdd(Count, + ConstantInt::get(Type::getInt64Ty(*Ctx),1)); + Builder.CreateStore(Count, Counter); + } else { + ComplexEdgePreds.insert(BB); + for (int i = 0; i != Successors; ++i) + ComplexEdgeSuccs.insert(TI->getSuccessor(i)); + } + Edge += Successors; + } + } + + if (!ComplexEdgePreds.empty()) { + GlobalVariable *EdgeTable = + buildEdgeLookupTable(F, Counters, + ComplexEdgePreds, ComplexEdgeSuccs); + GlobalVariable *EdgeState = getEdgeStateValue(); + + const Type *Int32Ty = Type::getInt32Ty(*Ctx); + for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) { + IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator()); + Builder.CreateStore(ConstantInt::get(Int32Ty, i), EdgeState); + } + for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) { + // call runtime to perform increment + IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstNonPHI()); + Value *CounterPtrArray = + Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0, + i * ComplexEdgePreds.size()); + Builder.CreateCall2(getIncrementIndirectCounterFunc(), + EdgeState, CounterPtrArray); + // clear the predecessor number + Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState); + } + } + } + + insertCounterWriteout(DIF, CountersBySP); + + return true; +} + +// All edges with successors that aren't branches are "complex", because it +// requires complex logic to pick which counter to update. +GlobalVariable *GCOVProfiler::buildEdgeLookupTable( + Function *F, + GlobalVariable *Counters, + const UniqueVector<BasicBlock *> &Preds, + const UniqueVector<BasicBlock *> &Succs) { + // TODO: support invoke, threads. We rely on the fact that nothing can modify + // the whole-Module pred edge# between the time we set it and the time we next + // read it. Threads and invoke make this untrue. + + // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]]. + const Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx); + const ArrayType *EdgeTableTy = ArrayType::get( + Int64PtrTy, Succs.size() * Preds.size()); + + Constant **EdgeTable = new Constant*[Succs.size() * Preds.size()]; + Constant *NullValue = Constant::getNullValue(Int64PtrTy); + for (int i = 0, ie = Succs.size() * Preds.size(); i != ie; ++i) + EdgeTable[i] = NullValue; + + unsigned Edge = 0; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + TerminatorInst *TI = BB->getTerminator(); + int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors(); + if (Successors > 1 && !isa<BranchInst>(TI) && !isa<ReturnInst>(TI)) { + for (int i = 0; i != Successors; ++i) { + BasicBlock *Succ = TI->getSuccessor(i); + IRBuilder<> builder(Succ); + Value *Counter = builder.CreateConstInBoundsGEP2_64(Counters, 0, + Edge + i); + EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) + + (Preds.idFor(BB)-1)] = cast<Constant>(Counter); + } + } + Edge += Successors; + } + + GlobalVariable *EdgeTableGV = + new GlobalVariable( + *M, EdgeTableTy, true, GlobalValue::InternalLinkage, + ConstantArray::get(EdgeTableTy, + &EdgeTable[0], Succs.size() * Preds.size()), + "__llvm_gcda_edge_table"); + EdgeTableGV->setUnnamedAddr(true); + return EdgeTableGV; +} + +Constant *GCOVProfiler::getStartFileFunc() { + const Type *Args[] = { Type::getInt8PtrTy(*Ctx) }; + const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), + Args, false); + return M->getOrInsertFunction("llvm_gcda_start_file", FTy); +} + +Constant *GCOVProfiler::getIncrementIndirectCounterFunc() { + const Type *Args[] = { + Type::getInt32PtrTy(*Ctx), // uint32_t *predecessor + Type::getInt64PtrTy(*Ctx)->getPointerTo(), // uint64_t **state_table_row + }; + const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), + Args, false); + return M->getOrInsertFunction("llvm_gcda_increment_indirect_counter", FTy); +} + +Constant *GCOVProfiler::getEmitFunctionFunc() { + const Type *Args[2] = { + Type::getInt32Ty(*Ctx), // uint32_t ident + Type::getInt8PtrTy(*Ctx), // const char *function_name + }; + const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), + Args, false); + return M->getOrInsertFunction("llvm_gcda_emit_function", FTy); +} + +Constant *GCOVProfiler::getEmitArcsFunc() { + const Type *Args[] = { + Type::getInt32Ty(*Ctx), // uint32_t num_counters + Type::getInt64PtrTy(*Ctx), // uint64_t *counters + }; + const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), + Args, false); + return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy); +} + +Constant *GCOVProfiler::getEndFileFunc() { + const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + return M->getOrInsertFunction("llvm_gcda_end_file", FTy); +} + +GlobalVariable *GCOVProfiler::getEdgeStateValue() { + GlobalVariable *GV = M->getGlobalVariable("__llvm_gcov_global_state_pred"); + if (!GV) { + GV = new GlobalVariable(*M, Type::getInt32Ty(*Ctx), false, + GlobalValue::InternalLinkage, + ConstantInt::get(Type::getInt32Ty(*Ctx), + 0xffffffff), + "__llvm_gcov_global_state_pred"); + GV->setUnnamedAddr(true); + } + return GV; +} + +void GCOVProfiler::insertCounterWriteout( + DebugInfoFinder &DIF, + SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &CountersBySP) { + const FunctionType *WriteoutFTy = + FunctionType::get(Type::getVoidTy(*Ctx), false); + Function *WriteoutF = Function::Create(WriteoutFTy, + GlobalValue::InternalLinkage, + "__llvm_gcov_writeout", M); + WriteoutF->setUnnamedAddr(true); + BasicBlock *BB = BasicBlock::Create(*Ctx, "", WriteoutF); + IRBuilder<> Builder(BB); + + Constant *StartFile = getStartFileFunc(); + Constant *EmitFunction = getEmitFunctionFunc(); + Constant *EmitArcs = getEmitArcsFunc(); + Constant *EndFile = getEndFileFunc(); + + for (DebugInfoFinder::iterator CUI = DIF.compile_unit_begin(), + CUE = DIF.compile_unit_end(); CUI != CUE; ++CUI) { + DICompileUnit compile_unit(*CUI); + std::string FilenameGcda = mangleName(compile_unit, "gcda"); + Builder.CreateCall(StartFile, + Builder.CreateGlobalStringPtr(FilenameGcda)); + for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator + I = CountersBySP.begin(), E = CountersBySP.end(); + I != E; ++I) { + DISubprogram SP(I->second); + intptr_t ident = reinterpret_cast<intptr_t>(I->second); + Builder.CreateCall2(EmitFunction, + ConstantInt::get(Type::getInt32Ty(*Ctx), ident), + Builder.CreateGlobalStringPtr(SP.getName())); + + GlobalVariable *GV = I->first; + unsigned Arcs = + cast<ArrayType>(GV->getType()->getElementType())->getNumElements(); + Builder.CreateCall2(EmitArcs, + ConstantInt::get(Type::getInt32Ty(*Ctx), Arcs), + Builder.CreateConstGEP2_64(GV, 0, 0)); + } + Builder.CreateCall(EndFile); + } + Builder.CreateRetVoid(); + + InsertProfilingShutdownCall(WriteoutF, M); +} diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index 96ed4fa..71adc1e 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -23,6 +23,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeEdgeProfilerPass(Registry); initializeOptimalEdgeProfilerPass(Registry); initializePathProfilerPass(Registry); + initializeGCOVProfilerPass(Registry); } /// LLVMInitializeInstrumentation - C binding for diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp index ae2f2e2..e09f882 100644 --- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp +++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "insert-optimal-edge-profiling" #include "ProfilingUtils.h" +#include "llvm/Constants.h" #include "llvm/Module.h" #include "llvm/Pass.h" #include "llvm/Analysis/Passes.h" @@ -26,7 +27,6 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Statistic.h" #include "MaximumSpanningTree.h" -#include <set> using namespace llvm; STATISTIC(NumEdgesInserted, "The # of edges inserted."); diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp index 830251c..182a43d 100644 --- a/lib/Transforms/Instrumentation/PathProfiling.cpp +++ b/lib/Transforms/Instrumentation/PathProfiling.cpp @@ -63,7 +63,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Instrumentation.h" -#include <map> #include <vector> #define HASH_THRESHHOLD 100000 @@ -259,7 +258,7 @@ private: }; // --------------------------------------------------------------------------- -// PathProfiler is a module pass which intruments path profiling instructions +// PathProfiler is a module pass which instruments path profiling instructions // --------------------------------------------------------------------------- class PathProfiler : public ModulePass { private: @@ -389,6 +388,9 @@ namespace llvm { // BallLarusEdge << operator overloading raw_ostream& operator<<(raw_ostream& os, + const BLInstrumentationEdge& edge) + LLVM_ATTRIBUTE_USED; + raw_ostream& operator<<(raw_ostream& os, const BLInstrumentationEdge& edge) { os << "[" << edge.getSource()->getName() << " -> " << edge.getTarget()->getName() << "] init: " @@ -1349,8 +1351,6 @@ bool PathProfiler::runOnModule(Module &M) { return false; } - BasicBlock::iterator insertPoint = Main->getEntryBlock().getFirstNonPHI(); - llvmIncrementHashFunction = M.getOrInsertFunction( "llvm_increment_path_count", Type::getVoidTy(*Context), // return type diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp index b57bbf6..7435bc3 100644 --- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp +++ b/lib/Transforms/Instrumentation/ProfilingUtils.cpp @@ -110,7 +110,7 @@ void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, GlobalValue *CounterArray, bool beginning) { // Insert the increment after any alloca or PHI instructions... BasicBlock::iterator InsertPos = beginning ? BB->getFirstNonPHI() : - BB->getTerminator(); + BB->getTerminator(); while (isa<AllocaInst>(InsertPos)) ++InsertPos; @@ -121,8 +121,7 @@ void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context)); Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum); Constant *ElementPtr = - ConstantExpr::getGetElementPtr(CounterArray, &Indices[0], - Indices.size()); + ConstantExpr::getGetElementPtr(CounterArray, &Indices[0], Indices.size()); // Load, increment and store the value back. Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos); @@ -131,3 +130,41 @@ void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, "NewFuncCounter", InsertPos); new StoreInst(NewVal, ElementPtr, InsertPos); } + +void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) { + // llvm.global_dtors is an array of type { i32, void ()* }. Prepare those + // types. + const Type *GlobalDtorElems[2] = { + Type::getInt32Ty(Mod->getContext()), + FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo() + }; + const StructType *GlobalDtorElemTy = + StructType::get(Mod->getContext(), GlobalDtorElems, false); + + // Construct the new element we'll be adding. + Constant *Elem[2] = { + ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 65535), + ConstantExpr::getBitCast(Callee, GlobalDtorElems[1]) + }; + + // If llvm.global_dtors exists, make a copy of the things in its list and + // delete it, to replace it with one that has a larger array type. + std::vector<Constant *> dtors; + if (GlobalVariable *GlobalDtors = Mod->getNamedGlobal("llvm.global_dtors")) { + if (ConstantArray *InitList = + dyn_cast<ConstantArray>(GlobalDtors->getInitializer())) { + for (unsigned i = 0, e = InitList->getType()->getNumElements(); + i != e; ++i) + dtors.push_back(cast<Constant>(InitList->getOperand(i))); + } + GlobalDtors->eraseFromParent(); + } + + // Build up llvm.global_dtors with our new item in it. + GlobalVariable *GlobalDtors = new GlobalVariable( + *Mod, ArrayType::get(GlobalDtorElemTy, 1), false, + GlobalValue::AppendingLinkage, NULL, "llvm.global_dtors"); + dtors.push_back(ConstantStruct::get(Mod->getContext(), Elem, 2, false)); + GlobalDtors->setInitializer(ConstantArray::get( + cast<ArrayType>(GlobalDtors->getType()->getElementType()), dtors)); +} diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.h b/lib/Transforms/Instrumentation/ProfilingUtils.h index a76e357..09b2217 100644 --- a/lib/Transforms/Instrumentation/ProfilingUtils.h +++ b/lib/Transforms/Instrumentation/ProfilingUtils.h @@ -18,9 +18,10 @@ #define PROFILINGUTILS_H namespace llvm { + class BasicBlock; class Function; class GlobalValue; - class BasicBlock; + class Module; class PointerType; void InsertProfilingInitCall(Function *MainFn, const char *FnName, @@ -29,6 +30,7 @@ namespace llvm { void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, GlobalValue *CounterArray, bool beginning = true); + void InsertProfilingShutdownCall(Function *Callee, Module *Mod); } #endif diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index fcf914f..c223da6 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -20,6 +20,7 @@ add_llvm_library(LLVMScalarOpts LoopUnswitch.cpp LowerAtomic.cpp MemCpyOptimizer.cpp + ObjCARC.cpp Reassociate.cpp Reg2Mem.cpp SCCP.cpp diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 2f7ccea..0af14ed 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -147,7 +147,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (!DisableBranchOpts) { MadeChange = false; for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - MadeChange |= ConstantFoldTerminator(BB); + MadeChange |= ConstantFoldTerminator(BB, true); if (MadeChange) ModifiedDT = true; @@ -371,9 +371,11 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ // If these values will be promoted, find out what they will be promoted // to. This helps us consider truncates on PPC as noop copies when they // are. - if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote) + if (TLI.getTypeAction(CI->getContext(), SrcVT) == + TargetLowering::TypePromoteInteger) SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); - if (TLI.getTypeAction(DstVT) == TargetLowering::Promote) + if (TLI.getTypeAction(CI->getContext(), DstVT) == + TargetLowering::TypePromoteInteger) DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); // If, after promotion, these are the same types, this is a noop copy. @@ -548,7 +550,23 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // From here on out we're working with named functions. if (CI->getCalledFunction() == 0) return false; - + + // llvm.dbg.value is far away from the value then iSel may not be able + // handle it properly. iSel will drop llvm.dbg.value if it can not + // find a node corresponding to the value. + if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(CI)) + if (Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue())) + if (!VI->isTerminator() && + (DVI->getParent() != VI->getParent() || DT->dominates(DVI, VI))) { + DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); + DVI->removeFromParent(); + if (isa<PHINode>(VI)) + DVI->insertBefore(VI->getParent()->getFirstNonPHI()); + else + DVI->insertAfter(VI); + return true; + } + // We'll need TargetData from here on out. const TargetData *TD = TLI ? TLI->getTargetData() : 0; if (!TD) return false; @@ -889,11 +907,26 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, MemoryInst->replaceUsesOfWith(Repl, SunkAddr); + // If we have no uses, recursively delete the value and all dead instructions + // using it. if (Repl->use_empty()) { + // This can cause recursive deletion, which can invalidate our iterator. + // Use a WeakVH to hold onto it in case this happens. + WeakVH IterHandle(CurInstIterator); + BasicBlock *BB = CurInstIterator->getParent(); + RecursivelyDeleteTriviallyDeadInstructions(Repl); - // This address is now available for reassignment, so erase the table entry; - // we don't want to match some completely different instruction. - SunkAddrs[Addr] = 0; + + if (IterHandle != CurInstIterator) { + // If the iterator instruction was recursively deleted, start over at the + // start of the block. + CurInstIterator = BB->begin(); + SunkAddrs.clear(); + } else { + // This address is now available for reassignment, so erase the table + // entry; we don't want to match some completely different instruction. + SunkAddrs[Addr] = 0; + } } ++NumMemoryInsts; return true; diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index be12973..e275268 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -13,6 +13,7 @@ #define DEBUG_TYPE "correlated-value-propagation" #include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" #include "llvm/Function.h" #include "llvm/Instructions.h" #include "llvm/Pass.h" diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index dbb68f3..8dbcc23 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -23,7 +23,6 @@ #include "llvm/Pass.h" #include "llvm/Support/InstIterator.h" #include "llvm/ADT/Statistic.h" -#include <set> using namespace llvm; STATISTIC(DIEEliminated, "Number of insts removed by DIE pass"); diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 9f36a38..cb9b5be 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -437,12 +437,9 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { MemDepResult InstDep = MD->getDependency(Inst); - // Ignore non-local store liveness. + // Ignore any store where we can't find a local dependence. // FIXME: cross-block DSE would be fun. :) - if (InstDep.isNonLocal() || - // Ignore self dependence, which happens in the entry block of the - // function. - InstDep.getInst() == Inst) + if (InstDep.isNonLocal() || InstDep.isUnknown()) continue; // If we're storing the same value back to a pointer that we just @@ -478,14 +475,14 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { if (Loc.Ptr == 0) continue; - while (!InstDep.isNonLocal()) { + while (!InstDep.isNonLocal() && !InstDep.isUnknown()) { // Get the memory clobbered by the instruction we depend on. MemDep will // skip any instructions that 'Loc' clearly doesn't interact with. If we // end up depending on a may- or must-aliased load, then we can't optimize // away the store and we bail out. However, if we depend on on something // that overwrites the memory location we *can* potentially optimize it. // - // Find out what memory location the dependant instruction stores. + // Find out what memory location the dependent instruction stores. Instruction *DepWrite = InstDep.getInst(); AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); // If we didn't get a useful location, or if it isn't a size, bail out. @@ -542,24 +539,26 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { /// HandleFree - Handle frees of entire structures whose dependency is a store /// to a field of that structure. bool DSE::HandleFree(CallInst *F) { + bool MadeChange = false; + MemDepResult Dep = MD->getDependency(F); - do { - if (Dep.isNonLocal()) return false; - + + while (!Dep.isNonLocal() && !Dep.isUnknown()) { Instruction *Dependency = Dep.getInst(); if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) - return false; + return MadeChange; Value *DepPointer = GetUnderlyingObject(getStoredPointerOperand(Dependency)); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) - return false; + return MadeChange; // DCE instructions only used to calculate that store DeleteDeadInstruction(Dependency, *MD); ++NumFastStores; + MadeChange = true; // Inst's old Dependency is now deleted. Compute the next dependency, // which may also be dead, as in @@ -567,9 +566,9 @@ bool DSE::HandleFree(CallInst *F) { // s[1] = 0; // This has just been deleted. // free(s); Dep = MD->getDependency(F); - } while (!Dep.isNonLocal()); + }; - return true; + return MadeChange; } /// handleEndBlock - Remove dead stores to stack-allocated locations in the diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 45fd665..0e1e6f3 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -63,50 +63,48 @@ static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true)); namespace { struct Expression { uint32_t opcode; - const Type* type; + const Type *type; SmallVector<uint32_t, 4> varargs; - Expression() { } - Expression(uint32_t o) : opcode(o) { } + Expression(uint32_t o = ~2U) : opcode(o) { } bool operator==(const Expression &other) const { if (opcode != other.opcode) return false; - else if (opcode == ~0U || opcode == ~1U) + if (opcode == ~0U || opcode == ~1U) return true; - else if (type != other.type) + if (type != other.type) return false; - else if (varargs != other.varargs) + if (varargs != other.varargs) return false; return true; } }; class ValueTable { - private: - DenseMap<Value*, uint32_t> valueNumbering; - DenseMap<Expression, uint32_t> expressionNumbering; - AliasAnalysis* AA; - MemoryDependenceAnalysis* MD; - DominatorTree* DT; - - uint32_t nextValueNumber; - - Expression create_expression(Instruction* I); - uint32_t lookup_or_add_call(CallInst* C); - public: - ValueTable() : nextValueNumber(1) { } - uint32_t lookup_or_add(Value *V); - uint32_t lookup(Value *V) const; - void add(Value *V, uint32_t num); - void clear(); - void erase(Value *v); - void setAliasAnalysis(AliasAnalysis* A) { AA = A; } - AliasAnalysis *getAliasAnalysis() const { return AA; } - void setMemDep(MemoryDependenceAnalysis* M) { MD = M; } - void setDomTree(DominatorTree* D) { DT = D; } - uint32_t getNextUnusedValueNumber() { return nextValueNumber; } - void verifyRemoved(const Value *) const; + DenseMap<Value*, uint32_t> valueNumbering; + DenseMap<Expression, uint32_t> expressionNumbering; + AliasAnalysis *AA; + MemoryDependenceAnalysis *MD; + DominatorTree *DT; + + uint32_t nextValueNumber; + + Expression create_expression(Instruction* I); + uint32_t lookup_or_add_call(CallInst* C); + public: + ValueTable() : nextValueNumber(1) { } + uint32_t lookup_or_add(Value *V); + uint32_t lookup(Value *V) const; + void add(Value *V, uint32_t num); + void clear(); + void erase(Value *v); + void setAliasAnalysis(AliasAnalysis* A) { AA = A; } + AliasAnalysis *getAliasAnalysis() const { return AA; } + void setMemDep(MemoryDependenceAnalysis* M) { MD = M; } + void setDomTree(DominatorTree* D) { DT = D; } + uint32_t getNextUnusedValueNumber() { return nextValueNumber; } + void verifyRemoved(const Value *) const; }; } @@ -229,21 +227,19 @@ uint32_t ValueTable::lookup_or_add_call(CallInst* C) { // Non-local case. const MemoryDependenceAnalysis::NonLocalDepInfo &deps = MD->getNonLocalCallDependency(CallSite(C)); - // FIXME: call/call dependencies for readonly calls should return def, not - // clobber! Move the checking logic to MemDep! + // FIXME: Move the checking logic to MemDep! CallInst* cdep = 0; // Check to see if we have a single dominating call instruction that is // identical to C. for (unsigned i = 0, e = deps.size(); i != e; ++i) { const NonLocalDepEntry *I = &deps[i]; - // Ignore non-local dependencies. if (I->getResult().isNonLocal()) continue; - // We don't handle non-depedencies. If we already have a call, reject + // We don't handle non-definitions. If we already have a call, reject // instruction dependencies. - if (I->getResult().isClobber() || cdep != 0) { + if (!I->getResult().isDef() || cdep != 0) { cdep = 0; break; } @@ -364,14 +360,14 @@ uint32_t ValueTable::lookup(Value *V) const { return VI->second; } -/// clear - Remove all entries from the ValueTable +/// clear - Remove all entries from the ValueTable. void ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); nextValueNumber = 1; } -/// erase - Remove a value from the value numbering +/// erase - Remove a value from the value numbering. void ValueTable::erase(Value *V) { valueNumbering.erase(V); } @@ -392,20 +388,11 @@ void ValueTable::verifyRemoved(const Value *V) const { namespace { class GVN : public FunctionPass { - bool runOnFunction(Function &F); - public: - static char ID; // Pass identification, replacement for typeid - explicit GVN(bool noloads = false) - : FunctionPass(ID), NoLoads(noloads), MD(0) { - initializeGVNPass(*PassRegistry::getPassRegistry()); - } - - private: bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; - const TargetData* TD; - + const TargetData *TD; + ValueTable VN; /// LeaderTable - A mapping from value numbers to lists of Value*'s that @@ -418,17 +405,39 @@ namespace { DenseMap<uint32_t, LeaderTableEntry> LeaderTable; BumpPtrAllocator TableAllocator; + SmallVector<Instruction*, 8> InstrsToErase; + public: + static char ID; // Pass identification, replacement for typeid + explicit GVN(bool noloads = false) + : FunctionPass(ID), NoLoads(noloads), MD(0) { + initializeGVNPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + /// markInstructionForDeletion - This removes the specified instruction from + /// our various maps and marks it for deletion. + void markInstructionForDeletion(Instruction *I) { + VN.erase(I); + InstrsToErase.push_back(I); + } + + const TargetData *getTargetData() const { return TD; } + DominatorTree &getDominatorTree() const { return *DT; } + AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } + MemoryDependenceAnalysis &getMemDep() const { return *MD; } + private: /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for /// its value number. void addToLeaderTable(uint32_t N, Value *V, BasicBlock *BB) { - LeaderTableEntry& Curr = LeaderTable[N]; + LeaderTableEntry &Curr = LeaderTable[N]; if (!Curr.Val) { Curr.Val = V; Curr.BB = BB; return; } - LeaderTableEntry* Node = TableAllocator.Allocate<LeaderTableEntry>(); + LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>(); Node->Val = V; Node->BB = BB; Node->Next = Curr.Next; @@ -474,19 +483,17 @@ namespace { AU.addPreserved<DominatorTree>(); AU.addPreserved<AliasAnalysis>(); } + // Helper fuctions // FIXME: eliminate or document these better - bool processLoad(LoadInst* L, - SmallVectorImpl<Instruction*> &toErase); - bool processInstruction(Instruction *I, - SmallVectorImpl<Instruction*> &toErase); - bool processNonLocalLoad(LoadInst* L, - SmallVectorImpl<Instruction*> &toErase); + bool processLoad(LoadInst *L); + bool processInstruction(Instruction *I); + bool processNonLocalLoad(LoadInst *L); bool processBlock(BasicBlock *BB); - void dump(DenseMap<uint32_t, Value*>& d); + void dump(DenseMap<uint32_t, Value*> &d); bool iterateOnFunction(Function &F); - bool performPRE(Function& F); + bool performPRE(Function &F); Value *findLeader(BasicBlock *BB, uint32_t num); void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; @@ -629,17 +636,17 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) return 0; + // If this is already the right type, just return it. const Type *StoredValTy = StoredVal->getType(); uint64_t StoreSize = TD.getTypeStoreSizeInBits(StoredValTy); - uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); + uint64_t LoadSize = TD.getTypeStoreSizeInBits(LoadedTy); // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { - if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) { - // Pointer to Pointer -> use bitcast. + // Pointer to Pointer -> use bitcast. + if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) return new BitCastInst(StoredVal, LoadedTy, "", InsertPt); - } // Convert source pointers to integers, which can be bitcast. if (StoredValTy->isPointerTy()) { @@ -796,6 +803,36 @@ static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr, StorePtr, StoreSize, TD); } +/// AnalyzeLoadFromClobberingLoad - This function is called when we have a +/// memdep query of a load that ends up being clobbered by another load. See if +/// the other load can feed into the second load. +static int AnalyzeLoadFromClobberingLoad(const Type *LoadTy, Value *LoadPtr, + LoadInst *DepLI, const TargetData &TD){ + // Cannot handle reading from store of first-class aggregate yet. + if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) + return -1; + + Value *DepPtr = DepLI->getPointerOperand(); + uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType()); + int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD); + if (R != -1) return R; + + // If we have a load/load clobber an DepLI can be widened to cover this load, + // then we should widen it! + int64_t LoadOffs = 0; + const Value *LoadBase = + GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD); + unsigned LoadSize = TD.getTypeStoreSize(LoadTy); + + unsigned Size = MemoryDependenceAnalysis:: + getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD); + if (Size == 0) return -1; + + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD); +} + + + static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr, MemIntrinsic *MI, const TargetData &TD) { @@ -843,9 +880,9 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr, /// GetStoreValueForLoad - This function is called when we have a /// memdep query of a load that ends up being a clobbering store. This means -/// that the store *may* provide bits used by the load but we can't be sure -/// because the pointers don't mustalias. Check this case to see if there is -/// anything more we can do before we give up. +/// that the store provides bits used by the load but we the pointers don't +/// mustalias. Check this case to see if there is anything more we can do +/// before we give up. static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, const Type *LoadTy, Instruction *InsertPt, const TargetData &TD){ @@ -881,6 +918,69 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); } +/// GetStoreValueForLoad - This function is called when we have a +/// memdep query of a load that ends up being a clobbering load. This means +/// that the load *may* provide bits used by the load but we can't be sure +/// because the pointers don't mustalias. Check this case to see if there is +/// anything more we can do before we give up. +static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, + const Type *LoadTy, Instruction *InsertPt, + GVN &gvn) { + const TargetData &TD = *gvn.getTargetData(); + // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to + // widen SrcVal out to a larger load. + unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); + unsigned LoadSize = TD.getTypeStoreSize(LoadTy); + if (Offset+LoadSize > SrcValSize) { + assert(!SrcVal->isVolatile() && "Cannot widen volatile load!"); + assert(isa<IntegerType>(SrcVal->getType())&&"Can't widen non-integer load"); + // If we have a load/load clobber an DepLI can be widened to cover this + // load, then we should widen it to the next power of 2 size big enough! + unsigned NewLoadSize = Offset+LoadSize; + if (!isPowerOf2_32(NewLoadSize)) + NewLoadSize = NextPowerOf2(NewLoadSize); + + Value *PtrVal = SrcVal->getPointerOperand(); + + // Insert the new load after the old load. This ensures that subsequent + // memdep queries will find the new load. We can't easily remove the old + // load completely because it is already in the value numbering table. + IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal)); + const Type *DestPTy = + IntegerType::get(LoadTy->getContext(), NewLoadSize*8); + DestPTy = PointerType::get(DestPTy, + cast<PointerType>(PtrVal->getType())->getAddressSpace()); + Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); + PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); + LoadInst *NewLoad = Builder.CreateLoad(PtrVal); + NewLoad->takeName(SrcVal); + NewLoad->setAlignment(SrcVal->getAlignment()); + + DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n"); + DEBUG(dbgs() << "TO: " << *NewLoad << "\n"); + + // Replace uses of the original load with the wider load. On a big endian + // system, we need to shift down to get the relevant bits. + Value *RV = NewLoad; + if (TD.isBigEndian()) + RV = Builder.CreateLShr(RV, + NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits()); + RV = Builder.CreateTrunc(RV, SrcVal->getType()); + SrcVal->replaceAllUsesWith(RV); + + // We would like to use gvn.markInstructionForDeletion here, but we can't + // because the load is already memoized into the leader map table that GVN + // tracks. It is potentially possible to remove the load from the table, + // but then there all of the operations based on it would need to be + // rehashed. Just leave the dead load around. + gvn.getMemDep().removeInstruction(SrcVal); + SrcVal = NewLoad; + } + + return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD); +} + + /// GetMemInstValueForLoad - This function is called when we have a /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, @@ -943,11 +1043,12 @@ struct AvailableValueInBlock { BasicBlock *BB; enum ValType { SimpleVal, // A simple offsetted value that is accessed. + LoadVal, // A value produced by a load. MemIntrin // A memory intrinsic which is loaded from. }; /// V - The value that is live out of the block. - PointerIntPair<Value *, 1, ValType> Val; + PointerIntPair<Value *, 2, ValType> Val; /// Offset - The byte offset in Val that is interesting for the load query. unsigned Offset; @@ -972,37 +1073,69 @@ struct AvailableValueInBlock { return Res; } + static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(LI); + Res.Val.setInt(LoadVal); + Res.Offset = Offset; + return Res; + } + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } + bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } + bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } + Value *getSimpleValue() const { assert(isSimpleValue() && "Wrong accessor"); return Val.getPointer(); } + LoadInst *getCoercedLoadValue() const { + assert(isCoercedLoadValue() && "Wrong accessor"); + return cast<LoadInst>(Val.getPointer()); + } + MemIntrinsic *getMemIntrinValue() const { - assert(!isSimpleValue() && "Wrong accessor"); + assert(isMemIntrinValue() && "Wrong accessor"); return cast<MemIntrinsic>(Val.getPointer()); } /// MaterializeAdjustedValue - Emit code into this block to adjust the value /// defined here to the specified type. This handles various coercion cases. - Value *MaterializeAdjustedValue(const Type *LoadTy, - const TargetData *TD) const { + Value *MaterializeAdjustedValue(const Type *LoadTy, GVN &gvn) const { Value *Res; if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { + const TargetData *TD = gvn.getTargetData(); assert(TD && "Need target data to handle type mismatch case"); Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), *TD); - DEBUG(errs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " + DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' << *Res << '\n' << "\n\n\n"); } + } else if (isCoercedLoadValue()) { + LoadInst *Load = getCoercedLoadValue(); + if (Load->getType() == LoadTy && Offset == 0) { + Res = Load; + } else { + Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), + gvn); + + DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " + << *getCoercedLoadValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } } else { + const TargetData *TD = gvn.getTargetData(); + assert(TD && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, BB->getTerminator(), *TD); - DEBUG(errs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset + DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); } @@ -1010,21 +1143,20 @@ struct AvailableValueInBlock { } }; -} +} // end anonymous namespace /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. static Value *ConstructSSAForLoadSet(LoadInst *LI, SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock, - const TargetData *TD, - const DominatorTree &DT, - AliasAnalysis *AA) { + GVN &gvn) { // Check for the fully redundant, dominating load case. In this case, we can // just use the dominating value directly. if (ValuesPerBlock.size() == 1 && - DT.properlyDominates(ValuesPerBlock[0].BB, LI->getParent())) - return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), TD); + gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, + LI->getParent())) + return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); // Otherwise, we have to construct SSA form. SmallVector<PHINode*, 8> NewPHIs; @@ -1040,14 +1172,16 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, if (SSAUpdate.HasValueForBlock(BB)) continue; - SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, TD)); + SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn)); } // Perform PHI construction. Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); // If new PHI nodes were created, notify alias analysis. - if (V->getType()->isPointerTy()) + if (V->getType()->isPointerTy()) { + AliasAnalysis *AA = gvn.getAliasAnalysis(); + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) AA->copyValue(LI, NewPHIs[i]); @@ -1059,6 +1193,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) AA->addEscapingUse(P->getOperandUse(2*ii)); } + } return V; } @@ -1071,8 +1206,7 @@ static bool isLifetimeStart(const Instruction *Inst) { /// processNonLocalLoad - Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. -bool GVN::processNonLocalLoad(LoadInst *LI, - SmallVectorImpl<Instruction*> &toErase) { +bool GVN::processNonLocalLoad(LoadInst *LI) { // Find the non-local dependencies of the load. SmallVector<NonLocalDepResult, 64> Deps; AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); @@ -1088,11 +1222,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI, // If we had a phi translation failure, we'll have a single entry which is a // clobber in the current block. Reject this early. - if (Deps.size() == 1 && Deps[0].getResult().isClobber()) { + if (Deps.size() == 1 && Deps[0].getResult().isUnknown()) { DEBUG( dbgs() << "GVN: non-local load "; WriteAsOperand(dbgs(), LI); - dbgs() << " is clobbered by " << *Deps[0].getResult().getInst() << '\n'; + dbgs() << " has unknown dependencies\n"; ); return false; } @@ -1108,6 +1242,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI, BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); + if (DepInfo.isUnknown()) { + UnavailableBlocks.push_back(DepBB); + continue; + } + if (DepInfo.isClobber()) { // The address being loaded in this non-local block may not be the same as // the pointer operand of the load if PHI translation occurs. Make sure @@ -1129,6 +1268,26 @@ bool GVN::processNonLocalLoad(LoadInst *LI, } } } + + // Check to see if we have something like this: + // load i32* P + // load i8* (P+1) + // if we have this, replace the later with an extraction from the former. + if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) { + // If this is a clobber and L is the first instruction in its block, then + // we have the first instruction in the entry block. + if (DepLI != LI && Address && TD) { + int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), + LI->getPointerOperand(), + DepLI, *TD); + + if (Offset != -1) { + ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, + Offset)); + continue; + } + } + } // If the clobbering value is a memset/memcpy/memmove, see if we can // forward a value on from it. @@ -1148,6 +1307,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI, continue; } + assert(DepInfo.isDef() && "Expecting def here"); + Instruction *DepInst = DepInfo.getInst(); // Loading the allocation -> undef. @@ -1187,7 +1348,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI, continue; } } - ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, LD)); + ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD)); continue; } @@ -1206,16 +1367,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI, DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); // Perform PHI construction. - Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD, *DT, - VN.getAliasAnalysis()); + Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); LI->replaceAllUsesWith(V); if (isa<PHINode>(V)) V->takeName(LI); if (V->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); - VN.erase(LI); - toErase.push_back(LI); + markInstructionForDeletion(LI); ++NumGVNLoad; return true; } @@ -1421,6 +1580,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI, if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + // Transfer DebugLoc. + NewLoad->setDebugLoc(LI->getDebugLoc()); + // Add the newly created load. ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred, NewLoad)); @@ -1429,33 +1591,37 @@ bool GVN::processNonLocalLoad(LoadInst *LI, } // Perform PHI construction. - Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD, *DT, - VN.getAliasAnalysis()); + Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); LI->replaceAllUsesWith(V); if (isa<PHINode>(V)) V->takeName(LI); if (V->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); - VN.erase(LI); - toErase.push_back(LI); + markInstructionForDeletion(LI); ++NumPRELoad; return true; } /// processLoad - Attempt to eliminate a load, first by eliminating it /// locally, and then attempting non-local elimination if that fails. -bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { +bool GVN::processLoad(LoadInst *L) { if (!MD) return false; if (L->isVolatile()) return false; + if (L->use_empty()) { + markInstructionForDeletion(L); + return true; + } + // ... to a pointer that has been loaded from before... MemDepResult Dep = MD->getDependency(L); - // If the value isn't available, don't do anything! - if (Dep.isClobber()) { + // If we have a clobber and target data is around, see if this is a clobber + // that we can fix up through code synthesis. + if (Dep.isClobber() && TD) { // Check to see if we have something like this: // store i32 123, i32* %P // %A = bitcast i32* %P to i8* @@ -1467,26 +1633,40 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { // completely covers this load. This sort of thing can happen in bitfield // access code. Value *AvailVal = 0; - if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) - if (TD) { - int Offset = AnalyzeLoadFromClobberingStore(L->getType(), - L->getPointerOperand(), - DepSI, *TD); - if (Offset != -1) - AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, - L->getType(), L, *TD); - } + if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) { + int Offset = AnalyzeLoadFromClobberingStore(L->getType(), + L->getPointerOperand(), + DepSI, *TD); + if (Offset != -1) + AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, + L->getType(), L, *TD); + } + + // Check to see if we have something like this: + // load i32* P + // load i8* (P+1) + // if we have this, replace the later with an extraction from the former. + if (LoadInst *DepLI = dyn_cast<LoadInst>(Dep.getInst())) { + // If this is a clobber and L is the first instruction in its block, then + // we have the first instruction in the entry block. + if (DepLI == L) + return false; + + int Offset = AnalyzeLoadFromClobberingLoad(L->getType(), + L->getPointerOperand(), + DepLI, *TD); + if (Offset != -1) + AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); + } // If the clobbering value is a memset/memcpy/memmove, see if we can forward // a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { - if (TD) { - int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), - L->getPointerOperand(), - DepMI, *TD); - if (Offset != -1) - AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L,*TD); - } + int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), + L->getPointerOperand(), + DepMI, *TD); + if (Offset != -1) + AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD); } if (AvailVal) { @@ -1497,14 +1677,16 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { L->replaceAllUsesWith(AvailVal); if (AvailVal->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(AvailVal); - VN.erase(L); - toErase.push_back(L); + markInstructionForDeletion(L); ++NumGVNLoad; return true; } - + } + + // If the value isn't available, don't do anything! + if (Dep.isClobber()) { DEBUG( - // fast print dep, using operator<< on instruction would be too slow + // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; WriteAsOperand(dbgs(), L); Instruction *I = Dep.getInst(); @@ -1513,9 +1695,21 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { return false; } + if (Dep.isUnknown()) { + DEBUG( + // fast print dep, using operator<< on instruction is too slow. + dbgs() << "GVN: load "; + WriteAsOperand(dbgs(), L); + dbgs() << " has unknown dependence\n"; + ); + return false; + } + // If it is defined in another block, try harder. if (Dep.isNonLocal()) - return processNonLocalLoad(L, toErase); + return processNonLocalLoad(L); + + assert(Dep.isDef() && "Expecting def here"); Instruction *DepInst = Dep.getInst(); if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { @@ -1542,8 +1736,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { L->replaceAllUsesWith(StoredVal); if (StoredVal->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(StoredVal); - VN.erase(L); - toErase.push_back(L); + markInstructionForDeletion(L); ++NumGVNLoad; return true; } @@ -1556,7 +1749,8 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { // (depending on its type). if (DepLI->getType() != L->getType()) { if (TD) { - AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L,*TD); + AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), + L, *TD); if (AvailableVal == 0) return false; @@ -1571,8 +1765,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { L->replaceAllUsesWith(AvailableVal); if (DepLI->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(DepLI); - VN.erase(L); - toErase.push_back(L); + markInstructionForDeletion(L); ++NumGVNLoad; return true; } @@ -1582,19 +1775,17 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { // intervening stores, for example. if (isa<AllocaInst>(DepInst) || isMalloc(DepInst)) { L->replaceAllUsesWith(UndefValue::get(L->getType())); - VN.erase(L); - toErase.push_back(L); + markInstructionForDeletion(L); ++NumGVNLoad; return true; } // If this load occurs either right after a lifetime begin, // then the loaded value is undefined. - if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(DepInst)) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) { L->replaceAllUsesWith(UndefValue::get(L->getType())); - VN.erase(L); - toErase.push_back(L); + markInstructionForDeletion(L); ++NumGVNLoad; return true; } @@ -1634,8 +1825,7 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) { /// processInstruction - When calculating availability, handle an instruction /// by inserting it into the appropriate sets -bool GVN::processInstruction(Instruction *I, - SmallVectorImpl<Instruction*> &toErase) { +bool GVN::processInstruction(Instruction *I) { // Ignore dbg info intrinsics. if (isa<DbgInfoIntrinsic>(I)) return false; @@ -1648,20 +1838,17 @@ bool GVN::processInstruction(Instruction *I, I->replaceAllUsesWith(V); if (MD && V->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); - VN.erase(I); - toErase.push_back(I); + markInstructionForDeletion(I); return true; } if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - bool Changed = processLoad(LI, toErase); - - if (!Changed) { - unsigned Num = VN.lookup_or_add(LI); - addToLeaderTable(Num, LI, LI->getParent()); - } + if (processLoad(LI)) + return true; - return Changed; + unsigned Num = VN.lookup_or_add(LI); + addToLeaderTable(Num, LI, LI->getParent()); + return false; } // For conditions branches, we can perform simple conditional propagation on @@ -1720,11 +1907,10 @@ bool GVN::processInstruction(Instruction *I, } // Remove it! - VN.erase(I); I->replaceAllUsesWith(repl); if (MD && repl->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(repl); - toErase.push_back(I); + markInstructionForDeletion(I); return true; } @@ -1781,35 +1967,36 @@ bool GVN::runOnFunction(Function& F) { bool GVN::processBlock(BasicBlock *BB) { - // FIXME: Kill off toErase by doing erasing eagerly in a helper function (and - // incrementing BI before processing an instruction). - SmallVector<Instruction*, 8> toErase; + // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function + // (and incrementing BI before processing an instruction). + assert(InstrsToErase.empty() && + "We expect InstrsToErase to be empty across iterations"); bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - ChangedFunction |= processInstruction(BI, toErase); - if (toErase.empty()) { + ChangedFunction |= processInstruction(BI); + if (InstrsToErase.empty()) { ++BI; continue; } // If we need some instructions deleted, do it now. - NumGVNInstr += toErase.size(); + NumGVNInstr += InstrsToErase.size(); // Avoid iterator invalidation. bool AtStart = BI == BB->begin(); if (!AtStart) --BI; - for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(), - E = toErase.end(); I != E; ++I) { + for (SmallVector<Instruction*, 4>::iterator I = InstrsToErase.begin(), + E = InstrsToErase.end(); I != E; ++I) { DEBUG(dbgs() << "GVN removed: " << **I << '\n'); if (MD) MD->removeInstruction(*I); (*I)->eraseFromParent(); DEBUG(verifyRemoved(*I)); } - toErase.clear(); + InstrsToErase.clear(); if (AtStart) BI = BB->begin(); @@ -1936,6 +2123,7 @@ bool GVN::performPRE(Function &F) { PREInstr->insertBefore(PREPred->getTerminator()); PREInstr->setName(CurInst->getName() + ".pre"); + PREInstr->setDebugLoc(CurInst->getDebugLoc()); predMap[PREPred] = PREInstr; VN.add(PREInstr, ValNo); ++NumGVNPRE; @@ -1955,7 +2143,7 @@ bool GVN::performPRE(Function &F) { VN.add(Phi, ValNo); addToLeaderTable(ValNo, Phi, CurrentBlock); - + Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); if (Phi->getType()->isPointerTy()) { // Because we have added a PHI-use of the pointer value, it has now diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index eebcc69..04ee7c8 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -52,20 +52,30 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Target/TargetData.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" using namespace llvm; STATISTIC(NumRemoved , "Number of aux indvars removed"); +STATISTIC(NumWidened , "Number of indvars widened"); STATISTIC(NumInserted, "Number of canonical indvars added"); STATISTIC(NumReplaced, "Number of exit values replaced"); STATISTIC(NumLFTR , "Number of loop exit tests replaced"); +STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); +STATISTIC(NumElimRem , "Number of IV remainder operations eliminated"); +STATISTIC(NumElimCmp , "Number of IV comparisons eliminated"); + +// DisableIVRewrite mode currently affects IVUsers, so is defined in libAnalysis +// and referenced here. +namespace llvm { + extern bool DisableIVRewrite; +} namespace { class IndVarSimplify : public LoopPass { @@ -73,12 +83,13 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; + TargetData *TD; SmallVector<WeakVH, 16> DeadInsts; bool Changed; public: static char ID; // Pass identification, replacement for typeid - IndVarSimplify() : LoopPass(ID) { + IndVarSimplify() : LoopPass(ID), IU(0), LI(0), SE(0), DT(0), TD(0) { initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); } @@ -101,15 +112,18 @@ namespace { private: bool isValidRewrite(Value *FromVal, Value *ToVal); - void EliminateIVComparisons(); - void EliminateIVRemainders(); + void SimplifyIVUsers(SCEVExpander &Rewriter); + void EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); + void EliminateIVRemainder(BinaryOperator *Rem, + Value *IVOperand, + bool IsSigned, + PHINode *IVPhi); void RewriteNonIntegerIVs(Loop *L); ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, - PHINode *IndVar, - BasicBlock *ExitingBlock, - BranchInst *BI, - SCEVExpander &Rewriter); + PHINode *IndVar, + SCEVExpander &Rewriter); + void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter); @@ -122,7 +136,7 @@ namespace { char IndVarSimplify::ID = 0; INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", - "Canonicalize Induction Variables", false, false) + "Induction Variable Simplification", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) @@ -130,7 +144,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(IVUsers) INITIALIZE_PASS_END(IndVarSimplify, "indvars", - "Canonicalize Induction Variables", false, false) + "Induction Variable Simplification", false, false) Pass *llvm::createIndVarSimplifyPass() { return new IndVarSimplify(); @@ -183,17 +197,23 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { return true; } -/// LinearFunctionTestReplace - This method rewrites the exit condition of the -/// loop to be a canonical != comparison against the incremented loop induction -/// variable. This pass is able to rewrite the exit tests of any loop where the -/// SCEV analysis can determine a loop-invariant trip count of the loop, which -/// is actually a much broader range than just linear tests. -ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, - const SCEV *BackedgeTakenCount, - PHINode *IndVar, - BasicBlock *ExitingBlock, - BranchInst *BI, - SCEVExpander &Rewriter) { +/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken +/// count expression can be safely and cheaply expanded into an instruction +/// sequence that can be used by LinearFunctionTestReplace. +static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) { + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) || + BackedgeTakenCount->isZero()) + return false; + + if (!L->getExitingBlock()) + return false; + + // Can't rewrite non-branch yet. + BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); + if (!BI) + return false; + // Special case: If the backedge-taken count is a UDiv, it's very likely a // UDiv that ScalarEvolution produced in order to compute a precise // expression, rather than a UDiv from the user's code. If we can't find a @@ -201,23 +221,68 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, // rewriting the loop. if (isa<SCEVUDivExpr>(BackedgeTakenCount)) { ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition()); - if (!OrigCond) return 0; + if (!OrigCond) return false; const SCEV *R = SE->getSCEV(OrigCond->getOperand(1)); R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1)); if (R != BackedgeTakenCount) { const SCEV *L = SE->getSCEV(OrigCond->getOperand(0)); L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1)); if (L != BackedgeTakenCount) - return 0; + return false; } } + return true; +} + +/// getBackedgeIVType - Get the widest type used by the loop test after peeking +/// through Truncs. +/// +/// TODO: Unnecessary once LinearFunctionTestReplace is removed. +static const Type *getBackedgeIVType(Loop *L) { + if (!L->getExitingBlock()) + return 0; + + // Can't rewrite non-branch yet. + BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); + if (!BI) + return 0; + + ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!Cond) + return 0; + + const Type *Ty = 0; + for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end(); + OI != OE; ++OI) { + assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types"); + TruncInst *Trunc = dyn_cast<TruncInst>(*OI); + if (!Trunc) + continue; + + return Trunc->getSrcTy(); + } + return Ty; +} + +/// LinearFunctionTestReplace - This method rewrites the exit condition of the +/// loop to be a canonical != comparison against the incremented loop induction +/// variable. This pass is able to rewrite the exit tests of any loop where the +/// SCEV analysis can determine a loop-invariant trip count of the loop, which +/// is actually a much broader range than just linear tests. +ICmpInst *IndVarSimplify:: +LinearFunctionTestReplace(Loop *L, + const SCEV *BackedgeTakenCount, + PHINode *IndVar, + SCEVExpander &Rewriter) { + assert(canExpandBackedgeTakenCount(L, SE) && "precondition"); + BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator()); // If the exiting block is not the same as the backedge block, we must compare // against the preincremented value, otherwise we prefer to compare against // the post-incremented value. Value *CmpIndVar; const SCEV *RHS = BackedgeTakenCount; - if (ExitingBlock == L->getLoopLatch()) { + if (L->getExitingBlock() == L->getLoopLatch()) { // Add one to the "backedge-taken" count to get the trip count. // If this addition may overflow, we have to be more pessimistic and // cast the induction variable before doing the add. @@ -240,7 +305,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. - CmpIndVar = IndVar->getIncomingValueForBlock(ExitingBlock); + CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock()); } else { // We have to use the preincremented value... RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount, @@ -275,7 +340,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, // update the branch to use the new comparison; in the common case this // will make old comparison dead. BI->setCondition(Cond); - RecursivelyDeleteTriviallyDeadInstructions(OrigCond); + DeadInsts.push_back(OrigCond); ++NumLFTR; Changed = true; @@ -418,96 +483,519 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { SE->forgetLoop(L); } -void IndVarSimplify::EliminateIVComparisons() { - // Look for ICmp users. - for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { - IVStrideUse &UI = *I; - ICmpInst *ICmp = dyn_cast<ICmpInst>(UI.getUser()); - if (!ICmp) continue; - - bool Swapped = UI.getOperandValToReplace() == ICmp->getOperand(1); - ICmpInst::Predicate Pred = ICmp->getPredicate(); - if (Swapped) Pred = ICmpInst::getSwappedPredicate(Pred); - - // Get the SCEVs for the ICmp operands. - const SCEV *S = IU->getReplacementExpr(UI); - const SCEV *X = SE->getSCEV(ICmp->getOperand(!Swapped)); - - // Simplify unnecessary loops away. - const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); - S = SE->getSCEVAtScope(S, ICmpLoop); - X = SE->getSCEVAtScope(X, ICmpLoop); - - // If the condition is always true or always false, replace it with - // a constant value. - if (SE->isKnownPredicate(Pred, S, X)) - ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); - else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) - ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); - else - continue; +namespace { + // Collect information about induction variables that are used by sign/zero + // extend operations. This information is recorded by CollectExtend and + // provides the input to WidenIV. + struct WideIVInfo { + const Type *WidestNativeType; // Widest integer type created [sz]ext + bool IsSigned; // Was an sext user seen before a zext? + + WideIVInfo() : WidestNativeType(0), IsSigned(false) {} + }; + typedef std::map<PHINode *, WideIVInfo> WideIVMap; +} + +/// CollectExtend - Update information about the induction variable that is +/// extended by this sign or zero extend operation. This is used to determine +/// the final width of the IV before actually widening it. +static void CollectExtend(CastInst *Cast, PHINode *Phi, bool IsSigned, + WideIVMap &IVMap, ScalarEvolution *SE, + const TargetData *TD) { + const Type *Ty = Cast->getType(); + uint64_t Width = SE->getTypeSizeInBits(Ty); + if (TD && !TD->isLegalInteger(Width)) + return; - DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); - DeadInsts.push_back(ICmp); + WideIVInfo &IVInfo = IVMap[Phi]; + if (!IVInfo.WidestNativeType) { + IVInfo.WidestNativeType = SE->getEffectiveSCEVType(Ty); + IVInfo.IsSigned = IsSigned; + return; } + + // We extend the IV to satisfy the sign of its first user, arbitrarily. + if (IVInfo.IsSigned != IsSigned) + return; + + if (Width > SE->getTypeSizeInBits(IVInfo.WidestNativeType)) + IVInfo.WidestNativeType = SE->getEffectiveSCEVType(Ty); } -void IndVarSimplify::EliminateIVRemainders() { - // Look for SRem and URem users. - for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { - IVStrideUse &UI = *I; - BinaryOperator *Rem = dyn_cast<BinaryOperator>(UI.getUser()); - if (!Rem) continue; +namespace { +/// WidenIV - The goal of this transform is to remove sign and zero extends +/// without creating any new induction variables. To do this, it creates a new +/// phi of the wider type and redirects all users, either removing extends or +/// inserting truncs whenever we stop propagating the type. +/// +class WidenIV { + PHINode *OrigPhi; + const Type *WideType; + bool IsSigned; + + IVUsers *IU; + LoopInfo *LI; + Loop *L; + ScalarEvolution *SE; + DominatorTree *DT; + SmallVectorImpl<WeakVH> &DeadInsts; + + PHINode *WidePhi; + Instruction *WideInc; + const SCEV *WideIncExpr; + + SmallPtrSet<Instruction*,16> Processed; + +public: + WidenIV(PHINode *PN, const WideIVInfo &IVInfo, IVUsers *IUsers, + LoopInfo *LInfo, ScalarEvolution *SEv, DominatorTree *DTree, + SmallVectorImpl<WeakVH> &DI) : + OrigPhi(PN), + WideType(IVInfo.WidestNativeType), + IsSigned(IVInfo.IsSigned), + IU(IUsers), + LI(LInfo), + L(LI->getLoopFor(OrigPhi->getParent())), + SE(SEv), + DT(DTree), + DeadInsts(DI), + WidePhi(0), + WideInc(0), + WideIncExpr(0) { + assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); + } - bool isSigned = Rem->getOpcode() == Instruction::SRem; - if (!isSigned && Rem->getOpcode() != Instruction::URem) - continue; + bool CreateWideIV(SCEVExpander &Rewriter); - // We're only interested in the case where we know something about - // the numerator. - if (UI.getOperandValToReplace() != Rem->getOperand(0)) - continue; +protected: + Instruction *CloneIVUser(Instruction *NarrowUse, + Instruction *NarrowDef, + Instruction *WideDef); - // Get the SCEVs for the ICmp operands. - const SCEV *S = SE->getSCEV(Rem->getOperand(0)); - const SCEV *X = SE->getSCEV(Rem->getOperand(1)); - - // Simplify unnecessary loops away. - const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); - S = SE->getSCEVAtScope(S, ICmpLoop); - X = SE->getSCEVAtScope(X, ICmpLoop); - - // i % n --> i if i is in [0,n). - if ((!isSigned || SE->isKnownNonNegative(S)) && - SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - S, X)) - Rem->replaceAllUsesWith(Rem->getOperand(0)); - else { - // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). - const SCEV *LessOne = - SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); - if ((!isSigned || SE->isKnownNonNegative(LessOne)) && - SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - LessOne, X)) { - ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, - Rem->getOperand(0), Rem->getOperand(1), - "tmp"); - SelectInst *Sel = - SelectInst::Create(ICmp, - ConstantInt::get(Rem->getType(), 0), - Rem->getOperand(0), "tmp", Rem); - Rem->replaceAllUsesWith(Sel); - } else + const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse); + + Instruction *WidenIVUse(Instruction *NarrowUse, + Instruction *NarrowDef, + Instruction *WideDef); +}; +} // anonymous namespace + +/// SimplifyIVUsers - Iteratively perform simplification on IVUsers within this +/// loop. IVUsers is treated as a worklist. Each successive simplification may +/// push more users which may themselves be candidates for simplification. +/// +void IndVarSimplify::SimplifyIVUsers(SCEVExpander &Rewriter) { + WideIVMap IVMap; + + // Each round of simplification involves a round of eliminating operations + // followed by a round of widening IVs. A single IVUsers worklist is used + // across all rounds. The inner loop advances the user. If widening exposes + // more uses, then another pass through the outer loop is triggered. + for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E;) { + for(; I != E; ++I) { + Instruction *UseInst = I->getUser(); + Value *IVOperand = I->getOperandValToReplace(); + + if (DisableIVRewrite) { + if (CastInst *Cast = dyn_cast<CastInst>(UseInst)) { + bool IsSigned = Cast->getOpcode() == Instruction::SExt; + if (IsSigned || Cast->getOpcode() == Instruction::ZExt) { + CollectExtend(Cast, I->getPhi(), IsSigned, IVMap, SE, TD); + continue; + } + } + } + if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { + EliminateIVComparison(ICmp, IVOperand); continue; + } + if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) { + bool IsSigned = Rem->getOpcode() == Instruction::SRem; + if (IsSigned || Rem->getOpcode() == Instruction::URem) { + EliminateIVRemainder(Rem, IVOperand, IsSigned, I->getPhi()); + continue; + } + } + } + for (WideIVMap::const_iterator I = IVMap.begin(), E = IVMap.end(); + I != E; ++I) { + WidenIV Widener(I->first, I->second, IU, LI, SE, DT, DeadInsts); + if (Widener.CreateWideIV(Rewriter)) + Changed = true; } + } +} - // Inform IVUsers about the new users. - if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) - IU->AddUsersIfInteresting(I); +static Value *getExtend( Value *NarrowOper, const Type *WideType, + bool IsSigned, IRBuilder<> &Builder) { + return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) : + Builder.CreateZExt(NarrowOper, WideType); +} - DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); - DeadInsts.push_back(Rem); +/// CloneIVUser - Instantiate a wide operation to replace a narrow +/// operation. This only needs to handle operations that can evaluation to +/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone. +Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse, + Instruction *NarrowDef, + Instruction *WideDef) { + unsigned Opcode = NarrowUse->getOpcode(); + switch (Opcode) { + default: + return 0; + case Instruction::Add: + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + DEBUG(dbgs() << "Cloning IVUser: " << *NarrowUse << "\n"); + + IRBuilder<> Builder(NarrowUse); + + // Replace NarrowDef operands with WideDef. Otherwise, we don't know + // anything about the narrow operand yet so must insert a [sz]ext. It is + // probably loop invariant and will be folded or hoisted. If it actually + // comes from a widened IV, it should be removed during a future call to + // WidenIVUse. + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) ? WideDef : + getExtend(NarrowUse->getOperand(0), WideType, IsSigned, Builder); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) ? WideDef : + getExtend(NarrowUse->getOperand(1), WideType, IsSigned, Builder); + + BinaryOperator *NarrowBO = cast<BinaryOperator>(NarrowUse); + BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), + LHS, RHS, + NarrowBO->getName()); + Builder.Insert(WideBO); + if (NarrowBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap(); + if (NarrowBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap(); + + return WideBO; } + llvm_unreachable(0); +} + +// GetWideRecurrence - Is this instruction potentially interesting from IVUsers' +// perspective after widening it's type? In other words, can the extend be +// safely hoisted out of the loop with SCEV reducing the value to a recurrence +// on the same loop. If so, return the sign or zero extended +// recurrence. Otherwise return NULL. +const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { + if (!SE->isSCEVable(NarrowUse->getType())) + return 0; + + const SCEV *NarrowExpr = SE->getSCEV(NarrowUse); + const SCEV *WideExpr = IsSigned ? + SE->getSignExtendExpr(NarrowExpr, WideType) : + SE->getZeroExtendExpr(NarrowExpr, WideType); + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr); + if (!AddRec || AddRec->getLoop() != L) + return 0; + + return AddRec; +} + +/// HoistStep - Attempt to hoist an IV increment above a potential use. +/// +/// To successfully hoist, two criteria must be met: +/// - IncV operands dominate InsertPos and +/// - InsertPos dominates IncV +/// +/// Meeting the second condition means that we don't need to check all of IncV's +/// existing uses (it's moving up in the domtree). +/// +/// This does not yet recursively hoist the operands, although that would +/// not be difficult. +static bool HoistStep(Instruction *IncV, Instruction *InsertPos, + const DominatorTree *DT) +{ + if (DT->dominates(IncV, InsertPos)) + return true; + + if (!DT->dominates(InsertPos->getParent(), IncV->getParent())) + return false; + + if (IncV->mayHaveSideEffects()) + return false; + + // Attempt to hoist IncV + for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end(); + OI != OE; ++OI) { + Instruction *OInst = dyn_cast<Instruction>(OI); + if (OInst && !DT->dominates(OInst, InsertPos)) + return false; + } + IncV->moveBefore(InsertPos); + return true; +} + +/// WidenIVUse - Determine whether an individual user of the narrow IV can be +/// widened. If so, return the wide clone of the user. +Instruction *WidenIV::WidenIVUse(Instruction *NarrowUse, + Instruction *NarrowDef, + Instruction *WideDef) { + // To be consistent with IVUsers, stop traversing the def-use chain at + // inner-loop phis or post-loop phis. + if (isa<PHINode>(NarrowUse) && LI->getLoopFor(NarrowUse->getParent()) != L) + return 0; + + // Handle data flow merges and bizarre phi cycles. + if (!Processed.insert(NarrowUse)) + return 0; + + // Our raison d'etre! Eliminate sign and zero extension. + if (IsSigned ? isa<SExtInst>(NarrowUse) : isa<ZExtInst>(NarrowUse)) { + Value *NewDef = WideDef; + if (NarrowUse->getType() != WideType) { + unsigned CastWidth = SE->getTypeSizeInBits(NarrowUse->getType()); + unsigned IVWidth = SE->getTypeSizeInBits(WideType); + if (CastWidth < IVWidth) { + // The cast isn't as wide as the IV, so insert a Trunc. + IRBuilder<> Builder(NarrowUse); + NewDef = Builder.CreateTrunc(WideDef, NarrowUse->getType()); + } + else { + // A wider extend was hidden behind a narrower one. This may induce + // another round of IV widening in which the intermediate IV becomes + // dead. It should be very rare. + DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi + << " not wide enough to subsume " << *NarrowUse << "\n"); + NarrowUse->replaceUsesOfWith(NarrowDef, WideDef); + NewDef = NarrowUse; + } + } + if (NewDef != NarrowUse) { + DEBUG(dbgs() << "INDVARS: eliminating " << *NarrowUse + << " replaced by " << *WideDef << "\n"); + ++NumElimExt; + NarrowUse->replaceAllUsesWith(NewDef); + DeadInsts.push_back(NarrowUse); + } + // Now that the extend is gone, expose it's uses to IVUsers for potential + // further simplification within SimplifyIVUsers. + IU->AddUsersIfInteresting(WideDef, WidePhi); + + // No further widening is needed. The deceased [sz]ext had done it for us. + return 0; + } + const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(NarrowUse); + if (!WideAddRec) { + // This user does not evaluate to a recurence after widening, so don't + // follow it. Instead insert a Trunc to kill off the original use, + // eventually isolating the original narrow IV so it can be removed. + IRBuilder<> Builder(NarrowUse); + Value *Trunc = Builder.CreateTrunc(WideDef, NarrowDef->getType()); + NarrowUse->replaceUsesOfWith(NarrowDef, Trunc); + return 0; + } + // Reuse the IV increment that SCEVExpander created as long as it dominates + // NarrowUse. + Instruction *WideUse = 0; + if (WideAddRec == WideIncExpr && HoistStep(WideInc, NarrowUse, DT)) { + WideUse = WideInc; + } + else { + WideUse = CloneIVUser(NarrowUse, NarrowDef, WideDef); + if (!WideUse) + return 0; + } + // GetWideRecurrence ensured that the narrow expression could be extended + // outside the loop without overflow. This suggests that the wide use + // evaluates to the same expression as the extended narrow use, but doesn't + // absolutely guarantee it. Hence the following failsafe check. In rare cases + // where it fails, we simple throw away the newly created wide use. + if (WideAddRec != SE->getSCEV(WideUse)) { + DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse + << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n"); + DeadInsts.push_back(WideUse); + return 0; + } + + // Returning WideUse pushes it on the worklist. + return WideUse; +} + +/// CreateWideIV - Process a single induction variable. First use the +/// SCEVExpander to create a wide induction variable that evaluates to the same +/// recurrence as the original narrow IV. Then use a worklist to forward +/// traverse the narrow IV's def-use chain. After WidenIVUse as processed all +/// interesting IV users, the narrow IV will be isolated for removal by +/// DeleteDeadPHIs. +/// +/// It would be simpler to delete uses as they are processed, but we must avoid +/// invalidating SCEV expressions. +/// +bool WidenIV::CreateWideIV(SCEVExpander &Rewriter) { + // Is this phi an induction variable? + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi)); + if (!AddRec) + return false; + + // Widen the induction variable expression. + const SCEV *WideIVExpr = IsSigned ? + SE->getSignExtendExpr(AddRec, WideType) : + SE->getZeroExtendExpr(AddRec, WideType); + + assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType && + "Expect the new IV expression to preserve its type"); + + // Can the IV be extended outside the loop without overflow? + AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr); + if (!AddRec || AddRec->getLoop() != L) + return false; + + // An AddRec must have loop-invariant operands. Since this AddRec it + // materialized by a loop header phi, the expression cannot have any post-loop + // operands, so they must dominate the loop header. + assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) && + SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) + && "Loop header phi recurrence inputs do not dominate the loop"); + + // The rewriter provides a value for the desired IV expression. This may + // either find an existing phi or materialize a new one. Either way, we + // expect a well-formed cyclic phi-with-increments. i.e. any operand not part + // of the phi-SCC dominates the loop entry. + Instruction *InsertPt = L->getHeader()->begin(); + WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt)); + + // Remembering the WideIV increment generated by SCEVExpander allows + // WidenIVUse to reuse it when widening the narrow IV's increment. We don't + // employ a general reuse mechanism because the call above is the only call to + // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses. + if (BasicBlock *LatchBlock = L->getLoopLatch()) { + WideInc = + cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock)); + WideIncExpr = SE->getSCEV(WideInc); + } + + DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n"); + ++NumWidened; + + // Traverse the def-use chain using a worklist starting at the original IV. + assert(Processed.empty() && "expect initial state" ); + + // Each worklist entry has a Narrow def-use link and Wide def. + SmallVector<std::pair<Use *, Instruction *>, 8> NarrowIVUsers; + for (Value::use_iterator UI = OrigPhi->use_begin(), + UE = OrigPhi->use_end(); UI != UE; ++UI) { + NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WidePhi)); + } + while (!NarrowIVUsers.empty()) { + Use *NarrowDefUse; + Instruction *WideDef; + tie(NarrowDefUse, WideDef) = NarrowIVUsers.pop_back_val(); + + // Process a def-use edge. This may replace the use, so don't hold a + // use_iterator across it. + Instruction *NarrowDef = cast<Instruction>(NarrowDefUse->get()); + Instruction *NarrowUse = cast<Instruction>(NarrowDefUse->getUser()); + Instruction *WideUse = WidenIVUse(NarrowUse, NarrowDef, WideDef); + + // Follow all def-use edges from the previous narrow use. + if (WideUse) { + for (Value::use_iterator UI = NarrowUse->use_begin(), + UE = NarrowUse->use_end(); UI != UE; ++UI) { + NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WideUse)); + } + } + // WidenIVUse may have removed the def-use edge. + if (NarrowDef->use_empty()) + DeadInsts.push_back(NarrowDef); + } + return true; +} + +void IndVarSimplify::EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { + unsigned IVOperIdx = 0; + ICmpInst::Predicate Pred = ICmp->getPredicate(); + if (IVOperand != ICmp->getOperand(0)) { + // Swapped + assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand"); + IVOperIdx = 1; + Pred = ICmpInst::getSwappedPredicate(Pred); + } + + // Get the SCEVs for the ICmp operands. + const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx)); + const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx)); + + // Simplify unnecessary loops away. + const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); + S = SE->getSCEVAtScope(S, ICmpLoop); + X = SE->getSCEVAtScope(X, ICmpLoop); + + // If the condition is always true or always false, replace it with + // a constant value. + if (SE->isKnownPredicate(Pred, S, X)) + ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); + else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) + ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); + else + return; + + DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); + ++NumElimCmp; + Changed = true; + DeadInsts.push_back(ICmp); +} + +void IndVarSimplify::EliminateIVRemainder(BinaryOperator *Rem, + Value *IVOperand, + bool IsSigned, + PHINode *IVPhi) { + // We're only interested in the case where we know something about + // the numerator. + if (IVOperand != Rem->getOperand(0)) + return; + + // Get the SCEVs for the ICmp operands. + const SCEV *S = SE->getSCEV(Rem->getOperand(0)); + const SCEV *X = SE->getSCEV(Rem->getOperand(1)); + + // Simplify unnecessary loops away. + const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); + S = SE->getSCEVAtScope(S, ICmpLoop); + X = SE->getSCEVAtScope(X, ICmpLoop); + + // i % n --> i if i is in [0,n). + if ((!IsSigned || SE->isKnownNonNegative(S)) && + SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + S, X)) + Rem->replaceAllUsesWith(Rem->getOperand(0)); + else { + // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). + const SCEV *LessOne = + SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); + if (IsSigned && !SE->isKnownNonNegative(LessOne)) + return; + + if (!SE->isKnownPredicate(IsSigned ? + ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + LessOne, X)) + return; + + ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, + Rem->getOperand(0), Rem->getOperand(1), + "tmp"); + SelectInst *Sel = + SelectInst::Create(ICmp, + ConstantInt::get(Rem->getType(), 0), + Rem->getOperand(0), "tmp", Rem); + Rem->replaceAllUsesWith(Sel); + } + + // Inform IVUsers about the new users. + if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) + IU->AddUsersIfInteresting(I, IVPhi); + + DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); + ++NumElimRem; + Changed = true; + DeadInsts.push_back(Rem); } bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { @@ -526,6 +1014,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTree>(); + TD = getAnalysisIfAvailable<TargetData>(); + DeadInsts.clear(); Changed = false; @@ -533,11 +1023,12 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // transform them to use integer recurrences. RewriteNonIntegerIVs(L); - BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); // Create a rewriter object which we'll use to transform the code with. SCEVExpander Rewriter(*SE); + if (DisableIVRewrite) + Rewriter.disableCanonicalMode(); // Check to see if this loop has a computable loop-invariant execution count. // If so, this means that we can compute the final value of any expressions @@ -548,33 +1039,42 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) RewriteLoopExitValues(L, Rewriter); - // Simplify ICmp IV users. - EliminateIVComparisons(); - - // Simplify SRem and URem IV users. - EliminateIVRemainders(); + // Eliminate redundant IV users. + SimplifyIVUsers(Rewriter); // Compute the type of the largest recurrence expression, and decide whether // a canonical induction variable should be inserted. const Type *LargestType = 0; bool NeedCannIV = false; - if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { - LargestType = BackedgeTakenCount->getType(); - LargestType = SE->getEffectiveSCEVType(LargestType); + bool ExpandBECount = canExpandBackedgeTakenCount(L, SE); + if (ExpandBECount) { // If we have a known trip count and a single exit block, we'll be // rewriting the loop exit test condition below, which requires a // canonical induction variable. - if (ExitingBlock) - NeedCannIV = true; - } - for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) { - const Type *Ty = - SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType()); + NeedCannIV = true; + const Type *Ty = BackedgeTakenCount->getType(); + if (DisableIVRewrite) { + // In this mode, SimplifyIVUsers may have already widened the IV used by + // the backedge test and inserted a Trunc on the compare's operand. Get + // the wider type to avoid creating a redundant narrow IV only used by the + // loop test. + LargestType = getBackedgeIVType(L); + } if (!LargestType || SE->getTypeSizeInBits(Ty) > + SE->getTypeSizeInBits(LargestType)) + LargestType = SE->getEffectiveSCEVType(Ty); + } + if (!DisableIVRewrite) { + for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) { + NeedCannIV = true; + const Type *Ty = + SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType()); + if (!LargestType || + SE->getTypeSizeInBits(Ty) > SE->getTypeSizeInBits(LargestType)) - LargestType = Ty; - NeedCannIV = true; + LargestType = Ty; + } } // Now that we know the largest of the induction variable expressions @@ -614,19 +1114,17 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // If we have a trip count expression, rewrite the loop's exit condition // using it. We can currently only handle loops with a single exit. ICmpInst *NewICmp = 0; - if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && - !BackedgeTakenCount->isZero() && - ExitingBlock) { + if (ExpandBECount) { + assert(canExpandBackedgeTakenCount(L, SE) && + "canonical IV disrupted BackedgeTaken expansion"); assert(NeedCannIV && "LinearFunctionTestReplace requires a canonical induction variable"); - // Can't rewrite non-branch yet. - if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator())) - NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, - ExitingBlock, BI, Rewriter); + NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, + Rewriter); } - // Rewrite IV-derived expressions. - RewriteIVExpressions(L, Rewriter); + if (!DisableIVRewrite) + RewriteIVExpressions(L, Rewriter); // Clear the rewriter cache, because values that are in the rewriter's cache // can be deleted in the loop below, causing the AssertingVH in the cache to @@ -649,7 +1147,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // For completeness, inform IVUsers of the IV use in the newly-created // loop exit test instruction. if (NewICmp) - IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0))); + IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)), + IndVar); // Clean up dead instructions. Changed |= DeleteDeadPHIs(L->getHeader()); @@ -1080,5 +1579,5 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { } // Add a new IVUsers entry for the newly-created integer PHI. - IU->AddUsersIfInteresting(NewPHI); + IU->AddUsersIfInteresting(NewPHI, NewPHI); } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 8f90dfe..cf18ff0 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -16,6 +16,7 @@ #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" #include "llvm/Pass.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" @@ -170,9 +171,9 @@ bool JumpThreading::runOnFunction(Function &F) { Changed = true; continue; } - + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); - + // Can't thread an unconditional jump, but if the block is "almost // empty", we can replace uses of it with uses of the successor and make // this dead. @@ -608,7 +609,7 @@ static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) { static bool hasAddressTakenAndUsed(BasicBlock *BB) { if (!BB->hasAddressTaken()) return false; - + // If the block has its address taken, it may be a tree of dead constants // hanging off of it. These shouldn't keep the block alive. BlockAddress *BA = BlockAddress::get(BB); @@ -668,6 +669,17 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { return false; // Must be an invoke. } + // Run constant folding to see if we can reduce the condition to a simple + // constant. + if (Instruction *I = dyn_cast<Instruction>(Condition)) { + Value *SimpleVal = ConstantFoldInstruction(I, TD); + if (SimpleVal) { + I->replaceAllUsesWith(SimpleVal); + I->eraseFromParent(); + Condition = SimpleVal; + } + } + // If the terminator is branching on an undef, we can pick any of the // successors to branch to. Let GetBestDestForJumpOnUndef decide. if (isa<UndefValue>(Condition)) { @@ -694,7 +706,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { DEBUG(dbgs() << " In block '" << BB->getName() << "' folding terminator: " << *BB->getTerminator() << '\n'); ++NumFolds; - ConstantFoldTerminator(BB); + ConstantFoldTerminator(BB, true); return true; } @@ -917,9 +929,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (UnavailablePred) { assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && "Can't handle critical edge here!"); - Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false, + LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false, LI->getAlignment(), UnavailablePred->getTerminator()); + NewVal->setDebugLoc(LI->getDebugLoc()); AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); } @@ -932,6 +945,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", LoadBB->begin()); PN->takeName(LI); + PN->setDebugLoc(LI->getDebugLoc()); // Insert new entries into the PHI for each predecessor. A single block may // have multiple entries here. @@ -1363,7 +1377,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. - BranchInst::Create(SuccBB, NewBB); + BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB); + NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the // PHI nodes for NewBB now. diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 93de9cf..13bd022 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -372,7 +372,11 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { return !pointerInvalidatedByLoop(LI->getOperand(0), Size, LI->getMetadata(LLVMContext::MD_tbaa)); } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { - // Handle obvious cases efficiently. + // Don't sink or hoist dbg info; it's legal, but not useful. + if (isa<DbgInfoIntrinsic>(I)) + return false; + + // Handle simple cases by querying alias analysis. AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI); if (Behavior == AliasAnalysis::DoesNotAccessMemory) return true; @@ -445,8 +449,7 @@ void LICM::sink(Instruction &I) { // enough that we handle it as a special (more efficient) case. It is more // efficient to handle because there are no PHI nodes that need to be placed. if (ExitBlocks.size() == 1) { - if (!isa<DbgInfoIntrinsic>(I) && - !DT->dominates(I.getParent(), ExitBlocks[0])) { + if (!DT->dominates(I.getParent(), ExitBlocks[0])) { // Instruction is not used, just delete it. CurAST->deleteValue(&I); // If I has users in unreachable blocks, eliminate. @@ -602,13 +605,15 @@ namespace { SmallPtrSet<Value*, 4> &PointerMustAliases; SmallVectorImpl<BasicBlock*> &LoopExitBlocks; AliasSetTracker &AST; + DebugLoc DL; public: LoopPromoter(Value *SP, const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, SmallPtrSet<Value*, 4> &PMA, - SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast) - : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), - LoopExitBlocks(LEB), AST(ast) {} + SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast, + DebugLoc dl) + : LoadAndStorePromoter(Insts, S, 0, 0), SomePtr(SP), + PointerMustAliases(PMA), LoopExitBlocks(LEB), AST(ast), DL(dl) {} virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &) const { @@ -629,7 +634,8 @@ namespace { BasicBlock *ExitBlock = LoopExitBlocks[i]; Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); Instruction *InsertPos = ExitBlock->getFirstNonPHI(); - new StoreInst(LiveInValue, SomePtr, InsertPos); + StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); + NewSI->setDebugLoc(DL); } } @@ -727,6 +733,12 @@ void LICM::PromoteAliasSet(AliasSet &AS) { Changed = true; ++NumPromoted; + // Grab a debug location for the inserted loads/stores; given that the + // inserted loads/stores have little relation to the original loads/stores, + // this code just arbitrarily picks a location from one, since any debug + // location is better than none. + DebugLoc DL = LoopUses[0]->getDebugLoc(); + SmallVector<BasicBlock*, 8> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); @@ -734,13 +746,14 @@ void LICM::PromoteAliasSet(AliasSet &AS) { SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - *CurAST); + *CurAST, DL); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. LoadInst *PreheaderLoad = new LoadInst(SomePtr, SomePtr->getName()+".promoted", Preheader->getTerminator()); + PreheaderLoad->setDebugLoc(DL); SSA.AddAvailableValue(Preheader, PreheaderLoad); // Rewrite all the loads in the loop and remember all the definitions from diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 1366231..dbf6eec 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -128,11 +128,11 @@ INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } -/// DeleteDeadInstruction - Delete this instruction. Before we do, go through +/// deleteDeadInstruction - Delete this instruction. Before we do, go through /// and zero out all the operands of this instruction. If any of them become /// dead, delete them and the computation tree that feeds them. /// -static void DeleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { +static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { SmallVector<Instruction*, 32> NowDeadInsts; NowDeadInsts.push_back(I); @@ -162,6 +162,14 @@ static void DeleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { } while (!NowDeadInsts.empty()); } +/// deleteIfDeadInstruction - If the specified value is a dead instruction, +/// delete it and any recursively used instructions. +static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (isInstructionTriviallyDead(I)) + deleteDeadInstruction(I, SE); +} + bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { CurLoop = L; @@ -454,31 +462,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, return false; } - - // Okay, we have a strided store "p[i]" of a splattable value. We can turn - // this into a memset in the loop preheader now if we want. However, this - // would be unsafe to do if there is anything else in the loop that may read - // or write to the aliased location. Check for an alias. - if (mayLoopAccessLocation(DestPtr, AliasAnalysis::ModRef, - CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) - return false; - - // Okay, everything looks good, insert the memset. - BasicBlock *Preheader = CurLoop->getLoopPreheader(); - - IRBuilder<> Builder(Preheader->getTerminator()); - // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the - // header. Just insert code for it in the preheader. + // header. This allows us to insert code for it in the preheader. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE); - + + // Okay, we have a strided store "p[i]" of a splattable value. We can turn + // this into a memset in the loop preheader now if we want. However, this + // would be unsafe to do if there is anything else in the loop that may read + // or write to the aliased location. Check for any overlap by generating the + // base pointer and checking the region. unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace(); Value *BasePtr = Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), Preheader->getTerminator()); + + if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, + CurLoop, BECount, + StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){ + Expander.clear(); + // If we generated new code for the base pointer, clean up. + deleteIfDeadInstruction(BasePtr, *SE); + return false; + } + + // Okay, everything looks good, insert the memset. + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); @@ -521,7 +533,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - DeleteDeadInstruction(TheStore, *SE); + deleteDeadInstruction(TheStore, *SE); ++NumMemSet; return true; } @@ -539,41 +551,51 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); + // The trip count of the loop and the base pointer of the addrec SCEV is + // guaranteed to be loop invariant, which means that it should dominate the + // header. This allows us to insert code for it in the preheader. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + IRBuilder<> Builder(Preheader->getTerminator()); + SCEVExpander Expander(*SE); + // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this // would be unsafe to do if there is anything else in the loop that may read - // or write to the stored location (including the load feeding the stores). - // Check for an alias. - if (mayLoopAccessLocation(SI->getPointerOperand(), AliasAnalysis::ModRef, + // or write the memory region we're storing to. This includes the load that + // feeds the stores. Check for an alias by generating the base address and + // checking everything. + Value *StoreBasePtr = + Expander.expandCodeFor(StoreEv->getStart(), + Builder.getInt8PtrTy(SI->getPointerAddressSpace()), + Preheader->getTerminator()); + + if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, CurLoop, BECount, StoreSize, - getAnalysis<AliasAnalysis>(), SI)) + getAnalysis<AliasAnalysis>(), SI)) { + Expander.clear(); + // If we generated new code for the base pointer, clean up. + deleteIfDeadInstruction(StoreBasePtr, *SE); return false; + } // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. - if (mayLoopAccessLocation(LI->getPointerOperand(), AliasAnalysis::Mod, - CurLoop, BECount, StoreSize, - getAnalysis<AliasAnalysis>(), SI)) - return false; - - // Okay, everything looks good, insert the memcpy. - BasicBlock *Preheader = CurLoop->getLoopPreheader(); - - IRBuilder<> Builder(Preheader->getTerminator()); - - // The trip count of the loop and the base pointer of the addrec SCEV is - // guaranteed to be loop invariant, which means that it should dominate the - // header. Just insert code for it in the preheader. - SCEVExpander Expander(*SE); - Value *LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(), Builder.getInt8PtrTy(LI->getPointerAddressSpace()), Preheader->getTerminator()); - Value *StoreBasePtr = - Expander.expandCodeFor(StoreEv->getStart(), - Builder.getInt8PtrTy(SI->getPointerAddressSpace()), - Preheader->getTerminator()); + + if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, + StoreSize, getAnalysis<AliasAnalysis>(), SI)) { + Expander.clear(); + // If we generated new code for the base pointer, clean up. + deleteIfDeadInstruction(LoadBasePtr, *SE); + deleteIfDeadInstruction(StoreBasePtr, *SE); + return false; + } + + // Okay, everything is safe, we can transform this! + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. @@ -589,18 +611,19 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); - Value *NewCall = + CallInst *NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, std::min(SI->getAlignment(), LI->getAlignment())); + NewCall->setDebugLoc(SI->getDebugLoc()); DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - (void)NewCall; + // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - DeleteDeadInstruction(SI, *SE); + deleteDeadInstruction(SI, *SE); ++NumMemCpy; return true; } diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 95e1578..47dced3 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -184,7 +184,11 @@ bool LoopRotate::rotateLoop(Loop *L) { // Now, this loop is suitable for rotation. BasicBlock *OrigPreheader = L->getLoopPreheader(); BasicBlock *OrigLatch = L->getLoopLatch(); - assert(OrigPreheader && OrigLatch && "Loop not in canonical form?"); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (OrigPreheader == 0 || OrigLatch == 0) + return false; // Anything ScalarEvolution may know about this loop or the PHI nodes // in its header will soon be invalidated. @@ -322,7 +326,8 @@ bool LoopRotate::rotateLoop(Loop *L) { // We can fold the conditional branch in the preheader, this makes things // simpler. The first step is to remove the extra edge to the Exit block. Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); - BranchInst::Create(NewHeader, PHBI); + BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); + NewBI->setDebugLoc(PHBI->getDebugLoc()); PHBI->eraseFromParent(); // With our CFG finalized, update DomTree if it is available. diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 87e78fa..73ebd61 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -209,7 +209,12 @@ struct Formula { /// when AM.Scale is not zero. const SCEV *ScaledReg; - Formula() : ScaledReg(0) {} + /// UnfoldedOffset - An additional constant offset which added near the + /// use. This requires a temporary register, but the offset itself can + /// live in an add immediate field rather than a register. + int64_t UnfoldedOffset; + + Formula() : ScaledReg(0), UnfoldedOffset(0) {} void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); @@ -379,6 +384,10 @@ void Formula::print(raw_ostream &OS) const { OS << "<unknown>"; OS << ')'; } + if (UnfoldedOffset != 0) { + if (!First) OS << " + "; else First = false; + OS << "imm(" << UnfoldedOffset << ')'; + } } void Formula::dump() const { @@ -572,9 +581,6 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::prefetch: - case Intrinsic::x86_sse2_loadu_dq: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse_loadu_ps: case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: @@ -774,8 +780,10 @@ void Cost::RateFormula(const Formula &F, RatePrimaryRegister(BaseReg, Regs, L, SE, DT); } - if (F.BaseRegs.size() > 1) - NumBaseAdds += F.BaseRegs.size() - 1; + // Determine how many (unfolded) adds we'll need inside the loop. + size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); + if (NumBaseParts > 1) + NumBaseAdds += NumBaseParts - 1; // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), @@ -789,7 +797,7 @@ void Cost::RateFormula(const Formula &F, } } -/// Loose - Set this cost to a loosing value. +/// Loose - Set this cost to a losing value. void Cost::Loose() { NumRegs = ~0u; AddRecCost = ~0u; @@ -1796,7 +1804,8 @@ LSRInstance::OptimizeLoopTermCond() { ExitingBlock->getInstList().insert(TermBr, Cond); // Clone the IVUse, as the old use still exists! - CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); + CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace(), + CondUse->getPhi()); TermBr->replaceUsesOfWith(OldCond, Cond); } } @@ -1827,7 +1836,7 @@ LSRInstance::OptimizeLoopTermCond() { } } -/// reconcileNewOffset - Determine if the given use can accomodate a fixup +/// reconcileNewOffset - Determine if the given use can accommodate a fixup /// at the given offset and other details. If so, update the use and /// return true. bool @@ -1948,7 +1957,8 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, if (F.BaseRegs == OrigF.BaseRegs && F.ScaledReg == OrigF.ScaledReg && F.AM.BaseGV == OrigF.AM.BaseGV && - F.AM.Scale == OrigF.AM.Scale) { + F.AM.Scale == OrigF.AM.Scale && + F.UnfoldedOffset == OrigF.UnfoldedOffset) { if (F.AM.BaseOffs == 0) return &LU; // This is the formula where all the registers and symbols matched; @@ -2064,6 +2074,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // x == y --> x - y == 0 const SCEV *N = SE.getSCEV(NV); if (SE.isLoopInvariant(N, L)) { + // S is normalized, so normalize N before folding it into S + // to keep the result normalized. + N = TransformForPostIncUse(Normalize, N, CI, 0, + LF.PostIncLoops, SE, DT); Kind = LSRUse::ICmpZero; S = SE.getMinusSCEV(N, S); } @@ -2316,8 +2330,29 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, if (InnerSum->isZero()) continue; Formula F = Base; - F.BaseRegs[i] = InnerSum; - F.BaseRegs.push_back(*J); + + // Add the remaining pieces of the add back into the new formula. + const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); + if (TLI && InnerSumSC && + SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue())) { + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue(); + F.BaseRegs.erase(F.BaseRegs.begin() + i); + } else + F.BaseRegs[i] = InnerSum; + + // Add J as its own register, or an unfolded immediate. + const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); + if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue())) + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue(); + else + F.BaseRegs.push_back(*J); + if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like // it. @@ -2485,6 +2520,15 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; } + // Check that multiplying with the unfolded offset doesn't overflow. + if (F.UnfoldedOffset != 0) { + if (F.UnfoldedOffset == INT64_MIN && Factor == -1) + continue; + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; + if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) + continue; + } + // If we make it here and it's legal, add it. (void)InsertFormula(LU, LUIdx, F); next:; @@ -2667,7 +2711,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // other orig regs. ImmMapTy::const_iterator OtherImms[] = { Imms.begin(), prior(Imms.end()), - Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) + Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) }; for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { ImmMapTy::const_iterator M = OtherImms[i]; @@ -2741,8 +2785,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Formula NewF = F; NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) - continue; + LU.Kind, LU.AccessTy, TLI)) { + if (!TLI || + !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + continue; + NewF = F; + NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; + } NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); // If the new formula has a constant in a register, and adding the @@ -3491,6 +3540,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } } + // Expand the unfolded offset portion. + int64_t UnfoldedOffset = F.UnfoldedOffset; + if (UnfoldedOffset != 0) { + // Just add the immediate values. + Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, + UnfoldedOffset))); + } + // Emit instructions summing all the operands. const SCEV *FullS = Ops.empty() ? SE.getConstant(IntTy, 0) : diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 80b263a..fef6bc3 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -43,7 +43,13 @@ namespace { class LoopUnroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopUnroll() : LoopPass(ID) { + LoopUnroll(int T = -1, int C = -1, int P = -1) : LoopPass(ID) { + CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); + CurrentCount = (C == -1) ? UnrollCount : unsigned(C); + CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; + + UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -56,7 +62,10 @@ namespace { // explicit -unroll-threshold). static const unsigned OptSizeUnrollThreshold = 50; + unsigned CurrentCount; unsigned CurrentThreshold; + bool CurrentAllowPartial; + bool UserThreshold; // CurrentThreshold is user-specified. bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -87,7 +96,9 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -Pass *llvm::createLoopUnrollPass() { return new LoopUnroll(); } +Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { + return new LoopUnroll(Threshold, Count, AllowPartial); +} /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) { @@ -119,14 +130,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // from UnrollThreshold, it is overridden to a smaller value if the current // function is marked as optimize-for-size, and the unroll threshold was // not user specified. - CurrentThreshold = UnrollThreshold; - if (Header->getParent()->hasFnAttr(Attribute::OptimizeForSize) && - UnrollThreshold.getNumOccurrences() == 0) - CurrentThreshold = OptSizeUnrollThreshold; + unsigned Threshold = CurrentThreshold; + if (!UserThreshold && + Header->getParent()->hasFnAttr(Attribute::OptimizeForSize)) + Threshold = OptSizeUnrollThreshold; // Find trip count unsigned TripCount = L->getSmallConstantTripCount(); - unsigned Count = UnrollCount; + unsigned Count = CurrentCount; // Automatically select an unroll count. if (Count == 0) { @@ -140,7 +151,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Enforce the threshold. - if (CurrentThreshold != NoThreshold) { + if (Threshold != NoThreshold) { unsigned NumInlineCandidates; unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); @@ -149,16 +160,16 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } uint64_t Size = (uint64_t)LoopSize*Count; - if (TripCount != 1 && Size > CurrentThreshold) { + if (TripCount != 1 && Size > Threshold) { DEBUG(dbgs() << " Too large to fully unroll with count: " << Count - << " because size: " << Size << ">" << CurrentThreshold << "\n"); - if (!UnrollAllowPartial) { + << " because size: " << Size << ">" << Threshold << "\n"); + if (!CurrentAllowPartial) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); return false; } // Reduce unroll count to be modulo of TripCount for partial unrolling - Count = CurrentThreshold / LoopSize; + Count = Threshold / LoopSize; while (Count != 0 && TripCount%Count != 0) { Count--; } diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index b4e3d31..e05f29c 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -258,6 +258,7 @@ bool LoopUnswitch::processCurrentLoop() { if (LoopCond && SI->getNumCases() > 1) { // Find a value to unswitch on: // FIXME: this should chose the most expensive case! + // FIXME: scan for a case with a non-critical edge? Constant *UnswitchVal = SI->getCaseValue(1); // Do not process same value again and again. if (!UnswitchedVals.insert(UnswitchVal)) @@ -560,6 +561,8 @@ void LoopUnswitch::SplitExitEdges(Loop *L, BasicBlock *ExitBlock = ExitBlocks[i]; SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock), pred_end(ExitBlock)); + // Although SplitBlockPredecessors doesn't preserve loop-simplify in + // general, if we call it on all predecessors of all exits then it does. SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(), ".us-lcssa", this); } @@ -786,8 +789,13 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, // If this is the edge to the header block for a loop, remove the loop and // promote all subloops. if (Loop *BBLoop = LI->getLoopFor(BB)) { - if (BBLoop->getLoopLatch() == BB) + if (BBLoop->getLoopLatch() == BB) { RemoveLoopFromHierarchy(BBLoop); + if (currentLoop == BBLoop) { + currentLoop = 0; + redoLoop = false; + } + } } // Remove the block from the loop info, which removes it from any loops it @@ -859,7 +867,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches, // selects, switches. - std::vector<User*> Users(LIC->use_begin(), LIC->use_end()); std::vector<Instruction*> Worklist; LLVMContext &Context = Val->getContext(); @@ -875,13 +882,14 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), !cast<ConstantInt>(Val)->getZExtValue()); - for (unsigned i = 0, e = Users.size(); i != e; ++i) - if (Instruction *U = cast<Instruction>(Users[i])) { - if (!L->contains(U)) - continue; - U->replaceUsesOfWith(LIC, Replacement); - Worklist.push_back(U); - } + for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); + UI != E; ++UI) { + Instruction *U = dyn_cast<Instruction>(*UI); + if (!U || !L->contains(U)) + continue; + U->replaceUsesOfWith(LIC, Replacement); + Worklist.push_back(U); + } SimplifyCode(Worklist, L); return; } @@ -889,9 +897,10 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // Otherwise, we don't know the precise value of LIC, but we do know that it // is certainly NOT "Val". As such, simplify any uses in the loop that we // can. This case occurs when we unswitch switch statements. - for (unsigned i = 0, e = Users.size(); i != e; ++i) { - Instruction *U = cast<Instruction>(Users[i]); - if (!L->contains(U)) + for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); + UI != E; ++UI) { + Instruction *U = dyn_cast<Instruction>(*UI); + if (!U || !L->contains(U)) continue; Worklist.push_back(U); @@ -909,13 +918,22 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // Found a dead case value. Don't remove PHI nodes in the // successor if they become single-entry, those PHI nodes may // be in the Users list. - + + BasicBlock *Switch = SI->getParent(); + BasicBlock *SISucc = SI->getSuccessor(DeadCase); + BasicBlock *Latch = L->getLoopLatch(); + if (!SI->findCaseDest(SISucc)) continue; // Edge is critical. + // If the DeadCase successor dominates the loop latch, then the + // transformation isn't safe since it will delete the sole predecessor edge + // to the latch. + if (Latch && DT->dominates(SISucc, Latch)) + continue; + // FIXME: This is a hack. We need to keep the successor around // and hooked up so as to preserve the loop structure, because // trying to update it is complicated. So instead we preserve the // loop structure and put the block on a dead code path. - BasicBlock *Switch = SI->getParent(); - SplitEdge(Switch, SI->getSuccessor(DeadCase), this); + SplitEdge(Switch, SISucc, this); // Compute the successors instead of relying on the return value // of SplitEdge, since it may have split the switch successor // after PHI nodes. diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index bde0e53..bd4c2d6 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -23,11 +23,13 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/IRBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" #include <list> using namespace llvm; @@ -299,12 +301,15 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, namespace { class MemCpyOpt : public FunctionPass { MemoryDependenceAnalysis *MD; + TargetLibraryInfo *TLI; const TargetData *TD; public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); MD = 0; + TLI = 0; + TD = 0; } bool runOnFunction(Function &F); @@ -316,6 +321,7 @@ namespace { AU.addRequired<DominatorTree>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); + AU.addRequired<TargetLibraryInfo>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<MemoryDependenceAnalysis>(); } @@ -346,6 +352,7 @@ INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) @@ -453,7 +460,10 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) dbgs() << *Range.TheStores[i] << '\n'; dbgs() << "With: " << *AMemSet << '\n'); - + + if (!Range.TheStores.empty()) + AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); + // Zap all the stores. for (SmallVector<Instruction*, 16>::const_iterator SI = Range.TheStores.begin(), @@ -477,12 +487,27 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { - if (!LI->isVolatile() && LI->hasOneUse()) { - MemDepResult dep = MD->getDependency(LI); + if (!LI->isVolatile() && LI->hasOneUse() && + LI->getParent() == SI->getParent()) { + MemDepResult ldep = MD->getDependency(LI); CallInst *C = 0; - if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst())) - C = dyn_cast<CallInst>(dep.getInst()); - + if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) + C = dyn_cast<CallInst>(ldep.getInst()); + + if (C) { + // Check that nothing touches the dest of the "copy" between + // the call and the store. + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis::Location StoreLoc = AA.getLocation(SI); + for (BasicBlock::iterator I = --BasicBlock::iterator(SI), + E = C; I != E; --I) { + if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { + C = 0; + break; + } + } + } + if (C) { bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), @@ -688,7 +713,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, if (M->getSource() == MDep->getSource()) return false; - // Second, the length of the memcpy's must be the same, or the preceeding one + // Second, the length of the memcpy's must be the same, or the preceding one // must be larger than the following one. ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); @@ -804,6 +829,9 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { bool MemCpyOpt::processMemMove(MemMoveInst *M) { AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + if (!TLI->has(LibFunc::memmove)) + return false; + // See if the pointers alias. if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M))) return false; @@ -854,12 +882,16 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) return false; - // Get the alignment of the byval. If it is greater than the memcpy, then we - // can't do the substitution. If the call doesn't specify the alignment, then - // it is some target specific value that we can't know. + // Get the alignment of the byval. If the call doesn't specify the alignment, + // then it is some target specific value that we can't know. unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); - if (ByValAlign == 0 || MDep->getAlignment() < ByValAlign) - return false; + if (ByValAlign == 0) return false; + + // If it is greater than the memcpy, then we check to see if we can force the + // source of the memcpy to the alignment we need. If we fail, we bail out. + if (MDep->getAlignment() < ByValAlign && + getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) + return false; // Verify that the copied-from memory doesn't change in between the memcpy and // the byval call. @@ -935,6 +967,14 @@ bool MemCpyOpt::runOnFunction(Function &F) { bool MadeChange = false; MD = &getAnalysis<MemoryDependenceAnalysis>(); TD = getAnalysisIfAvailable<TargetData>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + + // If we don't have at least memset and memcpy, there is little point of doing + // anything here. These are required by a freestanding implementation, so if + // even they are disabled, there is no point in trying hard. + if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy)) + return false; + while (1) { if (!iterateOnFunction(F)) break; diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp new file mode 100644 index 0000000..6cd35e5 --- /dev/null +++ b/lib/Transforms/Scalar/ObjCARC.cpp @@ -0,0 +1,3537 @@ +//===- ObjCARC.cpp - ObjC ARC Optimization --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines ObjC ARC optimizations. ARC stands for +// Automatic Reference Counting and is a system for managing reference counts +// for objects in Objective C. +// +// The optimizations performed include elimination of redundant, partially +// redundant, and inconsequential reference count operations, elimination of +// redundant weak pointer operations, pattern-matching and replacement of +// low-level operations into higher-level operations, and numerous minor +// simplifications. +// +// This file also defines a simple ARC-aware AliasAnalysis. +// +// WARNING: This file knows about certain library functions. It recognizes them +// by name, and hardwires knowedge of their semantics. +// +// WARNING: This file knows about how certain Objective-C library functions are +// used. Naive LLVM IR transformations which would otherwise be +// behavior-preserving may break these assumptions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "objc-arc" +#include "llvm/Function.h" +#include "llvm/Intrinsics.h" +#include "llvm/GlobalVariable.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +// A handy option to enable/disable all optimizations in this file. +static cl::opt<bool> EnableARCOpts("enable-objc-arc-opts", cl::init(true)); + +//===----------------------------------------------------------------------===// +// Misc. Utilities +//===----------------------------------------------------------------------===// + +namespace { + /// MapVector - An associative container with fast insertion-order + /// (deterministic) iteration over its elements. Plus the special + /// blot operation. + template<class KeyT, class ValueT> + class MapVector { + /// Map - Map keys to indices in Vector. + typedef DenseMap<KeyT, size_t> MapTy; + MapTy Map; + + /// Vector - Keys and values. + typedef std::vector<std::pair<KeyT, ValueT> > VectorTy; + VectorTy Vector; + + public: + typedef typename VectorTy::iterator iterator; + typedef typename VectorTy::const_iterator const_iterator; + iterator begin() { return Vector.begin(); } + iterator end() { return Vector.end(); } + const_iterator begin() const { return Vector.begin(); } + const_iterator end() const { return Vector.end(); } + +#ifdef XDEBUG + ~MapVector() { + assert(Vector.size() >= Map.size()); // May differ due to blotting. + for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); + I != E; ++I) { + assert(I->second < Vector.size()); + assert(Vector[I->second].first == I->first); + } + for (typename VectorTy::const_iterator I = Vector.begin(), + E = Vector.end(); I != E; ++I) + assert(!I->first || + (Map.count(I->first) && + Map[I->first] == size_t(I - Vector.begin()))); + } +#endif + + ValueT &operator[](KeyT Arg) { + std::pair<typename MapTy::iterator, bool> Pair = + Map.insert(std::make_pair(Arg, size_t(0))); + if (Pair.second) { + Pair.first->second = Vector.size(); + Vector.push_back(std::make_pair(Arg, ValueT())); + return Vector.back().second; + } + return Vector[Pair.first->second].second; + } + + std::pair<iterator, bool> + insert(const std::pair<KeyT, ValueT> &InsertPair) { + std::pair<typename MapTy::iterator, bool> Pair = + Map.insert(std::make_pair(InsertPair.first, size_t(0))); + if (Pair.second) { + Pair.first->second = Vector.size(); + Vector.push_back(InsertPair); + return std::make_pair(llvm::prior(Vector.end()), true); + } + return std::make_pair(Vector.begin() + Pair.first->second, false); + } + + const_iterator find(KeyT Key) const { + typename MapTy::const_iterator It = Map.find(Key); + if (It == Map.end()) return Vector.end(); + return Vector.begin() + It->second; + } + + /// blot - This is similar to erase, but instead of removing the element + /// from the vector, it just zeros out the key in the vector. This leaves + /// iterators intact, but clients must be prepared for zeroed-out keys when + /// iterating. + void blot(KeyT Key) { + typename MapTy::iterator It = Map.find(Key); + if (It == Map.end()) return; + Vector[It->second].first = KeyT(); + Map.erase(It); + } + + void clear() { + Map.clear(); + Vector.clear(); + } + }; +} + +//===----------------------------------------------------------------------===// +// ARC Utilities. +//===----------------------------------------------------------------------===// + +namespace { + /// InstructionClass - A simple classification for instructions. + enum InstructionClass { + IC_Retain, ///< objc_retain + IC_RetainRV, ///< objc_retainAutoreleasedReturnValue + IC_RetainBlock, ///< objc_retainBlock + IC_Release, ///< objc_release + IC_Autorelease, ///< objc_autorelease + IC_AutoreleaseRV, ///< objc_autoreleaseReturnValue + IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush + IC_AutoreleasepoolPop, ///< objc_autoreleasePoolPop + IC_NoopCast, ///< objc_retainedObject, etc. + IC_FusedRetainAutorelease, ///< objc_retainAutorelease + IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue + IC_LoadWeakRetained, ///< objc_loadWeakRetained (primitive) + IC_StoreWeak, ///< objc_storeWeak (primitive) + IC_InitWeak, ///< objc_initWeak (derived) + IC_LoadWeak, ///< objc_loadWeak (derived) + IC_MoveWeak, ///< objc_moveWeak (derived) + IC_CopyWeak, ///< objc_copyWeak (derived) + IC_DestroyWeak, ///< objc_destroyWeak (derived) + IC_CallOrUser, ///< could call objc_release and/or "use" pointers + IC_Call, ///< could call objc_release + IC_User, ///< could "use" a pointer + IC_None ///< anything else + }; +} + +/// IsPotentialUse - Test whether the given value is possible a +/// reference-counted pointer. +static bool IsPotentialUse(const Value *Op) { + // Pointers to static or stack storage are not reference-counted pointers. + if (isa<Constant>(Op) || isa<AllocaInst>(Op)) + return false; + // Special arguments are not reference-counted. + if (const Argument *Arg = dyn_cast<Argument>(Op)) + if (Arg->hasByValAttr() || + Arg->hasNestAttr() || + Arg->hasStructRetAttr()) + return false; + // Only consider values with pointer types, and not function pointers. + const PointerType *Ty = dyn_cast<PointerType>(Op->getType()); + if (!Ty || isa<FunctionType>(Ty->getElementType())) + return false; + // Conservatively assume anything else is a potential use. + return true; +} + +/// GetCallSiteClass - Helper for GetInstructionClass. Determines what kind +/// of construct CS is. +static InstructionClass GetCallSiteClass(ImmutableCallSite CS) { + for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); + I != E; ++I) + if (IsPotentialUse(*I)) + return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser; + + return CS.onlyReadsMemory() ? IC_None : IC_Call; +} + +/// GetFunctionClass - Determine if F is one of the special known Functions. +/// If it isn't, return IC_CallOrUser. +static InstructionClass GetFunctionClass(const Function *F) { + Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); + + // No arguments. + if (AI == AE) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_autoreleasePoolPush", IC_AutoreleasepoolPush) + .Default(IC_CallOrUser); + + // One argument. + const Argument *A0 = AI++; + if (AI == AE) + // Argument is a pointer. + if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { + const Type *ETy = PTy->getElementType(); + // Argument is i8*. + if (ETy->isIntegerTy(8)) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_retain", IC_Retain) + .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV) + .Case("objc_retainBlock", IC_RetainBlock) + .Case("objc_release", IC_Release) + .Case("objc_autorelease", IC_Autorelease) + .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV) + .Case("objc_autoreleasePoolPop", IC_AutoreleasepoolPop) + .Case("objc_retainedObject", IC_NoopCast) + .Case("objc_unretainedObject", IC_NoopCast) + .Case("objc_unretainedPointer", IC_NoopCast) + .Case("objc_retain_autorelease", IC_FusedRetainAutorelease) + .Case("objc_retainAutorelease", IC_FusedRetainAutorelease) + .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV) + .Default(IC_CallOrUser); + + // Argument is i8** + if (const PointerType *Pte = dyn_cast<PointerType>(ETy)) + if (Pte->getElementType()->isIntegerTy(8)) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_loadWeakRetained", IC_LoadWeakRetained) + .Case("objc_loadWeak", IC_LoadWeak) + .Case("objc_destroyWeak", IC_DestroyWeak) + .Default(IC_CallOrUser); + } + + // Two arguments, first is i8**. + const Argument *A1 = AI++; + if (AI == AE) + if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType())) + if (const PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) + if (Pte->getElementType()->isIntegerTy(8)) + if (const PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { + const Type *ETy1 = PTy1->getElementType(); + // Second argument is i8* + if (ETy1->isIntegerTy(8)) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_storeWeak", IC_StoreWeak) + .Case("objc_initWeak", IC_InitWeak) + .Default(IC_CallOrUser); + // Second argument is i8**. + if (const PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) + if (Pte1->getElementType()->isIntegerTy(8)) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_moveWeak", IC_MoveWeak) + .Case("objc_copyWeak", IC_CopyWeak) + .Default(IC_CallOrUser); + } + + // Anything else. + return IC_CallOrUser; +} + +/// GetInstructionClass - Determine what kind of construct V is. +static InstructionClass GetInstructionClass(const Value *V) { + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // Any instruction other than bitcast and gep with a pointer operand have a + // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer + // to a subsequent use, rather than using it themselves, in this sense. + // As a short cut, several other opcodes are known to have no pointer + // operands of interest. And ret is never followed by a release, so it's + // not interesting to examine. + switch (I->getOpcode()) { + case Instruction::Call: { + const CallInst *CI = cast<CallInst>(I); + // Check for calls to special functions. + if (const Function *F = CI->getCalledFunction()) { + InstructionClass Class = GetFunctionClass(F); + if (Class != IC_CallOrUser) + return Class; + + // None of the intrinsic functions do objc_release. For intrinsics, the + // only question is whether or not they may be users. + switch (F->getIntrinsicID()) { + case 0: break; + case Intrinsic::bswap: case Intrinsic::ctpop: + case Intrinsic::ctlz: case Intrinsic::cttz: + case Intrinsic::returnaddress: case Intrinsic::frameaddress: + case Intrinsic::stacksave: case Intrinsic::stackrestore: + case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend: + // Don't let dbg info affect our results. + case Intrinsic::dbg_declare: case Intrinsic::dbg_value: + // Short cut: Some intrinsics obviously don't use ObjC pointers. + return IC_None; + default: + for (Function::const_arg_iterator AI = F->arg_begin(), + AE = F->arg_end(); AI != AE; ++AI) + if (IsPotentialUse(AI)) + return IC_User; + return IC_None; + } + } + return GetCallSiteClass(CI); + } + case Instruction::Invoke: + return GetCallSiteClass(cast<InvokeInst>(I)); + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::Select: case Instruction::PHI: + case Instruction::Ret: case Instruction::Br: + case Instruction::Switch: case Instruction::IndirectBr: + case Instruction::Alloca: case Instruction::VAArg: + case Instruction::Add: case Instruction::FAdd: + case Instruction::Sub: case Instruction::FSub: + case Instruction::Mul: case Instruction::FMul: + case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv: + case Instruction::SRem: case Instruction::URem: case Instruction::FRem: + case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: + case Instruction::And: case Instruction::Or: case Instruction::Xor: + case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc: + case Instruction::IntToPtr: case Instruction::FCmp: + case Instruction::FPTrunc: case Instruction::FPExt: + case Instruction::FPToUI: case Instruction::FPToSI: + case Instruction::UIToFP: case Instruction::SIToFP: + case Instruction::InsertElement: case Instruction::ExtractElement: + case Instruction::ShuffleVector: + case Instruction::ExtractValue: + break; + case Instruction::ICmp: + // Comparing a pointer with null, or any other constant, isn't an + // interesting use, because we don't care what the pointer points to, or + // about the values of any other dynamic reference-counted pointers. + if (IsPotentialUse(I->getOperand(1))) + return IC_User; + break; + default: + // For anything else, check all the operands. + for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); + OI != OE; ++OI) + if (IsPotentialUse(*OI)) + return IC_User; + } + } + + // Otherwise, it's totally inert for ARC purposes. + return IC_None; +} + +/// GetBasicInstructionClass - Determine what kind of construct V is. This is +/// similar to GetInstructionClass except that it only detects objc runtine +/// calls. This allows it to be faster. +static InstructionClass GetBasicInstructionClass(const Value *V) { + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (const Function *F = CI->getCalledFunction()) + return GetFunctionClass(F); + // Otherwise, be conservative. + return IC_CallOrUser; + } + + // Otherwise, be conservative. + return IC_User; +} + +/// IsRetain - Test if the the given class is objc_retain or +/// equivalent. +static bool IsRetain(InstructionClass Class) { + return Class == IC_Retain || + Class == IC_RetainRV; +} + +/// IsAutorelease - Test if the the given class is objc_autorelease or +/// equivalent. +static bool IsAutorelease(InstructionClass Class) { + return Class == IC_Autorelease || + Class == IC_AutoreleaseRV; +} + +/// IsForwarding - Test if the given class represents instructions which return +/// their argument verbatim. +static bool IsForwarding(InstructionClass Class) { + // objc_retainBlock technically doesn't always return its argument + // verbatim, but it doesn't matter for our purposes here. + return Class == IC_Retain || + Class == IC_RetainRV || + Class == IC_Autorelease || + Class == IC_AutoreleaseRV || + Class == IC_RetainBlock || + Class == IC_NoopCast; +} + +/// IsNoopOnNull - Test if the given class represents instructions which do +/// nothing if passed a null pointer. +static bool IsNoopOnNull(InstructionClass Class) { + return Class == IC_Retain || + Class == IC_RetainRV || + Class == IC_Release || + Class == IC_Autorelease || + Class == IC_AutoreleaseRV || + Class == IC_RetainBlock; +} + +/// IsAlwaysTail - Test if the given class represents instructions which are +/// always safe to mark with the "tail" keyword. +static bool IsAlwaysTail(InstructionClass Class) { + // IC_RetainBlock may be given a stack argument. + return Class == IC_Retain || + Class == IC_RetainRV || + Class == IC_Autorelease || + Class == IC_AutoreleaseRV; +} + +/// IsNoThrow - Test if the given class represents instructions which are always +/// safe to mark with the nounwind attribute.. +static bool IsNoThrow(InstructionClass Class) { + return Class == IC_Retain || + Class == IC_RetainRV || + Class == IC_RetainBlock || + Class == IC_Release || + Class == IC_Autorelease || + Class == IC_AutoreleaseRV || + Class == IC_AutoreleasepoolPush || + Class == IC_AutoreleasepoolPop; +} + +/// EraseInstruction - Erase the given instruction. ObjC calls return their +/// argument verbatim, so if it's such a call and the return value has users, +/// replace them with the argument value. +static void EraseInstruction(Instruction *CI) { + Value *OldArg = cast<CallInst>(CI)->getArgOperand(0); + + bool Unused = CI->use_empty(); + + if (!Unused) { + // Replace the return value with the argument. + assert(IsForwarding(GetBasicInstructionClass(CI)) && + "Can't delete non-forwarding instruction with users!"); + CI->replaceAllUsesWith(OldArg); + } + + CI->eraseFromParent(); + + if (Unused) + RecursivelyDeleteTriviallyDeadInstructions(OldArg); +} + +/// GetUnderlyingObjCPtr - This is a wrapper around getUnderlyingObject which +/// also knows how to look through objc_retain and objc_autorelease calls, which +/// we know to return their argument verbatim. +static const Value *GetUnderlyingObjCPtr(const Value *V) { + for (;;) { + V = GetUnderlyingObject(V); + if (!IsForwarding(GetBasicInstructionClass(V))) + break; + V = cast<CallInst>(V)->getArgOperand(0); + } + + return V; +} + +/// StripPointerCastsAndObjCCalls - This is a wrapper around +/// Value::stripPointerCasts which also knows how to look through objc_retain +/// and objc_autorelease calls, which we know to return their argument verbatim. +static const Value *StripPointerCastsAndObjCCalls(const Value *V) { + for (;;) { + V = V->stripPointerCasts(); + if (!IsForwarding(GetBasicInstructionClass(V))) + break; + V = cast<CallInst>(V)->getArgOperand(0); + } + return V; +} + +/// StripPointerCastsAndObjCCalls - This is a wrapper around +/// Value::stripPointerCasts which also knows how to look through objc_retain +/// and objc_autorelease calls, which we know to return their argument verbatim. +static Value *StripPointerCastsAndObjCCalls(Value *V) { + for (;;) { + V = V->stripPointerCasts(); + if (!IsForwarding(GetBasicInstructionClass(V))) + break; + V = cast<CallInst>(V)->getArgOperand(0); + } + return V; +} + +/// GetObjCArg - Assuming the given instruction is one of the special calls such +/// as objc_retain or objc_release, return the argument value, stripped of no-op +/// casts and forwarding calls. +static Value *GetObjCArg(Value *Inst) { + return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0)); +} + +/// IsObjCIdentifiedObject - This is similar to AliasAnalysis' +/// isObjCIdentifiedObject, except that it uses special knowledge of +/// ObjC conventions... +static bool IsObjCIdentifiedObject(const Value *V) { + // Assume that call results and arguments have their own "provenance". + // Constants (including GlobalVariables) and Allocas are never + // reference-counted. + if (isa<CallInst>(V) || isa<InvokeInst>(V) || + isa<Argument>(V) || isa<Constant>(V) || + isa<AllocaInst>(V)) + return true; + + if (const LoadInst *LI = dyn_cast<LoadInst>(V)) { + const Value *Pointer = + StripPointerCastsAndObjCCalls(LI->getPointerOperand()); + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { + StringRef Name = GV->getName(); + // These special variables are known to hold values which are not + // reference-counted pointers. + if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") || + Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") || + Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") || + Name.startswith("\01L_OBJC_METH_VAR_NAME_") || + Name.startswith("\01l_objc_msgSend_fixup_")) + return true; + } + } + + return false; +} + +/// FindSingleUseIdentifiedObject - This is similar to +/// StripPointerCastsAndObjCCalls but it stops as soon as it finds a value +/// with multiple uses. +static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { + if (Arg->hasOneUse()) { + if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg)) + return FindSingleUseIdentifiedObject(BC->getOperand(0)); + if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg)) + if (GEP->hasAllZeroIndices()) + return FindSingleUseIdentifiedObject(GEP->getPointerOperand()); + if (IsForwarding(GetBasicInstructionClass(Arg))) + return FindSingleUseIdentifiedObject( + cast<CallInst>(Arg)->getArgOperand(0)); + if (!IsObjCIdentifiedObject(Arg)) + return 0; + return Arg; + } + + // If we found an identifiable object but it has multiple uses, but they + // are trivial uses, we can still consider this to be a single-use + // value. + if (IsObjCIdentifiedObject(Arg)) { + for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); + UI != UE; ++UI) { + const User *U = *UI; + if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg) + return 0; + } + + return Arg; + } + + return 0; +} + +//===----------------------------------------------------------------------===// +// ARC AliasAnalysis. +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Passes.h" + +namespace { + /// ObjCARCAliasAnalysis - This is a simple alias analysis + /// implementation that uses knowledge of ARC constructs to answer queries. + /// + /// TODO: This class could be generalized to know about other ObjC-specific + /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing + /// even though their offsets are dynamic. + class ObjCARCAliasAnalysis : public ImmutablePass, + public AliasAnalysis { + public: + static char ID; // Class identification, replacement for typeinfo + ObjCARCAliasAnalysis() : ImmutablePass(ID) { + initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry()); + } + + private: + virtual void initializePass() { + InitializeAliasAnalysis(this); + } + + /// getAdjustedAnalysisPointer - This method is used when a pass implements + /// an analysis interface through multiple inheritance. If needed, it + /// should override this to adjust the this pointer as needed for the + /// specified pass info. + virtual void *getAdjustedAnalysisPointer(const void *PI) { + if (PI == &AliasAnalysis::ID) + return (AliasAnalysis*)this; + return this; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual AliasResult alias(const Location &LocA, const Location &LocB); + virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal); + virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS); + virtual ModRefBehavior getModRefBehavior(const Function *F); + virtual ModRefResult getModRefInfo(ImmutableCallSite CS, + const Location &Loc); + virtual ModRefResult getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2); + }; +} // End of anonymous namespace + +// Register this pass... +char ObjCARCAliasAnalysis::ID = 0; +INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa", + "ObjC-ARC-Based Alias Analysis", false, true, false) + +ImmutablePass *llvm::createObjCARCAliasAnalysisPass() { + return new ObjCARCAliasAnalysis(); +} + +void +ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AliasAnalysis::getAnalysisUsage(AU); +} + +AliasAnalysis::AliasResult +ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) { + if (!EnableARCOpts) + return AliasAnalysis::alias(LocA, LocB); + + // First, strip off no-ops, including ObjC-specific no-ops, and try making a + // precise alias query. + const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr); + const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr); + AliasResult Result = + AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag), + Location(SB, LocB.Size, LocB.TBAATag)); + if (Result != MayAlias) + return Result; + + // If that failed, climb to the underlying object, including climbing through + // ObjC-specific no-ops, and try making an imprecise alias query. + const Value *UA = GetUnderlyingObjCPtr(SA); + const Value *UB = GetUnderlyingObjCPtr(SB); + if (UA != SA || UB != SB) { + Result = AliasAnalysis::alias(Location(UA), Location(UB)); + // We can't use MustAlias or PartialAlias results here because + // GetUnderlyingObjCPtr may return an offsetted pointer value. + if (Result == NoAlias) + return NoAlias; + } + + // If that failed, fail. We don't need to chain here, since that's covered + // by the earlier precise query. + return MayAlias; +} + +bool +ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc, + bool OrLocal) { + if (!EnableARCOpts) + return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + + // First, strip off no-ops, including ObjC-specific no-ops, and try making + // a precise alias query. + const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr); + if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag), + OrLocal)) + return true; + + // If that failed, climb to the underlying object, including climbing through + // ObjC-specific no-ops, and try making an imprecise alias query. + const Value *U = GetUnderlyingObjCPtr(S); + if (U != S) + return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal); + + // If that failed, fail. We don't need to chain here, since that's covered + // by the earlier precise query. + return false; +} + +AliasAnalysis::ModRefBehavior +ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { + // We have nothing to do. Just chain to the next AliasAnalysis. + return AliasAnalysis::getModRefBehavior(CS); +} + +AliasAnalysis::ModRefBehavior +ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) { + if (!EnableARCOpts) + return AliasAnalysis::getModRefBehavior(F); + + switch (GetFunctionClass(F)) { + case IC_NoopCast: + return DoesNotAccessMemory; + default: + break; + } + + return AliasAnalysis::getModRefBehavior(F); +} + +AliasAnalysis::ModRefResult +ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { + if (!EnableARCOpts) + return AliasAnalysis::getModRefInfo(CS, Loc); + + switch (GetBasicInstructionClass(CS.getInstruction())) { + case IC_Retain: + case IC_RetainRV: + case IC_RetainBlock: + case IC_Autorelease: + case IC_AutoreleaseRV: + case IC_NoopCast: + case IC_AutoreleasepoolPush: + case IC_FusedRetainAutorelease: + case IC_FusedRetainAutoreleaseRV: + // These functions don't access any memory visible to the compiler. + return NoModRef; + default: + break; + } + + return AliasAnalysis::getModRefInfo(CS, Loc); +} + +AliasAnalysis::ModRefResult +ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2) { + // TODO: Theoretically we could check for dependencies between objc_* calls + // and OnlyAccessesArgumentPointees calls or other well-behaved calls. + return AliasAnalysis::getModRefInfo(CS1, CS2); +} + +//===----------------------------------------------------------------------===// +// ARC expansion. +//===----------------------------------------------------------------------===// + +#include "llvm/Support/InstIterator.h" +#include "llvm/Transforms/Scalar.h" + +namespace { + /// ObjCARCExpand - Early ARC transformations. + class ObjCARCExpand : public FunctionPass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool runOnFunction(Function &F); + + public: + static char ID; + ObjCARCExpand() : FunctionPass(ID) { + initializeObjCARCExpandPass(*PassRegistry::getPassRegistry()); + } + }; +} + +char ObjCARCExpand::ID = 0; +INITIALIZE_PASS(ObjCARCExpand, + "objc-arc-expand", "ObjC ARC expansion", false, false) + +Pass *llvm::createObjCARCExpandPass() { + return new ObjCARCExpand(); +} + +void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +bool ObjCARCExpand::runOnFunction(Function &F) { + if (!EnableARCOpts) + return false; + + bool Changed = false; + + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) { + Instruction *Inst = &*I; + + switch (GetBasicInstructionClass(Inst)) { + case IC_Retain: + case IC_RetainRV: + case IC_Autorelease: + case IC_AutoreleaseRV: + case IC_FusedRetainAutorelease: + case IC_FusedRetainAutoreleaseRV: + // These calls return their argument verbatim, as a low-level + // optimization. However, this makes high-level optimizations + // harder. Undo any uses of this optimization that the front-end + // emitted here. We'll redo them in a later pass. + Changed = true; + Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0)); + break; + default: + break; + } + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// ARC optimization. +//===----------------------------------------------------------------------===// + +// TODO: On code like this: +// +// objc_retain(%x) +// stuff_that_cannot_release() +// objc_autorelease(%x) +// stuff_that_cannot_release() +// objc_retain(%x) +// stuff_that_cannot_release() +// objc_autorelease(%x) +// +// The second retain and autorelease can be deleted. + +// TODO: It should be possible to delete +// objc_autoreleasePoolPush and objc_autoreleasePoolPop +// pairs if nothing is actually autoreleased between them. Also, autorelease +// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code +// after inlining) can be turned into plain release calls. + +// TODO: Critical-edge splitting. If the optimial insertion point is +// a critical edge, the current algorithm has to fail, because it doesn't +// know how to split edges. It should be possible to make the optimizer +// think in terms of edges, rather than blocks, and then split critical +// edges on demand. + +// TODO: OptimizeSequences could generalized to be Interprocedural. + +// TODO: Recognize that a bunch of other objc runtime calls have +// non-escaping arguments and non-releasing arguments, and may be +// non-autoreleasing. + +// TODO: Sink autorelease calls as far as possible. Unfortunately we +// usually can't sink them past other calls, which would be the main +// case where it would be useful. + +#include "llvm/GlobalAlias.h" +#include "llvm/Module.h" +#include "llvm/Constants.h" +#include "llvm/LLVMContext.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/CFG.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" + +STATISTIC(NumNoops, "Number of no-op objc calls eliminated"); +STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); +STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases"); +STATISTIC(NumRets, "Number of return value forwarding " + "retain+autoreleaes eliminated"); +STATISTIC(NumRRs, "Number of retain+release paths eliminated"); +STATISTIC(NumPeeps, "Number of calls peephole-optimized"); + +namespace { + /// ProvenanceAnalysis - This is similar to BasicAliasAnalysis, and it + /// uses many of the same techniques, except it uses special ObjC-specific + /// reasoning about pointer relationships. + class ProvenanceAnalysis { + AliasAnalysis *AA; + + typedef std::pair<const Value *, const Value *> ValuePairTy; + typedef DenseMap<ValuePairTy, bool> CachedResultsTy; + CachedResultsTy CachedResults; + + bool relatedCheck(const Value *A, const Value *B); + bool relatedSelect(const SelectInst *A, const Value *B); + bool relatedPHI(const PHINode *A, const Value *B); + + // Do not implement. + void operator=(const ProvenanceAnalysis &); + ProvenanceAnalysis(const ProvenanceAnalysis &); + + public: + ProvenanceAnalysis() {} + + void setAA(AliasAnalysis *aa) { AA = aa; } + + AliasAnalysis *getAA() const { return AA; } + + bool related(const Value *A, const Value *B); + + void clear() { + CachedResults.clear(); + } + }; +} + +bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) { + // If the values are Selects with the same condition, we can do a more precise + // check: just check for relations between the values on corresponding arms. + if (const SelectInst *SB = dyn_cast<SelectInst>(B)) + if (A->getCondition() == SB->getCondition()) { + if (related(A->getTrueValue(), SB->getTrueValue())) + return true; + if (related(A->getFalseValue(), SB->getFalseValue())) + return true; + return false; + } + + // Check both arms of the Select node individually. + if (related(A->getTrueValue(), B)) + return true; + if (related(A->getFalseValue(), B)) + return true; + + // The arms both checked out. + return false; +} + +bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) { + // If the values are PHIs in the same block, we can do a more precise as well + // as efficient check: just check for relations between the values on + // corresponding edges. + if (const PHINode *PNB = dyn_cast<PHINode>(B)) + if (PNB->getParent() == A->getParent()) { + for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) + if (related(A->getIncomingValue(i), + PNB->getIncomingValueForBlock(A->getIncomingBlock(i)))) + return true; + return false; + } + + // Check each unique source of the PHI node against B. + SmallPtrSet<const Value *, 4> UniqueSrc; + for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) { + const Value *PV1 = A->getIncomingValue(i); + if (UniqueSrc.insert(PV1) && related(PV1, B)) + return true; + } + + // All of the arms checked out. + return false; +} + +/// isStoredObjCPointer - Test if the value of P, or any value covered by its +/// provenance, is ever stored within the function (not counting callees). +static bool isStoredObjCPointer(const Value *P) { + SmallPtrSet<const Value *, 8> Visited; + SmallVector<const Value *, 8> Worklist; + Worklist.push_back(P); + Visited.insert(P); + do { + P = Worklist.pop_back_val(); + for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end(); + UI != UE; ++UI) { + const User *Ur = *UI; + if (isa<StoreInst>(Ur)) { + if (UI.getOperandNo() == 0) + // The pointer is stored. + return true; + // The pointed is stored through. + continue; + } + if (isa<CallInst>(Ur)) + // The pointer is passed as an argument, ignore this. + continue; + if (isa<PtrToIntInst>(P)) + // Assume the worst. + return true; + if (Visited.insert(Ur)) + Worklist.push_back(Ur); + } + } while (!Worklist.empty()); + + // Everything checked out. + return false; +} + +bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) { + // Skip past provenance pass-throughs. + A = GetUnderlyingObjCPtr(A); + B = GetUnderlyingObjCPtr(B); + + // Quick check. + if (A == B) + return true; + + // Ask regular AliasAnalysis, for a first approximation. + switch (AA->alias(A, B)) { + case AliasAnalysis::NoAlias: + return false; + case AliasAnalysis::MustAlias: + case AliasAnalysis::PartialAlias: + return true; + case AliasAnalysis::MayAlias: + break; + } + + bool AIsIdentified = IsObjCIdentifiedObject(A); + bool BIsIdentified = IsObjCIdentifiedObject(B); + + // An ObjC-Identified object can't alias a load if it is never locally stored. + if (AIsIdentified) { + if (BIsIdentified) { + // If both pointers have provenance, they can be directly compared. + if (A != B) + return false; + } else { + if (isa<LoadInst>(B)) + return isStoredObjCPointer(A); + } + } else { + if (BIsIdentified && isa<LoadInst>(A)) + return isStoredObjCPointer(B); + } + + // Special handling for PHI and Select. + if (const PHINode *PN = dyn_cast<PHINode>(A)) + return relatedPHI(PN, B); + if (const PHINode *PN = dyn_cast<PHINode>(B)) + return relatedPHI(PN, A); + if (const SelectInst *S = dyn_cast<SelectInst>(A)) + return relatedSelect(S, B); + if (const SelectInst *S = dyn_cast<SelectInst>(B)) + return relatedSelect(S, A); + + // Conservative. + return true; +} + +bool ProvenanceAnalysis::related(const Value *A, const Value *B) { + // Begin by inserting a conservative value into the map. If the insertion + // fails, we have the answer already. If it succeeds, leave it there until we + // compute the real answer to guard against recursive queries. + if (A > B) std::swap(A, B); + std::pair<CachedResultsTy::iterator, bool> Pair = + CachedResults.insert(std::make_pair(ValuePairTy(A, B), true)); + if (!Pair.second) + return Pair.first->second; + + bool Result = relatedCheck(A, B); + CachedResults[ValuePairTy(A, B)] = Result; + return Result; +} + +namespace { + // Sequence - A sequence of states that a pointer may go through in which an + // objc_retain and objc_release are actually needed. + enum Sequence { + S_None, + S_Retain, ///< objc_retain(x) + S_CanRelease, ///< foo(x) -- x could possibly see a ref count decrement + S_Use, ///< any use of x + S_Stop, ///< like S_Release, but code motion is stopped + S_Release, ///< objc_release(x) + S_MovableRelease ///< objc_release(x), !clang.imprecise_release + }; +} + +static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) { + // The easy cases. + if (A == B) + return A; + if (A == S_None || B == S_None) + return S_None; + + // Note that we can't merge S_CanRelease and S_Use. + if (A > B) std::swap(A, B); + if (TopDown) { + // Choose the side which is further along in the sequence. + if (A == S_Retain && (B == S_CanRelease || B == S_Use)) + return B; + } else { + // Choose the side which is further along in the sequence. + if ((A == S_Use || A == S_CanRelease) && + (B == S_Release || B == S_Stop || B == S_MovableRelease)) + return A; + // If both sides are releases, choose the more conservative one. + if (A == S_Stop && (B == S_Release || B == S_MovableRelease)) + return A; + if (A == S_Release && B == S_MovableRelease) + return A; + } + + return S_None; +} + +namespace { + /// RRInfo - Unidirectional information about either a + /// retain-decrement-use-release sequence or release-use-decrement-retain + /// reverese sequence. + struct RRInfo { + /// KnownIncremented - After an objc_retain, the reference count of the + /// referenced object is known to be positive. Similarly, before an + /// objc_release, the reference count of the referenced object is known to + /// be positive. If there are retain-release pairs in code regions where the + /// retain count is known to be positive, they can be eliminated, regardless + /// of any side effects between them. + bool KnownIncremented; + + /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as + /// opposed to objc_retain calls). + bool IsRetainBlock; + + /// IsTailCallRelease - True of the objc_release calls are all marked + /// with the "tail" keyword. + bool IsTailCallRelease; + + /// ReleaseMetadata - If the Calls are objc_release calls and they all have + /// a clang.imprecise_release tag, this is the metadata tag. + MDNode *ReleaseMetadata; + + /// Calls - For a top-down sequence, the set of objc_retains or + /// objc_retainBlocks. For bottom-up, the set of objc_releases. + SmallPtrSet<Instruction *, 2> Calls; + + /// ReverseInsertPts - The set of optimal insert positions for + /// moving calls in the opposite sequence. + SmallPtrSet<Instruction *, 2> ReverseInsertPts; + + RRInfo() : + KnownIncremented(false), IsRetainBlock(false), IsTailCallRelease(false), + ReleaseMetadata(0) {} + + void clear(); + }; +} + +void RRInfo::clear() { + KnownIncremented = false; + IsRetainBlock = false; + IsTailCallRelease = false; + ReleaseMetadata = 0; + Calls.clear(); + ReverseInsertPts.clear(); +} + +namespace { + /// PtrState - This class summarizes several per-pointer runtime properties + /// which are propogated through the flow graph. + class PtrState { + /// RefCount - The known minimum number of reference count increments. + unsigned RefCount; + + /// Seq - The current position in the sequence. + Sequence Seq; + + public: + /// RRI - Unidirectional information about the current sequence. + /// TODO: Encapsulate this better. + RRInfo RRI; + + PtrState() : RefCount(0), Seq(S_None) {} + + void IncrementRefCount() { + if (RefCount != UINT_MAX) ++RefCount; + } + + void DecrementRefCount() { + if (RefCount != 0) --RefCount; + } + + void ClearRefCount() { + RefCount = 0; + } + + bool IsKnownIncremented() const { + return RefCount > 0; + } + + void SetSeq(Sequence NewSeq) { + Seq = NewSeq; + } + + void SetSeqToRelease(MDNode *M) { + if (Seq == S_None || Seq == S_Use) { + Seq = M ? S_MovableRelease : S_Release; + RRI.ReleaseMetadata = M; + } else if (Seq != S_MovableRelease || RRI.ReleaseMetadata != M) { + Seq = S_Release; + RRI.ReleaseMetadata = 0; + } + } + + Sequence GetSeq() const { + return Seq; + } + + void ClearSequenceProgress() { + Seq = S_None; + RRI.clear(); + } + + void Merge(const PtrState &Other, bool TopDown); + }; +} + +void +PtrState::Merge(const PtrState &Other, bool TopDown) { + Seq = MergeSeqs(Seq, Other.Seq, TopDown); + RefCount = std::min(RefCount, Other.RefCount); + + // We can't merge a plain objc_retain with an objc_retainBlock. + if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock) + Seq = S_None; + + if (Seq == S_None) { + RRI.clear(); + } else { + // Conservatively merge the ReleaseMetadata information. + if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata) + RRI.ReleaseMetadata = 0; + + RRI.KnownIncremented = RRI.KnownIncremented && Other.RRI.KnownIncremented; + RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease; + RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end()); + RRI.ReverseInsertPts.insert(Other.RRI.ReverseInsertPts.begin(), + Other.RRI.ReverseInsertPts.end()); + } +} + +namespace { + /// BBState - Per-BasicBlock state. + class BBState { + /// TopDownPathCount - The number of unique control paths from the entry + /// which can reach this block. + unsigned TopDownPathCount; + + /// BottomUpPathCount - The number of unique control paths to exits + /// from this block. + unsigned BottomUpPathCount; + + /// MapTy - A type for PerPtrTopDown and PerPtrBottomUp. + typedef MapVector<const Value *, PtrState> MapTy; + + /// PerPtrTopDown - The top-down traversal uses this to record information + /// known about a pointer at the bottom of each block. + MapTy PerPtrTopDown; + + /// PerPtrBottomUp - The bottom-up traversal uses this to record information + /// known about a pointer at the top of each block. + MapTy PerPtrBottomUp; + + public: + BBState() : TopDownPathCount(0), BottomUpPathCount(0) {} + + typedef MapTy::iterator ptr_iterator; + typedef MapTy::const_iterator ptr_const_iterator; + + ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); } + ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); } + ptr_const_iterator top_down_ptr_begin() const { + return PerPtrTopDown.begin(); + } + ptr_const_iterator top_down_ptr_end() const { + return PerPtrTopDown.end(); + } + + ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); } + ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); } + ptr_const_iterator bottom_up_ptr_begin() const { + return PerPtrBottomUp.begin(); + } + ptr_const_iterator bottom_up_ptr_end() const { + return PerPtrBottomUp.end(); + } + + /// SetAsEntry - Mark this block as being an entry block, which has one + /// path from the entry by definition. + void SetAsEntry() { TopDownPathCount = 1; } + + /// SetAsExit - Mark this block as being an exit block, which has one + /// path to an exit by definition. + void SetAsExit() { BottomUpPathCount = 1; } + + PtrState &getPtrTopDownState(const Value *Arg) { + return PerPtrTopDown[Arg]; + } + + PtrState &getPtrBottomUpState(const Value *Arg) { + return PerPtrBottomUp[Arg]; + } + + void clearBottomUpPointers() { + PerPtrTopDown.clear(); + } + + void clearTopDownPointers() { + PerPtrTopDown.clear(); + } + + void InitFromPred(const BBState &Other); + void InitFromSucc(const BBState &Other); + void MergePred(const BBState &Other); + void MergeSucc(const BBState &Other); + + /// GetAllPathCount - Return the number of possible unique paths from an + /// entry to an exit which pass through this block. This is only valid + /// after both the top-down and bottom-up traversals are complete. + unsigned GetAllPathCount() const { + return TopDownPathCount * BottomUpPathCount; + } + }; +} + +void BBState::InitFromPred(const BBState &Other) { + PerPtrTopDown = Other.PerPtrTopDown; + TopDownPathCount = Other.TopDownPathCount; +} + +void BBState::InitFromSucc(const BBState &Other) { + PerPtrBottomUp = Other.PerPtrBottomUp; + BottomUpPathCount = Other.BottomUpPathCount; +} + +/// MergePred - The top-down traversal uses this to merge information about +/// predecessors to form the initial state for a new block. +void BBState::MergePred(const BBState &Other) { + // Other.TopDownPathCount can be 0, in which case it is either dead or a + // loop backedge. Loop backedges are special. + TopDownPathCount += Other.TopDownPathCount; + + // For each entry in the other set, if our set has an entry with the same key, + // merge the entries. Otherwise, copy the entry and merge it with an empty + // entry. + for (ptr_const_iterator MI = Other.top_down_ptr_begin(), + ME = Other.top_down_ptr_end(); MI != ME; ++MI) { + std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI); + Pair.first->second.Merge(Pair.second ? PtrState() : MI->second, + /*TopDown=*/true); + } + + // For each entry in our set, if the other set doens't have an entry with the + // same key, force it to merge with an empty entry. + for (ptr_iterator MI = top_down_ptr_begin(), + ME = top_down_ptr_end(); MI != ME; ++MI) + if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end()) + MI->second.Merge(PtrState(), /*TopDown=*/true); +} + +/// MergeSucc - The bottom-up traversal uses this to merge information about +/// successors to form the initial state for a new block. +void BBState::MergeSucc(const BBState &Other) { + // Other.BottomUpPathCount can be 0, in which case it is either dead or a + // loop backedge. Loop backedges are special. + BottomUpPathCount += Other.BottomUpPathCount; + + // For each entry in the other set, if our set has an entry with the + // same key, merge the entries. Otherwise, copy the entry and merge + // it with an empty entry. + for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(), + ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) { + std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI); + Pair.first->second.Merge(Pair.second ? PtrState() : MI->second, + /*TopDown=*/false); + } + + // For each entry in our set, if the other set doens't have an entry + // with the same key, force it to merge with an empty entry. + for (ptr_iterator MI = bottom_up_ptr_begin(), + ME = bottom_up_ptr_end(); MI != ME; ++MI) + if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end()) + MI->second.Merge(PtrState(), /*TopDown=*/false); +} + +namespace { + /// ObjCARCOpt - The main ARC optimization pass. + class ObjCARCOpt : public FunctionPass { + bool Changed; + ProvenanceAnalysis PA; + + /// RetainFunc, RelaseFunc - Declarations for objc_retain, + /// objc_retainBlock, and objc_release. + Function *RetainFunc, *RetainBlockFunc, *RetainRVFunc, *ReleaseFunc, + *AutoreleaseFunc; + + /// RetainRVCallee, etc. - Declarations for ObjC runtime + /// functions, for use in creating calls to them. These are initialized + /// lazily to avoid cluttering up the Module with unused declarations. + Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee, + *RetainCallee, *AutoreleaseCallee; + + /// UsedInThisFunciton - Flags which determine whether each of the + /// interesting runtine functions is in fact used in the current function. + unsigned UsedInThisFunction; + + /// ImpreciseReleaseMDKind - The Metadata Kind for clang.imprecise_release + /// metadata. + unsigned ImpreciseReleaseMDKind; + + Constant *getRetainRVCallee(Module *M); + Constant *getAutoreleaseRVCallee(Module *M); + Constant *getReleaseCallee(Module *M); + Constant *getRetainCallee(Module *M); + Constant *getAutoreleaseCallee(Module *M); + + void OptimizeRetainCall(Function &F, Instruction *Retain); + bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV); + void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV); + void OptimizeIndividualCalls(Function &F); + + void CheckForCFGHazards(const BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + BBState &MyStates) const; + bool VisitBottomUp(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains); + bool VisitTopDown(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + DenseMap<Value *, RRInfo> &Releases); + bool Visit(Function &F, + DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases); + + void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, + SmallVectorImpl<Instruction *> &DeadInsts); + + bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases); + + void OptimizeWeakCalls(Function &F); + + bool OptimizeSequences(Function &F); + + void OptimizeReturns(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + virtual void releaseMemory(); + + public: + static char ID; + ObjCARCOpt() : FunctionPass(ID) { + initializeObjCARCOptPass(*PassRegistry::getPassRegistry()); + } + }; +} + +char ObjCARCOpt::ID = 0; +INITIALIZE_PASS_BEGIN(ObjCARCOpt, + "objc-arc", "ObjC ARC optimization", false, false) +INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis) +INITIALIZE_PASS_END(ObjCARCOpt, + "objc-arc", "ObjC ARC optimization", false, false) + +Pass *llvm::createObjCARCOptPass() { + return new ObjCARCOpt(); +} + +void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<ObjCARCAliasAnalysis>(); + AU.addRequired<AliasAnalysis>(); + // ARC optimization doesn't currently split critical edges. + AU.setPreservesCFG(); +} + +Constant *ObjCARCOpt::getRetainRVCallee(Module *M) { + if (!RetainRVCallee) { + LLVMContext &C = M->getContext(); + const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + std::vector<const Type *> Params; + Params.push_back(I8X); + const FunctionType *FTy = + FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes; + Attributes.addAttr(~0u, Attribute::NoUnwind); + RetainRVCallee = + M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy, + Attributes); + } + return RetainRVCallee; +} + +Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { + if (!AutoreleaseRVCallee) { + LLVMContext &C = M->getContext(); + const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + std::vector<const Type *> Params; + Params.push_back(I8X); + const FunctionType *FTy = + FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes; + Attributes.addAttr(~0u, Attribute::NoUnwind); + AutoreleaseRVCallee = + M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy, + Attributes); + } + return AutoreleaseRVCallee; +} + +Constant *ObjCARCOpt::getReleaseCallee(Module *M) { + if (!ReleaseCallee) { + LLVMContext &C = M->getContext(); + std::vector<const Type *> Params; + Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); + AttrListPtr Attributes; + Attributes.addAttr(~0u, Attribute::NoUnwind); + ReleaseCallee = + M->getOrInsertFunction( + "objc_release", + FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), + Attributes); + } + return ReleaseCallee; +} + +Constant *ObjCARCOpt::getRetainCallee(Module *M) { + if (!RetainCallee) { + LLVMContext &C = M->getContext(); + std::vector<const Type *> Params; + Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); + AttrListPtr Attributes; + Attributes.addAttr(~0u, Attribute::NoUnwind); + RetainCallee = + M->getOrInsertFunction( + "objc_retain", + FunctionType::get(Params[0], Params, /*isVarArg=*/false), + Attributes); + } + return RetainCallee; +} + +Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { + if (!AutoreleaseCallee) { + LLVMContext &C = M->getContext(); + std::vector<const Type *> Params; + Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); + AttrListPtr Attributes; + Attributes.addAttr(~0u, Attribute::NoUnwind); + AutoreleaseCallee = + M->getOrInsertFunction( + "objc_autorelease", + FunctionType::get(Params[0], Params, /*isVarArg=*/false), + Attributes); + } + return AutoreleaseCallee; +} + +/// CanAlterRefCount - Test whether the given instruction can result in a +/// reference count modification (positive or negative) for the pointer's +/// object. +static bool +CanAlterRefCount(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, InstructionClass Class) { + switch (Class) { + case IC_Autorelease: + case IC_AutoreleaseRV: + case IC_User: + // These operations never directly modify a reference count. + return false; + default: break; + } + + ImmutableCallSite CS = static_cast<const Value *>(Inst); + assert(CS && "Only calls can alter reference counts!"); + + // See if AliasAnalysis can help us with the call. + AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS); + if (AliasAnalysis::onlyReadsMemory(MRB)) + return false; + if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { + for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); + I != E; ++I) { + const Value *Op = *I; + if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + return true; + } + return false; + } + + // Assume the worst. + return true; +} + +/// CanUse - Test whether the given instruction can "use" the given pointer's +/// object in a way that requires the reference count to be positive. +static bool +CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, + InstructionClass Class) { + // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers. + if (Class == IC_Call) + return false; + + // Consider various instructions which may have pointer arguments which are + // not "uses". + if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) { + // Comparing a pointer with null, or any other constant, isn't really a use, + // because we don't care what the pointer points to, or about the values + // of any other dynamic reference-counted pointers. + if (!IsPotentialUse(ICI->getOperand(1))) + return false; + } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) { + // For calls, just check the arguments (and not the callee operand). + for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(), + OE = CS.arg_end(); OI != OE; ++OI) { + const Value *Op = *OI; + if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + return true; + } + return false; + } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // Special-case stores, because we don't care about the stored value, just + // the store address. + const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand()); + // If we can't tell what the underlying object was, assume there is a + // dependence. + return IsPotentialUse(Op) && PA.related(Op, Ptr); + } + + // Check each operand for a match. + for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end(); + OI != OE; ++OI) { + const Value *Op = *OI; + if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + return true; + } + return false; +} + +/// CanInterruptRV - Test whether the given instruction can autorelease +/// any pointer or cause an autoreleasepool pop. +static bool +CanInterruptRV(InstructionClass Class) { + switch (Class) { + case IC_AutoreleasepoolPop: + case IC_CallOrUser: + case IC_Call: + case IC_Autorelease: + case IC_AutoreleaseRV: + case IC_FusedRetainAutorelease: + case IC_FusedRetainAutoreleaseRV: + return true; + default: + return false; + } +} + +namespace { + /// DependenceKind - There are several kinds of dependence-like concepts in + /// use here. + enum DependenceKind { + NeedsPositiveRetainCount, + CanChangeRetainCount, + RetainAutoreleaseDep, ///< Blocks objc_retainAutorelease. + RetainAutoreleaseRVDep, ///< Blocks objc_retainAutoreleaseReturnValue. + RetainRVDep ///< Blocks objc_retainAutoreleasedReturnValue. + }; +} + +/// Depends - Test if there can be dependencies on Inst through Arg. This +/// function only tests dependencies relevant for removing pairs of calls. +static bool +Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg, + ProvenanceAnalysis &PA) { + // If we've reached the definition of Arg, stop. + if (Inst == Arg) + return true; + + switch (Flavor) { + case NeedsPositiveRetainCount: { + InstructionClass Class = GetInstructionClass(Inst); + switch (Class) { + case IC_AutoreleasepoolPop: + case IC_AutoreleasepoolPush: + case IC_None: + return false; + default: + return CanUse(Inst, Arg, PA, Class); + } + } + + case CanChangeRetainCount: { + InstructionClass Class = GetInstructionClass(Inst); + switch (Class) { + case IC_AutoreleasepoolPop: + // Conservatively assume this can decrement any count. + return true; + case IC_AutoreleasepoolPush: + case IC_None: + return false; + default: + return CanAlterRefCount(Inst, Arg, PA, Class); + } + } + + case RetainAutoreleaseDep: + switch (GetBasicInstructionClass(Inst)) { + case IC_AutoreleasepoolPop: + // Don't merge an objc_autorelease with an objc_retain inside a different + // autoreleasepool scope. + return true; + case IC_Retain: + case IC_RetainRV: + // Check for a retain of the same pointer for merging. + return GetObjCArg(Inst) == Arg; + default: + // Nothing else matters for objc_retainAutorelease formation. + return false; + } + break; + + case RetainAutoreleaseRVDep: { + InstructionClass Class = GetBasicInstructionClass(Inst); + switch (Class) { + case IC_Retain: + case IC_RetainRV: + // Check for a retain of the same pointer for merging. + return GetObjCArg(Inst) == Arg; + default: + // Anything that can autorelease interrupts + // retainAutoreleaseReturnValue formation. + return CanInterruptRV(Class); + } + break; + } + + case RetainRVDep: + return CanInterruptRV(GetBasicInstructionClass(Inst)); + } + + llvm_unreachable("Invalid dependence flavor"); + return true; +} + +/// FindDependencies - Walk up the CFG from StartPos (which is in StartBB) and +/// find local and non-local dependencies on Arg. +/// TODO: Cache results? +static void +FindDependencies(DependenceKind Flavor, + const Value *Arg, + BasicBlock *StartBB, Instruction *StartInst, + SmallPtrSet<Instruction *, 4> &DependingInstructions, + SmallPtrSet<const BasicBlock *, 4> &Visited, + ProvenanceAnalysis &PA) { + BasicBlock::iterator StartPos = StartInst; + + SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist; + Worklist.push_back(std::make_pair(StartBB, StartPos)); + do { + std::pair<BasicBlock *, BasicBlock::iterator> Pair = + Worklist.pop_back_val(); + BasicBlock *LocalStartBB = Pair.first; + BasicBlock::iterator LocalStartPos = Pair.second; + BasicBlock::iterator StartBBBegin = LocalStartBB->begin(); + for (;;) { + if (LocalStartPos == StartBBBegin) { + pred_iterator PI(LocalStartBB), PE(LocalStartBB, false); + if (PI == PE) + // If we've reached the function entry, produce a null dependence. + DependingInstructions.insert(0); + else + // Add the predecessors to the worklist. + do { + BasicBlock *PredBB = *PI; + if (Visited.insert(PredBB)) + Worklist.push_back(std::make_pair(PredBB, PredBB->end())); + } while (++PI != PE); + break; + } + + Instruction *Inst = --LocalStartPos; + if (Depends(Flavor, Inst, Arg, PA)) { + DependingInstructions.insert(Inst); + break; + } + } + } while (!Worklist.empty()); + + // Determine whether the original StartBB post-dominates all of the blocks we + // visited. If not, insert a sentinal indicating that most optimizations are + // not safe. + for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(), + E = Visited.end(); I != E; ++I) { + const BasicBlock *BB = *I; + if (BB == StartBB) + continue; + const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); + for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) { + const BasicBlock *Succ = *SI; + if (Succ != StartBB && !Visited.count(Succ)) { + DependingInstructions.insert(reinterpret_cast<Instruction *>(-1)); + return; + } + } + } +} + +static bool isNullOrUndef(const Value *V) { + return isa<ConstantPointerNull>(V) || isa<UndefValue>(V); +} + +static bool isNoopInstruction(const Instruction *I) { + return isa<BitCastInst>(I) || + (isa<GetElementPtrInst>(I) && + cast<GetElementPtrInst>(I)->hasAllZeroIndices()); +} + +/// OptimizeRetainCall - Turn objc_retain into +/// objc_retainAutoreleasedReturnValue if the operand is a return value. +void +ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { + CallSite CS(GetObjCArg(Retain)); + Instruction *Call = CS.getInstruction(); + if (!Call) return; + if (Call->getParent() != Retain->getParent()) return; + + // Check that the call is next to the retain. + BasicBlock::iterator I = Call; + ++I; + while (isNoopInstruction(I)) ++I; + if (&*I != Retain) + return; + + // Turn it to an objc_retainAutoreleasedReturnValue.. + Changed = true; + ++NumPeeps; + cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent())); +} + +/// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into +/// objc_retain if the operand is not a return value. Or, if it can be +/// paired with an objc_autoreleaseReturnValue, delete the pair and +/// return true. +bool +ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { + // Check for the argument being from an immediately preceding call. + Value *Arg = GetObjCArg(RetainRV); + CallSite CS(Arg); + if (Instruction *Call = CS.getInstruction()) + if (Call->getParent() == RetainRV->getParent()) { + BasicBlock::iterator I = Call; + ++I; + while (isNoopInstruction(I)) ++I; + if (&*I == RetainRV) + return false; + } + + // Check for being preceded by an objc_autoreleaseReturnValue on the same + // pointer. In this case, we can delete the pair. + BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin(); + if (I != Begin) { + do --I; while (I != Begin && isNoopInstruction(I)); + if (GetBasicInstructionClass(I) == IC_AutoreleaseRV && + GetObjCArg(I) == Arg) { + Changed = true; + ++NumPeeps; + EraseInstruction(I); + EraseInstruction(RetainRV); + return true; + } + } + + // Turn it to a plain objc_retain. + Changed = true; + ++NumPeeps; + cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent())); + return false; +} + +/// OptimizeAutoreleaseRVCall - Turn objc_autoreleaseReturnValue into +/// objc_autorelease if the result is not used as a return value. +void +ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) { + // Check for a return of the pointer value. + const Value *Ptr = GetObjCArg(AutoreleaseRV); + for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); + UI != UE; ++UI) { + const User *I = *UI; + if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV) + return; + } + + Changed = true; + ++NumPeeps; + cast<CallInst>(AutoreleaseRV)-> + setCalledFunction(getAutoreleaseCallee(F.getParent())); +} + +/// OptimizeIndividualCalls - Visit each call, one at a time, and make +/// simplifications without doing any additional analysis. +void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { + // Reset all the flags in preparation for recomputing them. + UsedInThisFunction = 0; + + // Visit all objc_* calls in F. + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { + Instruction *Inst = &*I++; + InstructionClass Class = GetBasicInstructionClass(Inst); + + switch (Class) { + default: break; + + // Delete no-op casts. These function calls have special semantics, but + // the semantics are entirely implemented via lowering in the front-end, + // so by the time they reach the optimizer, they are just no-op calls + // which return their argument. + // + // There are gray areas here, as the ability to cast reference-counted + // pointers to raw void* and back allows code to break ARC assumptions, + // however these are currently considered to be unimportant. + case IC_NoopCast: + Changed = true; + ++NumNoops; + EraseInstruction(Inst); + continue; + + // If the pointer-to-weak-pointer is null, it's undefined behavior. + case IC_StoreWeak: + case IC_LoadWeak: + case IC_LoadWeakRetained: + case IC_InitWeak: + case IC_DestroyWeak: { + CallInst *CI = cast<CallInst>(Inst); + if (isNullOrUndef(CI->getArgOperand(0))) { + const Type *Ty = CI->getArgOperand(0)->getType(); + new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), + Constant::getNullValue(Ty), + CI); + CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + CI->eraseFromParent(); + continue; + } + break; + } + case IC_CopyWeak: + case IC_MoveWeak: { + CallInst *CI = cast<CallInst>(Inst); + if (isNullOrUndef(CI->getArgOperand(0)) || + isNullOrUndef(CI->getArgOperand(1))) { + const Type *Ty = CI->getArgOperand(0)->getType(); + new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), + Constant::getNullValue(Ty), + CI); + CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + CI->eraseFromParent(); + continue; + } + break; + } + case IC_Retain: + OptimizeRetainCall(F, Inst); + break; + case IC_RetainRV: + if (OptimizeRetainRVCall(F, Inst)) + continue; + break; + case IC_AutoreleaseRV: + OptimizeAutoreleaseRVCall(F, Inst); + break; + } + + // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. + if (IsAutorelease(Class) && Inst->use_empty()) { + CallInst *Call = cast<CallInst>(Inst); + const Value *Arg = Call->getArgOperand(0); + Arg = FindSingleUseIdentifiedObject(Arg); + if (Arg) { + Changed = true; + ++NumAutoreleases; + + // Create the declaration lazily. + LLVMContext &C = Inst->getContext(); + CallInst *NewCall = + CallInst::Create(getReleaseCallee(F.getParent()), + Call->getArgOperand(0), "", Call); + NewCall->setMetadata(ImpreciseReleaseMDKind, + MDNode::get(C, ArrayRef<Value *>())); + EraseInstruction(Call); + Inst = NewCall; + Class = IC_Release; + } + } + + // For functions which can never be passed stack arguments, add + // a tail keyword. + if (IsAlwaysTail(Class)) { + Changed = true; + cast<CallInst>(Inst)->setTailCall(); + } + + // Set nounwind as needed. + if (IsNoThrow(Class)) { + Changed = true; + cast<CallInst>(Inst)->setDoesNotThrow(); + } + + if (!IsNoopOnNull(Class)) { + UsedInThisFunction |= 1 << Class; + continue; + } + + const Value *Arg = GetObjCArg(Inst); + + // ARC calls with null are no-ops. Delete them. + if (isNullOrUndef(Arg)) { + Changed = true; + ++NumNoops; + EraseInstruction(Inst); + continue; + } + + // Keep track of which of retain, release, autorelease, and retain_block + // are actually present in this function. + UsedInThisFunction |= 1 << Class; + + // If Arg is a PHI, and one or more incoming values to the + // PHI are null, and the call is control-equivalent to the PHI, and there + // are no relevant side effects between the PHI and the call, the call + // could be pushed up to just those paths with non-null incoming values. + // For now, don't bother splitting critical edges for this. + SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist; + Worklist.push_back(std::make_pair(Inst, Arg)); + do { + std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val(); + Inst = Pair.first; + Arg = Pair.second; + + const PHINode *PN = dyn_cast<PHINode>(Arg); + if (!PN) continue; + + // Determine if the PHI has any null operands, or any incoming + // critical edges. + bool HasNull = false; + bool HasCriticalEdges = false; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = + StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); + if (isNullOrUndef(Incoming)) + HasNull = true; + else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back()) + .getNumSuccessors() != 1) { + HasCriticalEdges = true; + break; + } + } + // If we have null operands and no critical edges, optimize. + if (!HasCriticalEdges && HasNull) { + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + + // Check that there is nothing that cares about the reference + // count between the call and the phi. + FindDependencies(NeedsPositiveRetainCount, Arg, + Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + if (DependingInstructions.size() == 1 && + *DependingInstructions.begin() == PN) { + Changed = true; + ++NumPartialNoops; + // Clone the call into each predecessor that has a non-null value. + CallInst *CInst = cast<CallInst>(Inst); + const Type *ParamTy = CInst->getArgOperand(0)->getType(); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = + StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); + if (!isNullOrUndef(Incoming)) { + CallInst *Clone = cast<CallInst>(CInst->clone()); + Value *Op = PN->getIncomingValue(i); + Instruction *InsertPos = &PN->getIncomingBlock(i)->back(); + if (Op->getType() != ParamTy) + Op = new BitCastInst(Op, ParamTy, "", InsertPos); + Clone->setArgOperand(0, Op); + Clone->insertBefore(InsertPos); + Worklist.push_back(std::make_pair(Clone, Incoming)); + } + } + // Erase the original call. + EraseInstruction(CInst); + continue; + } + } + } while (!Worklist.empty()); + } +} + +/// CheckForCFGHazards - Check for critical edges, loop boundaries, irreducible +/// control flow, or other CFG structures where moving code across the edge +/// would result in it being executed more. +void +ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + BBState &MyStates) const { + // If any top-down local-use or possible-dec has a succ which is earlier in + // the sequence, forget it. + for (BBState::ptr_const_iterator I = MyStates.top_down_ptr_begin(), + E = MyStates.top_down_ptr_end(); I != E; ++I) + switch (I->second.GetSeq()) { + default: break; + case S_Use: { + const Value *Arg = I->first; + const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); + bool SomeSuccHasSame = false; + bool AllSuccsHaveSame = true; + for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) + switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) { + case S_None: + case S_CanRelease: + MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); + SomeSuccHasSame = false; + break; + case S_Use: + SomeSuccHasSame = true; + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + AllSuccsHaveSame = false; + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + // If the state at the other end of any of the successor edges + // matches the current state, require all edges to match. This + // guards against loops in the middle of a sequence. + if (SomeSuccHasSame && !AllSuccsHaveSame) + MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); + } + case S_CanRelease: { + const Value *Arg = I->first; + const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); + bool SomeSuccHasSame = false; + bool AllSuccsHaveSame = true; + for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) + switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) { + case S_None: + MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); + SomeSuccHasSame = false; + break; + case S_CanRelease: + SomeSuccHasSame = true; + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + case S_Use: + AllSuccsHaveSame = false; + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + // If the state at the other end of any of the successor edges + // matches the current state, require all edges to match. This + // guards against loops in the middle of a sequence. + if (SomeSuccHasSame && !AllSuccsHaveSame) + MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); + } + } +} + +bool +ObjCARCOpt::VisitBottomUp(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains) { + bool NestingDetected = false; + BBState &MyStates = BBStates[BB]; + + // Merge the states from each successor to compute the initial state + // for the current block. + const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); + succ_const_iterator SI(TI), SE(TI, false); + if (SI == SE) + MyStates.SetAsExit(); + else + do { + const BasicBlock *Succ = *SI++; + if (Succ == BB) + continue; + DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); + if (I == BBStates.end()) + continue; + MyStates.InitFromSucc(I->second); + while (SI != SE) { + Succ = *SI++; + if (Succ != BB) { + I = BBStates.find(Succ); + if (I != BBStates.end()) + MyStates.MergeSucc(I->second); + } + } + break; + } while (SI != SE); + + // Visit all the instructions, bottom-up. + for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { + Instruction *Inst = llvm::prior(I); + InstructionClass Class = GetInstructionClass(Inst); + const Value *Arg = 0; + + switch (Class) { + case IC_Release: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrBottomUpState(Arg); + + // If we see two releases in a row on the same pointer. If so, make + // a note, and we'll cicle back to revisit it after we've + // hopefully eliminated the second release, which may allow us to + // eliminate the first release too. + // Theoretically we could implement removal of nested retain+release + // pairs by making PtrState hold a stack of states, but this is + // simple and avoids adding overhead for the non-nested case. + if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) + NestingDetected = true; + + S.SetSeqToRelease(Inst->getMetadata(ImpreciseReleaseMDKind)); + S.RRI.clear(); + S.RRI.KnownIncremented = S.IsKnownIncremented(); + S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); + S.RRI.Calls.insert(Inst); + + S.IncrementRefCount(); + break; + } + case IC_RetainBlock: + case IC_Retain: + case IC_RetainRV: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrBottomUpState(Arg); + S.DecrementRefCount(); + + switch (S.GetSeq()) { + case S_Stop: + case S_Release: + case S_MovableRelease: + case S_Use: + S.RRI.ReverseInsertPts.clear(); + // FALL THROUGH + case S_CanRelease: + // Don't do retain+release tracking for IC_RetainRV, because it's + // better to let it remain as the first instruction after a call. + if (Class != IC_RetainRV) { + S.RRI.IsRetainBlock = Class == IC_RetainBlock; + Retains[Inst] = S.RRI; + } + S.ClearSequenceProgress(); + break; + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + break; + } + case IC_AutoreleasepoolPop: + // Conservatively, clear MyStates for all known pointers. + MyStates.clearBottomUpPointers(); + continue; + case IC_AutoreleasepoolPush: + case IC_None: + // These are irrelevant. + continue; + default: + break; + } + + // Consider any other possible effects of this instruction on each + // pointer being tracked. + for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(), + ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) { + const Value *Ptr = MI->first; + if (Ptr == Arg) + continue; // Handled above. + PtrState &S = MI->second; + Sequence Seq = S.GetSeq(); + + // Check for possible retains and releases. + if (CanAlterRefCount(Inst, Ptr, PA, Class)) { + // Check for a retain (we're going bottom-up here). + S.DecrementRefCount(); + + // Check for a release. + if (!IsRetain(Class) && Class != IC_RetainBlock) + switch (Seq) { + case S_Use: + S.SetSeq(S_CanRelease); + continue; + case S_CanRelease: + case S_Release: + case S_MovableRelease: + case S_Stop: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + } + + // Check for possible direct uses. + switch (Seq) { + case S_Release: + case S_MovableRelease: + if (CanUse(Inst, Ptr, PA, Class)) { + S.RRI.ReverseInsertPts.clear(); + S.RRI.ReverseInsertPts.insert(Inst); + S.SetSeq(S_Use); + } else if (Seq == S_Release && + (Class == IC_User || Class == IC_CallOrUser)) { + // Non-movable releases depend on any possible objc pointer use. + S.SetSeq(S_Stop); + S.RRI.ReverseInsertPts.clear(); + S.RRI.ReverseInsertPts.insert(Inst); + } + break; + case S_Stop: + if (CanUse(Inst, Ptr, PA, Class)) + S.SetSeq(S_Use); + break; + case S_CanRelease: + case S_Use: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + } + } + + return NestingDetected; +} + +bool +ObjCARCOpt::VisitTopDown(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + DenseMap<Value *, RRInfo> &Releases) { + bool NestingDetected = false; + BBState &MyStates = BBStates[BB]; + + // Merge the states from each predecessor to compute the initial state + // for the current block. + const_pred_iterator PI(BB), PE(BB, false); + if (PI == PE) + MyStates.SetAsEntry(); + else + do { + const BasicBlock *Pred = *PI++; + if (Pred == BB) + continue; + DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); + if (I == BBStates.end()) + continue; + MyStates.InitFromPred(I->second); + while (PI != PE) { + Pred = *PI++; + if (Pred != BB) { + I = BBStates.find(Pred); + if (I != BBStates.end()) + MyStates.MergePred(I->second); + } + } + break; + } while (PI != PE); + + // Visit all the instructions, top-down. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + Instruction *Inst = I; + InstructionClass Class = GetInstructionClass(Inst); + const Value *Arg = 0; + + switch (Class) { + case IC_RetainBlock: + case IC_Retain: + case IC_RetainRV: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrTopDownState(Arg); + + // Don't do retain+release tracking for IC_RetainRV, because it's + // better to let it remain as the first instruction after a call. + if (Class != IC_RetainRV) { + // If we see two retains in a row on the same pointer. If so, make + // a note, and we'll cicle back to revisit it after we've + // hopefully eliminated the second retain, which may allow us to + // eliminate the first retain too. + // Theoretically we could implement removal of nested retain+release + // pairs by making PtrState hold a stack of states, but this is + // simple and avoids adding overhead for the non-nested case. + if (S.GetSeq() == S_Retain) + NestingDetected = true; + + S.SetSeq(S_Retain); + S.RRI.clear(); + S.RRI.IsRetainBlock = Class == IC_RetainBlock; + S.RRI.KnownIncremented = S.IsKnownIncremented(); + S.RRI.Calls.insert(Inst); + } + + S.IncrementRefCount(); + break; + } + case IC_Release: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrTopDownState(Arg); + S.DecrementRefCount(); + + switch (S.GetSeq()) { + case S_Retain: + case S_CanRelease: + S.RRI.ReverseInsertPts.clear(); + // FALL THROUGH + case S_Use: + S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); + S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); + Releases[Inst] = S.RRI; + S.ClearSequenceProgress(); + break; + case S_None: + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } + break; + } + case IC_AutoreleasepoolPop: + // Conservatively, clear MyStates for all known pointers. + MyStates.clearTopDownPointers(); + continue; + case IC_AutoreleasepoolPush: + case IC_None: + // These are irrelevant. + continue; + default: + break; + } + + // Consider any other possible effects of this instruction on each + // pointer being tracked. + for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(), + ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) { + const Value *Ptr = MI->first; + if (Ptr == Arg) + continue; // Handled above. + PtrState &S = MI->second; + Sequence Seq = S.GetSeq(); + + // Check for possible releases. + if (!IsRetain(Class) && Class != IC_RetainBlock && + CanAlterRefCount(Inst, Ptr, PA, Class)) { + // Check for a release. + S.DecrementRefCount(); + + // Check for a release. + switch (Seq) { + case S_Retain: + S.SetSeq(S_CanRelease); + S.RRI.ReverseInsertPts.clear(); + S.RRI.ReverseInsertPts.insert(Inst); + + // One call can't cause a transition from S_Retain to S_CanRelease + // and S_CanRelease to S_Use. If we've made the first transition, + // we're done. + continue; + case S_Use: + case S_CanRelease: + case S_None: + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } + } + + // Check for possible direct uses. + switch (Seq) { + case S_CanRelease: + if (CanUse(Inst, Ptr, PA, Class)) + S.SetSeq(S_Use); + break; + case S_Use: + case S_Retain: + case S_None: + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } + } + } + + CheckForCFGHazards(BB, BBStates, MyStates); + return NestingDetected; +} + +// Visit - Visit the function both top-down and bottom-up. +bool +ObjCARCOpt::Visit(Function &F, + DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases) { + // Use postorder for bottom-up, and reverse-postorder for top-down, because we + // magically know that loops will be well behaved, i.e. they won't repeatedly + // call retain on a single pointer without doing a release. + bool BottomUpNestingDetected = false; + SmallVector<BasicBlock *, 8> PostOrder; + for (po_iterator<Function *> I = po_begin(&F), E = po_end(&F); I != E; ++I) { + BasicBlock *BB = *I; + PostOrder.push_back(BB); + + BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains); + } + + // Iterate through the post-order in reverse order, achieving a + // reverse-postorder traversal. We don't use the ReversePostOrderTraversal + // class here because it works by computing its own full postorder iteration, + // recording the sequence, and playing it back in reverse. Since we're already + // doing a full iteration above, we can just record the sequence manually and + // avoid the cost of having ReversePostOrderTraversal compute it. + bool TopDownNestingDetected = false; + for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator + RI = PostOrder.rbegin(), RE = PostOrder.rend(); RI != RE; ++RI) + TopDownNestingDetected |= VisitTopDown(*RI, BBStates, Releases); + + return TopDownNestingDetected && BottomUpNestingDetected; +} + +/// MoveCalls - Move the calls in RetainsToMove and ReleasesToMove. +void ObjCARCOpt::MoveCalls(Value *Arg, + RRInfo &RetainsToMove, + RRInfo &ReleasesToMove, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, + SmallVectorImpl<Instruction *> &DeadInsts) { + const Type *ArgTy = Arg->getType(); + const Type *ParamTy = + (RetainRVFunc ? RetainRVFunc : + RetainFunc ? RetainFunc : + RetainBlockFunc)->arg_begin()->getType(); + + // Insert the new retain and release calls. + for (SmallPtrSet<Instruction *, 2>::const_iterator + PI = ReleasesToMove.ReverseInsertPts.begin(), + PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) { + Instruction *InsertPt = *PI; + Value *MyArg = ArgTy == ParamTy ? Arg : + new BitCastInst(Arg, ParamTy, "", InsertPt); + CallInst *Call = + CallInst::Create(RetainsToMove.IsRetainBlock ? + RetainBlockFunc : RetainFunc, + MyArg, "", InsertPt); + Call->setDoesNotThrow(); + if (!RetainsToMove.IsRetainBlock) + Call->setTailCall(); + } + for (SmallPtrSet<Instruction *, 2>::const_iterator + PI = RetainsToMove.ReverseInsertPts.begin(), + PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) { + Instruction *LastUse = *PI; + Instruction *InsertPts[] = { 0, 0, 0 }; + if (InvokeInst *II = dyn_cast<InvokeInst>(LastUse)) { + // We can't insert code immediately after an invoke instruction, so + // insert code at the beginning of both successor blocks instead. + // The invoke's return value isn't available in the unwind block, + // but our releases will never depend on it, because they must be + // paired with retains from before the invoke. + InsertPts[0] = II->getNormalDest()->getFirstNonPHI(); + InsertPts[1] = II->getUnwindDest()->getFirstNonPHI(); + } else { + // Insert code immediately after the last use. + InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse)); + } + + for (Instruction **I = InsertPts; *I; ++I) { + Instruction *InsertPt = *I; + Value *MyArg = ArgTy == ParamTy ? Arg : + new BitCastInst(Arg, ParamTy, "", InsertPt); + CallInst *Call = CallInst::Create(ReleaseFunc, MyArg, "", InsertPt); + // Attach a clang.imprecise_release metadata tag, if appropriate. + if (MDNode *M = ReleasesToMove.ReleaseMetadata) + Call->setMetadata(ImpreciseReleaseMDKind, M); + Call->setDoesNotThrow(); + if (ReleasesToMove.IsTailCallRelease) + Call->setTailCall(); + } + } + + // Delete the original retain and release calls. + for (SmallPtrSet<Instruction *, 2>::const_iterator + AI = RetainsToMove.Calls.begin(), + AE = RetainsToMove.Calls.end(); AI != AE; ++AI) { + Instruction *OrigRetain = *AI; + Retains.blot(OrigRetain); + DeadInsts.push_back(OrigRetain); + } + for (SmallPtrSet<Instruction *, 2>::const_iterator + AI = ReleasesToMove.Calls.begin(), + AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) { + Instruction *OrigRelease = *AI; + Releases.erase(OrigRelease); + DeadInsts.push_back(OrigRelease); + } +} + +bool +ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> + &BBStates, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases) { + bool AnyPairsCompletelyEliminated = false; + RRInfo RetainsToMove; + RRInfo ReleasesToMove; + SmallVector<Instruction *, 4> NewRetains; + SmallVector<Instruction *, 4> NewReleases; + SmallVector<Instruction *, 8> DeadInsts; + + for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(), + E = Retains.end(); I != E; ) { + Value *V = (I++)->first; + if (!V) continue; // blotted + + Instruction *Retain = cast<Instruction>(V); + Value *Arg = GetObjCArg(Retain); + + // If the object being released is in static or stack storage, we know it's + // not being managed by ObjC reference counting, so we can delete pairs + // regardless of what possible decrements or uses lie between them. + bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg); + + // If a pair happens in a region where it is known that the reference count + // is already incremented, we can similarly ignore possible decrements. + bool KnownIncrementedTD = true, KnownIncrementedBU = true; + + // Connect the dots between the top-down-collected RetainsToMove and + // bottom-up-collected ReleasesToMove to form sets of related calls. + // This is an iterative process so that we connect multiple releases + // to multiple retains if needed. + unsigned OldDelta = 0; + unsigned NewDelta = 0; + unsigned OldCount = 0; + unsigned NewCount = 0; + bool FirstRelease = true; + bool FirstRetain = true; + NewRetains.push_back(Retain); + for (;;) { + for (SmallVectorImpl<Instruction *>::const_iterator + NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) { + Instruction *NewRetain = *NI; + MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain); + assert(It != Retains.end()); + const RRInfo &NewRetainRRI = It->second; + KnownIncrementedTD &= NewRetainRRI.KnownIncremented; + for (SmallPtrSet<Instruction *, 2>::const_iterator + LI = NewRetainRRI.Calls.begin(), + LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) { + Instruction *NewRetainRelease = *LI; + DenseMap<Value *, RRInfo>::const_iterator Jt = + Releases.find(NewRetainRelease); + if (Jt == Releases.end()) + goto next_retain; + const RRInfo &NewRetainReleaseRRI = Jt->second; + assert(NewRetainReleaseRRI.Calls.count(NewRetain)); + if (ReleasesToMove.Calls.insert(NewRetainRelease)) { + OldDelta -= + BBStates[NewRetainRelease->getParent()].GetAllPathCount(); + + // Merge the ReleaseMetadata and IsTailCallRelease values. + if (FirstRelease) { + ReleasesToMove.ReleaseMetadata = + NewRetainReleaseRRI.ReleaseMetadata; + ReleasesToMove.IsTailCallRelease = + NewRetainReleaseRRI.IsTailCallRelease; + FirstRelease = false; + } else { + if (ReleasesToMove.ReleaseMetadata != + NewRetainReleaseRRI.ReleaseMetadata) + ReleasesToMove.ReleaseMetadata = 0; + if (ReleasesToMove.IsTailCallRelease != + NewRetainReleaseRRI.IsTailCallRelease) + ReleasesToMove.IsTailCallRelease = false; + } + + // Collect the optimal insertion points. + if (!KnownSafe) + for (SmallPtrSet<Instruction *, 2>::const_iterator + RI = NewRetainReleaseRRI.ReverseInsertPts.begin(), + RE = NewRetainReleaseRRI.ReverseInsertPts.end(); + RI != RE; ++RI) { + Instruction *RIP = *RI; + if (ReleasesToMove.ReverseInsertPts.insert(RIP)) + NewDelta -= BBStates[RIP->getParent()].GetAllPathCount(); + } + NewReleases.push_back(NewRetainRelease); + } + } + } + NewRetains.clear(); + if (NewReleases.empty()) break; + + // Back the other way. + for (SmallVectorImpl<Instruction *>::const_iterator + NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) { + Instruction *NewRelease = *NI; + DenseMap<Value *, RRInfo>::const_iterator It = + Releases.find(NewRelease); + assert(It != Releases.end()); + const RRInfo &NewReleaseRRI = It->second; + KnownIncrementedBU &= NewReleaseRRI.KnownIncremented; + for (SmallPtrSet<Instruction *, 2>::const_iterator + LI = NewReleaseRRI.Calls.begin(), + LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) { + Instruction *NewReleaseRetain = *LI; + MapVector<Value *, RRInfo>::const_iterator Jt = + Retains.find(NewReleaseRetain); + if (Jt == Retains.end()) + goto next_retain; + const RRInfo &NewReleaseRetainRRI = Jt->second; + assert(NewReleaseRetainRRI.Calls.count(NewRelease)); + if (RetainsToMove.Calls.insert(NewReleaseRetain)) { + unsigned PathCount = + BBStates[NewReleaseRetain->getParent()].GetAllPathCount(); + OldDelta += PathCount; + OldCount += PathCount; + + // Merge the IsRetainBlock values. + if (FirstRetain) { + RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock; + FirstRetain = false; + } else if (ReleasesToMove.IsRetainBlock != + NewReleaseRetainRRI.IsRetainBlock) + // It's not possible to merge the sequences if one uses + // objc_retain and the other uses objc_retainBlock. + goto next_retain; + + // Collect the optimal insertion points. + if (!KnownSafe) + for (SmallPtrSet<Instruction *, 2>::const_iterator + RI = NewReleaseRetainRRI.ReverseInsertPts.begin(), + RE = NewReleaseRetainRRI.ReverseInsertPts.end(); + RI != RE; ++RI) { + Instruction *RIP = *RI; + if (RetainsToMove.ReverseInsertPts.insert(RIP)) { + PathCount = BBStates[RIP->getParent()].GetAllPathCount(); + NewDelta += PathCount; + NewCount += PathCount; + } + } + NewRetains.push_back(NewReleaseRetain); + } + } + } + NewReleases.clear(); + if (NewRetains.empty()) break; + } + + // If the pointer is known incremented, we can safely delete the pair + // regardless of what's between them. + if (KnownIncrementedTD || KnownIncrementedBU) { + RetainsToMove.ReverseInsertPts.clear(); + ReleasesToMove.ReverseInsertPts.clear(); + NewCount = 0; + } + + // Determine whether the original call points are balanced in the retain and + // release calls through the program. If not, conservatively don't touch + // them. + // TODO: It's theoretically possible to do code motion in this case, as + // long as the existing imbalances are maintained. + if (OldDelta != 0) + goto next_retain; + + // Determine whether the new insertion points we computed preserve the + // balance of retain and release calls through the program. + // TODO: If the fully aggressive solution isn't valid, try to find a + // less aggressive solution which is. + if (NewDelta != 0) + goto next_retain; + + // Ok, everything checks out and we're all set. Let's move some code! + Changed = true; + AnyPairsCompletelyEliminated = NewCount == 0; + NumRRs += OldCount - NewCount; + MoveCalls(Arg, RetainsToMove, ReleasesToMove, Retains, Releases, DeadInsts); + + next_retain: + NewReleases.clear(); + NewRetains.clear(); + RetainsToMove.clear(); + ReleasesToMove.clear(); + } + + // Now that we're done moving everything, we can delete the newly dead + // instructions, as we no longer need them as insert points. + while (!DeadInsts.empty()) + EraseInstruction(DeadInsts.pop_back_val()); + + return AnyPairsCompletelyEliminated; +} + +/// OptimizeWeakCalls - Weak pointer optimizations. +void ObjCARCOpt::OptimizeWeakCalls(Function &F) { + // First, do memdep-style RLE and S2L optimizations. We can't use memdep + // itself because it uses AliasAnalysis and we need to do provenance + // queries instead. + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { + Instruction *Inst = &*I++; + InstructionClass Class = GetBasicInstructionClass(Inst); + if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained) + continue; + + // Delete objc_loadWeak calls with no users. + if (Class == IC_LoadWeak && Inst->use_empty()) { + Inst->eraseFromParent(); + continue; + } + + // TODO: For now, just look for an earlier available version of this value + // within the same block. Theoretically, we could do memdep-style non-local + // analysis too, but that would want caching. A better approach would be to + // use the technique that EarlyCSE uses. + inst_iterator Current = llvm::prior(I); + BasicBlock *CurrentBB = Current.getBasicBlockIterator(); + for (BasicBlock::iterator B = CurrentBB->begin(), + J = Current.getInstructionIterator(); + J != B; --J) { + Instruction *EarlierInst = &*llvm::prior(J); + InstructionClass EarlierClass = GetInstructionClass(EarlierInst); + switch (EarlierClass) { + case IC_LoadWeak: + case IC_LoadWeakRetained: { + // If this is loading from the same pointer, replace this load's value + // with that one. + CallInst *Call = cast<CallInst>(Inst); + CallInst *EarlierCall = cast<CallInst>(EarlierInst); + Value *Arg = Call->getArgOperand(0); + Value *EarlierArg = EarlierCall->getArgOperand(0); + switch (PA.getAA()->alias(Arg, EarlierArg)) { + case AliasAnalysis::MustAlias: + Changed = true; + // If the load has a builtin retain, insert a plain retain for it. + if (Class == IC_LoadWeakRetained) { + CallInst *CI = + CallInst::Create(getRetainCallee(F.getParent()), EarlierCall, + "", Call); + CI->setTailCall(); + } + // Zap the fully redundant load. + Call->replaceAllUsesWith(EarlierCall); + Call->eraseFromParent(); + goto clobbered; + case AliasAnalysis::MayAlias: + case AliasAnalysis::PartialAlias: + goto clobbered; + case AliasAnalysis::NoAlias: + break; + } + break; + } + case IC_StoreWeak: + case IC_InitWeak: { + // If this is storing to the same pointer and has the same size etc. + // replace this load's value with the stored value. + CallInst *Call = cast<CallInst>(Inst); + CallInst *EarlierCall = cast<CallInst>(EarlierInst); + Value *Arg = Call->getArgOperand(0); + Value *EarlierArg = EarlierCall->getArgOperand(0); + switch (PA.getAA()->alias(Arg, EarlierArg)) { + case AliasAnalysis::MustAlias: + Changed = true; + // If the load has a builtin retain, insert a plain retain for it. + if (Class == IC_LoadWeakRetained) { + CallInst *CI = + CallInst::Create(getRetainCallee(F.getParent()), EarlierCall, + "", Call); + CI->setTailCall(); + } + // Zap the fully redundant load. + Call->replaceAllUsesWith(EarlierCall->getArgOperand(1)); + Call->eraseFromParent(); + goto clobbered; + case AliasAnalysis::MayAlias: + case AliasAnalysis::PartialAlias: + goto clobbered; + case AliasAnalysis::NoAlias: + break; + } + break; + } + case IC_MoveWeak: + case IC_CopyWeak: + // TOOD: Grab the copied value. + goto clobbered; + case IC_AutoreleasepoolPush: + case IC_None: + case IC_User: + // Weak pointers are only modified through the weak entry points + // (and arbitrary calls, which could call the weak entry points). + break; + default: + // Anything else could modify the weak pointer. + goto clobbered; + } + } + clobbered:; + } + + // Then, for each destroyWeak with an alloca operand, check to see if + // the alloca and all its users can be zapped. + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { + Instruction *Inst = &*I++; + InstructionClass Class = GetBasicInstructionClass(Inst); + if (Class != IC_DestroyWeak) + continue; + + CallInst *Call = cast<CallInst>(Inst); + Value *Arg = Call->getArgOperand(0); + if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) { + for (Value::use_iterator UI = Alloca->use_begin(), + UE = Alloca->use_end(); UI != UE; ++UI) { + Instruction *UserInst = cast<Instruction>(*UI); + switch (GetBasicInstructionClass(UserInst)) { + case IC_InitWeak: + case IC_StoreWeak: + case IC_DestroyWeak: + continue; + default: + goto done; + } + } + Changed = true; + for (Value::use_iterator UI = Alloca->use_begin(), + UE = Alloca->use_end(); UI != UE; ) { + CallInst *UserInst = cast<CallInst>(*UI++); + if (!UserInst->use_empty()) + UserInst->replaceAllUsesWith(UserInst->getOperand(1)); + UserInst->eraseFromParent(); + } + Alloca->eraseFromParent(); + done:; + } + } +} + +/// OptimizeSequences - Identify program paths which execute sequences of +/// retains and releases which can be eliminated. +bool ObjCARCOpt::OptimizeSequences(Function &F) { + /// Releases, Retains - These are used to store the results of the main flow + /// analysis. These use Value* as the key instead of Instruction* so that the + /// map stays valid when we get around to rewriting code and calls get + /// replaced by arguments. + DenseMap<Value *, RRInfo> Releases; + MapVector<Value *, RRInfo> Retains; + + /// BBStates, This is used during the traversal of the function to track the + /// states for each identified object at each block. + DenseMap<const BasicBlock *, BBState> BBStates; + + // Analyze the CFG of the function, and all instructions. + bool NestingDetected = Visit(F, BBStates, Retains, Releases); + + // Transform. + return PerformCodePlacement(BBStates, Retains, Releases) && NestingDetected; +} + +/// OptimizeReturns - Look for this pattern: +/// +/// %call = call i8* @something(...) +/// %2 = call i8* @objc_retain(i8* %call) +/// %3 = call i8* @objc_autorelease(i8* %2) +/// ret i8* %3 +/// +/// And delete the retain and autorelease. +/// +/// Otherwise if it's just this: +/// +/// %3 = call i8* @objc_autorelease(i8* %2) +/// ret i8* %3 +/// +/// convert the autorelease to autoreleaseRV. +void ObjCARCOpt::OptimizeReturns(Function &F) { + if (!F.getReturnType()->isPointerTy()) + return; + + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { + BasicBlock *BB = FI; + ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back()); + if (!Ret) continue; + + const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0)); + FindDependencies(NeedsPositiveRetainCount, Arg, + BB, Ret, DependingInstructions, Visited, PA); + if (DependingInstructions.size() != 1) + goto next_block; + + { + CallInst *Autorelease = + dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); + if (!Autorelease) + goto next_block; + InstructionClass AutoreleaseClass = + GetBasicInstructionClass(Autorelease); + if (!IsAutorelease(AutoreleaseClass)) + goto next_block; + if (GetObjCArg(Autorelease) != Arg) + goto next_block; + + DependingInstructions.clear(); + Visited.clear(); + + // Check that there is nothing that can affect the reference + // count between the autorelease and the retain. + FindDependencies(CanChangeRetainCount, Arg, + BB, Autorelease, DependingInstructions, Visited, PA); + if (DependingInstructions.size() != 1) + goto next_block; + + { + CallInst *Retain = + dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); + + // Check that we found a retain with the same argument. + if (!Retain || + !IsRetain(GetBasicInstructionClass(Retain)) || + GetObjCArg(Retain) != Arg) + goto next_block; + + DependingInstructions.clear(); + Visited.clear(); + + // Convert the autorelease to an autoreleaseRV, since it's + // returning the value. + if (AutoreleaseClass == IC_Autorelease) { + Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent())); + AutoreleaseClass = IC_AutoreleaseRV; + } + + // Check that there is nothing that can affect the reference + // count between the retain and the call. + FindDependencies(CanChangeRetainCount, Arg, BB, Retain, + DependingInstructions, Visited, PA); + if (DependingInstructions.size() != 1) + goto next_block; + + { + CallInst *Call = + dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); + + // Check that the pointer is the return value of the call. + if (!Call || Arg != Call) + goto next_block; + + // Check that the call is a regular call. + InstructionClass Class = GetBasicInstructionClass(Call); + if (Class != IC_CallOrUser && Class != IC_Call) + goto next_block; + + // If so, we can zap the retain and autorelease. + Changed = true; + ++NumRets; + EraseInstruction(Retain); + EraseInstruction(Autorelease); + } + } + } + + next_block: + DependingInstructions.clear(); + Visited.clear(); + } +} + +bool ObjCARCOpt::doInitialization(Module &M) { + if (!EnableARCOpts) + return false; + + // Identify the imprecise release metadata kind. + ImpreciseReleaseMDKind = + M.getContext().getMDKindID("clang.imprecise_release"); + + // Identify the declarations for objc_retain and friends. + RetainFunc = M.getFunction("objc_retain"); + RetainBlockFunc = M.getFunction("objc_retainBlock"); + RetainRVFunc = M.getFunction("objc_retainAutoreleasedReturnValue"); + ReleaseFunc = M.getFunction("objc_release"); + AutoreleaseFunc = M.getFunction("objc_autorelease"); + + // Intuitively, objc_retain and others are nocapture, however in practice + // they are not, because they return their argument value. And objc_release + // calls finalizers. + + // These are initialized lazily. + RetainRVCallee = 0; + AutoreleaseRVCallee = 0; + ReleaseCallee = 0; + RetainCallee = 0; + AutoreleaseCallee = 0; + + return false; +} + +bool ObjCARCOpt::runOnFunction(Function &F) { + if (!EnableARCOpts) + return false; + + Changed = false; + + PA.setAA(&getAnalysis<AliasAnalysis>()); + + // This pass performs several distinct transformations. As a compile-time aid + // when compiling code that isn't ObjC, skip these if the relevant ObjC + // library functions aren't declared. + + // Preliminary optimizations. This also computs UsedInThisFunction. + OptimizeIndividualCalls(F); + + // Optimizations for weak pointers. + if (UsedInThisFunction & ((1 << IC_LoadWeak) | + (1 << IC_LoadWeakRetained) | + (1 << IC_StoreWeak) | + (1 << IC_InitWeak) | + (1 << IC_CopyWeak) | + (1 << IC_MoveWeak) | + (1 << IC_DestroyWeak))) + OptimizeWeakCalls(F); + + // Optimizations for retain+release pairs. + if (UsedInThisFunction & ((1 << IC_Retain) | + (1 << IC_RetainRV) | + (1 << IC_RetainBlock))) + if (UsedInThisFunction & (1 << IC_Release)) + // Run OptimizeSequences until it either stops making changes or + // no retain+release pair nesting is detected. + while (OptimizeSequences(F)) {} + + // Optimizations if objc_autorelease is used. + if (UsedInThisFunction & + ((1 << IC_Autorelease) | (1 << IC_AutoreleaseRV))) + OptimizeReturns(F); + + return Changed; +} + +void ObjCARCOpt::releaseMemory() { + PA.clear(); +} + +//===----------------------------------------------------------------------===// +// ARC contraction. +//===----------------------------------------------------------------------===// + +// TODO: ObjCARCContract could insert PHI nodes when uses aren't +// dominated by single calls. + +#include "llvm/Operator.h" +#include "llvm/InlineAsm.h" +#include "llvm/Analysis/Dominators.h" + +STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); + +namespace { + /// ObjCARCContract - Late ARC optimizations. These change the IR in a way + /// that makes it difficult to be analyzed by ObjCARCOpt, so it's run late. + class ObjCARCContract : public FunctionPass { + bool Changed; + AliasAnalysis *AA; + DominatorTree *DT; + ProvenanceAnalysis PA; + + /// StoreStrongCallee, etc. - Declarations for ObjC runtime + /// functions, for use in creating calls to them. These are initialized + /// lazily to avoid cluttering up the Module with unused declarations. + Constant *StoreStrongCallee, + *RetainAutoreleaseCallee, *RetainAutoreleaseRVCallee; + + /// RetainRVMarker - The inline asm string to insert between calls and + /// RetainRV calls to make the optimization work on targets which need it. + const MDString *RetainRVMarker; + + Constant *getStoreStrongCallee(Module *M); + Constant *getRetainAutoreleaseCallee(Module *M); + Constant *getRetainAutoreleaseRVCallee(Module *M); + + bool ContractAutorelease(Function &F, Instruction *Autorelease, + InstructionClass Class, + SmallPtrSet<Instruction *, 4> + &DependingInstructions, + SmallPtrSet<const BasicBlock *, 4> + &Visited); + + void ContractRelease(Instruction *Release, + inst_iterator &Iter); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + + public: + static char ID; + ObjCARCContract() : FunctionPass(ID) { + initializeObjCARCContractPass(*PassRegistry::getPassRegistry()); + } + }; +} + +char ObjCARCContract::ID = 0; +INITIALIZE_PASS_BEGIN(ObjCARCContract, + "objc-arc-contract", "ObjC ARC contraction", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(ObjCARCContract, + "objc-arc-contract", "ObjC ARC contraction", false, false) + +Pass *llvm::createObjCARCContractPass() { + return new ObjCARCContract(); +} + +void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); +} + +Constant *ObjCARCContract::getStoreStrongCallee(Module *M) { + if (!StoreStrongCallee) { + LLVMContext &C = M->getContext(); + const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + const Type *I8XX = PointerType::getUnqual(I8X); + std::vector<const Type *> Params; + Params.push_back(I8XX); + Params.push_back(I8X); + + AttrListPtr Attributes; + Attributes.addAttr(~0u, Attribute::NoUnwind); + Attributes.addAttr(1, Attribute::NoCapture); + + StoreStrongCallee = + M->getOrInsertFunction( + "objc_storeStrong", + FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), + Attributes); + } + return StoreStrongCallee; +} + +Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { + if (!RetainAutoreleaseCallee) { + LLVMContext &C = M->getContext(); + const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + std::vector<const Type *> Params; + Params.push_back(I8X); + const FunctionType *FTy = + FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes; + Attributes.addAttr(~0u, Attribute::NoUnwind); + RetainAutoreleaseCallee = + M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes); + } + return RetainAutoreleaseCallee; +} + +Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { + if (!RetainAutoreleaseRVCallee) { + LLVMContext &C = M->getContext(); + const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + std::vector<const Type *> Params; + Params.push_back(I8X); + const FunctionType *FTy = + FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes; + Attributes.addAttr(~0u, Attribute::NoUnwind); + RetainAutoreleaseRVCallee = + M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy, + Attributes); + } + return RetainAutoreleaseRVCallee; +} + +/// ContractAutorelease - Merge an autorelease with a retain into a fused +/// call. +bool +ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, + InstructionClass Class, + SmallPtrSet<Instruction *, 4> + &DependingInstructions, + SmallPtrSet<const BasicBlock *, 4> + &Visited) { + const Value *Arg = GetObjCArg(Autorelease); + + // Check that there are no instructions between the retain and the autorelease + // (such as an autorelease_pop) which may change the count. + CallInst *Retain = 0; + if (Class == IC_AutoreleaseRV) + FindDependencies(RetainAutoreleaseRVDep, Arg, + Autorelease->getParent(), Autorelease, + DependingInstructions, Visited, PA); + else + FindDependencies(RetainAutoreleaseDep, Arg, + Autorelease->getParent(), Autorelease, + DependingInstructions, Visited, PA); + + Visited.clear(); + if (DependingInstructions.size() != 1) { + DependingInstructions.clear(); + return false; + } + + Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); + DependingInstructions.clear(); + + if (!Retain || + GetBasicInstructionClass(Retain) != IC_Retain || + GetObjCArg(Retain) != Arg) + return false; + + Changed = true; + ++NumPeeps; + + if (Class == IC_AutoreleaseRV) + Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent())); + else + Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent())); + + EraseInstruction(Autorelease); + return true; +} + +/// ContractRelease - Attempt to merge an objc_release with a store, load, and +/// objc_retain to form an objc_storeStrong. This can be a little tricky because +/// the instructions don't always appear in order, and there may be unrelated +/// intervening instructions. +void ObjCARCContract::ContractRelease(Instruction *Release, + inst_iterator &Iter) { + LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release)); + if (!Load || Load->isVolatile()) return; + + // For now, require everything to be in one basic block. + BasicBlock *BB = Release->getParent(); + if (Load->getParent() != BB) return; + + // Walk down to find the store. + BasicBlock::iterator I = Load, End = BB->end(); + ++I; + AliasAnalysis::Location Loc = AA->getLocation(Load); + while (I != End && + (&*I == Release || + IsRetain(GetBasicInstructionClass(I)) || + !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod))) + ++I; + StoreInst *Store = dyn_cast<StoreInst>(I); + if (!Store || Store->isVolatile()) return; + if (Store->getPointerOperand() != Loc.Ptr) return; + + Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand()); + + // Walk up to find the retain. + I = Store; + BasicBlock::iterator Begin = BB->begin(); + while (I != Begin && GetBasicInstructionClass(I) != IC_Retain) + --I; + Instruction *Retain = I; + if (GetBasicInstructionClass(Retain) != IC_Retain) return; + if (GetObjCArg(Retain) != New) return; + + Changed = true; + ++NumStoreStrongs; + + LLVMContext &C = Release->getContext(); + const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + const Type *I8XX = PointerType::getUnqual(I8X); + + Value *Args[] = { Load->getPointerOperand(), New }; + if (Args[0]->getType() != I8XX) + Args[0] = new BitCastInst(Args[0], I8XX, "", Store); + if (Args[1]->getType() != I8X) + Args[1] = new BitCastInst(Args[1], I8X, "", Store); + CallInst *StoreStrong = + CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()), + Args, array_endof(Args), "", Store); + StoreStrong->setDoesNotThrow(); + StoreStrong->setDebugLoc(Store->getDebugLoc()); + + if (&*Iter == Store) ++Iter; + Store->eraseFromParent(); + Release->eraseFromParent(); + EraseInstruction(Retain); + if (Load->use_empty()) + Load->eraseFromParent(); +} + +bool ObjCARCContract::doInitialization(Module &M) { + // These are initialized lazily. + StoreStrongCallee = 0; + RetainAutoreleaseCallee = 0; + RetainAutoreleaseRVCallee = 0; + + // Initialize RetainRVMarker. + RetainRVMarker = 0; + if (NamedMDNode *NMD = + M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker")) + if (NMD->getNumOperands() == 1) { + const MDNode *N = NMD->getOperand(0); + if (N->getNumOperands() == 1) + if (const MDString *S = dyn_cast<MDString>(N->getOperand(0))) + RetainRVMarker = S; + } + + return false; +} + +bool ObjCARCContract::runOnFunction(Function &F) { + if (!EnableARCOpts) + return false; + + Changed = false; + AA = &getAnalysis<AliasAnalysis>(); + DT = &getAnalysis<DominatorTree>(); + + PA.setAA(&getAnalysis<AliasAnalysis>()); + + // For ObjC library calls which return their argument, replace uses of the + // argument with uses of the call return value, if it dominates the use. This + // reduces register pressure. + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { + Instruction *Inst = &*I++; + + // Only these library routines return their argument. In particular, + // objc_retainBlock does not necessarily return its argument. + InstructionClass Class = GetBasicInstructionClass(Inst); + switch (Class) { + case IC_Retain: + case IC_FusedRetainAutorelease: + case IC_FusedRetainAutoreleaseRV: + break; + case IC_Autorelease: + case IC_AutoreleaseRV: + if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited)) + continue; + break; + case IC_RetainRV: { + // If we're compiling for a target which needs a special inline-asm + // marker to do the retainAutoreleasedReturnValue optimization, + // insert it now. + if (!RetainRVMarker) + break; + BasicBlock::iterator BBI = Inst; + --BBI; + while (isNoopInstruction(BBI)) --BBI; + if (&*BBI == GetObjCArg(Inst)) { + InlineAsm *IA = + InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()), + /*isVarArg=*/false), + RetainRVMarker->getString(), + /*Constraints=*/"", /*hasSideEffects=*/true); + CallInst::Create(IA, "", Inst); + } + break; + } + case IC_InitWeak: { + // objc_initWeak(p, null) => *p = null + CallInst *CI = cast<CallInst>(Inst); + if (isNullOrUndef(CI->getArgOperand(1))) { + Value *Null = + ConstantPointerNull::get(cast<PointerType>(CI->getType())); + Changed = true; + new StoreInst(Null, CI->getArgOperand(0), CI); + CI->replaceAllUsesWith(Null); + CI->eraseFromParent(); + } + continue; + } + case IC_Release: + ContractRelease(Inst, I); + continue; + default: + continue; + } + + // Don't use GetObjCArg because we don't want to look through bitcasts + // and such; to do the replacement, the argument must have type i8*. + const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); + for (;;) { + // If we're compiling bugpointed code, don't get in trouble. + if (!isa<Instruction>(Arg) && !isa<Argument>(Arg)) + break; + // Look through the uses of the pointer. + for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); + UI != UE; ) { + Use &U = UI.getUse(); + unsigned OperandNo = UI.getOperandNo(); + ++UI; // Increment UI now, because we may unlink its element. + if (Instruction *UserInst = dyn_cast<Instruction>(U.getUser())) + if (Inst != UserInst && DT->dominates(Inst, UserInst)) { + Changed = true; + Instruction *Replacement = Inst; + const Type *UseTy = U.get()->getType(); + if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) { + // For PHI nodes, insert the bitcast in the predecessor block. + unsigned ValNo = + PHINode::getIncomingValueNumForOperand(OperandNo); + BasicBlock *BB = + PHI->getIncomingBlock(ValNo); + if (Replacement->getType() != UseTy) + Replacement = new BitCastInst(Replacement, UseTy, "", + &BB->back()); + for (unsigned i = 0, e = PHI->getNumIncomingValues(); + i != e; ++i) + if (PHI->getIncomingBlock(i) == BB) { + // Keep the UI iterator valid. + if (&PHI->getOperandUse( + PHINode::getOperandNumForIncomingValue(i)) == + &UI.getUse()) + ++UI; + PHI->setIncomingValue(i, Replacement); + } + } else { + if (Replacement->getType() != UseTy) + Replacement = new BitCastInst(Replacement, UseTy, "", UserInst); + U.set(Replacement); + } + } + } + + // If Arg is a no-op casted pointer, strip one level of casts and + // iterate. + if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg)) + Arg = BI->getOperand(0); + else if (isa<GEPOperator>(Arg) && + cast<GEPOperator>(Arg)->hasAllZeroIndices()) + Arg = cast<GEPOperator>(Arg)->getPointerOperand(); + else if (isa<GlobalAlias>(Arg) && + !cast<GlobalAlias>(Arg)->mayBeOverridden()) + Arg = cast<GlobalAlias>(Arg)->getAliasee(); + else + break; + } + } + + return Changed; +} diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index accabb0..c1dfe15 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -75,6 +75,7 @@ namespace { class Reassociate : public FunctionPass { DenseMap<BasicBlock*, unsigned> RankMap; DenseMap<AssertingVH<>, unsigned> ValueRankMap; + SmallVector<WeakVH, 8> RedoInsts; SmallVector<WeakVH, 8> DeadInsts; bool MadeChange; public: @@ -100,7 +101,7 @@ namespace { void LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); void LinearizeExpr(BinaryOperator *I); Value *RemoveFactorFromExpression(Value *V, Value *Factor); - void ReassociateBB(BasicBlock *BB); + void ReassociateInst(BasicBlock::iterator &BBI); void RemoveDeadBinaryOp(Value *V); }; @@ -216,6 +217,7 @@ static Instruction *LowerNegateToMultiply(Instruction *Neg, ValueRankMap.erase(Neg); Res->takeName(Neg); Neg->replaceAllUsesWith(Res); + Res->setDebugLoc(Neg->getDebugLoc()); Neg->eraseFromParent(); return Res; } @@ -505,6 +507,7 @@ static Instruction *BreakUpSubtract(Instruction *Sub, // Everyone now refers to the add instruction. ValueRankMap.erase(Sub); Sub->replaceAllUsesWith(New); + New->setDebugLoc(Sub->getDebugLoc()); Sub->eraseFromParent(); DEBUG(dbgs() << "Negated: " << *New << '\n'); @@ -530,6 +533,7 @@ static Instruction *ConvertShiftToMul(Instruction *Shl, ValueRankMap.erase(Shl); Mul->takeName(Shl); Shl->replaceAllUsesWith(Mul); + Mul->setDebugLoc(Shl->getDebugLoc()); Shl->eraseFromParent(); return Mul; } @@ -734,7 +738,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // Now that we have inserted a multiply, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6 - Mul = ReassociateExpression(cast<BinaryOperator>(Mul)); + RedoInsts.push_back(Mul); // If every add operand was a duplicate, return the multiply. if (Ops.empty()) @@ -962,71 +966,69 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, } -/// ReassociateBB - Inspect all of the instructions in this basic block, -/// reassociating them as we go. -void Reassociate::ReassociateBB(BasicBlock *BB) { - for (BasicBlock::iterator BBI = BB->begin(); BBI != BB->end(); ) { - Instruction *BI = BBI++; - if (BI->getOpcode() == Instruction::Shl && - isa<ConstantInt>(BI->getOperand(1))) - if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) { - MadeChange = true; - BI = NI; - } +/// ReassociateInst - Inspect and reassociate the instruction at the +/// given position, post-incrementing the position. +void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) { + Instruction *BI = BBI++; + if (BI->getOpcode() == Instruction::Shl && + isa<ConstantInt>(BI->getOperand(1))) + if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) { + MadeChange = true; + BI = NI; + } - // Reject cases where it is pointless to do this. - if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || - BI->getType()->isVectorTy()) - continue; // Floating point ops are not associative. - - // Do not reassociate boolean (i1) expressions. We want to preserve the - // original order of evaluation for short-circuited comparisons that - // SimplifyCFG has folded to AND/OR expressions. If the expression - // is not further optimized, it is likely to be transformed back to a - // short-circuited form for code gen, and the source order may have been - // optimized for the most likely conditions. - if (BI->getType()->isIntegerTy(1)) - continue; + // Reject cases where it is pointless to do this. + if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || + BI->getType()->isVectorTy()) + return; // Floating point ops are not associative. + + // Do not reassociate boolean (i1) expressions. We want to preserve the + // original order of evaluation for short-circuited comparisons that + // SimplifyCFG has folded to AND/OR expressions. If the expression + // is not further optimized, it is likely to be transformed back to a + // short-circuited form for code gen, and the source order may have been + // optimized for the most likely conditions. + if (BI->getType()->isIntegerTy(1)) + return; - // If this is a subtract instruction which is not already in negate form, - // see if we can convert it to X+-Y. - if (BI->getOpcode() == Instruction::Sub) { - if (ShouldBreakUpSubtract(BI)) { - BI = BreakUpSubtract(BI, ValueRankMap); - // Reset the BBI iterator in case BreakUpSubtract changed the - // instruction it points to. - BBI = BI; - ++BBI; + // If this is a subtract instruction which is not already in negate form, + // see if we can convert it to X+-Y. + if (BI->getOpcode() == Instruction::Sub) { + if (ShouldBreakUpSubtract(BI)) { + BI = BreakUpSubtract(BI, ValueRankMap); + // Reset the BBI iterator in case BreakUpSubtract changed the + // instruction it points to. + BBI = BI; + ++BBI; + MadeChange = true; + } else if (BinaryOperator::isNeg(BI)) { + // Otherwise, this is a negation. See if the operand is a multiply tree + // and if this is not an inner node of a multiply tree. + if (isReassociableOp(BI->getOperand(1), Instruction::Mul) && + (!BI->hasOneUse() || + !isReassociableOp(BI->use_back(), Instruction::Mul))) { + BI = LowerNegateToMultiply(BI, ValueRankMap); MadeChange = true; - } else if (BinaryOperator::isNeg(BI)) { - // Otherwise, this is a negation. See if the operand is a multiply tree - // and if this is not an inner node of a multiply tree. - if (isReassociableOp(BI->getOperand(1), Instruction::Mul) && - (!BI->hasOneUse() || - !isReassociableOp(BI->use_back(), Instruction::Mul))) { - BI = LowerNegateToMultiply(BI, ValueRankMap); - MadeChange = true; - } } } + } - // If this instruction is a commutative binary operator, process it. - if (!BI->isAssociative()) continue; - BinaryOperator *I = cast<BinaryOperator>(BI); + // If this instruction is a commutative binary operator, process it. + if (!BI->isAssociative()) return; + BinaryOperator *I = cast<BinaryOperator>(BI); - // If this is an interior node of a reassociable tree, ignore it until we - // get to the root of the tree, to avoid N^2 analysis. - if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode())) - continue; + // If this is an interior node of a reassociable tree, ignore it until we + // get to the root of the tree, to avoid N^2 analysis. + if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode())) + return; - // If this is an add tree that is used by a sub instruction, ignore it - // until we process the subtract. - if (I->hasOneUse() && I->getOpcode() == Instruction::Add && - cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub) - continue; + // If this is an add tree that is used by a sub instruction, ignore it + // until we process the subtract. + if (I->hasOneUse() && I->getOpcode() == Instruction::Add && + cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub) + return; - ReassociateExpression(I); - } + ReassociateExpression(I); } Value *Reassociate::ReassociateExpression(BinaryOperator *I) { @@ -1053,6 +1055,8 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) { // eliminate it. DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); I->replaceAllUsesWith(V); + if (Instruction *VI = dyn_cast<Instruction>(V)) + VI->setDebugLoc(I->getDebugLoc()); RemoveDeadBinaryOp(I); ++NumAnnihil; return V; @@ -1076,6 +1080,8 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) { // This expression tree simplified to something that isn't a tree, // eliminate it. I->replaceAllUsesWith(Ops[0].Op); + if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op)) + OI->setDebugLoc(I->getDebugLoc()); RemoveDeadBinaryOp(I); return Ops[0].Op; } @@ -1093,7 +1099,16 @@ bool Reassociate::runOnFunction(Function &F) { MadeChange = false; for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) - ReassociateBB(FI); + for (BasicBlock::iterator BBI = FI->begin(); BBI != FI->end(); ) + ReassociateInst(BBI); + + // Now that we're done, revisit any instructions which are likely to + // have secondary reassociation opportunities. + while (!RedoInsts.empty()) + if (Value *V = RedoInsts.pop_back_val()) { + BasicBlock::iterator BBI = cast<Instruction>(V); + ReassociateInst(BBI); + } // Now that we're done, delete any instructions which are no longer used. while (!DeadInsts.empty()) diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index 459bb06..47afc77 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -9,7 +9,7 @@ // // This file demotes all registers to memory references. It is intented to be // the inverse of PromoteMemoryToRegister. By converting to loads, the only -// values live accross basic blocks are allocas and loads before phi nodes. +// values live across basic blocks are allocas and loads before phi nodes. // It is intended that this should make CFG hacking much easier. // To make later hacking easier, the entry block is split into two, such that // all introduced allocas and nothing else are in the entry block. diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index c82e929..083412e 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -655,7 +655,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { // Just mark all destinations executable! // TODO: This could be improved if the operand is a [cast of a] BlockAddress. - if (isa<IndirectBrInst>(&TI)) + if (isa<IndirectBrInst>(TI)) return true; #ifndef NDEBUG @@ -1989,7 +1989,7 @@ bool IPSCCP::runOnModule(Module &M) { ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType())); } - // If we infered constant or undef values for globals variables, we can delete + // If we inferred constant or undef values for globals variables, we can delete // the global and any stores that remain to it. const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals(); for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(), diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 5428a5a..158d653 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -17,6 +17,7 @@ #include "llvm-c/Initialization.h" #include "llvm/InitializePasses.h" #include "llvm/PassManager.h" +#include "llvm/Analysis/Passes.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Scalar.h" @@ -48,6 +49,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopIdiomRecognizePass(Registry); initializeLowerAtomicPass(Registry); initializeMemCpyOptPass(Registry); + initializeObjCARCAliasAnalysisPass(Registry); + initializeObjCARCExpandPass(Registry); + initializeObjCARCContractPass(Registry); + initializeObjCARCOptPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); initializeSCCPPass(Registry); @@ -173,3 +178,11 @@ void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) { void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createEarlyCSEPass()); } + +void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createTypeBasedAliasAnalysisPass()); +} + +void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createBasicAliasAnalysisPass()); +} diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 42246ff..beef127 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -30,6 +30,7 @@ #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Pass.h" +#include "llvm/Analysis/DIBuilder.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" @@ -227,16 +228,30 @@ class ConvertToScalarInfo { /// which means that mem2reg can't promote it. bool IsNotTrivial; + /// ScalarKind - Tracks the kind of alloca being considered for promotion, + /// computed based on the uses of the alloca rather than the LLVM type system. + enum { + Unknown, + + // Accesses via GEPs that are consistent with element access of a vector + // type. This will not be converted into a vector unless there is a later + // access using an actual vector type. + ImplicitVector, + + // Accesses via vector operations and GEPs that are consistent with the + // layout of a vector type. + Vector, + + // An integer bag-of-bits with bitwise operations for insertion and + // extraction. Any combination of types can be converted into this kind + // of scalar. + Integer + } ScalarKind; + /// VectorTy - This tracks the type that we should promote the vector to if /// it is possible to turn it into a vector. This starts out null, and if it /// isn't possible to turn into a vector type, it gets set to VoidTy. - const Type *VectorTy; - - /// HadAVector - True if there is at least one vector access to the alloca. - /// We don't want to turn random arrays into vectors and use vector element - /// insert/extract, but if there are element accesses to something that is - /// also declared as a vector, we do want to promote to a vector. - bool HadAVector; + const VectorType *VectorTy; /// HadNonMemTransferAccess - True if there is at least one access to the /// alloca that is not a MemTransferInst. We don't want to turn structs into @@ -245,14 +260,14 @@ class ConvertToScalarInfo { public: explicit ConvertToScalarInfo(unsigned Size, const TargetData &td) - : AllocaSize(Size), TD(td), IsNotTrivial(false), VectorTy(0), - HadAVector(false), HadNonMemTransferAccess(false) { } + : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown), + VectorTy(0), HadNonMemTransferAccess(false) { } AllocaInst *TryConvert(AllocaInst *AI); private: bool CanConvertToScalar(Value *V, uint64_t Offset); - void MergeInType(const Type *In, uint64_t Offset, bool IsLoadOrStore); + void MergeInTypeForLoadOrStore(const Type *In, uint64_t Offset); bool MergeInVectorType(const VectorType *VInTy, uint64_t Offset); void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); @@ -273,6 +288,11 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { if (!CanConvertToScalar(AI, 0) || !IsNotTrivial) return 0; + // If an alloca has only memset / memcpy uses, it may still have an Unknown + // ScalarKind. Treat it as an Integer below. + if (ScalarKind == Unknown) + ScalarKind = Integer; + // If we were able to find a vector type that can handle this with // insert/extract elements, and if there was at least one use that had // a vector type, promote this to a vector. We don't want to promote @@ -280,14 +300,15 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // we just get a lot of insert/extracts. If at least one vector is // involved, then we probably really do have a union of vector/array. const Type *NewTy; - if (VectorTy && VectorTy->isVectorTy() && HadAVector) { + if (ScalarKind == Vector) { + assert(VectorTy && "Missing type for vector scalar."); DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " << *VectorTy << '\n'); NewTy = VectorTy; // Use the vector type. } else { unsigned BitWidth = AllocaSize * 8; - if (!HadAVector && !HadNonMemTransferAccess && - !TD.fitsInLegalInteger(BitWidth)) + if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && + !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth)) return 0; DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); @@ -299,8 +320,9 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { return NewAI; } -/// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy) -/// so far at the offset specified by Offset (which is specified in bytes). +/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type +/// (VectorTy) so far at the offset specified by Offset (which is specified in +/// bytes). /// /// There are three cases we handle here: /// 1) A union of vector types of the same size and potentially its elements. @@ -315,11 +337,11 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { /// large) integer type with extract and insert operations where the loads /// and stores would mutate the memory. We mark this by setting VectorTy /// to VoidTy. -void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset, - bool IsLoadOrStore) { +void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In, + uint64_t Offset) { // If we already decided to turn this into a blob of integer memory, there is // nothing to be done. - if (VectorTy && VectorTy->isVoidTy()) + if (ScalarKind == Integer) return; // If this could be contributing to a vector, analyze it. @@ -335,33 +357,40 @@ void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset, // Full width accesses can be ignored, because they can always be turned // into bitcasts. unsigned EltSize = In->getPrimitiveSizeInBits()/8; - if (IsLoadOrStore && EltSize == AllocaSize) + if (EltSize == AllocaSize) return; + // If we're accessing something that could be an element of a vector, see // if the implied vector agrees with what we already have and if Offset is // compatible with it. if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && - (VectorTy == 0 || - cast<VectorType>(VectorTy)->getElementType() - ->getPrimitiveSizeInBits()/8 == EltSize)) { - if (VectorTy == 0) + (!VectorTy || Offset * 8 < VectorTy->getPrimitiveSizeInBits())) { + if (!VectorTy) { + ScalarKind = ImplicitVector; VectorTy = VectorType::get(In, AllocaSize/EltSize); - return; + return; + } + + unsigned CurrentEltSize = VectorTy->getElementType() + ->getPrimitiveSizeInBits()/8; + if (EltSize == CurrentEltSize) + return; + + if (In->isIntegerTy() && isPowerOf2_32(AllocaSize / EltSize)) + return; } } // Otherwise, we have a case that we can't handle with an optimized vector // form. We can still turn this into a large integer. - VectorTy = Type::getVoidTy(In->getContext()); + ScalarKind = Integer; + VectorTy = 0; } -/// MergeInVectorType - Handles the vector case of MergeInType, returning true -/// if the type was successfully merged and false otherwise. +/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore, +/// returning true if the type was successfully merged and false otherwise. bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy, uint64_t Offset) { - // Remember if we saw a vector type. - HadAVector = true; - // TODO: Support nonzero offsets? if (Offset != 0) return false; @@ -373,19 +402,22 @@ bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy, // If this the first vector we see, remember the type so that we know the // element size. if (!VectorTy) { + ScalarKind = Vector; VectorTy = VInTy; return true; } - unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); + unsigned BitWidth = VectorTy->getBitWidth(); unsigned InBitWidth = VInTy->getBitWidth(); // Vectors of the same size can be converted using a simple bitcast. - if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) + if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) { + ScalarKind = Vector; return true; + } - const Type *ElementTy = cast<VectorType>(VectorTy)->getElementType(); - const Type *InElementTy = cast<VectorType>(VInTy)->getElementType(); + const Type *ElementTy = VectorTy->getElementType(); + const Type *InElementTy = VInTy->getElementType(); // Do not allow mixed integer and floating-point accesses from vectors of // different sizes. @@ -420,6 +452,7 @@ bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy, } // Pick the largest of the two vector types. + ScalarKind = Vector; if (InBitWidth > BitWidth) VectorTy = VInTy; @@ -447,7 +480,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { if (LI->getType()->isX86_MMXTy()) return false; HadNonMemTransferAccess = true; - MergeInType(LI->getType(), Offset, true); + MergeInTypeForLoadOrStore(LI->getType(), Offset); continue; } @@ -458,7 +491,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { if (SI->getOperand(0)->getType()->isX86_MMXTy()) return false; HadNonMemTransferAccess = true; - MergeInType(SI->getOperand(0)->getType(), Offset, true); + MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset); continue; } @@ -657,23 +690,60 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, } /// getScaledElementType - Gets a scaled element type for a partial vector -/// access of an alloca. The input type must be an integer or float, and -/// the resulting type must be an integer, float or double. -static const Type *getScaledElementType(const Type *OldTy, +/// access of an alloca. The input types must be integer or floating-point +/// scalar or vector types, and the resulting type is an integer, float or +/// double. +static const Type *getScaledElementType(const Type *Ty1, const Type *Ty2, unsigned NewBitWidth) { - assert((OldTy->isIntegerTy() || OldTy->isFloatTy()) && "Partial vector " - "accesses must be scaled from integer or float elements."); - - LLVMContext &Context = OldTy->getContext(); + bool IsFP1 = Ty1->isFloatingPointTy() || + (Ty1->isVectorTy() && + cast<VectorType>(Ty1)->getElementType()->isFloatingPointTy()); + bool IsFP2 = Ty2->isFloatingPointTy() || + (Ty2->isVectorTy() && + cast<VectorType>(Ty2)->getElementType()->isFloatingPointTy()); + + LLVMContext &Context = Ty1->getContext(); + + // Prefer floating-point types over integer types, as integer types may have + // been created by earlier scalar replacement. + if (IsFP1 || IsFP2) { + if (NewBitWidth == 32) + return Type::getFloatTy(Context); + if (NewBitWidth == 64) + return Type::getDoubleTy(Context); + } - if (OldTy->isIntegerTy()) - return Type::getIntNTy(Context, NewBitWidth); - if (NewBitWidth == 32) - return Type::getFloatTy(Context); - if (NewBitWidth == 64) - return Type::getDoubleTy(Context); + return Type::getIntNTy(Context, NewBitWidth); +} - llvm_unreachable("Invalid type for a partial vector access of an alloca!"); +/// CreateShuffleVectorCast - Creates a shuffle vector to convert one vector +/// to another vector of the same element type which has the same allocation +/// size but different primitive sizes (e.g. <3 x i32> and <4 x i32>). +static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType, + IRBuilder<> &Builder) { + const Type *FromType = FromVal->getType(); + const VectorType *FromVTy = cast<VectorType>(FromType); + const VectorType *ToVTy = cast<VectorType>(ToType); + assert((ToVTy->getElementType() == FromVTy->getElementType()) && + "Vectors must have the same element type"); + Value *UnV = UndefValue::get(FromType); + unsigned numEltsFrom = FromVTy->getNumElements(); + unsigned numEltsTo = ToVTy->getNumElements(); + + SmallVector<Constant*, 3> Args; + const Type* Int32Ty = Builder.getInt32Ty(); + unsigned minNumElts = std::min(numEltsFrom, numEltsTo); + unsigned i; + for (i=0; i != minNumElts; ++i) + Args.push_back(ConstantInt::get(Int32Ty, i)); + + if (i < numEltsTo) { + Constant* UnC = UndefValue::get(Int32Ty); + for (; i != numEltsTo; ++i) + Args.push_back(UnC); + } + Constant *Mask = ConstantVector::get(Args); + return Builder.CreateShuffleVector(FromVal, UnV, Mask, "tmpV"); } /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer @@ -690,34 +760,45 @@ Value *ConvertToScalarInfo:: ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, uint64_t Offset, IRBuilder<> &Builder) { // If the load is of the whole new alloca, no conversion is needed. - if (FromVal->getType() == ToType && Offset == 0) + const Type *FromType = FromVal->getType(); + if (FromType == ToType && Offset == 0) return FromVal; // If the result alloca is a vector type, this is either an element // access or a bitcast to another vector type of the same size. - if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) { + if (const VectorType *VTy = dyn_cast<VectorType>(FromType)) { + unsigned FromTypeSize = TD.getTypeAllocSize(FromType); unsigned ToTypeSize = TD.getTypeAllocSize(ToType); - if (ToTypeSize == AllocaSize) - return Builder.CreateBitCast(FromVal, ToType, "tmp"); - - if (ToType->isVectorTy()) { - assert(isPowerOf2_64(AllocaSize / ToTypeSize) && - "Partial vector access of an alloca must have a power-of-2 size " - "ratio."); - assert(Offset == 0 && "Can't extract a value of a smaller vector type " - "from a nonzero offset."); - - const Type *ToElementTy = cast<VectorType>(ToType)->getElementType(); - const Type *CastElementTy = getScaledElementType(ToElementTy, + if (FromTypeSize == ToTypeSize) { + // If the two types have the same primitive size, use a bit cast. + // Otherwise, it is two vectors with the same element type that has + // the same allocation size but different number of elements so use + // a shuffle vector. + if (FromType->getPrimitiveSizeInBits() == + ToType->getPrimitiveSizeInBits()) + return Builder.CreateBitCast(FromVal, ToType, "tmp"); + else + return CreateShuffleVectorCast(FromVal, ToType, Builder); + } + + if (isPowerOf2_64(FromTypeSize / ToTypeSize)) { + assert(!(ToType->isVectorTy() && Offset != 0) && "Can't extract a value " + "of a smaller vector type at a nonzero offset."); + + const Type *CastElementTy = getScaledElementType(FromType, ToType, ToTypeSize * 8); - unsigned NumCastVectorElements = AllocaSize / ToTypeSize; + unsigned NumCastVectorElements = FromTypeSize / ToTypeSize; LLVMContext &Context = FromVal->getContext(); const Type *CastTy = VectorType::get(CastElementTy, NumCastVectorElements); Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp"); + + unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy); + unsigned Elt = Offset/EltSize; + assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get( - Type::getInt32Ty(Context), 0), "tmp"); + Type::getInt32Ty(Context), Elt), "tmp"); return Builder.CreateBitCast(Extract, ToType, "tmp"); } @@ -837,16 +918,24 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, // Changing the whole vector with memset or with an access of a different // vector type? - if (ValSize == VecSize) - return Builder.CreateBitCast(SV, AllocaType, "tmp"); + if (ValSize == VecSize) { + // If the two types have the same primitive size, use a bit cast. + // Otherwise, it is two vectors with the same element type that has + // the same allocation size but different number of elements so use + // a shuffle vector. + if (VTy->getPrimitiveSizeInBits() == + SV->getType()->getPrimitiveSizeInBits()) + return Builder.CreateBitCast(SV, AllocaType, "tmp"); + else + return CreateShuffleVectorCast(SV, VTy, Builder); + } - if (SV->getType()->isVectorTy() && isPowerOf2_64(VecSize / ValSize)) { - assert(Offset == 0 && "Can't insert a value of a smaller vector type at " - "a nonzero offset."); + if (isPowerOf2_64(VecSize / ValSize)) { + assert(!(SV->getType()->isVectorTy() && Offset != 0) && "Can't insert a " + "value of a smaller vector type at a nonzero offset."); - const Type *ToElementTy = - cast<VectorType>(SV->getType())->getElementType(); - const Type *CastElementTy = getScaledElementType(ToElementTy, ValSize); + const Type *CastElementTy = getScaledElementType(VTy, SV->getType(), + ValSize); unsigned NumCastVectorElements = VecSize / ValSize; LLVMContext &Context = SV->getContext(); @@ -855,24 +944,23 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp"); Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp"); + + unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy); + unsigned Elt = Offset/EltSize; + assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); Value *Insert = Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get( - Type::getInt32Ty(Context), 0), "tmp"); + Type::getInt32Ty(Context), Elt), "tmp"); return Builder.CreateBitCast(Insert, AllocaType, "tmp"); } - uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); - // Must be an element insertion. + assert(SV->getType() == VTy->getElementType()); + uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); unsigned Elt = Offset/EltSize; - - if (SV->getType() != VTy->getElementType()) - SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp"); - - SV = Builder.CreateInsertElement(Old, SV, + return Builder.CreateInsertElement(Old, SV, ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), "tmp"); - return SV; } // If SV is a first-class aggregate value, insert each value recursively. @@ -990,8 +1078,9 @@ namespace { class AllocaPromoter : public LoadAndStorePromoter { AllocaInst *AI; public: - AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S) - : LoadAndStorePromoter(Insts, S), AI(0) {} + AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + DbgDeclareInst *DD, DIBuilder *&DB) + : LoadAndStorePromoter(Insts, S, DD, DB), AI(0) {} void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { // Remember which alloca we're promoting (for isInstInList). @@ -1268,7 +1357,6 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { return true; } - bool SROA::performPromotion(Function &F) { std::vector<AllocaInst*> Allocas; DominatorTree *DT = 0; @@ -1279,6 +1367,7 @@ bool SROA::performPromotion(Function &F) { bool Changed = false; SmallVector<Instruction*, 64> Insts; + DIBuilder *DIB = 0; while (1) { Allocas.clear(); @@ -1302,8 +1391,11 @@ bool SROA::performPromotion(Function &F) { for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ++UI) Insts.push_back(cast<Instruction>(*UI)); - - AllocaPromoter(Insts, SSA).run(AI, Insts); + + DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); + if (DDI && !DIB) + DIB = new DIBuilder(*AI->getParent()->getParent()->getParent()); + AllocaPromoter(Insts, SSA, DDI, DIB).run(AI, Insts); Insts.clear(); } } @@ -1311,6 +1403,10 @@ bool SROA::performPromotion(Function &F) { Changed = true; } + // FIXME: Is there a better way to handle the lazy initialization of DIB + // so that there doesn't need to be an explicit delete? + delete DIB; + return Changed; } @@ -1770,9 +1866,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, // %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 // (Also works for arrays instead of structs) Value *Insert = UndefValue::get(LIType); + IRBuilder<> Builder(LI); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { - Value *Load = new LoadInst(NewElts[i], "load", LI); - Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI); + Value *Load = Builder.CreateLoad(NewElts[i], "load"); + Insert = Builder.CreateInsertValue(Insert, Load, i, "insert"); } LI->replaceAllUsesWith(Insert); DeadInsts.push_back(LI); @@ -1797,9 +1894,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, // %val.1 = extractvalue { i32, i32 } %val, 1 // store i32 %val.1, i32* %alloc.1 // (Also works for arrays instead of structs) + IRBuilder<> Builder(SI); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { - Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI); - new StoreInst(Extract, NewElts[i], SI); + Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName()); + Builder.CreateStore(Extract, NewElts[i]); } DeadInsts.push_back(SI); } else if (SIType->isIntegerTy() && @@ -2420,19 +2518,22 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, } if (CallSite CS = U) { - // If this is a readonly/readnone call site, then we know it is just a - // load and we can ignore it. - if (CS.onlyReadsMemory()) - continue; - // If this is the function being called then we treat it like a load and // ignore it. if (CS.isCallee(UI)) continue; + // If this is a readonly/readnone call site, then we know it is just a + // load (but one that potentially returns the value itself), so we can + // ignore it if we know that the value isn't captured. + unsigned ArgNo = CS.getArgumentNo(UI); + if (CS.onlyReadsMemory() && + (CS.getInstruction()->use_empty() || + CS.paramHasAttr(ArgNo+1, Attribute::NoCapture))) + continue; + // If this is being passed as a byval argument, the caller is making a // copy, so it is only a read of the alloca. - unsigned ArgNo = CS.getArgumentNo(UI); if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal)) continue; } diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 0bcec6b..7e9cc80 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -73,7 +73,8 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { if (UseLLVMTrap) { Function *TrapFn = Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); - CallInst::Create(TrapFn, "", I); + CallInst *CallTrap = CallInst::Create(TrapFn, "", I); + CallTrap->setDebugLoc(I->getDebugLoc()); } new UnreachableInst(I->getContext(), I); @@ -95,6 +96,7 @@ static void ChangeToCall(InvokeInst *II) { NewCall->takeName(II); NewCall->setCallingConv(II->getCallingConv()); NewCall->setAttributes(II->getAttributes()); + NewCall->setDebugLoc(II->getDebugLoc()); II->replaceAllUsesWith(NewCall); // Follow the call by a branch to the normal destination. @@ -162,7 +164,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, Changed = true; } - Changed |= ConstantFoldTerminator(BB); + Changed |= ConstantFoldTerminator(BB, true); for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) Worklist.push_back(*SI); } while (!Worklist.empty()); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 5768ccb..e21eb9d 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -36,7 +36,7 @@ // evaluated each time through the tail recursion. Safely keeping allocas // in the entry block requires analysis to proves that the tail-called // function does not read or write the stack object. -// 2. Tail recursion is only performed if the call immediately preceeds the +// 2. Tail recursion is only performed if the call immediately precedes the // return instruction. It's possible that there could be a jump between // the call and the return. // 3. There can be intervening operations between the call and the return that @@ -59,6 +59,7 @@ #include "llvm/Function.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" #include "llvm/Pass.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" @@ -209,10 +210,10 @@ bool TailCallElim::runOnFunction(Function &F) { } } - // Finally, if this function contains no non-escaping allocas, mark all calls - // in the function as eligible for tail calls (there is no stack memory for - // them to access). - if (!FunctionContainsEscapingAllocas) + // Finally, if this function contains no non-escaping allocas, or calls + // setjmp, mark all calls in the function as eligible for tail calls + //(there is no stack memory for them to access). + if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice()) for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) if (CallInst *CI = dyn_cast<CallInst>(I)) { @@ -433,7 +434,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, if (CanMoveAboveCall(BBI, CI)) continue; // If we can't move the instruction above the call, it might be because it - // is an associative and commutative operation that could be tranformed + // is an associative and commutative operation that could be transformed // using accumulator recursion elimination. Check to see if this is the // case, and if so, remember the initial accumulator value for later. if ((AccumulatorRecursionEliminationInitVal = @@ -573,7 +574,9 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // Now that all of the PHI nodes are in place, remove the call and // ret instructions, replacing them with an unconditional branch. - BranchInst::Create(OldEntry, Ret); + BranchInst *NewBI = BranchInst::Create(OldEntry, Ret); + NewBI->setDebugLoc(CI->getDebugLoc()); + BB->getInstList().erase(Ret); // Remove return. BB->getInstList().erase(CI); // Remove call. ++NumEliminated; diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index f8c3326..92464e8 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -538,3 +538,13 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, UncondBranch->eraseFromParent(); return cast<ReturnInst>(NewRet); } + +/// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a +/// given basic block. +DebugLoc llvm::GetFirstDebugLocInBasicBlock(const BasicBlock *BB) { + if (const Instruction *I = BB->getFirstNonPHI()) + return I->getDebugLoc(); + // Scanning entire block may be too expensive, if the first instruction + // does not have valid location info. + return DebugLoc(); +} diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index 14a3c95..d6206a3 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -56,7 +56,7 @@ char BreakCriticalEdges::ID = 0; INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges", "Break critical edges in CFG", false, false) -// Publically exposed interface to pass... +// Publicly exposed interface to pass... char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID; FunctionPass *llvm::createBreakCriticalEdgesPass() { return new BreakCriticalEdges(); @@ -180,7 +180,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, BasicBlock *NewBB = BasicBlock::Create(TI->getContext(), TIBB->getName() + "." + DestBB->getName() + "_crit_edge"); // Create our unconditional branch. - BranchInst::Create(DestBB, NewBB); + BranchInst *NewBI = BranchInst::Create(DestBB, NewBB); + NewBI->setDebugLoc(TI->getDebugLoc()); // Branch to the new block, breaking the edge. TI->setSuccessor(SuccNum, NewBB); diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index 4a90751..14bb17f 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -362,12 +362,8 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { Function *Callee = CI->getCalledFunction(); StringRef Name = Callee->getName(); const FunctionType *FT = Callee->getFunctionType(); - BasicBlock *BB = CI->getParent(); LLVMContext &Context = CI->getParent()->getContext(); - IRBuilder<> B(Context); - - // Set the builder to the instruction after the call. - B.SetInsertPoint(BB, CI); + IRBuilder<> B(CI); if (Name == "__memcpy_chk") { // Check if this has the right signature. diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt index 5b76bb2..2df534f 100644 --- a/lib/Transforms/Utils/CMakeLists.txt +++ b/lib/Transforms/Utils/CMakeLists.txt @@ -5,7 +5,6 @@ add_llvm_library(LLVMTransformUtils BreakCriticalEdges.cpp BuildLibCalls.cpp CloneFunction.cpp - CloneLoop.cpp CloneModule.cpp CodeExtractor.cpp DemoteRegToStack.cpp diff --git a/lib/Transforms/Utils/CloneLoop.cpp b/lib/Transforms/Utils/CloneLoop.cpp deleted file mode 100644 index 87dd141..0000000 --- a/lib/Transforms/Utils/CloneLoop.cpp +++ /dev/null @@ -1,128 +0,0 @@ -//===- CloneLoop.cpp - Clone loop nest ------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the CloneLoop interface which makes a copy of a loop. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/BasicBlock.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/Dominators.h" - - -using namespace llvm; - -/// CloneDominatorInfo - Clone a basic block's dominator tree. It is expected -/// that the basic block is already cloned. -static void CloneDominatorInfo(BasicBlock *BB, - ValueToValueMapTy &VMap, - DominatorTree *DT) { - - assert (DT && "DominatorTree is not available"); - ValueToValueMapTy::iterator BI = VMap.find(BB); - assert (BI != VMap.end() && "BasicBlock clone is missing"); - BasicBlock *NewBB = cast<BasicBlock>(BI->second); - - // NewBB already got dominator info. - if (DT->getNode(NewBB)) - return; - - assert (DT->getNode(BB) && "BasicBlock does not have dominator info"); - // Entry block is not expected here. Infinite loops are not to cloned. - assert (DT->getNode(BB)->getIDom() && "BasicBlock does not have immediate dominator"); - BasicBlock *BBDom = DT->getNode(BB)->getIDom()->getBlock(); - - // NewBB's dominator is either BB's dominator or BB's dominator's clone. - BasicBlock *NewBBDom = BBDom; - ValueToValueMapTy::iterator BBDomI = VMap.find(BBDom); - if (BBDomI != VMap.end()) { - NewBBDom = cast<BasicBlock>(BBDomI->second); - if (!DT->getNode(NewBBDom)) - CloneDominatorInfo(BBDom, VMap, DT); - } - DT->addNewBlock(NewBB, NewBBDom); -} - -/// CloneLoop - Clone Loop. Clone dominator info. Populate VMap -/// using old blocks to new blocks mapping. -Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager *LPM, LoopInfo *LI, - ValueToValueMapTy &VMap, Pass *P) { - - DominatorTree *DT = NULL; - if (P) - DT = P->getAnalysisIfAvailable<DominatorTree>(); - - SmallVector<BasicBlock *, 16> NewBlocks; - - // Populate loop nest. - SmallVector<Loop *, 8> LoopNest; - LoopNest.push_back(OrigL); - - - Loop *NewParentLoop = NULL; - do { - Loop *L = LoopNest.pop_back_val(); - Loop *NewLoop = new Loop(); - - if (!NewParentLoop) - NewParentLoop = NewLoop; - - LPM->insertLoop(NewLoop, L->getParentLoop()); - - // Clone Basic Blocks. - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) { - BasicBlock *BB = *I; - BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".clone"); - VMap[BB] = NewBB; - if (P) - LPM->cloneBasicBlockSimpleAnalysis(BB, NewBB, L); - NewLoop->addBasicBlockToLoop(NewBB, LI->getBase()); - NewBlocks.push_back(NewBB); - } - - // Clone dominator info. - if (DT) - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) { - BasicBlock *BB = *I; - CloneDominatorInfo(BB, VMap, DT); - } - - // Process sub loops - for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) - LoopNest.push_back(*I); - } while (!LoopNest.empty()); - - // Remap instructions to reference operands from VMap. - for(SmallVector<BasicBlock *, 16>::iterator NBItr = NewBlocks.begin(), - NBE = NewBlocks.end(); NBItr != NBE; ++NBItr) { - BasicBlock *NB = *NBItr; - for(BasicBlock::iterator BI = NB->begin(), BE = NB->end(); - BI != BE; ++BI) { - Instruction *Insn = BI; - for (unsigned index = 0, num_ops = Insn->getNumOperands(); - index != num_ops; ++index) { - Value *Op = Insn->getOperand(index); - ValueToValueMapTy::iterator OpItr = VMap.find(Op); - if (OpItr != VMap.end()) - Insn->setOperand(index, OpItr->second); - } - } - } - - BasicBlock *Latch = OrigL->getLoopLatch(); - Function *F = Latch->getParent(); - F->getBasicBlockList().insert(OrigL->getHeader(), - NewBlocks.begin(), NewBlocks.end()); - - - return NewParentLoop; -} diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index 46601b4..8c133ea 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -157,7 +157,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { TI->replaceUsesOfWith(OldPred, NewBB); } - // Okay, everthing within the region is now branching to the right block, we + // Okay, everything within the region is now branching to the right block, we // just have to update the PHI nodes now, inserting PHI nodes into NewBB. for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) { PHINode *PN = cast<PHINode>(AfterPHIs); diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 2cb1d3b..946e62f 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -10,6 +10,13 @@ // This file implements inlining of a function into a call site, resolving // parameters and the return value as appropriate. // +// The code in this file for handling inlines through invoke +// instructions preserves semantics only under some assumptions about +// the behavior of unwinders which correspond to gcc-style libUnwind +// exception personality functions. Eventually the IR will be +// improved to make this unnecessary, but until then, this code is +// marked [LIBUNWIND]. +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" @@ -28,6 +35,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/CallSite.h" +#include "llvm/Support/IRBuilder.h" using namespace llvm; bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI) { @@ -37,6 +45,372 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) { return InlineFunction(CallSite(II), IFI); } +/// [LIBUNWIND] Look for an llvm.eh.exception call in the given block. +static EHExceptionInst *findExceptionInBlock(BasicBlock *bb) { + for (BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; i++) { + EHExceptionInst *exn = dyn_cast<EHExceptionInst>(i); + if (exn) return exn; + } + + return 0; +} + +/// [LIBUNWIND] Look for the 'best' llvm.eh.selector instruction for +/// the given llvm.eh.exception call. +static EHSelectorInst *findSelectorForException(EHExceptionInst *exn) { + BasicBlock *exnBlock = exn->getParent(); + + EHSelectorInst *outOfBlockSelector = 0; + for (Instruction::use_iterator + ui = exn->use_begin(), ue = exn->use_end(); ui != ue; ++ui) { + EHSelectorInst *sel = dyn_cast<EHSelectorInst>(*ui); + if (!sel) continue; + + // Immediately accept an eh.selector in the same block as the + // excepton call. + if (sel->getParent() == exnBlock) return sel; + + // Otherwise, use the first selector we see. + if (!outOfBlockSelector) outOfBlockSelector = sel; + } + + return outOfBlockSelector; +} + +/// [LIBUNWIND] Find the (possibly absent) call to @llvm.eh.selector +/// in the given landing pad. In principle, llvm.eh.exception is +/// required to be in the landing pad; in practice, SplitCriticalEdge +/// can break that invariant, and then inlining can break it further. +/// There's a real need for a reliable solution here, but until that +/// happens, we have some fragile workarounds here. +static EHSelectorInst *findSelectorForLandingPad(BasicBlock *lpad) { + // Look for an exception call in the actual landing pad. + EHExceptionInst *exn = findExceptionInBlock(lpad); + if (exn) return findSelectorForException(exn); + + // Okay, if that failed, look for one in an obvious successor. If + // we find one, we'll fix the IR by moving things back to the + // landing pad. + + bool dominates = true; // does the lpad dominate the exn call + BasicBlock *nonDominated = 0; // if not, the first non-dominated block + BasicBlock *lastDominated = 0; // and the block which branched to it + + BasicBlock *exnBlock = lpad; + + // We need to protect against lpads that lead into infinite loops. + SmallPtrSet<BasicBlock*,4> visited; + visited.insert(exnBlock); + + do { + // We're not going to apply this hack to anything more complicated + // than a series of unconditional branches, so if the block + // doesn't terminate in an unconditional branch, just fail. More + // complicated cases can arise when, say, sinking a call into a + // split unwind edge and then inlining it; but that can do almost + // *anything* to the CFG, including leaving the selector + // completely unreachable. The only way to fix that properly is + // to (1) prohibit transforms which move the exception or selector + // values away from the landing pad, e.g. by producing them with + // instructions that are pinned to an edge like a phi, or + // producing them with not-really-instructions, and (2) making + // transforms which split edges deal with that. + BranchInst *branch = dyn_cast<BranchInst>(&exnBlock->back()); + if (!branch || branch->isConditional()) return 0; + + BasicBlock *successor = branch->getSuccessor(0); + + // Fail if we found an infinite loop. + if (!visited.insert(successor)) return 0; + + // If the successor isn't dominated by exnBlock: + if (!successor->getSinglePredecessor()) { + // We don't want to have to deal with threading the exception + // through multiple levels of phi, so give up if we've already + // followed a non-dominating edge. + if (!dominates) return 0; + + // Otherwise, remember this as a non-dominating edge. + dominates = false; + nonDominated = successor; + lastDominated = exnBlock; + } + + exnBlock = successor; + + // Can we stop here? + exn = findExceptionInBlock(exnBlock); + } while (!exn); + + // Look for a selector call for the exception we found. + EHSelectorInst *selector = findSelectorForException(exn); + if (!selector) return 0; + + // The easy case is when the landing pad still dominates the + // exception call, in which case we can just move both calls back to + // the landing pad. + if (dominates) { + selector->moveBefore(lpad->getFirstNonPHI()); + exn->moveBefore(selector); + return selector; + } + + // Otherwise, we have to split at the first non-dominating block. + // The CFG looks basically like this: + // lpad: + // phis_0 + // insnsAndBranches_1 + // br label %nonDominated + // nonDominated: + // phis_2 + // insns_3 + // %exn = call i8* @llvm.eh.exception() + // insnsAndBranches_4 + // %selector = call @llvm.eh.selector(i8* %exn, ... + // We need to turn this into: + // lpad: + // phis_0 + // %exn0 = call i8* @llvm.eh.exception() + // %selector0 = call @llvm.eh.selector(i8* %exn0, ... + // insnsAndBranches_1 + // br label %split // from lastDominated + // nonDominated: + // phis_2 (without edge from lastDominated) + // %exn1 = call i8* @llvm.eh.exception() + // %selector1 = call i8* @llvm.eh.selector(i8* %exn1, ... + // br label %split + // split: + // phis_2 (edge from lastDominated, edge from split) + // %exn = phi ... + // %selector = phi ... + // insns_3 + // insnsAndBranches_4 + + assert(nonDominated); + assert(lastDominated); + + // First, make clones of the intrinsics to go in lpad. + EHExceptionInst *lpadExn = cast<EHExceptionInst>(exn->clone()); + EHSelectorInst *lpadSelector = cast<EHSelectorInst>(selector->clone()); + lpadSelector->setArgOperand(0, lpadExn); + lpadSelector->insertBefore(lpad->getFirstNonPHI()); + lpadExn->insertBefore(lpadSelector); + + // Split the non-dominated block. + BasicBlock *split = + nonDominated->splitBasicBlock(nonDominated->getFirstNonPHI(), + nonDominated->getName() + ".lpad-fix"); + + // Redirect the last dominated branch there. + cast<BranchInst>(lastDominated->back()).setSuccessor(0, split); + + // Move the existing intrinsics to the end of the old block. + selector->moveBefore(&nonDominated->back()); + exn->moveBefore(selector); + + Instruction *splitIP = &split->front(); + + // For all the phis in nonDominated, make a new phi in split to join + // that phi with the edge from lastDominated. + for (BasicBlock::iterator + i = nonDominated->begin(), e = nonDominated->end(); i != e; ++i) { + PHINode *phi = dyn_cast<PHINode>(i); + if (!phi) break; + + PHINode *splitPhi = PHINode::Create(phi->getType(), 2, phi->getName(), + splitIP); + phi->replaceAllUsesWith(splitPhi); + splitPhi->addIncoming(phi, nonDominated); + splitPhi->addIncoming(phi->removeIncomingValue(lastDominated), + lastDominated); + } + + // Make new phis for the exception and selector. + PHINode *exnPhi = PHINode::Create(exn->getType(), 2, "", splitIP); + exn->replaceAllUsesWith(exnPhi); + selector->setArgOperand(0, exn); // except for this use + exnPhi->addIncoming(exn, nonDominated); + exnPhi->addIncoming(lpadExn, lastDominated); + + PHINode *selectorPhi = PHINode::Create(selector->getType(), 2, "", splitIP); + selector->replaceAllUsesWith(selectorPhi); + selectorPhi->addIncoming(selector, nonDominated); + selectorPhi->addIncoming(lpadSelector, lastDominated); + + return lpadSelector; +} + +namespace { + /// A class for recording information about inlining through an invoke. + class InvokeInliningInfo { + BasicBlock *OuterUnwindDest; + EHSelectorInst *OuterSelector; + BasicBlock *InnerUnwindDest; + PHINode *InnerExceptionPHI; + PHINode *InnerSelectorPHI; + SmallVector<Value*, 8> UnwindDestPHIValues; + + public: + InvokeInliningInfo(InvokeInst *II) : + OuterUnwindDest(II->getUnwindDest()), OuterSelector(0), + InnerUnwindDest(0), InnerExceptionPHI(0), InnerSelectorPHI(0) { + + // If there are PHI nodes in the unwind destination block, we + // need to keep track of which values came into them from the + // invoke before removing the edge from this block. + llvm::BasicBlock *invokeBB = II->getParent(); + for (BasicBlock::iterator I = OuterUnwindDest->begin(); + isa<PHINode>(I); ++I) { + // Save the value to use for this edge. + PHINode *phi = cast<PHINode>(I); + UnwindDestPHIValues.push_back(phi->getIncomingValueForBlock(invokeBB)); + } + } + + /// The outer unwind destination is the target of unwind edges + /// introduced for calls within the inlined function. + BasicBlock *getOuterUnwindDest() const { + return OuterUnwindDest; + } + + EHSelectorInst *getOuterSelector() { + if (!OuterSelector) + OuterSelector = findSelectorForLandingPad(OuterUnwindDest); + return OuterSelector; + } + + BasicBlock *getInnerUnwindDest(); + + bool forwardEHResume(CallInst *call, BasicBlock *src); + + /// Add incoming-PHI values to the unwind destination block for + /// the given basic block, using the values for the original + /// invoke's source block. + void addIncomingPHIValuesFor(BasicBlock *BB) const { + addIncomingPHIValuesForInto(BB, OuterUnwindDest); + } + + void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const { + BasicBlock::iterator I = dest->begin(); + for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { + PHINode *phi = cast<PHINode>(I); + phi->addIncoming(UnwindDestPHIValues[i], src); + } + } + }; +} + +/// Get or create a target for the branch out of rewritten calls to +/// llvm.eh.resume. +BasicBlock *InvokeInliningInfo::getInnerUnwindDest() { + if (InnerUnwindDest) return InnerUnwindDest; + + // Find and hoist the llvm.eh.exception and llvm.eh.selector calls + // in the outer landing pad to immediately following the phis. + EHSelectorInst *selector = getOuterSelector(); + if (!selector) return 0; + + // The call to llvm.eh.exception *must* be in the landing pad. + Instruction *exn = cast<Instruction>(selector->getArgOperand(0)); + assert(exn->getParent() == OuterUnwindDest); + + // TODO: recognize when we've already done this, so that we don't + // get a linear number of these when inlining calls into lots of + // invokes with the same landing pad. + + // Do the hoisting. + Instruction *splitPoint = exn->getParent()->getFirstNonPHI(); + assert(splitPoint != selector && "selector-on-exception dominance broken!"); + if (splitPoint == exn) { + selector->removeFromParent(); + selector->insertAfter(exn); + splitPoint = selector->getNextNode(); + } else { + exn->moveBefore(splitPoint); + selector->moveBefore(splitPoint); + } + + // Split the landing pad. + InnerUnwindDest = OuterUnwindDest->splitBasicBlock(splitPoint, + OuterUnwindDest->getName() + ".body"); + + // The number of incoming edges we expect to the inner landing pad. + const unsigned phiCapacity = 2; + + // Create corresponding new phis for all the phis in the outer landing pad. + BasicBlock::iterator insertPoint = InnerUnwindDest->begin(); + BasicBlock::iterator I = OuterUnwindDest->begin(); + for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { + PHINode *outerPhi = cast<PHINode>(I); + PHINode *innerPhi = PHINode::Create(outerPhi->getType(), phiCapacity, + outerPhi->getName() + ".lpad-body", + insertPoint); + outerPhi->replaceAllUsesWith(innerPhi); + innerPhi->addIncoming(outerPhi, OuterUnwindDest); + } + + // Create a phi for the exception value... + InnerExceptionPHI = PHINode::Create(exn->getType(), phiCapacity, + "exn.lpad-body", insertPoint); + exn->replaceAllUsesWith(InnerExceptionPHI); + selector->setArgOperand(0, exn); // restore this use + InnerExceptionPHI->addIncoming(exn, OuterUnwindDest); + + // ...and the selector. + InnerSelectorPHI = PHINode::Create(selector->getType(), phiCapacity, + "selector.lpad-body", insertPoint); + selector->replaceAllUsesWith(InnerSelectorPHI); + InnerSelectorPHI->addIncoming(selector, OuterUnwindDest); + + // All done. + return InnerUnwindDest; +} + +/// [LIBUNWIND] Try to forward the given call, which logically occurs +/// at the end of the given block, as a branch to the inner unwind +/// block. Returns true if the call was forwarded. +bool InvokeInliningInfo::forwardEHResume(CallInst *call, BasicBlock *src) { + // First, check whether this is a call to the intrinsic. + Function *fn = dyn_cast<Function>(call->getCalledValue()); + if (!fn || fn->getName() != "llvm.eh.resume") + return false; + + // At this point, we need to return true on all paths, because + // otherwise we'll construct an invoke of the intrinsic, which is + // not well-formed. + + // Try to find or make an inner unwind dest, which will fail if we + // can't find a selector call for the outer unwind dest. + BasicBlock *dest = getInnerUnwindDest(); + bool hasSelector = (dest != 0); + + // If we failed, just use the outer unwind dest, dropping the + // exception and selector on the floor. + if (!hasSelector) + dest = OuterUnwindDest; + + // Make a branch. + BranchInst::Create(dest, src); + + // Update the phis in the destination. They were inserted in an + // order which makes this work. + addIncomingPHIValuesForInto(src, dest); + + if (hasSelector) { + InnerExceptionPHI->addIncoming(call->getArgOperand(0), src); + InnerSelectorPHI->addIncoming(call->getArgOperand(1), src); + } + + return true; +} + +/// [LIBUNWIND] Check whether this selector is "only cleanups": +/// call i32 @llvm.eh.selector(blah, blah, i32 0) +static bool isCleanupOnlySelector(EHSelectorInst *selector) { + if (selector->getNumArgOperands() != 3) return false; + ConstantInt *val = dyn_cast<ConstantInt>(selector->getArgOperand(2)); + return (val && val->isZero()); +} /// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into /// an invoke, we have to turn all of the calls that can throw into @@ -44,9 +418,9 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) { /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI /// nodes in that block with the values specified in InvokeDestPHIValues. /// -static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, - BasicBlock *InvokeDest, - const SmallVectorImpl<Value*> &InvokeDestPHIValues) { +/// Returns true to indicate that the next block should be skipped. +static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, + InvokeInliningInfo &Invoke) { for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { Instruction *I = BBI++; @@ -54,6 +428,37 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, // instructions require no special handling. CallInst *CI = dyn_cast<CallInst>(I); if (CI == 0) continue; + + // LIBUNWIND: merge selector instructions. + if (EHSelectorInst *Inner = dyn_cast<EHSelectorInst>(CI)) { + EHSelectorInst *Outer = Invoke.getOuterSelector(); + if (!Outer) continue; + + bool innerIsOnlyCleanup = isCleanupOnlySelector(Inner); + bool outerIsOnlyCleanup = isCleanupOnlySelector(Outer); + + // If both selectors contain only cleanups, we don't need to do + // anything. TODO: this is really just a very specific instance + // of a much more general optimization. + if (innerIsOnlyCleanup && outerIsOnlyCleanup) continue; + + // Otherwise, we just append the outer selector to the inner selector. + SmallVector<Value*, 16> NewSelector; + for (unsigned i = 0, e = Inner->getNumArgOperands(); i != e; ++i) + NewSelector.push_back(Inner->getArgOperand(i)); + for (unsigned i = 2, e = Outer->getNumArgOperands(); i != e; ++i) + NewSelector.push_back(Outer->getArgOperand(i)); + + CallInst *NewInner = + IRBuilder<>(Inner).CreateCall(Inner->getCalledValue(), + NewSelector.begin(), + NewSelector.end()); + // No need to copy attributes, calling convention, etc. + NewInner->takeName(Inner); + Inner->replaceAllUsesWith(NewInner); + Inner->eraseFromParent(); + continue; + } // If this call cannot unwind, don't convert it to an invoke. if (CI->doesNotThrow()) @@ -62,37 +467,45 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, // Convert this function call into an invoke instruction. // First, split the basic block. BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc"); - - // Next, create the new invoke instruction, inserting it at the end - // of the old basic block. + + // Delete the unconditional branch inserted by splitBasicBlock + BB->getInstList().pop_back(); + + // LIBUNWIND: If this is a call to @llvm.eh.resume, just branch + // directly to the new landing pad. + if (Invoke.forwardEHResume(CI, BB)) { + // TODO: 'Split' is now unreachable; clean it up. + + // We want to leave the original call intact so that the call + // graph and other structures won't get misled. We also have to + // avoid processing the next block, or we'll iterate here forever. + return true; + } + + // Otherwise, create the new invoke instruction. ImmutableCallSite CS(CI); SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end()); InvokeInst *II = - InvokeInst::Create(CI->getCalledValue(), Split, InvokeDest, + InvokeInst::Create(CI->getCalledValue(), Split, + Invoke.getOuterUnwindDest(), InvokeArgs.begin(), InvokeArgs.end(), - CI->getName(), BB->getTerminator()); + CI->getName(), BB); II->setCallingConv(CI->getCallingConv()); II->setAttributes(CI->getAttributes()); // Make sure that anything using the call now uses the invoke! This also // updates the CallGraph if present, because it uses a WeakVH. CI->replaceAllUsesWith(II); - - // Delete the unconditional branch inserted by splitBasicBlock - BB->getInstList().pop_back(); + Split->getInstList().pop_front(); // Delete the original call - + // Update any PHI nodes in the exceptional block to indicate that // there is now a new entry in them. - unsigned i = 0; - for (BasicBlock::iterator I = InvokeDest->begin(); - isa<PHINode>(I); ++I, ++i) - cast<PHINode>(I)->addIncoming(InvokeDestPHIValues[i], BB); - - // This basic block is now complete, the caller will continue scanning the - // next one. - return; + Invoke.addIncomingPHIValuesFor(BB); + return false; } + + return false; } @@ -106,17 +519,6 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, ClonedCodeInfo &InlinedCodeInfo) { BasicBlock *InvokeDest = II->getUnwindDest(); - SmallVector<Value*, 8> InvokeDestPHIValues; - - // If there are PHI nodes in the unwind destination block, we need to - // keep track of which values came into them from this invoke, then remove - // the entry for this block. - BasicBlock *InvokeBlock = II->getParent(); - for (BasicBlock::iterator I = InvokeDest->begin(); isa<PHINode>(I); ++I) { - PHINode *PN = cast<PHINode>(I); - // Save the value to use for this edge. - InvokeDestPHIValues.push_back(PN->getIncomingValueForBlock(InvokeBlock)); - } Function *Caller = FirstNewBlock->getParent(); @@ -132,11 +534,17 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, InvokeDest->removePredecessor(II->getParent()); return; } + + InvokeInliningInfo Invoke(II); for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){ if (InlinedCodeInfo.ContainsCalls) - HandleCallsInBlockInlinedThroughInvoke(BB, InvokeDest, - InvokeDestPHIValues); + if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) { + // Honor a request to skip the next block. We don't need to + // consider UnwindInsts in this case either. + ++BB; + continue; + } if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { // An UnwindInst requires special handling when it gets inlined into an @@ -150,12 +558,7 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, // Update any PHI nodes in the exceptional block to indicate that // there is now a new entry in them. - unsigned i = 0; - for (BasicBlock::iterator I = InvokeDest->begin(); - isa<PHINode>(I); ++I, ++i) { - PHINode *PN = cast<PHINode>(I); - PN->addIncoming(InvokeDestPHIValues[i], BB); - } + Invoke.addIncomingPHIValuesFor(BB); } } @@ -299,28 +702,55 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, ConstantInt::get(Type::getInt32Ty(Context), 1), ConstantInt::getFalse(Context) // isVolatile }; - CallInst *TheMemCpy = - CallInst::Create(MemCpyFn, CallArgs, CallArgs+5, "", TheCall); - - // If we have a call graph, update it. - if (CallGraph *CG = IFI.CG) { - CallGraphNode *MemCpyCGN = CG->getOrInsertFunction(MemCpyFn); - CallGraphNode *CallerNode = (*CG)[Caller]; - CallerNode->addCalledFunction(TheMemCpy, MemCpyCGN); - } + IRBuilder<>(TheCall).CreateCall(MemCpyFn, CallArgs, CallArgs+5); // Uses of the argument in the function should use our new alloca // instead. return NewAlloca; } +// isUsedByLifetimeMarker - Check whether this Value is used by a lifetime +// intrinsic. +static bool isUsedByLifetimeMarker(Value *V) { + for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); UI != UE; + ++UI) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*UI)) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + return true; + } + } + } + return false; +} + +// hasLifetimeMarkers - Check whether the given alloca already has +// lifetime.start or lifetime.end intrinsics. +static bool hasLifetimeMarkers(AllocaInst *AI) { + const Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext()); + if (AI->getType() == Int8PtrTy) + return isUsedByLifetimeMarker(AI); + + // Do a scan to find all the casts to i8*. + for (Value::use_iterator I = AI->use_begin(), E = AI->use_end(); I != E; + ++I) { + if (I->getType() != Int8PtrTy) continue; + if (I->stripPointerCasts() != AI) continue; + if (isUsedByLifetimeMarker(*I)) + return true; + } + return false; +} + // InlineFunction - This function inlines the called function into the basic // block of the caller. This returns false if it is not possible to inline this // call. The program is still in a well defined state if this occurs though. // // Note that this only does one level of inlining. For example, if the // instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now -// exists in the instruction stream. Similiarly this will inline a recursive +// exists in the instruction stream. Similarly this will inline a recursive // function by one level. // bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) { @@ -460,6 +890,26 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) { } } + // Leave lifetime markers for the static alloca's, scoping them to the + // function we just inlined. + if (!IFI.StaticAllocas.empty()) { + IRBuilder<> builder(FirstNewBlock->begin()); + for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) { + AllocaInst *AI = IFI.StaticAllocas[ai]; + + // If the alloca is already scoped to something smaller than the whole + // function then there's no need to add redundant, less accurate markers. + if (hasLifetimeMarkers(AI)) + continue; + + builder.CreateLifetimeStart(AI); + for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) { + IRBuilder<> builder(Returns[ri]); + builder.CreateLifetimeEnd(AI); + } + } + } + // If the inlined code contained dynamic alloca instructions, wrap the inlined // code with llvm.stacksave/llvm.stackrestore intrinsics. if (InlinedFunctionInfo.ContainsDynamicAllocas) { @@ -468,25 +918,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) { Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore); - // If we are preserving the callgraph, add edges to the stacksave/restore - // functions for the calls we insert. - CallGraphNode *StackSaveCGN = 0, *StackRestoreCGN = 0, *CallerNode = 0; - if (CallGraph *CG = IFI.CG) { - StackSaveCGN = CG->getOrInsertFunction(StackSave); - StackRestoreCGN = CG->getOrInsertFunction(StackRestore); - CallerNode = (*CG)[Caller]; - } - // Insert the llvm.stacksave. - CallInst *SavedPtr = CallInst::Create(StackSave, "savedstack", - FirstNewBlock->begin()); - if (IFI.CG) CallerNode->addCalledFunction(SavedPtr, StackSaveCGN); + CallInst *SavedPtr = IRBuilder<>(FirstNewBlock, FirstNewBlock->begin()) + .CreateCall(StackSave, "savedstack"); // Insert a call to llvm.stackrestore before any return instructions in the // inlined function. for (unsigned i = 0, e = Returns.size(); i != e; ++i) { - CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", Returns[i]); - if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN); + IRBuilder<>(Returns[i]).CreateCall(StackRestore, SavedPtr); } // Count the number of StackRestore calls we insert. @@ -498,8 +937,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) { for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB) if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { - CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", UI); - if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN); + IRBuilder<>(UI).CreateCall(StackRestore, SavedPtr); ++NumStackRestores; } } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index d7e2336..19c3c72 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -20,6 +20,8 @@ #include "llvm/Instructions.h" #include "llvm/Intrinsics.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Metadata.h" +#include "llvm/Operator.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/DebugInfo.h" @@ -33,6 +35,7 @@ #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" @@ -42,12 +45,16 @@ using namespace llvm; // Local constant propagation. // -// ConstantFoldTerminator - If a terminator instruction is predicated on a -// constant value, convert it into an unconditional branch to the constant -// destination. -// -bool llvm::ConstantFoldTerminator(BasicBlock *BB) { +/// ConstantFoldTerminator - If a terminator instruction is predicated on a +/// constant value, convert it into an unconditional branch to the constant +/// destination. This is a nontrivial operation because the successors of this +/// basic block must have their PHI nodes updated. +/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch +/// conditions and indirectbr addresses this might make dead if +/// DeleteDeadConditions is true. +bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) { TerminatorInst *T = BB->getTerminator(); + IRBuilder<> Builder(T); // Branch - See if we are conditional jumping on constant if (BranchInst *BI = dyn_cast<BranchInst>(T)) { @@ -67,11 +74,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) { // Let the basic block know that we are letting go of it. Based on this, // it will adjust it's PHI nodes. - assert(BI->getParent() && "Terminator not inserted in block!"); - OldDest->removePredecessor(BI->getParent()); + OldDest->removePredecessor(BB); // Replace the conditional branch with an unconditional one. - BranchInst::Create(Destination, BI); + Builder.CreateBr(Destination); BI->eraseFromParent(); return true; } @@ -86,8 +92,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) { Dest1->removePredecessor(BI->getParent()); // Replace the conditional branch with an unconditional one. - BranchInst::Create(Dest1, BI); + Builder.CreateBr(Dest1); + Value *Cond = BI->getCondition(); BI->eraseFromParent(); + if (DeleteDeadConditions) + RecursivelyDeleteTriviallyDeadInstructions(Cond); return true; } return false; @@ -136,7 +145,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) { // now. if (TheOnlyDest) { // Insert the new branch. - BranchInst::Create(TheOnlyDest, SI); + Builder.CreateBr(TheOnlyDest); BasicBlock *BB = SI->getParent(); // Remove entries from PHI nodes which we no longer branch to... @@ -150,17 +159,21 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) { } // Delete the old switch. - BB->getInstList().erase(SI); + Value *Cond = SI->getCondition(); + SI->eraseFromParent(); + if (DeleteDeadConditions) + RecursivelyDeleteTriviallyDeadInstructions(Cond); return true; } if (SI->getNumSuccessors() == 2) { // Otherwise, we can fold this switch into a conditional branch // instruction if it has only one non-default destination. - Value *Cond = new ICmpInst(SI, ICmpInst::ICMP_EQ, SI->getCondition(), - SI->getSuccessorValue(1), "cond"); + Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), + SI->getSuccessorValue(1), "cond"); + // Insert the new branch. - BranchInst::Create(SI->getSuccessor(1), SI->getSuccessor(0), Cond, SI); + Builder.CreateCondBr(Cond, SI->getSuccessor(1), SI->getSuccessor(0)); // Delete the old switch. SI->eraseFromParent(); @@ -175,7 +188,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) { dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) { BasicBlock *TheOnlyDest = BA->getBasicBlock(); // Insert the new branch. - BranchInst::Create(TheOnlyDest, IBI); + Builder.CreateBr(TheOnlyDest); for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { if (IBI->getDestination(i) == TheOnlyDest) @@ -183,7 +196,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) { else IBI->getDestination(i)->removePredecessor(IBI->getParent()); } + Value *Address = IBI->getAddress(); IBI->eraseFromParent(); + if (DeleteDeadConditions) + RecursivelyDeleteTriviallyDeadInstructions(Address); // If we didn't find our destination in the IBI successor list, then we // have undefined behavior. Replace the unconditional branch with an @@ -690,39 +706,15 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { /// static unsigned enforceKnownAlignment(Value *V, unsigned Align, unsigned PrefAlign) { + V = V->stripPointerCasts(); - User *U = dyn_cast<User>(V); - if (!U) return Align; - - switch (Operator::getOpcode(U)) { - default: break; - case Instruction::BitCast: - return enforceKnownAlignment(U->getOperand(0), Align, PrefAlign); - case Instruction::GetElementPtr: { - // If all indexes are zero, it is just the alignment of the base pointer. - bool AllZeroOperands = true; - for (User::op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i) - if (!isa<Constant>(*i) || - !cast<Constant>(*i)->isNullValue()) { - AllZeroOperands = false; - break; - } - - if (AllZeroOperands) { - // Treat this like a bitcast. - return enforceKnownAlignment(U->getOperand(0), Align, PrefAlign); - } - return Align; - } - case Instruction::Alloca: { - AllocaInst *AI = cast<AllocaInst>(V); + if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) { // If there is a requested alignment and if this is an alloca, round up. if (AI->getAlignment() >= PrefAlign) return AI->getAlignment(); AI->setAlignment(PrefAlign); return PrefAlign; } - } if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) { // If there is a large requested alignment and we can, bump up the alignment @@ -785,10 +777,19 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, if (!DIVar.Verify()) return false; - Instruction *DbgVal = - Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, - DIVar, SI); - + Instruction *DbgVal = NULL; + // If an argument is zero extended then use argument directly. The ZExt + // may be zapped by an optimization pass in future. + Argument *ExtendedArg = NULL; + if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) + ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0)); + if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) + ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0)); + if (ExtendedArg) + DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, SI); + else + DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, SI); + // Propagate any debug metadata from the store onto the dbg.value. DebugLoc SIDL = SI->getDebugLoc(); if (!SIDL.isUnknown()) @@ -838,14 +839,30 @@ bool llvm::LowerDbgDeclare(Function &F) { E = Dbgs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) { + bool RemoveDDI = true; for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ++UI) if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) ConvertDebugDeclareToDebugValue(DDI, SI, DIB); else if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) ConvertDebugDeclareToDebugValue(DDI, LI, DIB); + else + RemoveDDI = false; + if (RemoveDDI) + DDI->eraseFromParent(); } - DDI->eraseFromParent(); } return true; } + +/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the +/// alloca 'V', if any. +DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { + if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), V)) + for (Value::use_iterator UI = DebugNode->use_begin(), + E = DebugNode->use_end(); UI != E; ++UI) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + return DDI; + + return 0; +} diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index 9fe5929..f02ffd2 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -115,7 +115,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", true, false) -// Publically exposed interface to pass... +// Publicly exposed interface to pass... char &llvm::LoopSimplifyID = LoopSimplify::ID; Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp index 914a439..ed733d3 100644 --- a/lib/Transforms/Utils/LowerSwitch.cpp +++ b/lib/Transforms/Utils/LowerSwitch.cpp @@ -84,7 +84,7 @@ char LowerSwitch::ID = 0; INITIALIZE_PASS(LowerSwitch, "lowerswitch", "Lower SwitchInst's to branches", false, false) -// Publically exposed interface to pass... +// Publicly exposed interface to pass... char &llvm::LowerSwitchID = LowerSwitch::ID; // createLowerSwitchPass - Interface to this file... FunctionPass *llvm::createLowerSwitchPass() { diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index c96bbad..a1736b9 100644 --- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -46,7 +46,6 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CFG.h" #include <algorithm> -#include <map> #include <queue> using namespace llvm; @@ -101,18 +100,6 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { return true; } -/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the -/// alloca 'V', if any. -static DbgDeclareInst *FindAllocaDbgDeclare(Value *V) { - if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), &V, 1)) - for (Value::use_iterator UI = DebugNode->use_begin(), - E = DebugNode->use_end(); UI != E; ++UI) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) - return DDI; - - return 0; -} - namespace { struct AllocaInfo; diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index 4f83b7e..b336194 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -12,16 +12,22 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "ssaupdater" +#include "llvm/Constants.h" #include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/DIBuilder.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Support/AlignOf.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/SSAUpdaterImpl.h" + using namespace llvm; typedef DenseMap<BasicBlock*, Value*> AvailableValsTy; @@ -184,6 +190,9 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { return V; } + // Set DebugLoc. + InsertedPHI->setDebugLoc(GetFirstDebugLocInBasicBlock(BB)); + // If the client wants to know about all new instructions, tell it. if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI); @@ -349,7 +358,8 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) { LoadAndStorePromoter:: LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts, - SSAUpdater &S, StringRef BaseName) : SSA(S) { + SSAUpdater &S, DbgDeclareInst *DD, DIBuilder *DB, + StringRef BaseName) : SSA(S), DDI(DD), DIB(DB) { if (Insts.empty()) return; Value *SomeVal; @@ -396,9 +406,11 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // single user in it, we can rewrite it trivially. if (BlockUses.size() == 1) { // If it is a store, it is a trivial def of the value in the block. - if (StoreInst *SI = dyn_cast<StoreInst>(User)) + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + if (DDI) + ConvertDebugDeclareToDebugValue(DDI, SI, *DIB); SSA.AddAvailableValue(BB, SI->getOperand(0)); - else + } else // Otherwise it is a load, queue it to rewrite as a live-in load. LiveInLoads.push_back(cast<LoadInst>(User)); BlockUses.clear(); @@ -447,12 +459,15 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { continue; } - if (StoreInst *S = dyn_cast<StoreInst>(II)) { + if (StoreInst *SI = dyn_cast<StoreInst>(II)) { // If this is a store to an unrelated pointer, ignore it. - if (!isInstInList(S, Insts)) continue; - + if (!isInstInList(SI, Insts)) continue; + + if (DDI) + ConvertDebugDeclareToDebugValue(DDI, SI, *DIB); + // Remember that this is the active value in the block. - StoredValue = S->getOperand(0); + StoredValue = SI->getOperand(0); } } @@ -507,4 +522,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { instructionDeleted(User); User->eraseFromParent(); } + + if (DDI) + DDI->eraseFromParent(); } diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index cfc897c..6df846c 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -20,6 +20,7 @@ #include "llvm/DerivedTypes.h" #include "llvm/GlobalVariable.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/ADT/DenseMap.h" @@ -31,12 +32,18 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConstantRange.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/NoFolder.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <set> #include <map> using namespace llvm; +static cl::opt<unsigned> +PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1), + cl::desc("Control the amount of phi node folding to perform (default = 1)")); + static cl::opt<bool> DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false), cl::desc("Duplicate return instructions into unconditional branches")); @@ -51,16 +58,18 @@ class SimplifyCFGOpt { BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases); bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, - BasicBlock *Pred); - bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI); + BasicBlock *Pred, + IRBuilder<> &Builder); + bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI, + IRBuilder<> &Builder); - bool SimplifyReturn(ReturnInst *RI); - bool SimplifyUnwind(UnwindInst *UI); + bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder); + bool SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder); bool SimplifyUnreachable(UnreachableInst *UI); - bool SimplifySwitch(SwitchInst *SI); + bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder); bool SimplifyIndirectBr(IndirectBrInst *IBI); - bool SimplifyUncondBranch(BranchInst *BI); - bool SimplifyCondBranch(BranchInst *BI); + bool SimplifyUncondBranch(BranchInst *BI, IRBuilder <> &Builder); + bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder); public: explicit SimplifyCFGOpt(const TargetData *td) : TD(td) {} @@ -201,11 +210,20 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, /// which works well enough for us. /// /// If AggressiveInsts is non-null, and if V does not dominate BB, we check to -/// see if V (which must be an instruction) is cheap to compute and is -/// non-trapping. If both are true, the instruction is inserted into the set -/// and true is returned. +/// see if V (which must be an instruction) and its recursive operands +/// that do not dominate BB have a combined cost lower than CostRemaining and +/// are non-trapping. If both are true, the instruction is inserted into the +/// set and true is returned. +/// +/// The cost for most non-trapping instructions is defined as 1 except for +/// Select whose cost is 2. +/// +/// After this function returns, CostRemaining is decreased by the cost of +/// V plus its non-dominating operands. If that cost is greater than +/// CostRemaining, false is returned and CostRemaining is undefined. static bool DominatesMergePoint(Value *V, BasicBlock *BB, - SmallPtrSet<Instruction*, 4> *AggressiveInsts) { + SmallPtrSet<Instruction*, 4> *AggressiveInsts, + unsigned &CostRemaining) { Instruction *I = dyn_cast<Instruction>(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs @@ -232,12 +250,17 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // instructions in the 'if region'. if (AggressiveInsts == 0) return false; + // If we have seen this instruction before, don't count it again. + if (AggressiveInsts->count(I)) return true; + // Okay, it looks like the instruction IS in the "condition". Check to // see if it's a cheap instruction to unconditionally compute, and if it // only uses stuff defined outside of the condition. If so, hoist it out. if (!I->isSafeToSpeculativelyExecute()) return false; + unsigned Cost = 0; + switch (I->getOpcode()) { default: return false; // Cannot hoist this out safely. case Instruction::Load: @@ -246,11 +269,13 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // predecessor. if (PBB->getFirstNonPHIOrDbg() != I) return false; + Cost = 1; break; case Instruction::GetElementPtr: // GEPs are cheap if all indices are constant. if (!cast<GetElementPtrInst>(I)->hasAllConstantIndices()) return false; + Cost = 1; break; case Instruction::Add: case Instruction::Sub: @@ -261,13 +286,26 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, case Instruction::LShr: case Instruction::AShr: case Instruction::ICmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + Cost = 1; break; // These are all cheap and non-trapping instructions. + + case Instruction::Select: + Cost = 2; + break; } - // Okay, we can only really hoist these out if their operands are not - // defined in the conditional region. + if (Cost > CostRemaining) + return false; + + CostRemaining -= Cost; + + // Okay, we can only really hoist these out if their operands do + // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, 0)) + if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining)) return false; // Okay, it's safe to do this! Remember this instruction. AggressiveInsts->insert(I); @@ -508,7 +546,8 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1, /// form of jump threading. bool SimplifyCFGOpt:: SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, - BasicBlock *Pred) { + BasicBlock *Pred, + IRBuilder<> &Builder) { Value *PredVal = isValueEqualityComparison(Pred->getTerminator()); if (!PredVal) return false; // Not a value comparison in predecessor. @@ -541,7 +580,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, // uncond br. assert(ThisCases.size() == 1 && "Branch can only have one case!"); // Insert the new branch. - Instruction *NI = BranchInst::Create(ThisDef, TI); + Instruction *NI = Builder.CreateBr(ThisDef); (void) NI; // Remove PHI node entries for the dead edge. @@ -606,7 +645,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, CheckEdge = 0; // Insert the new branch. - Instruction *NI = BranchInst::Create(TheRealDest, TI); + Instruction *NI = Builder.CreateBr(TheRealDest); (void) NI; DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() @@ -641,7 +680,8 @@ static int ConstantIntSortPredicate(const void *P1, const void *P2) { /// equality comparison instruction (either a switch or a branch on "X == c"). /// See if any of the predecessors of the terminator block are value comparisons /// on the same value. If so, and if safe to do so, fold them together. -bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI) { +bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, + IRBuilder<> &Builder) { BasicBlock *BB = TI->getParent(); Value *CV = isValueEqualityComparison(TI); // CondVal assert(CV && "Not a comparison?"); @@ -734,16 +774,18 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI) { for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i) AddPredecessorToBlock(NewSuccessors[i], Pred, BB); + Builder.SetInsertPoint(PTI); // Convert pointer to int before we switch. if (CV->getType()->isPointerTy()) { assert(TD && "Cannot switch on pointer without TargetData"); - CV = new PtrToIntInst(CV, TD->getIntPtrType(CV->getContext()), - "magicptr", PTI); + CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()), + "magicptr"); } // Now that the successors are updated, create the new Switch instruction. - SwitchInst *NewSI = SwitchInst::Create(CV, PredDefault, - PredCases.size(), PTI); + SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault, + PredCases.size()); + NewSI->setDebugLoc(PTI->getDebugLoc()); for (unsigned i = 0, e = PredCases.size(); i != e; ++i) NewSI->addCase(PredCases[i].first, PredCases[i].second); @@ -867,6 +909,7 @@ HoistTerminator: NT->takeName(I1); } + IRBuilder<true, NoFolder> Builder(NT); // Hoisting one of the terminators from our successor is a great thing. // Unfortunately, the successors of the if/else blocks may have PHI nodes in // them. If they do, all PHI entries for BB1/BB2 must agree for all PHI @@ -883,9 +926,11 @@ HoistTerminator: // These values do not agree. Insert a select instruction before NT // that determines the right value. SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)]; - if (SI == 0) - SI = SelectInst::Create(BI->getCondition(), BB1V, BB2V, - BB1V->getName()+"."+BB2V->getName(), NT); + if (SI == 0) + SI = cast<SelectInst> + (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V, + BB1V->getName()+"."+BB2V->getName())); + // Make the PHI node use the select for all incoming values for BB1/BB2 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2) @@ -1043,13 +1088,16 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) { // Create a select whose true value is the speculatively executed value and // false value is the previously determined FalseV. + IRBuilder<true, NoFolder> Builder(BI); SelectInst *SI; if (Invert) - SI = SelectInst::Create(BrCond, FalseV, HInst, - FalseV->getName() + "." + HInst->getName(), BI); + SI = cast<SelectInst> + (Builder.CreateSelect(BrCond, FalseV, HInst, + FalseV->getName() + "." + HInst->getName())); else - SI = SelectInst::Create(BrCond, HInst, FalseV, - HInst->getName() + "." + FalseV->getName(), BI); + SI = cast<SelectInst> + (Builder.CreateSelect(BrCond, HInst, FalseV, + HInst->getName() + "." + FalseV->getName())); // Make the PHI node use the select for all incoming values for "then" and // "if" blocks. @@ -1123,6 +1171,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) { BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue()); if (RealDest == BB) continue; // Skip self loops. + // Skip if the predecessor's terminator is an indirect branch. + if (isa<IndirectBrInst>(PredBB->getTerminator())) continue; // The dest block might have PHI nodes, other predecessors and other // difficult cases. Instead of being smart about this, just insert a new @@ -1178,7 +1228,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) { BB->removePredecessor(PredBB); PredBBTI->setSuccessor(i, EdgeBB); } - + // Recurse, simplifying any other constants. return FoldCondBranchOnPHI(BI, TD) | true; } @@ -1217,6 +1267,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { // instructions. While we are at it, keep track of the instructions // that need to be moved to the dominating block. SmallPtrSet<Instruction*, 4> AggressiveInsts; + unsigned MaxCostVal0 = PHINodeFoldingThreshold, + MaxCostVal1 = PHINodeFoldingThreshold; for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) { PHINode *PN = cast<PHINode>(II++); @@ -1226,8 +1278,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { continue; } - if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts) || - !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts)) + if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts, + MaxCostVal0) || + !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts, + MaxCostVal1)) return false; } @@ -1283,6 +1337,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { // If we can still promote the PHI nodes after this gauntlet of tests, // do all of the PHI's now. Instruction *InsertPt = DomBlock->getTerminator(); + IRBuilder<true, NoFolder> Builder(InsertPt); // Move all 'aggressive' instructions, which are defined in the // conditional parts of the if's up to the dominating block. @@ -1300,7 +1355,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse); Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue); - Value *NV = SelectInst::Create(IfCond, TrueVal, FalseVal, "", InsertPt); + SelectInst *NV = + cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, "")); PN->replaceAllUsesWith(NV); NV->takeName(PN); PN->eraseFromParent(); @@ -1310,7 +1366,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { // has been flattened. Change DomBlock to jump directly to our new block to // avoid other simplifycfg's kicking in on the diamond. TerminatorInst *OldTI = DomBlock->getTerminator(); - BranchInst::Create(BB, OldTI); + Builder.SetInsertPoint(OldTI); + Builder.CreateBr(BB); OldTI->eraseFromParent(); return true; } @@ -1318,7 +1375,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { /// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes /// to two returning blocks, try to merge them together into one return, /// introducing a select if the return values disagree. -static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) { +static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, + IRBuilder<> &Builder) { assert(BI->isConditional() && "Must be a conditional branch"); BasicBlock *TrueSucc = BI->getSuccessor(0); BasicBlock *FalseSucc = BI->getSuccessor(1); @@ -1333,13 +1391,14 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) { if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator()) return false; + Builder.SetInsertPoint(BI); // Okay, we found a branch that is going to two return nodes. If // there is no return value for this function, just change the // branch into a return. if (FalseRet->getNumOperands() == 0) { TrueSucc->removePredecessor(BI->getParent()); FalseSucc->removePredecessor(BI->getParent()); - ReturnInst::Create(BI->getContext(), 0, BI); + Builder.CreateRetVoid(); EraseTerminatorInstAndDCECond(BI); return true; } @@ -1382,14 +1441,14 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) { } else if (isa<UndefValue>(TrueValue)) { TrueValue = FalseValue; } else { - TrueValue = SelectInst::Create(BrCond, TrueValue, - FalseValue, "retval", BI); + TrueValue = Builder.CreateSelect(BrCond, TrueValue, + FalseValue, "retval"); } } - Value *RI = !TrueValue ? - ReturnInst::Create(BI->getContext(), BI) : - ReturnInst::Create(BI->getContext(), TrueValue, BI); + Value *RI = !TrueValue ? + Builder.CreateRetVoid() : Builder.CreateRet(TrueValue); + (void) RI; DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:" @@ -1401,27 +1460,24 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) { return true; } -/// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch, -/// and if a predecessor branches to us and one of our successors, fold the -/// setcc into the predecessor and use logical operations to pick the right -/// destination. +/// FoldBranchToCommonDest - If this basic block is simple enough, and if a +/// predecessor branches to us and one of our successors, fold the block into +/// the predecessor and use logical operations to pick the right destination. bool llvm::FoldBranchToCommonDest(BranchInst *BI) { BasicBlock *BB = BI->getParent(); + Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) || Cond->getParent() != BB || !Cond->hasOneUse()) return false; - SmallVector<DbgInfoIntrinsic *, 8> DbgValues; // Only allow this if the condition is a simple instruction that can be // executed unconditionally. It must be in the same block as the branch, and // must be at the front of the block. BasicBlock::iterator FrontIt = BB->front(); + // Ignore dbg intrinsics. - while (DbgInfoIntrinsic *DBI = dyn_cast<DbgInfoIntrinsic>(FrontIt)) { - DbgValues.push_back(DBI); - ++FrontIt; - } + while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt; // Allow a single instruction to be hoisted in addition to the compare // that feeds the branch. We later ensure that any values that _it_ uses @@ -1433,12 +1489,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { FrontIt->isSafeToSpeculativelyExecute()) { BonusInst = &*FrontIt; ++FrontIt; - } - - // Ignore dbg intrinsics. - while (DbgInfoIntrinsic *DBI = dyn_cast<DbgInfoIntrinsic>(FrontIt)) { - DbgValues.push_back(DBI); - ++FrontIt; + + // Ignore dbg intrinsics. + while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt; } // Only a single bonus inst is allowed. @@ -1447,15 +1500,12 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Make sure the instruction after the condition is the cond branch. BasicBlock::iterator CondIt = Cond; ++CondIt; + // Ingore dbg intrinsics. - while(DbgInfoIntrinsic *DBI = dyn_cast<DbgInfoIntrinsic>(CondIt)) { - DbgValues.push_back(DBI); - ++CondIt; - } - if (&*CondIt != BI) { - assert (!isa<DbgInfoIntrinsic>(CondIt) && "Hey do not forget debug info!"); + while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt; + + if (&*CondIt != BI) return false; - } // Cond is known to be a compare or binary operator. Check to make sure that // neither operand is a potentially-trapping constant expression. @@ -1466,7 +1516,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { if (CE->canTrap()) return false; - // Finally, don't infinitely unroll conditional loops. BasicBlock *TrueDest = BI->getSuccessor(0); BasicBlock *FalseDest = BI->getSuccessor(1); @@ -1480,10 +1529,24 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Check that we have two conditional branches. If there is a PHI node in // the common successor, verify that the same value flows in from both // blocks. - if (PBI == 0 || PBI->isUnconditional() || - !SafeToMergeTerminators(BI, PBI)) + if (PBI == 0 || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI)) continue; + // Determine if the two branches share a common destination. + Instruction::BinaryOps Opc; + bool InvertPredCond = false; + + if (PBI->getSuccessor(0) == TrueDest) + Opc = Instruction::Or; + else if (PBI->getSuccessor(1) == FalseDest) + Opc = Instruction::And; + else if (PBI->getSuccessor(0) == FalseDest) + Opc = Instruction::And, InvertPredCond = true; + else if (PBI->getSuccessor(1) == TrueDest) + Opc = Instruction::Or, InvertPredCond = true; + else + continue; + // Ensure that any values used in the bonus instruction are also used // by the terminator of the predecessor. This means that those values // must already have been resolved, so we won't be inhibiting the @@ -1521,23 +1584,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { if (!UsedValues.empty()) return false; } - - Instruction::BinaryOps Opc; - bool InvertPredCond = false; - - if (PBI->getSuccessor(0) == TrueDest) - Opc = Instruction::Or; - else if (PBI->getSuccessor(1) == FalseDest) - Opc = Instruction::And; - else if (PBI->getSuccessor(0) == FalseDest) - Opc = Instruction::And, InvertPredCond = true; - else if (PBI->getSuccessor(1) == TrueDest) - Opc = Instruction::Or, InvertPredCond = true; - else - continue; DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB); - + IRBuilder<> Builder(PBI); + // If we need to invert the condition in the pred block to match, do so now. if (InvertPredCond) { Value *NewCond = PBI->getCondition(); @@ -1546,8 +1596,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { CmpInst *CI = cast<CmpInst>(NewCond); CI->setPredicate(CI->getInversePredicate()); } else { - NewCond = BinaryOperator::CreateNot(NewCond, - PBI->getCondition()->getName()+".not", PBI); + NewCond = Builder.CreateNot(NewCond, + PBI->getCondition()->getName()+".not"); } PBI->setCondition(NewCond); @@ -1574,8 +1624,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { New->takeName(Cond); Cond->setName(New->getName()+".old"); - Value *NewCond = BinaryOperator::Create(Opc, PBI->getCondition(), - New, "or.cond", PBI); + Instruction *NewCond = + cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(), + New, "or.cond")); PBI->setCondition(NewCond); if (PBI->getSuccessor(0) == BB) { AddPredecessorToBlock(TrueDest, PredBlock, BB); @@ -1586,13 +1637,11 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { PBI->setSuccessor(1, FalseDest); } - // Move dbg value intrinsics in PredBlock. - for (SmallVector<DbgInfoIntrinsic *, 8>::iterator DBI = DbgValues.begin(), - DBE = DbgValues.end(); DBI != DBE; ++DBI) { - DbgInfoIntrinsic *DB = *DBI; - DB->removeFromParent(); - DB->insertBefore(PBI); - } + // Copy any debug value intrinsics into the end of PredBlock. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (isa<DbgInfoIntrinsic>(*I)) + I->clone()->insertBefore(PBI); + return true; } return false; @@ -1720,23 +1769,22 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { } DEBUG(dbgs() << *PBI->getParent()->getParent()); - + // BI may have other predecessors. Because of this, we leave // it alone, but modify PBI. // Make sure we get to CommonDest on True&True directions. Value *PBICond = PBI->getCondition(); + IRBuilder<true, NoFolder> Builder(PBI); if (PBIOp) - PBICond = BinaryOperator::CreateNot(PBICond, - PBICond->getName()+".not", - PBI); + PBICond = Builder.CreateNot(PBICond, PBICond->getName()+".not"); + Value *BICond = BI->getCondition(); if (BIOp) - BICond = BinaryOperator::CreateNot(BICond, - BICond->getName()+".not", - PBI); + BICond = Builder.CreateNot(BICond, BICond->getName()+".not"); + // Merge the conditions. - Value *Cond = BinaryOperator::CreateOr(PBICond, BICond, "brmerge", PBI); + Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge"); // Modify PBI to branch on the new condition to the new dests. PBI->setCondition(Cond); @@ -1759,8 +1807,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { Value *PBIV = PN->getIncomingValue(PBBIdx); if (BIV != PBIV) { // Insert a select in PBI to pick the right value. - Value *NV = SelectInst::Create(PBICond, PBIV, BIV, - PBIV->getName()+".mux", PBI); + Value *NV = cast<SelectInst> + (Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName()+".mux")); PN->setIncomingValue(PBBIdx, NV); } } @@ -1799,16 +1847,19 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, Succ->removePredecessor(OldTerm->getParent()); } + IRBuilder<> Builder(OldTerm); + Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc()); + // Insert an appropriate new terminator. if ((KeepEdge1 == 0) && (KeepEdge2 == 0)) { if (TrueBB == FalseBB) // We were only looking for one successor, and it was present. // Create an unconditional branch to it. - BranchInst::Create(TrueBB, OldTerm); + Builder.CreateBr(TrueBB); else // We found both of the successors we were looking for. // Create a conditional branch sharing the condition of the select. - BranchInst::Create(TrueBB, FalseBB, Cond, OldTerm); + Builder.CreateCondBr(Cond, TrueBB, FalseBB); } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) { // Neither of the selected blocks were successors, so this // terminator must be unreachable. @@ -1819,10 +1870,10 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, // the edge to the one that wasn't must be unreachable. if (KeepEdge1 == 0) // Only TrueBB was found. - BranchInst::Create(TrueBB, OldTerm); + Builder.CreateBr(TrueBB); else // Only FalseBB was found. - BranchInst::Create(FalseBB, OldTerm); + Builder.CreateBr(FalseBB); } EraseTerminatorInstAndDCECond(OldTerm); @@ -1887,8 +1938,10 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) { /// We prefer to split the edge to 'end' so that there is a true/false entry to /// the PHI, merging the third icmp into the switch. static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, - const TargetData *TD) { + const TargetData *TD, + IRBuilder<> &Builder) { BasicBlock *BB = ICI->getParent(); + // If the block has any PHIs in it or the icmp has multiple uses, it is too // complex. if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) return false; @@ -1966,7 +2019,9 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, SI->addCase(Cst, NewBB); // NewBB branches to the phi block, add the uncond branch and the phi entry. - BranchInst::Create(SuccBlock, NewBB); + Builder.SetInsertPoint(NewBB); + Builder.SetCurrentDebugLocation(SI->getDebugLoc()); + Builder.CreateBr(SuccBlock); PHIUse->addIncoming(NewCst, NewBB); return true; } @@ -1974,7 +2029,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch. /// Check to see if it is branching on an or/and chain of icmp instructions, and /// fold it into a switch instruction if so. -static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) { +static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD, + IRBuilder<> &Builder) { Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); if (Cond == 0) return false; @@ -2030,11 +2086,12 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) { BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test"); // Remove the uncond branch added to the old block. TerminatorInst *OldTI = BB->getTerminator(); - + Builder.SetInsertPoint(OldTI); + if (TrueWhenEqual) - BranchInst::Create(EdgeBB, NewBB, ExtraCase, OldTI); + Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB); else - BranchInst::Create(NewBB, EdgeBB, ExtraCase, OldTI); + Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); OldTI->eraseFromParent(); @@ -2046,18 +2103,19 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) { << "\nEXTRABB = " << *BB); BB = NewBB; } - + + Builder.SetInsertPoint(BI); // Convert pointer to int before we switch. if (CompVal->getType()->isPointerTy()) { assert(TD && "Cannot switch on pointer without TargetData"); - CompVal = new PtrToIntInst(CompVal, - TD->getIntPtrType(CompVal->getContext()), - "magicptr", BI); + CompVal = Builder.CreatePtrToInt(CompVal, + TD->getIntPtrType(CompVal->getContext()), + "magicptr"); } // Create the new switch instruction now. - SwitchInst *New = SwitchInst::Create(CompVal, DefaultBB, Values.size(), BI); - + SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size()); + // Add all of the 'cases' to the switch instruction. for (unsigned i = 0, e = Values.size(); i != e; ++i) New->addCase(Values[i], EdgeBB); @@ -2080,7 +2138,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) { return true; } -bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI) { +bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { BasicBlock *BB = RI->getParent(); if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false; @@ -2124,13 +2182,13 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI) { // Check to see if the non-BB successor is also a return block. if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) && isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) && - SimplifyCondBranchToTwoReturns(BI)) + SimplifyCondBranchToTwoReturns(BI, Builder)) return true; } return false; } -bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI) { +bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder) { // Check to see if the first instruction in this block is just an unwind. // If so, replace any invoke instructions which use this as an exception // destination with call instructions. @@ -2145,14 +2203,16 @@ bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI) { if (II && II->getUnwindDest() == BB) { // Insert a new branch instruction before the invoke, because this // is now a fall through. - BranchInst *BI = BranchInst::Create(II->getNormalDest(), II); + Builder.SetInsertPoint(II); + BranchInst *BI = Builder.CreateBr(II->getNormalDest()); Pred->getInstList().remove(II); // Take out of symbol table // Insert the call now. SmallVector<Value*,8> Args(II->op_begin(), II->op_end()-3); - CallInst *CI = CallInst::Create(II->getCalledValue(), - Args.begin(), Args.end(), - II->getName(), BI); + Builder.SetInsertPoint(BI); + CallInst *CI = Builder.CreateCall(II->getCalledValue(), + Args.begin(), Args.end(), + II->getName()); CI->setCallingConv(II->getCallingConv()); CI->setAttributes(II->getAttributes()); // If the invoke produced a value, the Call now does instead. @@ -2211,7 +2271,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB)); for (unsigned i = 0, e = Preds.size(); i != e; ++i) { TerminatorInst *TI = Preds[i]->getTerminator(); - + IRBuilder<> Builder(TI); if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (BI->isUnconditional()) { if (BI->getSuccessor(0) == BB) { @@ -2221,10 +2281,10 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { } } else { if (BI->getSuccessor(0) == BB) { - BranchInst::Create(BI->getSuccessor(1), BI); + Builder.CreateBr(BI->getSuccessor(1)); EraseTerminatorInstAndDCECond(BI); } else if (BI->getSuccessor(1) == BB) { - BranchInst::Create(BI->getSuccessor(0), BI); + Builder.CreateBr(BI->getSuccessor(0)); EraseTerminatorInstAndDCECond(BI); Changed = true; } @@ -2288,14 +2348,15 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { if (II->getUnwindDest() == BB) { // Convert the invoke to a call instruction. This would be a good // place to note that the call does not throw though. - BranchInst *BI = BranchInst::Create(II->getNormalDest(), II); + BranchInst *BI = Builder.CreateBr(II->getNormalDest()); II->removeFromParent(); // Take out of symbol table // Insert the call now... SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3); - CallInst *CI = CallInst::Create(II->getCalledValue(), - Args.begin(), Args.end(), - II->getName(), BI); + Builder.SetInsertPoint(BI); + CallInst *CI = Builder.CreateCall(II->getCalledValue(), + Args.begin(), Args.end(), + II->getName()); CI->setCallingConv(II->getCallingConv()); CI->setAttributes(II->getAttributes()); // If the invoke produced a value, the call does now instead. @@ -2319,7 +2380,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { /// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a /// integer range comparison into a sub, an icmp and a branch. -static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) { +static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { assert(SI->getNumCases() > 2 && "Degenerate switch?"); // Make sure all cases point to the same destination and gather the values. @@ -2344,9 +2405,9 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) { Value *Sub = SI->getCondition(); if (!Offset->isNullValue()) - Sub = BinaryOperator::CreateAdd(Sub, Offset, Sub->getName()+".off", SI); - Value *Cmp = new ICmpInst(SI, ICmpInst::ICMP_ULT, Sub, NumCases, "switch"); - BranchInst::Create(SI->getSuccessor(1), SI->getDefaultDest(), Cmp, SI); + Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off"); + Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch"); + Builder.CreateCondBr(Cmp, SI->getSuccessor(1), SI->getDefaultDest()); // Prune obsolete incoming values off the successor's PHI nodes. for (BasicBlock::iterator BBI = SI->getSuccessor(1)->begin(); @@ -2359,7 +2420,37 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) { return true; } -bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) { +/// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch +/// and use it to remove dead cases. +static bool EliminateDeadSwitchCases(SwitchInst *SI) { + Value *Cond = SI->getCondition(); + unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth(); + APInt KnownZero(Bits, 0), KnownOne(Bits, 0); + ComputeMaskedBits(Cond, APInt::getAllOnesValue(Bits), KnownZero, KnownOne); + + // Gather dead cases. + SmallVector<ConstantInt*, 8> DeadCases; + for (unsigned I = 1, E = SI->getNumCases(); I != E; ++I) { + if ((SI->getCaseValue(I)->getValue() & KnownZero) != 0 || + (SI->getCaseValue(I)->getValue() & KnownOne) != KnownOne) { + DeadCases.push_back(SI->getCaseValue(I)); + DEBUG(dbgs() << "SimplifyCFG: switch case '" + << SI->getCaseValue(I)->getValue() << "' is dead.\n"); + } + } + + // Remove dead cases from the switch. + for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) { + unsigned Case = SI->findCaseValue(DeadCases[I]); + // Prune unused values from PHI nodes. + SI->getSuccessor(Case)->removePredecessor(SI->getParent()); + SI->removeCase(Case); + } + + return !DeadCases.empty(); +} + +bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { // If this switch is too complex to want to look at, ignore it. if (!isValueEqualityComparison(SI)) return false; @@ -2369,7 +2460,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) { // If we only have one predecessor, and if it is a branch on this value, // see if that predecessor totally determines the outcome of this switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) - if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred)) + if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder)) return SimplifyCFG(BB) | true; Value *Cond = SI->getCondition(); @@ -2384,13 +2475,17 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) { while (isa<DbgInfoIntrinsic>(BBI)) ++BBI; if (SI == &*BBI) - if (FoldValueComparisonIntoPredecessors(SI)) + if (FoldValueComparisonIntoPredecessors(SI, Builder)) return SimplifyCFG(BB) | true; // Try to transform the switch into an icmp and a branch. - if (TurnSwitchRangeIntoICmp(SI)) + if (TurnSwitchRangeIntoICmp(SI, Builder)) return SimplifyCFG(BB) | true; - + + // Remove unreachable cases. + if (EliminateDeadSwitchCases(SI)) + return SimplifyCFG(BB) | true; + return false; } @@ -2431,7 +2526,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { return Changed; } -bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) { +bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ BasicBlock *BB = BI->getParent(); // If the Terminator is the only non-phi instruction, simplify the block. @@ -2446,7 +2541,8 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) { if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) { for (++I; isa<DbgInfoIntrinsic>(I); ++I) ; - if (I->isTerminator() && TryToSimplifyUncondBranchWithICmpInIt(ICI, TD)) + if (I->isTerminator() + && TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder)) return true; } @@ -2454,7 +2550,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) { } -bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) { +bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { BasicBlock *BB = BI->getParent(); // Conditional branch @@ -2463,7 +2559,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) { // see if that predecessor totally determines the outcome of this // switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) - if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred)) + if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder)) return SimplifyCFG(BB) | true; // This block must be empty, except for the setcond inst, if it exists. @@ -2473,20 +2569,20 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) { while (isa<DbgInfoIntrinsic>(I)) ++I; if (&*I == BI) { - if (FoldValueComparisonIntoPredecessors(BI)) + if (FoldValueComparisonIntoPredecessors(BI, Builder)) return SimplifyCFG(BB) | true; } else if (&*I == cast<Instruction>(BI->getCondition())){ ++I; // Ignore dbg intrinsics. while (isa<DbgInfoIntrinsic>(I)) ++I; - if (&*I == BI && FoldValueComparisonIntoPredecessors(BI)) + if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder)) return SimplifyCFG(BB) | true; } } // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction. - if (SimplifyBranchOnICmpChain(BI, TD)) + if (SimplifyBranchOnICmpChain(BI, TD, Builder)) return true; // We have a conditional branch to two blocks that are only reachable @@ -2557,7 +2653,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { // Check to see if we can constant propagate this terminator instruction // away... - Changed |= ConstantFoldTerminator(BB); + Changed |= ConstantFoldTerminator(BB, true); // Check for and eliminate duplicate PHI nodes in this block. Changed |= EliminateDuplicatePHINodes(BB); @@ -2569,27 +2665,30 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { if (MergeBlockIntoPredecessor(BB)) return true; + IRBuilder<> Builder(BB); + // If there is a trivial two-entry PHI node in this basic block, and we can // eliminate it, do so now. if (PHINode *PN = dyn_cast<PHINode>(BB->begin())) if (PN->getNumIncomingValues() == 2) Changed |= FoldTwoEntryPHINode(PN, TD); + Builder.SetInsertPoint(BB->getTerminator()); if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { if (BI->isUnconditional()) { - if (SimplifyUncondBranch(BI)) return true; + if (SimplifyUncondBranch(BI, Builder)) return true; } else { - if (SimplifyCondBranch(BI)) return true; + if (SimplifyCondBranch(BI, Builder)) return true; } } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) { - if (SimplifyReturn(RI)) return true; + if (SimplifyReturn(RI, Builder)) return true; } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { - if (SimplifySwitch(SI)) return true; + if (SimplifySwitch(SI, Builder)) return true; } else if (UnreachableInst *UI = dyn_cast<UnreachableInst>(BB->getTerminator())) { if (SimplifyUnreachable(UI)) return true; } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { - if (SimplifyUnwind(UI)) return true; + if (SimplifyUnwind(UI, Builder)) return true; } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(BB->getTerminator())) { if (SimplifyIndirectBr(IBI)) return true; diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index f5481d3..a73bf04 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -39,7 +39,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, return VM[V] = const_cast<Value*>(V); // Create a dummy node in case we have a metadata cycle. - MDNode *Dummy = MDNode::getTemporary(V->getContext(), 0, 0); + MDNode *Dummy = MDNode::getTemporary(V->getContext(), ArrayRef<Value*>()); VM[V] = Dummy; // Check all operands to see if any need to be remapped. @@ -54,7 +54,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, Value *Op = MD->getOperand(i); Elts.push_back(Op ? MapValue(Op, VM, Flags) : 0); } - MDNode *NewMD = MDNode::get(V->getContext(), Elts.data(), Elts.size()); + MDNode *NewMD = MDNode::get(V->getContext(), Elts); Dummy->replaceAllUsesWith(NewMD); VM[V] = NewMD; MDNode::deleteTemporary(Dummy); diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp index ffd367a..cfcffeb 100644 --- a/lib/VMCore/AsmWriter.cpp +++ b/lib/VMCore/AsmWriter.cpp @@ -32,6 +32,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" @@ -39,9 +40,13 @@ #include "llvm/Support/FormattedStream.h" #include <algorithm> #include <cctype> -#include <map> using namespace llvm; +static cl::opt<bool> +EnableDebugInfoComment("enable-debug-info-comment", cl::Hidden, + cl::desc("Enable debug info comments")); + + // Make virtual table appear in this compilation unit. AssemblyAnnotationWriter::~AssemblyAnnotationWriter() {} @@ -89,7 +94,7 @@ enum PrefixType { /// prefixed with % (if the string only contains simple characters) or is /// surrounded with ""'s (if it has special chars in it). Print it out. static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) { - assert(Name.data() && "Cannot get empty name!"); + assert(!Name.empty() && "Cannot get empty name!"); switch (Prefix) { default: llvm_unreachable("Bad prefix!"); case NoPrefix: break; @@ -1075,7 +1080,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, } if (CE->hasIndices()) { - const SmallVector<unsigned, 4> &Indices = CE->getIndices(); + ArrayRef<unsigned> Indices = CE->getIndices(); for (unsigned i = 0, e = Indices.size(); i != e; ++i) Out << ", " << Indices[i]; } @@ -1396,7 +1401,25 @@ void AssemblyWriter::printModule(const Module *M) { } void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) { - Out << "!" << NMD->getName() << " = !{"; + Out << '!'; + StringRef Name = NMD->getName(); + if (Name.empty()) { + Out << "<empty name> "; + } else { + if (isalpha(Name[0]) || Name[0] == '-' || Name[0] == '$' || + Name[0] == '.' || Name[0] == '_') + Out << Name[0]; + else + Out << '\\' << hexdigit(Name[0] >> 4) << hexdigit(Name[0] & 0x0F); + for (unsigned i = 1, e = Name.size(); i != e; ++i) { + unsigned char C = Name[i]; + if (isalnum(C) || C == '-' || C == '$' || C == '.' || C == '_') + Out << C; + else + Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F); + } + } + Out << " = !{"; for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) { if (i) Out << ", "; int Slot = Machine.getMetadataSlot(NMD->getOperand(i)); @@ -1730,6 +1753,18 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) { if (AnnotationWriter) AnnotationWriter->emitBasicBlockEndAnnot(BB, Out); } +/// printDebugLoc - Print DebugLoc. +static void printDebugLoc(const DebugLoc &DL, formatted_raw_ostream &OS) { + OS << DL.getLine() << ":" << DL.getCol(); + if (MDNode *N = DL.getInlinedAt(getGlobalContext())) { + DebugLoc IDL = DebugLoc::getFromDILocation(N); + if (!IDL.isUnknown()) { + OS << "@"; + printDebugLoc(IDL,OS); + } + } +} + /// printInfoComment - Print a little comment after the instruction indicating /// which slot it occupies. /// @@ -1737,6 +1772,43 @@ void AssemblyWriter::printInfoComment(const Value &V) { if (AnnotationWriter) { AnnotationWriter->printInfoComment(V, Out); return; + } else if (EnableDebugInfoComment) { + bool Padded = false; + if (const Instruction *I = dyn_cast<Instruction>(&V)) { + const DebugLoc &DL = I->getDebugLoc(); + if (!DL.isUnknown()) { + if (!Padded) { + Out.PadToColumn(50); + Padded = true; + Out << ";"; + } + Out << " [debug line = "; + printDebugLoc(DL,Out); + Out << "]"; + } + if (const DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) { + const MDNode *Var = DDI->getVariable(); + if (!Padded) { + Out.PadToColumn(50); + Padded = true; + Out << ";"; + } + if (Var && Var->getNumOperands() >= 2) + if (MDString *MDS = dyn_cast_or_null<MDString>(Var->getOperand(2))) + Out << " [debug variable = " << MDS->getString() << "]"; + } + else if (const DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) { + const MDNode *Var = DVI->getVariable(); + if (!Padded) { + Out.PadToColumn(50); + Padded = true; + Out << ";"; + } + if (Var && Var->getNumOperands() >= 2) + if (MDString *MDS = dyn_cast_or_null<MDString>(Var->getOperand(2))) + Out << " [debug variable = " << MDS->getString() << "]"; + } + } } } diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp index 92152a3..bf6efa1 100644 --- a/lib/VMCore/Attributes.cpp +++ b/lib/VMCore/Attributes.cpp @@ -36,6 +36,8 @@ std::string Attribute::getAsString(Attributes Attrs) { Result += "noreturn "; if (Attrs & Attribute::NoUnwind) Result += "nounwind "; + if (Attrs & Attribute::UWTable) + Result += "uwtable "; if (Attrs & Attribute::InReg) Result += "inreg "; if (Attrs & Attribute::NoAlias) @@ -72,6 +74,8 @@ std::string Attribute::getAsString(Attributes Attrs) { Result += "naked "; if (Attrs & Attribute::Hotpatch) Result += "hotpatch "; + if (Attrs & Attribute::NonLazyBind) + Result += "nonlazybind "; if (Attrs & Attribute::StackAlignment) { Result += "alignstack("; Result += utostr(Attribute::getStackAlignmentFromAttrs(Attrs)); diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp index 9e551bb..9d4543d 100644 --- a/lib/VMCore/AutoUpgrade.cpp +++ b/lib/VMCore/AutoUpgrade.cpp @@ -284,8 +284,58 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { break; } + // This upgrades the llvm.prefetch intrinsic to accept one more parameter, + // which is a instruction / data cache identifier. The old version only + // implicitly accepted the data version. + if (Name.compare(5,8,"prefetch",8) == 0) { + // Don't do anything if it has the correct number of arguments already + if (FTy->getNumParams() == 4) + break; + + assert(FTy->getNumParams() == 3 && "old prefetch takes 3 args!"); + // We first need to change the name of the old (bad) intrinsic, because + // its type is incorrect, but we cannot overload that name. We + // arbitrarily unique it here allowing us to construct a correctly named + // and typed function below. + F->setName(""); + NewFn = cast<Function>(M->getOrInsertFunction(Name, + FTy->getReturnType(), + FTy->getParamType(0), + FTy->getParamType(1), + FTy->getParamType(2), + FTy->getParamType(2), + (Type*)0)); + return true; + } + break; - case 'x': + case 'x': + // This fixes the poorly named crc32 intrinsics + if (Name.compare(5, 13, "x86.sse42.crc", 13) == 0) { + const char* NewFnName = NULL; + if (Name.compare(18, 2, "32", 2) == 0) { + if (Name.compare(20, 2, ".8") == 0 && Name.length() == 22) { + NewFnName = "llvm.x86.sse42.crc32.32.8"; + } else if (Name.compare(20, 3, ".16") == 0 && Name.length() == 23) { + NewFnName = "llvm.x86.sse42.crc32.32.16"; + } else if (Name.compare(20, 3, ".32") == 0 && Name.length() == 23) { + NewFnName = "llvm.x86.sse42.crc32.32.32"; + } + } + else if (Name.compare(18, 2, "64", 2) == 0) { + if (Name.compare(20, 2, ".8") == 0 && Name.length() == 22) { + NewFnName = "llvm.x86.sse42.crc32.64.8"; + } else if (Name.compare(20, 3, ".64") == 0 && Name.length() == 23) { + NewFnName = "llvm.x86.sse42.crc32.64.64"; + } + } + if (NewFnName) { + F->setName(NewFnName); + NewFn = F; + return true; + } + } + // This fixes all MMX shift intrinsic instructions to take a // x86_mmx instead of a v1i64, v2i32, v4i16, or v8i8. if (Name.compare(5, 8, "x86.mmx.", 8) == 0) { @@ -527,6 +577,19 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { // or 0. NewFn = 0; return true; + } else if (Name.compare(5, 16, "x86.sse.loadu.ps", 16) == 0 || + Name.compare(5, 17, "x86.sse2.loadu.dq", 17) == 0 || + Name.compare(5, 17, "x86.sse2.loadu.pd", 17) == 0) { + // Calls to these instructions are transformed into unaligned loads. + NewFn = 0; + return true; + } else if (Name.compare(5, 16, "x86.sse.movnt.ps", 16) == 0 || + Name.compare(5, 17, "x86.sse2.movnt.dq", 17) == 0 || + Name.compare(5, 17, "x86.sse2.movnt.pd", 17) == 0 || + Name.compare(5, 17, "x86.sse2.movnt.i", 16) == 0) { + // Calls to these instructions are transformed into nontemporal stores. + NewFn = 0; + return true; } else if (Name.compare(5, 17, "x86.ssse3.pshuf.w", 17) == 0) { // This is an SSE/MMX instruction. const Type *X86_MMXTy = VectorType::getX86_MMXTy(FTy->getContext()); @@ -946,7 +1009,54 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { // Remove upgraded instruction. CI->eraseFromParent(); - + + } else if (F->getName() == "llvm.x86.sse.loadu.ps" || + F->getName() == "llvm.x86.sse2.loadu.dq" || + F->getName() == "llvm.x86.sse2.loadu.pd") { + // Convert to a native, unaligned load. + const Type *VecTy = CI->getType(); + const Type *IntTy = IntegerType::get(C, 128); + IRBuilder<> Builder(C); + Builder.SetInsertPoint(CI->getParent(), CI); + + Value *BC = Builder.CreateBitCast(CI->getArgOperand(0), + PointerType::getUnqual(IntTy), + "cast"); + LoadInst *LI = Builder.CreateLoad(BC, CI->getName()); + LI->setAlignment(1); // Unaligned load. + BC = Builder.CreateBitCast(LI, VecTy, "new.cast"); + + // Fix up all the uses with our new load. + if (!CI->use_empty()) + CI->replaceAllUsesWith(BC); + + // Remove intrinsic. + CI->eraseFromParent(); + } else if (F->getName() == "llvm.x86.sse.movnt.ps" || + F->getName() == "llvm.x86.sse2.movnt.dq" || + F->getName() == "llvm.x86.sse2.movnt.pd" || + F->getName() == "llvm.x86.sse2.movnt.i") { + IRBuilder<> Builder(C); + Builder.SetInsertPoint(CI->getParent(), CI); + + Module *M = F->getParent(); + SmallVector<Value *, 1> Elts; + Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1)); + MDNode *Node = MDNode::get(C, Elts); + + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + + // Convert the type of the pointer to a pointer to the stored type. + Value *BC = Builder.CreateBitCast(Arg0, + PointerType::getUnqual(Arg1->getType()), + "cast"); + StoreInst *SI = Builder.CreateStore(Arg1, BC); + SI->setMetadata(M->getMDKindID("nontemporal"), Node); + SI->setAlignment(16); + + // Remove intrinsic. + CI->eraseFromParent(); } else { llvm_unreachable("Unknown function for CallInst upgrade."); } @@ -1258,6 +1368,29 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { CI->eraseFromParent(); break; } + case Intrinsic::prefetch: { + IRBuilder<> Builder(C); + Builder.SetInsertPoint(CI->getParent(), CI); + const llvm::Type *I32Ty = llvm::Type::getInt32Ty(CI->getContext()); + + // Add the extra "data cache" argument + Value *Operands[4] = { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), + llvm::ConstantInt::get(I32Ty, 1) }; + CallInst *NewCI = CallInst::Create(NewFn, Operands, Operands+4, + CI->getName(), CI); + NewCI->setTailCall(CI->isTailCall()); + NewCI->setCallingConv(CI->getCallingConv()); + // Handle any uses of the old CallInst. + if (!CI->use_empty()) + // Replace all uses of the old call with the new cast which has the + // correct type. + CI->replaceAllUsesWith(NewCI); + + // Clean up the old call now that it has been completely upgraded. + CI->eraseFromParent(); + break; + } } } diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp index 573efb7..9985ada 100644 --- a/lib/VMCore/ConstantFold.cpp +++ b/lib/VMCore/ConstantFold.cpp @@ -24,6 +24,7 @@ #include "llvm/Function.h" #include "llvm/GlobalAlias.h" #include "llvm/GlobalVariable.h" +#include "llvm/Operator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" @@ -1735,7 +1736,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, // with a single zero index, it must be nonzero. assert(CE1->getNumOperands() == 2 && !CE1->getOperand(1)->isNullValue() && - "Suprising getelementptr!"); + "Surprising getelementptr!"); return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; } else { // If they are different globals, we don't know what the value is, diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp index 7a4dcf9..15d7793 100644 --- a/lib/VMCore/Constants.cpp +++ b/lib/VMCore/Constants.cpp @@ -32,7 +32,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include <algorithm> -#include <map> #include <cstdarg> using namespace llvm; @@ -771,7 +770,7 @@ bool ConstantExpr::hasIndices() const { getOpcode() == Instruction::InsertValue; } -const SmallVector<unsigned, 4> &ConstantExpr::getIndices() const { +ArrayRef<unsigned> ConstantExpr::getIndices() const { if (const ExtractValueConstantExpr *EVCE = dyn_cast<ExtractValueConstantExpr>(this)) return EVCE->Indices; @@ -855,10 +854,10 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const { /// operands replaced with the specified values. The specified operands must /// match count and type with the existing ones. Constant *ConstantExpr:: -getWithOperands(Constant *const *Ops, unsigned NumOps) const { - assert(NumOps == getNumOperands() && "Operand count mismatch!"); +getWithOperands(ArrayRef<Constant*> Ops) const { + assert(Ops.size() == getNumOperands() && "Operand count mismatch!"); bool AnyChange = false; - for (unsigned i = 0; i != NumOps; ++i) { + for (unsigned i = 0; i != Ops.size(); ++i) { assert(Ops[i]->getType() == getOperand(i)->getType() && "Operand type mismatch!"); AnyChange |= Ops[i] != getOperand(i); @@ -890,8 +889,8 @@ getWithOperands(Constant *const *Ops, unsigned NumOps) const { return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]); case Instruction::GetElementPtr: return cast<GEPOperator>(this)->isInBounds() ? - ConstantExpr::getInBoundsGetElementPtr(Ops[0], &Ops[1], NumOps-1) : - ConstantExpr::getGetElementPtr(Ops[0], &Ops[1], NumOps-1); + ConstantExpr::getInBoundsGetElementPtr(Ops[0], &Ops[1], Ops.size()-1) : + ConstantExpr::getGetElementPtr(Ops[0], &Ops[1], Ops.size()-1); case Instruction::ICmp: case Instruction::FCmp: return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1]); @@ -2151,7 +2150,7 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV, Constant *Agg = getOperand(0); if (Agg == From) Agg = To; - const SmallVector<unsigned, 4> &Indices = getIndices(); + ArrayRef<unsigned> Indices = getIndices(); Replacement = ConstantExpr::getExtractValue(Agg, &Indices[0], Indices.size()); } else if (getOpcode() == Instruction::InsertValue) { @@ -2160,7 +2159,7 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV, if (Agg == From) Agg = To; if (Val == From) Val = To; - const SmallVector<unsigned, 4> &Indices = getIndices(); + ArrayRef<unsigned> Indices = getIndices(); Replacement = ConstantExpr::getInsertValue(Agg, Val, &Indices[0], Indices.size()); } else if (isCast()) { diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h index ffc673f..1395754 100644 --- a/lib/VMCore/ConstantsContext.h +++ b/lib/VMCore/ConstantsContext.h @@ -301,20 +301,18 @@ struct OperandTraits<CompareConstantExpr> : DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value) struct ExprMapKeyType { - typedef SmallVector<unsigned, 4> IndexList; - ExprMapKeyType(unsigned opc, - const std::vector<Constant*> &ops, + ArrayRef<Constant*> ops, unsigned short flags = 0, unsigned short optionalflags = 0, - const IndexList &inds = IndexList()) + ArrayRef<unsigned> inds = ArrayRef<unsigned>()) : opcode(opc), subclassoptionaldata(optionalflags), subclassdata(flags), - operands(ops), indices(inds) {} + operands(ops.begin(), ops.end()), indices(inds.begin(), inds.end()) {} uint8_t opcode; uint8_t subclassoptionaldata; uint16_t subclassdata; std::vector<Constant*> operands; - IndexList indices; + SmallVector<unsigned, 4> indices; bool operator==(const ExprMapKeyType& that) const { return this->opcode == that.opcode && this->subclassdata == that.subclassdata && @@ -465,7 +463,7 @@ struct ConstantKeyData<ConstantExpr> { CE->isCompare() ? CE->getPredicate() : 0, CE->getRawSubclassOptionalData(), CE->hasIndices() ? - CE->getIndices() : SmallVector<unsigned, 4>()); + CE->getIndices() : ArrayRef<unsigned>()); } }; diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp index 986b403..92f9440 100644 --- a/lib/VMCore/Core.cpp +++ b/lib/VMCore/Core.cpp @@ -335,7 +335,7 @@ unsigned LLVMCountStructElementTypes(LLVMTypeRef StructTy) { void LLVMGetStructElementTypes(LLVMTypeRef StructTy, LLVMTypeRef *Dest) { StructType *Ty = unwrap<StructType>(StructTy); - for (FunctionType::param_iterator I = Ty->element_begin(), + for (StructType::element_iterator I = Ty->element_begin(), E = Ty->element_end(); I != E; ++I) *Dest++ = wrap(*I); } @@ -543,7 +543,8 @@ LLVMValueRef LLVMMDString(const char *Str, unsigned SLen) { LLVMValueRef LLVMMDNodeInContext(LLVMContextRef C, LLVMValueRef *Vals, unsigned Count) { - return wrap(MDNode::get(*unwrap(C), unwrap<Value>(Vals, Count), Count)); + return wrap(MDNode::get(*unwrap(C), + ArrayRef<Value*>(unwrap<Value>(Vals, Count), Count))); } LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) { diff --git a/lib/VMCore/DebugInfoProbe.cpp b/lib/VMCore/DebugInfoProbe.cpp index 57d17a6..d1275ff 100644 --- a/lib/VMCore/DebugInfoProbe.cpp +++ b/lib/VMCore/DebugInfoProbe.cpp @@ -51,44 +51,28 @@ namespace llvm { unsigned NumDbgLineLost, NumDbgValueLost; std::string PassName; Function *TheFn; - std::set<unsigned> LineNos; std::set<MDNode *> DbgVariables; + std::set<Instruction *> MissingDebugLoc; }; } //===----------------------------------------------------------------------===// // DebugInfoProbeImpl -static void collect(Function &F, std::set<unsigned> &Lines) { - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); - BI != BE; ++BI) { - const DebugLoc &DL = BI->getDebugLoc(); - unsigned LineNo = 0; - if (!DL.isUnknown()) { - if (MDNode *N = DL.getInlinedAt(F.getContext())) - LineNo = DebugLoc::getFromDILocation(N).getLine(); - else - LineNo = DL.getLine(); - - Lines.insert(LineNo); - } - } -} - /// initialize - Collect information before running an optimization pass. void DebugInfoProbeImpl::initialize(StringRef PName, Function &F) { if (!EnableDebugInfoProbe) return; PassName = PName; - LineNos.clear(); DbgVariables.clear(); + MissingDebugLoc.clear(); TheFn = &F; - collect(F, LineNos); for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { + if (!isa<PHINode>(BI) && BI->getDebugLoc().isUnknown()) + MissingDebugLoc.insert(BI); if (!isa<DbgInfoIntrinsic>(BI)) continue; Value *Addr = NULL; MDNode *Node = NULL; @@ -127,23 +111,19 @@ void DebugInfoProbeImpl::report() { /// must be used after initialization. void DebugInfoProbeImpl::finalize(Function &F) { if (!EnableDebugInfoProbe) return; - std::set<unsigned> LineNos2; - collect(F, LineNos2); assert (TheFn == &F && "Invalid function to measure!"); - for (std::set<unsigned>::iterator I = LineNos.begin(), - E = LineNos.end(); I != E; ++I) { - unsigned LineNo = *I; - if (LineNos2.count(LineNo) == 0) { - DEBUG(dbgs() << "DebugInfoProbe: Losing dbg info intrinsic at line " << LineNo << "\n"); - ++NumDbgLineLost; - } - } - std::set<MDNode *>DbgVariables2; for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { + if (!isa<PHINode>(BI) && BI->getDebugLoc().isUnknown() && + MissingDebugLoc.count(BI) == 0) { + ++NumDbgLineLost; + DEBUG(dbgs() << "DebugInfoProbe (" << PassName << "): --- "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } if (!isa<DbgInfoIntrinsic>(BI)) continue; Value *Addr = NULL; MDNode *Node = NULL; @@ -160,9 +140,17 @@ void DebugInfoProbeImpl::finalize(Function &F) { for (std::set<MDNode *>::iterator I = DbgVariables.begin(), E = DbgVariables.end(); I != E; ++I) { - if (DbgVariables2.count(*I) == 0) { - DEBUG(dbgs() << "DebugInfoProbe: Losing dbg info for variable: "); - DEBUG((*I)->print(dbgs())); + if (DbgVariables2.count(*I) == 0 && (*I)->getNumOperands() >= 2) { + DEBUG(dbgs() + << "DebugInfoProbe(" + << PassName + << "): Losing dbg info for variable: "; + if (MDString *MDS = dyn_cast_or_null<MDString>( + (*I)->getOperand(2))) + dbgs() << MDS->getString(); + else + dbgs() << "..."; + dbgs() << "\n"); ++NumDbgValueLost; } } diff --git a/lib/VMCore/DebugLoc.cpp b/lib/VMCore/DebugLoc.cpp index 3569162..520333c 100644 --- a/lib/VMCore/DebugLoc.cpp +++ b/lib/VMCore/DebugLoc.cpp @@ -109,7 +109,7 @@ MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const { ConstantInt::get(Int32, getLine()), ConstantInt::get(Int32, getCol()), Scope, IA }; - return MDNode::get(Ctx2, &Elts[0], 4); + return MDNode::get(Ctx2, Elts); } /// getFromDILocation - Translate the DILocation quad into a DebugLoc. diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp index 00d1d78..0ae0bdb 100644 --- a/lib/VMCore/Function.cpp +++ b/lib/VMCore/Function.cpp @@ -24,6 +24,7 @@ #include "llvm/Support/Threading.h" #include "SymbolTableListTraitsImpl.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" using namespace llvm; @@ -78,6 +79,12 @@ bool Argument::hasByValAttr() const { return getParent()->paramHasAttr(getArgNo()+1, Attribute::ByVal); } +unsigned Argument::getParamAlignment() const { + assert(getType()->isPointerTy() && "Only pointers have alignments"); + return getParent()->getParamAlignment(getArgNo()+1); + +} + /// hasNestAttr - Return true if this argument has the nest attribute on /// it in its containing function. bool Argument::hasNestAttr() const { @@ -328,7 +335,7 @@ unsigned Function::getIntrinsicID() const { std::string Intrinsic::getName(ID id, const Type **Tys, unsigned numTys) { assert(id < num_intrinsics && "Invalid intrinsic ID!"); - const char * const Table[] = { + static const char * const Table[] = { "not_intrinsic", #define GET_INTRINSIC_NAME_TABLE #include "llvm/Intrinsics.gen" @@ -363,7 +370,7 @@ const FunctionType *Intrinsic::getType(LLVMContext &Context, } bool Intrinsic::isOverloaded(ID id) { - const bool OTable[] = { + static const bool OTable[] = { false, #define GET_INTRINSIC_OVERLOAD_TABLE #include "llvm/Intrinsics.gen" @@ -406,4 +413,36 @@ bool Function::hasAddressTaken(const User* *PutOffender) const { return false; } +/// callsFunctionThatReturnsTwice - Return true if the function has a call to +/// setjmp or other function that gcc recognizes as "returning twice". +/// +/// FIXME: Remove after <rdar://problem/8031714> is fixed. +/// FIXME: Is the obove FIXME valid? +bool Function::callsFunctionThatReturnsTwice() const { + const Module *M = this->getParent(); + static const char *ReturnsTwiceFns[] = { + "_setjmp", + "setjmp", + "sigsetjmp", + "setjmp_syscall", + "savectx", + "qsetjmp", + "vfork", + "getcontext" + }; + + for (unsigned I = 0; I < array_lengthof(ReturnsTwiceFns); ++I) + if (const Function *Callee = M->getFunction(ReturnsTwiceFns[I])) { + if (!Callee->use_empty()) + for (Value::const_use_iterator + I = Callee->use_begin(), E = Callee->use_end(); + I != E; ++I) + if (const CallInst *CI = dyn_cast<CallInst>(*I)) + if (CI->getParent()->getParent() == this) + return true; + } + + return false; +} + // vim: sw=2 ai diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp index 1658d79..f2d469a 100644 --- a/lib/VMCore/IRBuilder.cpp +++ b/lib/VMCore/IRBuilder.cpp @@ -23,7 +23,7 @@ using namespace llvm; /// has array of i8 type filled in with the nul terminated string value /// specified. If Name is specified, it is the name of the global variable /// created. -Value *IRBuilderBase::CreateGlobalString(const char *Str, const Twine &Name) { +Value *IRBuilderBase::CreateGlobalString(StringRef Str, const Twine &Name) { Constant *StrConstant = ConstantArray::get(Context, Str, true); Module &M = *BB->getParent()->getParent(); GlobalVariable *GV = new GlobalVariable(M, StrConstant->getType(), @@ -60,7 +60,6 @@ static CallInst *createCallHelper(Value *Callee, Value *const* Ops, return CI; } - CallInst *IRBuilderBase:: CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag) { @@ -118,3 +117,33 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align, return CI; } + +CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { + assert(isa<PointerType>(Ptr->getType()) && + "lifetime.start only applies to pointers."); + Ptr = getCastedInt8PtrValue(Ptr); + if (!Size) + Size = getInt64(-1); + else + assert(Size->getType() == getInt64Ty() && + "lifetime.start requires the size to be an i64"); + Value *Ops[] = { Size, Ptr }; + Module *M = BB->getParent()->getParent(); + Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start); + return createCallHelper(TheFn, Ops, 2, this); +} + +CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) { + assert(isa<PointerType>(Ptr->getType()) && + "lifetime.end only applies to pointers."); + Ptr = getCastedInt8PtrValue(Ptr); + if (!Size) + Size = getInt64(-1); + else + assert(Size->getType() == getInt64Ty() && + "lifetime.end requires the size to be an i64"); + Value *Ops[] = { Size, Ptr }; + Module *M = BB->getParent()->getParent(); + Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end); + return createCallHelper(TheFn, Ops, 2, this); +} diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp index e4f99f0..bd3667d 100644 --- a/lib/VMCore/InlineAsm.cpp +++ b/lib/VMCore/InlineAsm.cpp @@ -181,6 +181,11 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str, multipleAlternativeIndex++; pCodes = &multipleAlternatives[multipleAlternativeIndex].Codes; ++I; + } else if (*I == '^') { + // Multi-letter constraint + // FIXME: For now assuming these are 2-character constraints. + pCodes->push_back(std::string(I+1, I+3)); + I += 3; } else { // Single letter constraint. pCodes->push_back(std::string(I, I+1)); diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp index 33dfcc0..8f4eabe 100644 --- a/lib/VMCore/Instructions.cpp +++ b/lib/VMCore/Instructions.cpp @@ -137,7 +137,8 @@ Value *PHINode::removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty) { /// void PHINode::growOperands() { unsigned e = getNumOperands(); - unsigned NumOps = e*3/2; + // Multiply by 1.5 and round down so the result is still even. + unsigned NumOps = e + e / 4 * 2; if (NumOps < 4) NumOps = 4; // 4 op PHI nodes are VERY common. ReservedSpace = NumOps; @@ -2075,6 +2076,7 @@ unsigned CastInst::isEliminableCastPair( CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore) { + assert(castIsValid(op, S, Ty) && "Invalid cast!"); // Construct and return the appropriate CastInst subclass switch (op) { case Trunc: return new TruncInst (S, Ty, Name, InsertBefore); @@ -2097,6 +2099,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty, CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd) { + assert(castIsValid(op, S, Ty) && "Invalid cast!"); // Construct and return the appropriate CastInst subclass switch (op) { case Trunc: return new TruncInst (S, Ty, Name, InsertAtEnd); @@ -2253,60 +2256,56 @@ bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) { if (SrcTy == DestTy) return true; + if (const VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy)) + if (const VectorType *DestVecTy = dyn_cast<VectorType>(DestTy)) + if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) { + // An element by element cast. Valid if casting the elements is valid. + SrcTy = SrcVecTy->getElementType(); + DestTy = DestVecTy->getElementType(); + } + // Get the bit sizes, we'll need these - unsigned SrcBits = SrcTy->getScalarSizeInBits(); // 0 for ptr - unsigned DestBits = DestTy->getScalarSizeInBits(); // 0 for ptr + unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); // 0 for ptr + unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr // Run through the possibilities ... - if (DestTy->isIntegerTy()) { // Casting to integral - if (SrcTy->isIntegerTy()) { // Casting from integral + if (DestTy->isIntegerTy()) { // Casting to integral + if (SrcTy->isIntegerTy()) { // Casting from integral return true; - } else if (SrcTy->isFloatingPointTy()) { // Casting from floating pt + } else if (SrcTy->isFloatingPointTy()) { // Casting from floating pt return true; - } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) { - // Casting from vector - return DestBits == PTy->getBitWidth(); + } else if (SrcTy->isVectorTy()) { // Casting from vector + return DestBits == SrcBits; } else { // Casting from something else return SrcTy->isPointerTy(); } - } else if (DestTy->isFloatingPointTy()) { // Casting to floating pt - if (SrcTy->isIntegerTy()) { // Casting from integral + } else if (DestTy->isFloatingPointTy()) { // Casting to floating pt + if (SrcTy->isIntegerTy()) { // Casting from integral return true; - } else if (SrcTy->isFloatingPointTy()) { // Casting from floating pt + } else if (SrcTy->isFloatingPointTy()) { // Casting from floating pt return true; - } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) { - // Casting from vector - return DestBits == PTy->getBitWidth(); + } else if (SrcTy->isVectorTy()) { // Casting from vector + return DestBits == SrcBits; } else { // Casting from something else return false; } - } else if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) { - // Casting to vector - if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) { - // Casting from vector - return DestPTy->getBitWidth() == SrcPTy->getBitWidth(); - } else if (DestPTy->getBitWidth() == SrcBits) { - return true; // float/int -> vector - } else if (SrcTy->isX86_MMXTy()) { - return DestPTy->getBitWidth() == 64; // MMX to 64-bit vector - } else { - return false; - } + } else if (DestTy->isVectorTy()) { // Casting to vector + return DestBits == SrcBits; } else if (DestTy->isPointerTy()) { // Casting to pointer - if (SrcTy->isPointerTy()) { // Casting from pointer + if (SrcTy->isPointerTy()) { // Casting from pointer return true; - } else if (SrcTy->isIntegerTy()) { // Casting from integral + } else if (SrcTy->isIntegerTy()) { // Casting from integral return true; - } else { // Casting from something else + } else { // Casting from something else return false; } } else if (DestTy->isX86_MMXTy()) { - if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) { - return SrcPTy->getBitWidth() == 64; // 64-bit vector to MMX + if (SrcTy->isVectorTy()) { + return DestBits == SrcBits; // 64-bit vector to MMX } else { return false; } - } else { // Casting to something else + } else { // Casting to something else return false; } } @@ -2321,14 +2320,27 @@ bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) { Instruction::CastOps CastInst::getCastOpcode( const Value *Src, bool SrcIsSigned, const Type *DestTy, bool DestIsSigned) { - // Get the bit sizes, we'll need these const Type *SrcTy = Src->getType(); - unsigned SrcBits = SrcTy->getScalarSizeInBits(); // 0 for ptr - unsigned DestBits = DestTy->getScalarSizeInBits(); // 0 for ptr assert(SrcTy->isFirstClassType() && DestTy->isFirstClassType() && "Only first class types are castable!"); + if (SrcTy == DestTy) + return BitCast; + + if (const VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy)) + if (const VectorType *DestVecTy = dyn_cast<VectorType>(DestTy)) + if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) { + // An element by element cast. Find the appropriate opcode based on the + // element types. + SrcTy = SrcVecTy->getElementType(); + DestTy = DestVecTy->getElementType(); + } + + // Get the bit sizes, we'll need these + unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); // 0 for ptr + unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr + // Run through the possibilities ... if (DestTy->isIntegerTy()) { // Casting to integral if (SrcTy->isIntegerTy()) { // Casting from integral @@ -2347,10 +2359,9 @@ CastInst::getCastOpcode( return FPToSI; // FP -> sint else return FPToUI; // FP -> uint - } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) { - assert(DestBits == PTy->getBitWidth() && - "Casting vector to integer of different width"); - PTy = NULL; + } else if (SrcTy->isVectorTy()) { + assert(DestBits == SrcBits && + "Casting vector to integer of different width"); return BitCast; // Same size, no-op cast } else { assert(SrcTy->isPointerTy() && @@ -2371,29 +2382,17 @@ CastInst::getCastOpcode( } else { return BitCast; // same size, no-op cast } - } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) { - assert(DestBits == PTy->getBitWidth() && + } else if (SrcTy->isVectorTy()) { + assert(DestBits == SrcBits && "Casting vector to floating point of different width"); - PTy = NULL; return BitCast; // same size, no-op cast } else { llvm_unreachable("Casting pointer or non-first class to float"); } - } else if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) { - if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) { - assert(DestPTy->getBitWidth() == SrcPTy->getBitWidth() && - "Casting vector to vector of different widths"); - SrcPTy = NULL; - return BitCast; // vector -> vector - } else if (DestPTy->getBitWidth() == SrcBits) { - return BitCast; // float/int -> vector - } else if (SrcTy->isX86_MMXTy()) { - assert(DestPTy->getBitWidth()==64 && - "Casting X86_MMX to vector of wrong width"); - return BitCast; // MMX to 64-bit vector - } else { - assert(!"Illegal cast to vector (wrong type or size)"); - } + } else if (DestTy->isVectorTy()) { + assert(DestBits == SrcBits && + "Illegal cast to vector (wrong type or size)"); + return BitCast; } else if (DestTy->isPointerTy()) { if (SrcTy->isPointerTy()) { return BitCast; // ptr -> ptr @@ -2403,9 +2402,8 @@ CastInst::getCastOpcode( assert(!"Casting pointer to other than pointer or int"); } } else if (DestTy->isX86_MMXTy()) { - if (isa<VectorType>(SrcTy)) { - assert(cast<VectorType>(SrcTy)->getBitWidth() == 64 && - "Casting vector of wrong width to X86_MMX"); + if (SrcTy->isVectorTy()) { + assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX"); return BitCast; // 64-bit vector to MMX } else { assert(!"Illegal cast to X86_MMX"); @@ -2441,46 +2439,40 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, const Type *DstTy) { unsigned SrcBitSize = SrcTy->getScalarSizeInBits(); unsigned DstBitSize = DstTy->getScalarSizeInBits(); + // If these are vector types, get the lengths of the vectors (using zero for + // scalar types means that checking that vector lengths match also checks that + // scalars are not being converted to vectors or vectors to scalars). + unsigned SrcLength = SrcTy->isVectorTy() ? + cast<VectorType>(SrcTy)->getNumElements() : 0; + unsigned DstLength = DstTy->isVectorTy() ? + cast<VectorType>(DstTy)->getNumElements() : 0; + // Switch on the opcode provided switch (op) { default: return false; // This is an input error case Instruction::Trunc: - return SrcTy->isIntOrIntVectorTy() && - DstTy->isIntOrIntVectorTy()&& SrcBitSize > DstBitSize; + return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() && + SrcLength == DstLength && SrcBitSize > DstBitSize; case Instruction::ZExt: - return SrcTy->isIntOrIntVectorTy() && - DstTy->isIntOrIntVectorTy()&& SrcBitSize < DstBitSize; + return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() && + SrcLength == DstLength && SrcBitSize < DstBitSize; case Instruction::SExt: - return SrcTy->isIntOrIntVectorTy() && - DstTy->isIntOrIntVectorTy()&& SrcBitSize < DstBitSize; + return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() && + SrcLength == DstLength && SrcBitSize < DstBitSize; case Instruction::FPTrunc: - return SrcTy->isFPOrFPVectorTy() && - DstTy->isFPOrFPVectorTy() && - SrcBitSize > DstBitSize; + return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() && + SrcLength == DstLength && SrcBitSize > DstBitSize; case Instruction::FPExt: - return SrcTy->isFPOrFPVectorTy() && - DstTy->isFPOrFPVectorTy() && - SrcBitSize < DstBitSize; + return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() && + SrcLength == DstLength && SrcBitSize < DstBitSize; case Instruction::UIToFP: case Instruction::SIToFP: - if (const VectorType *SVTy = dyn_cast<VectorType>(SrcTy)) { - if (const VectorType *DVTy = dyn_cast<VectorType>(DstTy)) { - return SVTy->getElementType()->isIntOrIntVectorTy() && - DVTy->getElementType()->isFPOrFPVectorTy() && - SVTy->getNumElements() == DVTy->getNumElements(); - } - } - return SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy(); + return SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy() && + SrcLength == DstLength; case Instruction::FPToUI: case Instruction::FPToSI: - if (const VectorType *SVTy = dyn_cast<VectorType>(SrcTy)) { - if (const VectorType *DVTy = dyn_cast<VectorType>(DstTy)) { - return SVTy->getElementType()->isFPOrFPVectorTy() && - DVTy->getElementType()->isIntOrIntVectorTy() && - SVTy->getNumElements() == DVTy->getNumElements(); - } - } - return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy(); + return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() && + SrcLength == DstLength; case Instruction::PtrToInt: return SrcTy->isPointerTy() && DstTy->isIntegerTy(); case Instruction::IntToPtr: diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h index 23971aa..6ea4b48e 100644 --- a/lib/VMCore/LLVMContextImpl.h +++ b/lib/VMCore/LLVMContextImpl.h @@ -184,7 +184,7 @@ public: // Concrete/Abstract TypeDescriptions - We lazily calculate type descriptions // for types as they are needed. Because resolution of types must invalidate - // all of the abstract type descriptions, we keep them in a seperate map to + // all of the abstract type descriptions, we keep them in a separate map to // make this easy. TypePrinting ConcreteTypeDescriptions; TypePrinting AbstractTypeDescriptions; diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp index 84a0975..eb719e5 100644 --- a/lib/VMCore/Metadata.cpp +++ b/lib/VMCore/Metadata.cpp @@ -84,18 +84,18 @@ static MDNodeOperand *getOperandPtr(MDNode *N, unsigned Op) { return reinterpret_cast<MDNodeOperand*>(N+1)+Op; } -MDNode::MDNode(LLVMContext &C, Value *const *Vals, unsigned NumVals, - bool isFunctionLocal) +MDNode::MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal) : Value(Type::getMetadataTy(C), Value::MDNodeVal) { - NumOperands = NumVals; + NumOperands = Vals.size(); if (isFunctionLocal) setValueSubclassData(getSubclassDataFromValue() | FunctionLocalBit); // Initialize the operand list, which is co-allocated on the end of the node. + unsigned i = 0; for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op+NumOperands; - Op != E; ++Op, ++Vals) - new (Op) MDNodeOperand(*Vals, this); + Op != E; ++Op, ++i) + new (Op) MDNodeOperand(Vals[i], this); } @@ -183,9 +183,8 @@ static bool isFunctionLocalValue(Value *V) { (isa<MDNode>(V) && cast<MDNode>(V)->isFunctionLocal()); } -MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals, - unsigned NumVals, FunctionLocalness FL, - bool Insert) { +MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals, + FunctionLocalness FL, bool Insert) { LLVMContextImpl *pImpl = Context.pImpl; // Add all the operand pointers. Note that we don't have to add the @@ -193,7 +192,7 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals, // Note that if the operands are later nulled out, the node will be // removed from the uniquing map. FoldingSetNodeID ID; - for (unsigned i = 0; i != NumVals; ++i) + for (unsigned i = 0; i != Vals.size(); ++i) ID.AddPointer(Vals[i]); void *InsertPoint; @@ -205,7 +204,7 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals, bool isFunctionLocal = false; switch (FL) { case FL_Unknown: - for (unsigned i = 0; i != NumVals; ++i) { + for (unsigned i = 0; i != Vals.size(); ++i) { Value *V = Vals[i]; if (!V) continue; if (isFunctionLocalValue(V)) { @@ -223,8 +222,8 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals, } // Coallocate space for the node and Operands together, then placement new. - void *Ptr = malloc(sizeof(MDNode)+NumVals*sizeof(MDNodeOperand)); - N = new (Ptr) MDNode(Context, Vals, NumVals, isFunctionLocal); + void *Ptr = malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand)); + N = new (Ptr) MDNode(Context, Vals, isFunctionLocal); // InsertPoint will have been set by the FindNodeOrInsertPos call. pImpl->MDNodeSet.InsertNode(N, InsertPoint); @@ -233,26 +232,23 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals, } MDNode *MDNode::get(LLVMContext &Context, ArrayRef<Value*> Vals) { - return getMDNode(Context, Vals.data(), Vals.size(), FL_Unknown); -} -MDNode *MDNode::get(LLVMContext &Context, Value*const* Vals, unsigned NumVals) { - return getMDNode(Context, Vals, NumVals, FL_Unknown); + return getMDNode(Context, Vals, FL_Unknown); } -MDNode *MDNode::getWhenValsUnresolved(LLVMContext &Context, Value *const *Vals, - unsigned NumVals, bool isFunctionLocal) { - return getMDNode(Context, Vals, NumVals, isFunctionLocal ? FL_Yes : FL_No); +MDNode *MDNode::getWhenValsUnresolved(LLVMContext &Context, + ArrayRef<Value*> Vals, + bool isFunctionLocal) { + return getMDNode(Context, Vals, isFunctionLocal ? FL_Yes : FL_No); } -MDNode *MDNode::getIfExists(LLVMContext &Context, Value *const *Vals, - unsigned NumVals) { - return getMDNode(Context, Vals, NumVals, FL_Unknown, false); +MDNode *MDNode::getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals) { + return getMDNode(Context, Vals, FL_Unknown, false); } -MDNode *MDNode::getTemporary(LLVMContext &Context, Value *const *Vals, - unsigned NumVals) { - MDNode *N = (MDNode *)malloc(sizeof(MDNode)+NumVals*sizeof(MDNodeOperand)); - N = new (N) MDNode(Context, Vals, NumVals, FL_No); +MDNode *MDNode::getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals) { + MDNode *N = + (MDNode *)malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand)); + N = new (N) MDNode(Context, Vals, FL_No); N->setValueSubclassData(N->getSubclassDataFromValue() | NotUniquedBit); LeakDetector::addGarbageObject(N); diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp index ca4455a..5cf2905 100644 --- a/lib/VMCore/PassManager.cpp +++ b/lib/VMCore/PassManager.cpp @@ -449,9 +449,9 @@ namespace { static DebugInfoProbeInfo *TheDebugProbe; static void createDebugInfoProbe() { if (TheDebugProbe) return; - - // Constructed the first time this is called. This guarantees that the - // object will be constructed, if -enable-debug-info-probe is set, + + // Constructed the first time this is called. This guarantees that the + // object will be constructed, if -enable-debug-info-probe is set, // before static globals, thus it will be destroyed before them. static ManagedStatic<DebugInfoProbeInfo> DIP; TheDebugProbe = &*DIP; @@ -632,6 +632,7 @@ void PMTopLevelManager::schedulePass(Pass *P) { Pass *AnalysisPass = findAnalysisPass(*I); if (!AnalysisPass) { const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I); + assert(PI && "Expected required passes to be initialized"); AnalysisPass = PI->createPass(); if (P->getPotentialPassManagerType () == AnalysisPass->getPotentialPassManagerType()) @@ -686,6 +687,7 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) { // If Pass not found then check the interfaces implemented by Immutable Pass const PassInfo *PassInf = PassRegistry::getPassRegistry()->getPassInfo(PI); + assert(PassInf && "Expected all immutable passes to be initialized"); const std::vector<const PassInfo*> &ImmPI = PassInf->getInterfacesImplemented(); for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(), @@ -727,9 +729,11 @@ void PMTopLevelManager::dumpArguments() const { for (SmallVector<ImmutablePass *, 8>::const_iterator I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I) if (const PassInfo *PI = - PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) + PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) { + assert(PI && "Expected all immutable passes to be initialized"); if (!PI->isAnalysisGroup()) dbgs() << " -" << PI->getPassArgument(); + } for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(), E = PassManagers.end(); I != E; ++I) (*I)->dumpPassArguments(); @@ -982,7 +986,7 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) { // Keep track of higher level analysis used by this manager. HigherLevelAnalysis.push_back(PRequired); } else - llvm_unreachable("Unable to accomodate Required Pass"); + llvm_unreachable("Unable to accommodate Required Pass"); } // Set P as P's last user until someone starts using P. @@ -1183,6 +1187,12 @@ void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P, for (unsigned i = 0; i != Set.size(); ++i) { if (i) dbgs() << ','; const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]); + if (!PInf) { + // Some preserved passes, such as AliasAnalysis, may not be initialized by + // all drivers. + dbgs() << " Uninitialized Pass"; + continue; + } dbgs() << ' ' << PInf->getPassName(); } dbgs() << '\n'; diff --git a/lib/VMCore/PassRegistry.cpp b/lib/VMCore/PassRegistry.cpp index c97a170..fa92620 100644 --- a/lib/VMCore/PassRegistry.cpp +++ b/lib/VMCore/PassRegistry.cpp @@ -26,7 +26,7 @@ using namespace llvm; // FIXME: We use ManagedStatic to erase the pass registrar on shutdown. // Unfortunately, passes are registered with static ctors, and having -// llvm_shutdown clear this map prevents successful ressurection after +// llvm_shutdown clear this map prevents successful resurrection after // llvm_shutdown is run. Ideally we should find a solution so that we don't // leak the map, AND can still resurrect after shutdown. static ManagedStatic<PassRegistry> PassRegistryObj; diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp index b15304c..566bb28 100644 --- a/lib/VMCore/Type.cpp +++ b/lib/VMCore/Type.cpp @@ -12,23 +12,7 @@ //===----------------------------------------------------------------------===// #include "LLVMContextImpl.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Constants.h" -#include "llvm/Assembly/Writer.h" -#include "llvm/LLVMContext.h" -#include "llvm/Metadata.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/SCCIterator.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Threading.h" #include <algorithm> #include <cstdarg> using namespace llvm; @@ -87,7 +71,9 @@ void Type::destroy() const { operator delete(const_cast<Type *>(this)); return; - } else if (const OpaqueType *opaque_this = dyn_cast<OpaqueType>(this)) { + } + + if (const OpaqueType *opaque_this = dyn_cast<OpaqueType>(this)) { LLVMContextImpl *pImpl = this->getContext().pImpl; pImpl->OpaqueTypes.erase(opaque_this); } @@ -116,15 +102,6 @@ const Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) { } } -const Type *Type::getVAArgsPromotedType(LLVMContext &C) const { - if (ID == IntegerTyID && getSubclassData() < 32) - return Type::getInt32Ty(C); - else if (ID == FloatTyID) - return Type::getDoubleTy(C); - else - return this; -} - /// getScalarType - If this is a vector type, return the element type, /// otherwise return this. const Type *Type::getScalarType() const { @@ -197,6 +174,25 @@ bool Type::canLosslesslyBitCastTo(const Type *Ty) const { return false; // Other types have no identity values } +bool Type::isEmptyTy() const { + const ArrayType *ATy = dyn_cast<ArrayType>(this); + if (ATy) { + unsigned NumElements = ATy->getNumElements(); + return NumElements == 0 || ATy->getElementType()->isEmptyTy(); + } + + const StructType *STy = dyn_cast<StructType>(this); + if (STy) { + unsigned NumElements = STy->getNumElements(); + for (unsigned i = 0; i < NumElements; ++i) + if (!STy->getElementType(i)->isEmptyTy()) + return false; + return true; + } + + return false; +} + unsigned Type::getPrimitiveSizeInBits() const { switch (getTypeID()) { case Type::FloatTyID: return 32; @@ -243,8 +239,8 @@ bool Type::isSizedDerivedType() const { if (const ArrayType *ATy = dyn_cast<ArrayType>(this)) return ATy->getElementType()->isSized(); - if (const VectorType *PTy = dyn_cast<VectorType>(this)) - return PTy->getElementType()->isSized(); + if (const VectorType *VTy = dyn_cast<VectorType>(this)) + return VTy->getElementType()->isSized(); if (!this->isStructTy()) return false; @@ -463,11 +459,11 @@ bool FunctionType::isValidArgumentType(const Type *ArgTy) { FunctionType::FunctionType(const Type *Result, ArrayRef<const Type*> Params, bool IsVarArgs) - : DerivedType(Result->getContext(), FunctionTyID), isVarArgs(IsVarArgs) { + : DerivedType(Result->getContext(), FunctionTyID) { ContainedTys = reinterpret_cast<PATypeHandle*>(this+1); NumContainedTys = Params.size() + 1; // + 1 for result type assert(isValidReturnType(Result) && "invalid return type for function"); - + setSubclassData(IsVarArgs); bool isAbstract = Result->isAbstract(); new (&ContainedTys[0]) PATypeHandle(Result, this); @@ -523,7 +519,7 @@ VectorType::VectorType(const Type *ElType, unsigned NumEl) PointerType::PointerType(const Type *E, unsigned AddrSpace) : SequentialType(PointerTyID, E) { - AddressSpace = AddrSpace; + setSubclassData(AddrSpace); // Calculate whether or not this type is abstract setAbstract(E->isAbstract()); } @@ -836,6 +832,9 @@ FunctionValType FunctionValType::get(const FunctionType *FT) { return FunctionValType(FT->getReturnType(), ParamTypes, FT->isVarArg()); } +FunctionType *FunctionType::get(const Type *Result, bool isVarArg) { + return get(Result, ArrayRef<const Type *>(), isVarArg); +} // FunctionType::get - The factory function for the FunctionType class... FunctionType *FunctionType::get(const Type *ReturnType, @@ -912,9 +911,14 @@ bool VectorType::isValidElementType(const Type *ElemTy) { } //===----------------------------------------------------------------------===// -// Struct Type Factory... +// Struct Type Factory. // +StructType *StructType::get(LLVMContext &Context, bool isPacked) { + return get(Context, llvm::ArrayRef<const Type*>(), isPacked); +} + + StructType *StructType::get(LLVMContext &Context, ArrayRef<const Type*> ETypes, bool isPacked) { diff --git a/lib/VMCore/TypesContext.h b/lib/VMCore/TypesContext.h index 6fb53be..ad09478 100644 --- a/lib/VMCore/TypesContext.h +++ b/lib/VMCore/TypesContext.h @@ -370,7 +370,7 @@ public: // Remove the old entry form TypesByHash. If the hash values differ // now, remove it from the old place. Otherwise, continue scanning - // withing this hashcode to reduce work. + // within this hashcode to reduce work. if (NewTypeHash != OldTypeHash) { RemoveFromTypesByHash(OldTypeHash, Ty); } else { diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp index 8b89110..139e035 100644 --- a/lib/VMCore/Verifier.cpp +++ b/lib/VMCore/Verifier.cpp @@ -1645,6 +1645,9 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) { Assert1(isa<ConstantInt>(CI.getArgOperand(3)), "alignment argument of memory intrinsics must be a constant int", &CI); + Assert1(isa<ConstantInt>(CI.getArgOperand(4)), + "isvolatile argument of memory intrinsics must be a constant int", + &CI); break; case Intrinsic::gcroot: case Intrinsic::gcwrite: |
