diff options
Diffstat (limited to 'lib')
747 files changed, 60627 insertions, 28055 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp index 95c834b..3b6aab1 100644 --- a/lib/Analysis/AliasAnalysis.cpp +++ b/lib/Analysis/AliasAnalysis.cpp @@ -25,6 +25,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Pass.h" #include "llvm/BasicBlock.h" #include "llvm/Function.h" @@ -356,6 +359,86 @@ AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW, const Location &Loc) { return ModRef; } +namespace { + /// Only find pointer captures which happen before the given instruction. Uses + /// the dominator tree to determine whether one instruction is before another. + struct CapturesBefore : public CaptureTracker { + CapturesBefore(const Instruction *I, DominatorTree *DT) + : BeforeHere(I), DT(DT), Captured(false) {} + + void tooManyUses() { Captured = true; } + + bool shouldExplore(Use *U) { + Instruction *I = cast<Instruction>(U->getUser()); + BasicBlock *BB = I->getParent(); + if (BeforeHere != I && + (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I))) + return false; + return true; + } + + bool captured(Use *U) { + Instruction *I = cast<Instruction>(U->getUser()); + BasicBlock *BB = I->getParent(); + if (BeforeHere != I && + (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I))) + return false; + Captured = true; + return true; + } + + const Instruction *BeforeHere; + DominatorTree *DT; + + bool Captured; + }; +} + +// FIXME: this is really just shoring-up a deficiency in alias analysis. +// BasicAA isn't willing to spend linear time determining whether an alloca +// was captured before or after this particular call, while we are. However, +// with a smarter AA in place, this test is just wasting compile time. +AliasAnalysis::ModRefResult +AliasAnalysis::callCapturesBefore(const Instruction *I, + const AliasAnalysis::Location &MemLoc, + DominatorTree *DT) { + if (!DT || !TD) return AliasAnalysis::ModRef; + + const Value *Object = GetUnderlyingObject(MemLoc.Ptr, TD); + if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object) || + isa<Constant>(Object)) + return AliasAnalysis::ModRef; + + ImmutableCallSite CS(I); + if (!CS.getInstruction() || CS.getInstruction() == Object) + return AliasAnalysis::ModRef; + + CapturesBefore CB(I, DT); + llvm::PointerMayBeCaptured(Object, &CB); + if (CB.Captured) + return AliasAnalysis::ModRef; + + unsigned ArgNo = 0; + for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); + CI != CE; ++CI, ++ArgNo) { + // Only look at the no-capture or byval pointer arguments. If this + // pointer were passed to arguments that were neither of these, then it + // couldn't be no-capture. + if (!(*CI)->getType()->isPointerTy() || + (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo))) + continue; + + // If this is a no-capture pointer argument, see if we can tell that it + // is impossible to alias the pointer we're checking. If not, we have to + // assume that the call could touch the pointer, even though it doesn't + // escape. + if (!isNoAlias(AliasAnalysis::Location(*CI), + AliasAnalysis::Location(Object))) { + return AliasAnalysis::ModRef; + } + } + return AliasAnalysis::NoModRef; +} // AliasAnalysis destructor: DO NOT move this to the header file for // AliasAnalysis or else clients of the AliasAnalysis class may not depend on diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp index f80e2fb..92e8906 100644 --- a/lib/Analysis/AliasSetTracker.cpp +++ b/lib/Analysis/AliasSetTracker.cpp @@ -501,7 +501,7 @@ void AliasSetTracker::deleteValue(Value *PtrVal) { } // First, look up the PointerRec for this pointer. - PointerMapType::iterator I = PointerMap.find(PtrVal); + PointerMapType::iterator I = PointerMap.find_as(PtrVal); if (I == PointerMap.end()) return; // Noop // If we found one, remove the pointer from the alias set it is in. @@ -527,7 +527,7 @@ void AliasSetTracker::copyValue(Value *From, Value *To) { AA.copyValue(From, To); // First, look up the PointerRec for this pointer. - PointerMapType::iterator I = PointerMap.find(From); + PointerMapType::iterator I = PointerMap.find_as(From); if (I == PointerMap.end()) return; // Noop assert(I->second->hasAliasSet() && "Dead entry?"); @@ -536,7 +536,7 @@ void AliasSetTracker::copyValue(Value *From, Value *To) { if (Entry.hasAliasSet()) return; // Already in the tracker! // Add it to the alias set it aliases... - I = PointerMap.find(From); + I = PointerMap.find_as(From); AliasSet *AS = I->second->getAliasSet(*this); AS->addPointer(*this, Entry, I->second->getSize(), I->second->getTBAAInfo(), diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index 20ecfd2..1d028c2 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -86,47 +86,10 @@ static bool isEscapeSource(const Value *V) { /// UnknownSize if unknown. static uint64_t getObjectSize(const Value *V, const TargetData &TD, bool RoundToAlign = false) { - Type *AccessTy; - unsigned Align; - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) { - if (!GV->hasDefinitiveInitializer()) - return AliasAnalysis::UnknownSize; - AccessTy = GV->getType()->getElementType(); - Align = GV->getAlignment(); - } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) { - if (!AI->isArrayAllocation()) - AccessTy = AI->getType()->getElementType(); - else - return AliasAnalysis::UnknownSize; - Align = AI->getAlignment(); - } else if (const CallInst* CI = extractMallocCall(V)) { - if (!RoundToAlign && !isArrayMalloc(V, &TD)) - // The size is the argument to the malloc call. - if (const ConstantInt* C = dyn_cast<ConstantInt>(CI->getArgOperand(0))) - return C->getZExtValue(); - return AliasAnalysis::UnknownSize; - } else if (const Argument *A = dyn_cast<Argument>(V)) { - if (A->hasByValAttr()) { - AccessTy = cast<PointerType>(A->getType())->getElementType(); - Align = A->getParamAlignment(); - } else { - return AliasAnalysis::UnknownSize; - } - } else { - return AliasAnalysis::UnknownSize; - } - - if (!AccessTy->isSized()) - return AliasAnalysis::UnknownSize; - - uint64_t Size = TD.getTypeAllocSize(AccessTy); - // If there is an explicitly specified alignment, and we need to - // take alignment into account, round up the size. (If the alignment - // is implicit, getTypeAllocSize is sufficient.) - if (RoundToAlign && Align) - Size = RoundUpToAlignment(Size, Align); - - return Size; + uint64_t Size; + if (getObjectSize(V, Size, &TD, RoundToAlign)) + return Size; + return AliasAnalysis::UnknownSize; } /// isObjectSmallerThan - Return true if we can prove that the object specified diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt index 2e3ec8b..96e68b4 100644 --- a/lib/Analysis/CMakeLists.txt +++ b/lib/Analysis/CMakeLists.txt @@ -12,9 +12,7 @@ add_llvm_library(LLVMAnalysis CaptureTracking.cpp CodeMetrics.cpp ConstantFolding.cpp - DIBuilder.cpp DbgInfoPrinter.cpp - DebugInfo.cpp DomPrinter.cpp DominanceFrontier.cpp IVUsers.cpp @@ -59,4 +57,6 @@ add_llvm_library(LLVMAnalysis ValueTracking.cpp ) +add_dependencies(LLVMAnalysis intrinsics_gen) + add_subdirectory(IPA) diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp index dd33eeb..974b906 100644 --- a/lib/Analysis/CaptureTracking.cpp +++ b/lib/Analysis/CaptureTracking.cpp @@ -34,7 +34,7 @@ namespace { bool captured(Use *U) { if (isa<ReturnInst>(U->getUser()) && !ReturnCaptures) - return false; + return false; Captured = true; return true; diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp index 316e7bc..acda34b 100644 --- a/lib/Analysis/CodeMetrics.cpp +++ b/lib/Analysis/CodeMetrics.cpp @@ -22,7 +22,11 @@ using namespace llvm; /// callIsSmall - If a call is likely to lower to a single target instruction, /// or is otherwise deemed small return true. /// TODO: Perhaps calls like memcpy, strcpy, etc? -bool llvm::callIsSmall(const Function *F) { +bool llvm::callIsSmall(ImmutableCallSite CS) { + if (isa<IntrinsicInst>(CS.getInstruction())) + return true; + + const Function *F = CS.getCalledFunction(); if (!F) return false; if (F->hasLocalLinkage()) return false; @@ -79,8 +83,24 @@ bool llvm::isInstructionFree(const Instruction *I, const TargetData *TD) { if (const CastInst *CI = dyn_cast<CastInst>(I)) { // Noop casts, including ptr <-> int, don't count. - if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) || isa<PtrToIntInst>(CI)) + if (CI->isLosslessCast()) + return true; + + Value *Op = CI->getOperand(0); + // An inttoptr cast is free so long as the input is a legal integer type + // which doesn't contain values outside the range of a pointer. + if (isa<IntToPtrInst>(CI) && TD && + TD->isLegalInteger(Op->getType()->getScalarSizeInBits()) && + Op->getType()->getScalarSizeInBits() <= TD->getPointerSizeInBits()) return true; + + // A ptrtoint cast is free so long as the result is large enough to store + // the pointer, and a legal integer type. + if (isa<PtrToIntInst>(CI) && TD && + TD->isLegalInteger(Op->getType()->getScalarSizeInBits()) && + Op->getType()->getScalarSizeInBits() >= TD->getPointerSizeInBits()) + return true; + // trunc to a native type is free (assuming the target has compare and // shift-right of the same width). if (TD && isa<TruncInst>(CI) && @@ -126,7 +146,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB, isRecursive = true; } - if (!isa<IntrinsicInst>(II) && !callIsSmall(CS.getCalledFunction())) { + if (!callIsSmall(CS)) { // Each argument to a call takes on average one instruction to set up. NumInsts += CS.arg_size(); diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 7a0a4e1..7ced848 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -681,6 +681,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops, // This makes it easy to determine if the getelementptr is "inbounds". // Also, this helps GlobalOpt do SROA on GlobalVariables. Type *Ty = Ptr->getType(); + assert(Ty->isPointerTy() && "Forming regular GEP of non-pointer type"); SmallVector<Constant*, 32> NewIdxs; do { if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) { @@ -711,10 +712,17 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops, } Ty = ATy->getElementType(); } else if (StructType *STy = dyn_cast<StructType>(Ty)) { - // Determine which field of the struct the offset points into. The - // getZExtValue is at least as safe as the StructLayout API because we - // know the offset is within the struct at this point. + // If we end up with an offset that isn't valid for this struct type, we + // can't re-form this GEP in a regular form, so bail out. The pointer + // operand likely went through casts that are necessary to make the GEP + // sensible. const StructLayout &SL = *TD->getStructLayout(STy); + if (Offset.uge(SL.getSizeInBytes())) + break; + + // Determine which field of the struct the offset points into. The + // getZExtValue is fine as we've already ensured that the offset is + // within the range representable by the StructLayout API. unsigned ElIdx = SL.getElementContainingOffset(Offset.getZExtValue()); NewIdxs.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()), ElIdx)); @@ -772,14 +780,21 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, // all operands are constants. if (isa<UndefValue>(Incoming)) continue; - // If the incoming value is not a constant, or is a different constant to - // the one we saw previously, then give up. + // If the incoming value is not a constant, then give up. Constant *C = dyn_cast<Constant>(Incoming); - if (!C || (CommonValue && C != CommonValue)) + if (!C) + return 0; + // Fold the PHI's operands. + if (ConstantExpr *NewC = dyn_cast<ConstantExpr>(C)) + C = ConstantFoldConstantExpression(NewC, TD, TLI); + // If the incoming value is a different constant to + // the one we saw previously, then give up. + if (CommonValue && C != CommonValue) return 0; CommonValue = C; } + // If we reach here, all incoming values are the same constant or undef. return CommonValue ? CommonValue : UndefValue::get(PN->getType()); } @@ -787,12 +802,18 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, // Scan the operand list, checking to see if they are all constants, if so, // hand off to ConstantFoldInstOperands. SmallVector<Constant*, 8> Ops; - for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (Constant *Op = dyn_cast<Constant>(*i)) - Ops.push_back(Op); - else + for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) { + Constant *Op = dyn_cast<Constant>(*i); + if (!Op) return 0; // All operands not constant! + // Fold the Instruction's operands. + if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(Op)) + Op = ConstantFoldConstantExpression(NewCE, TD, TLI); + + Ops.push_back(Op); + } + if (const CmpInst *CI = dyn_cast<CmpInst>(I)) return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1], TD, TLI); diff --git a/lib/Analysis/DbgInfoPrinter.cpp b/lib/Analysis/DbgInfoPrinter.cpp index cd832ab..41cd34c 100644 --- a/lib/Analysis/DbgInfoPrinter.cpp +++ b/lib/Analysis/DbgInfoPrinter.cpp @@ -16,14 +16,14 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Pass.h" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/IntrinsicInst.h" #include "llvm/Metadata.h" #include "llvm/Module.h" -#include "llvm/Assembly/Writer.h" -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/Pass.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Assembly/Writer.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" diff --git a/lib/Analysis/IPA/CMakeLists.txt b/lib/Analysis/IPA/CMakeLists.txt index 8ffef29..34d6d1b 100644 --- a/lib/Analysis/IPA/CMakeLists.txt +++ b/lib/Analysis/IPA/CMakeLists.txt @@ -5,3 +5,5 @@ add_llvm_library(LLVMipa GlobalsModRef.cpp IPA.cpp ) + +add_dependencies(LLVMipa intrinsics_gen) diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp index 963da75..449b7ee 100644 --- a/lib/Analysis/IPA/CallGraphSCCPass.cpp +++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp @@ -246,7 +246,9 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC, for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { CallSite CS(cast<Value>(I)); - if (!CS || isa<IntrinsicInst>(I)) continue; + if (!CS) continue; + Function *Callee = CS.getCalledFunction(); + if (Callee && Callee->isIntrinsic()) continue; // If this call site already existed in the callgraph, just verify it // matches up to expectations and remove it from CallSites. diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp index c1d8e3e..22f6e96 100644 --- a/lib/Analysis/IPA/GlobalsModRef.cpp +++ b/lib/Analysis/IPA/GlobalsModRef.cpp @@ -329,15 +329,8 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) { // Check the value being stored. Value *Ptr = GetUnderlyingObject(SI->getOperand(0)); - if (isMalloc(Ptr)) { - // Okay, easy case. - } else if (CallInst *CI = dyn_cast<CallInst>(Ptr)) { - Function *F = CI->getCalledFunction(); - if (!F || !F->isDeclaration()) return false; // Too hard to analyze. - if (F->getName() != "calloc") return false; // Not calloc. - } else { + if (!isAllocLikeFn(Ptr)) return false; // Too hard to analyze. - } // Analyze all uses of the allocation. If any of them are used in a // non-simple way (e.g. stored to another global) bail out. @@ -454,19 +447,18 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) { for (inst_iterator II = inst_begin(SCC[i]->getFunction()), E = inst_end(SCC[i]->getFunction()); II != E && FunctionEffect != ModRef; ++II) - if (isa<LoadInst>(*II)) { + if (LoadInst *LI = dyn_cast<LoadInst>(&*II)) { FunctionEffect |= Ref; - if (cast<LoadInst>(*II).isVolatile()) + if (LI->isVolatile()) // Volatile loads may have side-effects, so mark them as writing // memory (for example, a flag inside the processor). FunctionEffect |= Mod; - } else if (isa<StoreInst>(*II)) { + } else if (StoreInst *SI = dyn_cast<StoreInst>(&*II)) { FunctionEffect |= Mod; - if (cast<StoreInst>(*II).isVolatile()) + if (SI->isVolatile()) // Treat volatile stores as reading memory somewhere. FunctionEffect |= Ref; - } else if (isMalloc(&cast<Instruction>(*II)) || - isFreeCall(&cast<Instruction>(*II))) { + } else if (isAllocationFn(&*II) || isFreeCall(&*II)) { FunctionEffect |= ModRef; } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) { // The callgraph doesn't include intrinsic calls. diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp index b80966b..0a6682a 100644 --- a/lib/Analysis/IVUsers.cpp +++ b/lib/Analysis/IVUsers.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Target/TargetData.h" #include "llvm/Assembly/Writer.h" #include "llvm/ADT/STLExtras.h" @@ -120,6 +121,12 @@ bool IVUsers::AddUsersImpl(Instruction *I, if (!SE->isSCEVable(I->getType())) return false; // Void and FP expressions cannot be reduced. + // IVUsers is used by LSR which assumes that all SCEV expressions are safe to + // pass to SCEVExpander. Expressions are not safe to expand if they represent + // operations that are not safe to speculate, namely integer division. + if (!isa<PHINode>(I) && !isSafeToSpeculativelyExecute(I, TD)) + return false; + // LSR is not APInt clean, do not touch integers bigger than 64-bits. // Also avoid creating IVs of non-native types. For example, we don't want a // 64-bit IV in 32-bit code just because the loop has one 64-bit cast. diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index 3e3d2ab..a6bf4a8 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -178,7 +178,7 @@ bool CallAnalyzer::lookupSROAArgAndCost( /// \brief Disable SROA for the candidate marked by this cost iterator. /// -/// This markes the candidate as no longer viable for SROA, and adds the cost +/// This marks the candidate as no longer viable for SROA, and adds the cost /// savings associated with it back into the inline cost measurement. void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { // If we're no longer able to perform SROA we need to undo its cost savings @@ -398,10 +398,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) SROAArgValues[&I] = SROAArg; - // A ptrtoint cast is free so long as the result is large enough to store the - // pointer, and a legal integer type. - return TD && TD->isLegalInteger(IntegerSize) && - IntegerSize >= TD->getPointerSizeInBits(); + return isInstructionFree(&I, TD); } bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { @@ -428,10 +425,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) SROAArgValues[&I] = SROAArg; - // An inttoptr cast is free so long as the input is a legal integer type - // which doesn't contain values outside the range of a pointer. - return TD && TD->isLegalInteger(IntegerSize) && - IntegerSize <= TD->getPointerSizeInBits(); + return isInstructionFree(&I, TD); } bool CallAnalyzer::visitCastInst(CastInst &I) { @@ -445,24 +439,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) { // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. disableSROA(I.getOperand(0)); - // No-op casts don't have any cost. - if (I.isLosslessCast()) - return true; - - // trunc to a native type is free (assuming the target has compare and - // shift-right of the same width). - if (TD && isa<TruncInst>(I) && - TD->isLegalInteger(TD->getTypeSizeInBits(I.getType()))) - return true; - - // Result of a cmp instruction is often extended (to be used by other - // cmp instructions, logical or return instructions). These are usually - // no-ops on most sane targets. - if (isa<CmpInst>(I.getOperand(0))) - return true; - - // Assume the rest of the casts require work. - return false; + return isInstructionFree(&I, TD); } bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { @@ -636,21 +613,11 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { default: return Base::visitCallSite(CS); - case Intrinsic::dbg_declare: - case Intrinsic::dbg_value: - case Intrinsic::invariant_start: - case Intrinsic::invariant_end: - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: case Intrinsic::memset: case Intrinsic::memcpy: case Intrinsic::memmove: - case Intrinsic::objectsize: - case Intrinsic::ptr_annotation: - case Intrinsic::var_annotation: - // SROA can usually chew through these intrinsics and they have no cost - // so don't pay the price of analyzing them in detail. - return true; + // SROA can usually chew through these intrinsics, but they aren't free. + return false; } } @@ -662,7 +629,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { return false; } - if (!callIsSmall(F)) { + if (!callIsSmall(CS)) { // We account for the average 1 instruction per call argument setup // here. Cost += CS.arg_size() * InlineConstants::InstrCost; @@ -706,6 +673,11 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { } bool CallAnalyzer::visitInstruction(Instruction &I) { + // Some instructions are free. All of the free intrinsics can also be + // handled by SROA, etc. + if (isInstructionFree(&I, TD)) + return true; + // We found something we don't understand or can't handle. Mark any SROA-able // values in the operand list as no longer viable. for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 16e7a72..16a9a04 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -47,7 +47,7 @@ struct Query { const DominatorTree *DT; Query(const TargetData *td, const TargetLibraryInfo *tli, - const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {}; + const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {} }; static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned); diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp index 5ca2746..9140786 100644 --- a/lib/Analysis/LazyValueInfo.cpp +++ b/lib/Analysis/LazyValueInfo.cpp @@ -172,7 +172,7 @@ public: if (NewR.isEmptySet()) return markOverdefined(); - bool changed = Range == NewR; + bool changed = Range != NewR; Range = NewR; return changed; } @@ -457,8 +457,10 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) { void LazyValueInfoCache::solve() { while (!BlockValueStack.empty()) { std::pair<BasicBlock*, Value*> &e = BlockValueStack.top(); - if (solveBlockValue(e.second, e.first)) + if (solveBlockValue(e.second, e.first)) { + assert(BlockValueStack.top() == e); BlockValueStack.pop(); + } } } @@ -766,15 +768,10 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV, return true; } -/// getEdgeValue - This method attempts to infer more complex -bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom, - BasicBlock *BBTo, LVILatticeVal &Result) { - // If already a constant, there is nothing to compute. - if (Constant *VC = dyn_cast<Constant>(Val)) { - Result = LVILatticeVal::get(VC); - return true; - } - +/// \brief Compute the value of Val on the edge BBFrom -> BBTo. Returns false if +/// Val is not constrained on the edge. +static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom, + BasicBlock *BBTo, LVILatticeVal &Result) { // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we // know that v != 0. if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) { @@ -818,7 +815,7 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom, ConstantInt *CI = dyn_cast<ConstantInt>(ICI->getOperand(1)); if (CI && (ICI->getOperand(0) == Val || NegOffset)) { // Calculate the range of values that would satisfy the comparison. - ConstantRange CmpRange(CI->getValue(), CI->getValue()+1); + ConstantRange CmpRange(CI->getValue()); ConstantRange TrueValues = ConstantRange::makeICmpRegion(ICI->getPredicate(), CmpRange); @@ -827,25 +824,8 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom, // If we're interested in the false dest, invert the condition. if (!isTrueDest) TrueValues = TrueValues.inverse(); - - // Figure out the possible values of the query BEFORE this branch. - if (!hasBlockValue(Val, BBFrom)) { - BlockValueStack.push(std::make_pair(BBFrom, Val)); - return false; - } - - LVILatticeVal InBlock = getBlockValue(Val, BBFrom); - if (!InBlock.isConstantRange()) { - Result = LVILatticeVal::getRange(TrueValues); - return true; - } - - // Find all potential values that satisfy both the input and output - // conditions. - ConstantRange PossibleValues = - TrueValues.intersectWith(InBlock.getConstantRange()); - - Result = LVILatticeVal::getRange(PossibleValues); + + Result = LVILatticeVal::getRange(TrueValues); return true; } } @@ -855,40 +835,71 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom, // If the edge was formed by a switch on the value, then we may know exactly // what it is. if (SwitchInst *SI = dyn_cast<SwitchInst>(BBFrom->getTerminator())) { - if (SI->getCondition() == Val) { - // We don't know anything in the default case. - if (SI->getDefaultDest() == BBTo) { - Result.markOverdefined(); - return true; - } - - // We only know something if there is exactly one value that goes from - // BBFrom to BBTo. - unsigned NumEdges = 0; - ConstantInt *EdgeVal = 0; - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - if (i.getCaseSuccessor() != BBTo) continue; - if (NumEdges++) break; - EdgeVal = i.getCaseValue(); - } - assert(EdgeVal && "Missing successor?"); - if (NumEdges == 1) { - Result = LVILatticeVal::get(EdgeVal); - return true; - } + if (SI->getCondition() != Val) + return false; + + bool DefaultCase = SI->getDefaultDest() == BBTo; + unsigned BitWidth = Val->getType()->getIntegerBitWidth(); + ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/); + + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + ConstantRange EdgeVal(i.getCaseValue()->getValue()); + if (DefaultCase) + EdgesVals = EdgesVals.difference(EdgeVal); + else if (i.getCaseSuccessor() == BBTo) + EdgesVals = EdgesVals.unionWith(EdgeVal); } - } - - // Otherwise see if the value is known in the block. - if (hasBlockValue(Val, BBFrom)) { - Result = getBlockValue(Val, BBFrom); + Result = LVILatticeVal::getRange(EdgesVals); return true; } - BlockValueStack.push(std::make_pair(BBFrom, Val)); return false; } +/// \brief Compute the value of Val on the edge BBFrom -> BBTo, or the value at +/// the basic block if the edge does not constraint Val. +bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom, + BasicBlock *BBTo, LVILatticeVal &Result) { + // If already a constant, there is nothing to compute. + if (Constant *VC = dyn_cast<Constant>(Val)) { + Result = LVILatticeVal::get(VC); + return true; + } + + if (getEdgeValueLocal(Val, BBFrom, BBTo, Result)) { + if (!Result.isConstantRange() || + Result.getConstantRange().getSingleElement()) + return true; + + // FIXME: this check should be moved to the beginning of the function when + // LVI better supports recursive values. Even for the single value case, we + // can intersect to detect dead code (an empty range). + if (!hasBlockValue(Val, BBFrom)) { + BlockValueStack.push(std::make_pair(BBFrom, Val)); + return false; + } + + // Try to intersect ranges of the BB and the constraint on the edge. + LVILatticeVal InBlock = getBlockValue(Val, BBFrom); + if (!InBlock.isConstantRange()) + return true; + + ConstantRange Range = + Result.getConstantRange().intersectWith(InBlock.getConstantRange()); + Result = LVILatticeVal::getRange(Range); + return true; + } + + if (!hasBlockValue(Val, BBFrom)) { + BlockValueStack.push(std::make_pair(BBFrom, Val)); + return false; + } + + // if we couldn't compute the value on the edge, use the value from the BB + Result = getBlockValue(Val, BBFrom); + return true; +} + LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB) { DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '" << BB->getName() << "'\n"); diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp index f7a60a1..20c33a3 100644 --- a/lib/Analysis/LoopInfo.cpp +++ b/lib/Analysis/LoopInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/Constants.h" #include "llvm/Instructions.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfoImpl.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Assembly/Writer.h" @@ -29,6 +30,10 @@ #include <algorithm> using namespace llvm; +// Explicitly instantiate methods in LoopInfoImpl.h for IR-level Loops. +template class llvm::LoopBase<BasicBlock, Loop>; +template class llvm::LoopInfoBase<BasicBlock, Loop>; + // Always verify loopinfo if expensive checking is enabled. #ifdef XDEBUG static bool VerifyLoopInfo = true; @@ -507,7 +512,7 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) { // bool LoopInfo::runOnFunction(Function &) { releaseMemory(); - LI.Calculate(getAnalysis<DominatorTree>().getBase()); // Update + LI.Analyze(getAnalysis<DominatorTree>().getBase()); return false; } @@ -589,9 +594,6 @@ void LoopInfo::verifyAnalysis() const { } // Verify that blocks are mapped to valid loops. - // - // FIXME: With an up-to-date DFS (see LoopIterator.h) and DominatorTree, we - // could also verify that the blocks are still in the correct loops. for (DenseMap<BasicBlock*, Loop*>::const_iterator I = LI.BBMap.begin(), E = LI.BBMap.end(); I != E; ++I) { assert(Loops.count(I->second) && "orphaned loop"); diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp index aba700a..1540112 100644 --- a/lib/Analysis/LoopPass.cpp +++ b/lib/Analysis/LoopPass.cpp @@ -162,7 +162,7 @@ void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) { // Recurse through all subloops and all loops into LQ. static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) { LQ.push_back(L); - for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + for (Loop::reverse_iterator I = L->rbegin(), E = L->rend(); I != E; ++I) addLoopIntoQueue(*I, LQ); } @@ -183,8 +183,12 @@ bool LPPassManager::runOnFunction(Function &F) { // Collect inherited analysis from Module level pass manager. populateInheritedAnalysis(TPM->activeStack); - // Populate Loop Queue - for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) + // Populate the loop queue in reverse program order. There is no clear need to + // process sibling loops in either forward or reverse order. There may be some + // advantage in deleting uses in a later loop before optimizing the + // definitions in an earlier loop. If we find a clear reason to process in + // forward order, then a forward variant of LoopPassManager should be created. + for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I) addLoopIntoQueue(*I, LQ); if (LQ.empty()) // No loops, skip calling finalizers diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp index 22414b3..8578a63 100644 --- a/lib/Analysis/MemDepPrinter.cpp +++ b/lib/Analysis/MemDepPrinter.cpp @@ -32,7 +32,7 @@ namespace { Unknown }; - static const char* DepTypeStr[]; + static const char *const DepTypeStr[]; typedef PointerIntPair<const Instruction *, 2, DepType> InstTypePair; typedef std::pair<InstTypePair, const BasicBlock *> Dep; @@ -88,7 +88,7 @@ FunctionPass *llvm::createMemDepPrinter() { return new MemDepPrinter(); } -const char* MemDepPrinter::DepTypeStr[] +const char *const MemDepPrinter::DepTypeStr[] = {"Clobber", "Def", "NonFuncLocal", "Unknown"}; bool MemDepPrinter::runOnFunction(Function &F) { diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp index b145650..8d99ec3 100644 --- a/lib/Analysis/MemoryBuiltins.cpp +++ b/lib/Analysis/MemoryBuiltins.cpp @@ -12,80 +12,168 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "memory-builtins" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Constants.h" +#include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Metadata.h" #include "llvm/Module.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; -//===----------------------------------------------------------------------===// -// malloc Call Utility Functions. -// +enum AllocType { + MallocLike = 1<<0, // allocates + CallocLike = 1<<1, // allocates + bzero + ReallocLike = 1<<2, // reallocates + StrDupLike = 1<<3, + AllocLike = MallocLike | CallocLike | StrDupLike, + AnyAlloc = MallocLike | CallocLike | ReallocLike | StrDupLike +}; + +struct AllocFnsTy { + const char *Name; + AllocType AllocTy; + unsigned char NumParams; + // First and Second size parameters (or -1 if unused) + signed char FstParam, SndParam; +}; + +// FIXME: certain users need more information. E.g., SimplifyLibCalls needs to +// know which functions are nounwind, noalias, nocapture parameters, etc. +static const AllocFnsTy AllocationFnData[] = { + {"malloc", MallocLike, 1, 0, -1}, + {"valloc", MallocLike, 1, 0, -1}, + {"_Znwj", MallocLike, 1, 0, -1}, // new(unsigned int) + {"_ZnwjRKSt9nothrow_t", MallocLike, 2, 0, -1}, // new(unsigned int, nothrow) + {"_Znwm", MallocLike, 1, 0, -1}, // new(unsigned long) + {"_ZnwmRKSt9nothrow_t", MallocLike, 2, 0, -1}, // new(unsigned long, nothrow) + {"_Znaj", MallocLike, 1, 0, -1}, // new[](unsigned int) + {"_ZnajRKSt9nothrow_t", MallocLike, 2, 0, -1}, // new[](unsigned int, nothrow) + {"_Znam", MallocLike, 1, 0, -1}, // new[](unsigned long) + {"_ZnamRKSt9nothrow_t", MallocLike, 2, 0, -1}, // new[](unsigned long, nothrow) + {"posix_memalign", MallocLike, 3, 2, -1}, + {"calloc", CallocLike, 2, 0, 1}, + {"realloc", ReallocLike, 2, 1, -1}, + {"reallocf", ReallocLike, 2, 1, -1}, + {"strdup", StrDupLike, 1, -1, -1}, + {"strndup", StrDupLike, 2, -1, -1} +}; + -/// isMalloc - Returns true if the value is either a malloc call or a -/// bitcast of the result of a malloc call. -bool llvm::isMalloc(const Value *I) { - return extractMallocCall(I) || extractMallocCallFromBitCast(I); +static Function *getCalledFunction(const Value *V, bool LookThroughBitCast) { + if (LookThroughBitCast) + V = V->stripPointerCasts(); + + CallSite CS(const_cast<Value*>(V)); + if (!CS.getInstruction()) + return 0; + + Function *Callee = CS.getCalledFunction(); + if (!Callee || !Callee->isDeclaration()) + return 0; + return Callee; } -static bool isMallocCall(const CallInst *CI) { - if (!CI) - return false; +/// \brief Returns the allocation data for the given value if it is a call to a +/// known allocation function, and NULL otherwise. +static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy, + bool LookThroughBitCast = false) { + Function *Callee = getCalledFunction(V, LookThroughBitCast); + if (!Callee) + return 0; - Function *Callee = CI->getCalledFunction(); - if (Callee == 0 || !Callee->isDeclaration()) - return false; - if (Callee->getName() != "malloc" && - Callee->getName() != "_Znwj" && // operator new(unsigned int) - Callee->getName() != "_Znwm" && // operator new(unsigned long) - Callee->getName() != "_Znaj" && // operator new[](unsigned int) - Callee->getName() != "_Znam") // operator new[](unsigned long) - return false; + unsigned i = 0; + bool found = false; + for ( ; i < array_lengthof(AllocationFnData); ++i) { + if (Callee->getName() == AllocationFnData[i].Name) { + found = true; + break; + } + } + if (!found) + return 0; - // Check malloc prototype. - // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin - // attribute will exist. + const AllocFnsTy *FnData = &AllocationFnData[i]; + if ((FnData->AllocTy & AllocTy) == 0) + return 0; + + // Check function prototype. + // FIXME: Check the nobuiltin metadata?? (PR5130) + int FstParam = FnData->FstParam; + int SndParam = FnData->SndParam; FunctionType *FTy = Callee->getFunctionType(); - return FTy->getReturnType() == Type::getInt8PtrTy(FTy->getContext()) && - FTy->getNumParams() == 1 && - (FTy->getParamType(0)->isIntegerTy(32) || - FTy->getParamType(0)->isIntegerTy(64)); + + if (FTy->getReturnType() == Type::getInt8PtrTy(FTy->getContext()) && + FTy->getNumParams() == FnData->NumParams && + (FstParam < 0 || + (FTy->getParamType(FstParam)->isIntegerTy(32) || + FTy->getParamType(FstParam)->isIntegerTy(64))) && + (SndParam < 0 || + FTy->getParamType(SndParam)->isIntegerTy(32) || + FTy->getParamType(SndParam)->isIntegerTy(64))) + return FnData; + return 0; } -/// extractMallocCall - Returns the corresponding CallInst if the instruction -/// is a malloc call. Since CallInst::CreateMalloc() only creates calls, we -/// ignore InvokeInst here. -const CallInst *llvm::extractMallocCall(const Value *I) { - const CallInst *CI = dyn_cast<CallInst>(I); - return (isMallocCall(CI)) ? CI : NULL; +static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) { + ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V); + return CS && CS.hasFnAttr(Attribute::NoAlias); } -CallInst *llvm::extractMallocCall(Value *I) { - CallInst *CI = dyn_cast<CallInst>(I); - return (isMallocCall(CI)) ? CI : NULL; + +/// \brief Tests if a value is a call or invoke to a library function that +/// allocates or reallocates memory (either malloc, calloc, realloc, or strdup +/// like). +bool llvm::isAllocationFn(const Value *V, bool LookThroughBitCast) { + return getAllocationData(V, AnyAlloc, LookThroughBitCast); } -static bool isBitCastOfMallocCall(const BitCastInst *BCI) { - if (!BCI) - return false; - - return isMallocCall(dyn_cast<CallInst>(BCI->getOperand(0))); +/// \brief Tests if a value is a call or invoke to a function that returns a +/// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions). +bool llvm::isNoAliasFn(const Value *V, bool LookThroughBitCast) { + // it's safe to consider realloc as noalias since accessing the original + // pointer is undefined behavior + return isAllocationFn(V, LookThroughBitCast) || + hasNoAliasAttr(V, LookThroughBitCast); } -/// extractMallocCallFromBitCast - Returns the corresponding CallInst if the -/// instruction is a bitcast of the result of a malloc call. -CallInst *llvm::extractMallocCallFromBitCast(Value *I) { - BitCastInst *BCI = dyn_cast<BitCastInst>(I); - return (isBitCastOfMallocCall(BCI)) ? cast<CallInst>(BCI->getOperand(0)) - : NULL; +/// \brief Tests if a value is a call or invoke to a library function that +/// allocates uninitialized memory (such as malloc). +bool llvm::isMallocLikeFn(const Value *V, bool LookThroughBitCast) { + return getAllocationData(V, MallocLike, LookThroughBitCast); } -const CallInst *llvm::extractMallocCallFromBitCast(const Value *I) { - const BitCastInst *BCI = dyn_cast<BitCastInst>(I); - return (isBitCastOfMallocCall(BCI)) ? cast<CallInst>(BCI->getOperand(0)) - : NULL; +/// \brief Tests if a value is a call or invoke to a library function that +/// allocates zero-filled memory (such as calloc). +bool llvm::isCallocLikeFn(const Value *V, bool LookThroughBitCast) { + return getAllocationData(V, CallocLike, LookThroughBitCast); +} + +/// \brief Tests if a value is a call or invoke to a library function that +/// allocates memory (either malloc, calloc, or strdup like). +bool llvm::isAllocLikeFn(const Value *V, bool LookThroughBitCast) { + return getAllocationData(V, AllocLike, LookThroughBitCast); +} + +/// \brief Tests if a value is a call or invoke to a library function that +/// reallocates memory (such as realloc). +bool llvm::isReallocLikeFn(const Value *V, bool LookThroughBitCast) { + return getAllocationData(V, ReallocLike, LookThroughBitCast); +} + +/// extractMallocCall - Returns the corresponding CallInst if the instruction +/// is a malloc call. Since CallInst::CreateMalloc() only creates calls, we +/// ignore InvokeInst here. +const CallInst *llvm::extractMallocCall(const Value *I) { + return isMallocLikeFn(I) ? dyn_cast<CallInst>(I) : 0; } static Value *computeArraySize(const CallInst *CI, const TargetData *TD, @@ -134,7 +222,7 @@ const CallInst *llvm::isArrayMalloc(const Value *I, const TargetData *TD) { /// 1: PointerType is the bitcast's result type. /// >1: Unique PointerType cannot be determined, return NULL. PointerType *llvm::getMallocType(const CallInst *CI) { - assert(isMalloc(CI) && "getMallocType and not malloc call"); + assert(isMallocLikeFn(CI) && "getMallocType and not malloc call"); PointerType *MallocType = NULL; unsigned NumOfBitCastUses = 0; @@ -176,13 +264,17 @@ Type *llvm::getMallocAllocatedType(const CallInst *CI) { /// determined. Value *llvm::getMallocArraySize(CallInst *CI, const TargetData *TD, bool LookThroughSExt) { - assert(isMalloc(CI) && "getMallocArraySize and not malloc call"); + assert(isMallocLikeFn(CI) && "getMallocArraySize and not malloc call"); return computeArraySize(CI, TD, LookThroughSExt); } -//===----------------------------------------------------------------------===// -// free Call Utility Functions. -// + +/// extractCallocCall - Returns the corresponding CallInst if the instruction +/// is a calloc call. +const CallInst *llvm::extractCallocCall(const Value *I) { + return isCallocLikeFn(I) ? cast<CallInst>(I) : 0; +} + /// isFreeCall - Returns non-null if the value is a call to the builtin free() const CallInst *llvm::isFreeCall(const Value *I) { @@ -211,3 +303,417 @@ const CallInst *llvm::isFreeCall(const Value *I) { return CI; } + + + +//===----------------------------------------------------------------------===// +// Utility functions to compute size of objects. +// + + +/// \brief Compute the size of the object pointed by Ptr. Returns true and the +/// object size in Size if successful, and false otherwise. +/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas, +/// byval arguments, and global variables. +bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const TargetData *TD, + bool RoundToAlign) { + if (!TD) + return false; + + ObjectSizeOffsetVisitor Visitor(TD, Ptr->getContext(), RoundToAlign); + SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr)); + if (!Visitor.bothKnown(Data)) + return false; + + APInt ObjSize = Data.first, Offset = Data.second; + // check for overflow + if (Offset.slt(0) || ObjSize.ult(Offset)) + Size = 0; + else + Size = (ObjSize - Offset).getZExtValue(); + return true; +} + + +STATISTIC(ObjectVisitorArgument, + "Number of arguments with unsolved size and offset"); +STATISTIC(ObjectVisitorLoad, + "Number of load instructions with unsolved size and offset"); + + +APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) { + if (RoundToAlign && Align) + return APInt(IntTyBits, RoundUpToAlignment(Size.getZExtValue(), Align)); + return Size; +} + +ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const TargetData *TD, + LLVMContext &Context, + bool RoundToAlign) +: TD(TD), RoundToAlign(RoundToAlign) { + IntegerType *IntTy = TD->getIntPtrType(Context); + IntTyBits = IntTy->getBitWidth(); + Zero = APInt::getNullValue(IntTyBits); +} + +SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) { + V = V->stripPointerCasts(); + + if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) + return visitGEPOperator(*GEP); + if (Instruction *I = dyn_cast<Instruction>(V)) + return visit(*I); + if (Argument *A = dyn_cast<Argument>(V)) + return visitArgument(*A); + if (ConstantPointerNull *P = dyn_cast<ConstantPointerNull>(V)) + return visitConstantPointerNull(*P); + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + return visitGlobalVariable(*GV); + if (UndefValue *UV = dyn_cast<UndefValue>(V)) + return visitUndefValue(*UV); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + if (CE->getOpcode() == Instruction::IntToPtr) + return unknown(); // clueless + + DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: " << *V + << '\n'); + return unknown(); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) { + if (!I.getAllocatedType()->isSized()) + return unknown(); + + APInt Size(IntTyBits, TD->getTypeAllocSize(I.getAllocatedType())); + if (!I.isArrayAllocation()) + return std::make_pair(align(Size, I.getAlignment()), Zero); + + Value *ArraySize = I.getArraySize(); + if (const ConstantInt *C = dyn_cast<ConstantInt>(ArraySize)) { + Size *= C->getValue().zextOrSelf(IntTyBits); + return std::make_pair(align(Size, I.getAlignment()), Zero); + } + return unknown(); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) { + // no interprocedural analysis is done at the moment + if (!A.hasByValAttr()) { + ++ObjectVisitorArgument; + return unknown(); + } + PointerType *PT = cast<PointerType>(A.getType()); + APInt Size(IntTyBits, TD->getTypeAllocSize(PT->getElementType())); + return std::make_pair(align(Size, A.getParamAlignment()), Zero); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) { + const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc); + if (!FnData) + return unknown(); + + // handle strdup-like functions separately + if (FnData->AllocTy == StrDupLike) { + // TODO + return unknown(); + } + + ConstantInt *Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam)); + if (!Arg) + return unknown(); + + APInt Size = Arg->getValue().zextOrSelf(IntTyBits); + // size determined by just 1 parameter + if (FnData->SndParam < 0) + return std::make_pair(Size, Zero); + + Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->SndParam)); + if (!Arg) + return unknown(); + + Size *= Arg->getValue().zextOrSelf(IntTyBits); + return std::make_pair(Size, Zero); + + // TODO: handle more standard functions (+ wchar cousins): + // - strdup / strndup + // - strcpy / strncpy + // - strcat / strncat + // - memcpy / memmove + // - strcat / strncat + // - memset +} + +SizeOffsetType +ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull&) { + return std::make_pair(Zero, Zero); +} + +SizeOffsetType +ObjectSizeOffsetVisitor::visitExtractElementInst(ExtractElementInst&) { + return unknown(); +} + +SizeOffsetType +ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) { + // Easy cases were already folded by previous passes. + return unknown(); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) { + SizeOffsetType PtrData = compute(GEP.getPointerOperand()); + if (!bothKnown(PtrData) || !GEP.hasAllConstantIndices()) + return unknown(); + + SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end()); + APInt Offset(IntTyBits,TD->getIndexedOffset(GEP.getPointerOperandType(),Ops)); + return std::make_pair(PtrData.first, PtrData.second + Offset); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){ + if (!GV.hasDefinitiveInitializer()) + return unknown(); + + APInt Size(IntTyBits, TD->getTypeAllocSize(GV.getType()->getElementType())); + return std::make_pair(align(Size, GV.getAlignment()), Zero); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitIntToPtrInst(IntToPtrInst&) { + // clueless + return unknown(); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitLoadInst(LoadInst&) { + ++ObjectVisitorLoad; + return unknown(); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode&) { + // too complex to analyze statically. + return unknown(); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) { + SizeOffsetType TrueSide = compute(I.getTrueValue()); + SizeOffsetType FalseSide = compute(I.getFalseValue()); + if (bothKnown(TrueSide) && bothKnown(FalseSide) && TrueSide == FalseSide) + return TrueSide; + return unknown(); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitUndefValue(UndefValue&) { + return std::make_pair(Zero, Zero); +} + +SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) { + DEBUG(dbgs() << "ObjectSizeOffsetVisitor unknown instruction:" << I << '\n'); + return unknown(); +} + + +ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const TargetData *TD, + LLVMContext &Context) +: TD(TD), Context(Context), Builder(Context, TargetFolder(TD)), +Visitor(TD, Context) { + IntTy = TD->getIntPtrType(Context); + Zero = ConstantInt::get(IntTy, 0); +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) { + SizeOffsetEvalType Result = compute_(V); + + if (!bothKnown(Result)) { + // erase everything that was computed in this iteration from the cache, so + // that no dangling references are left behind. We could be a bit smarter if + // we kept a dependency graph. It's probably not worth the complexity. + for (PtrSetTy::iterator I=SeenVals.begin(), E=SeenVals.end(); I != E; ++I) { + CacheMapTy::iterator CacheIt = CacheMap.find(*I); + // non-computable results can be safely cached + if (CacheIt != CacheMap.end() && anyKnown(CacheIt->second)) + CacheMap.erase(CacheIt); + } + } + + SeenVals.clear(); + return Result; +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) { + SizeOffsetType Const = Visitor.compute(V); + if (Visitor.bothKnown(Const)) + return std::make_pair(ConstantInt::get(Context, Const.first), + ConstantInt::get(Context, Const.second)); + + V = V->stripPointerCasts(); + + // check cache + CacheMapTy::iterator CacheIt = CacheMap.find(V); + if (CacheIt != CacheMap.end()) + return CacheIt->second; + + // always generate code immediately before the instruction being + // processed, so that the generated code dominates the same BBs + Instruction *PrevInsertPoint = Builder.GetInsertPoint(); + if (Instruction *I = dyn_cast<Instruction>(V)) + Builder.SetInsertPoint(I); + + // record the pointers that were handled in this run, so that they can be + // cleaned later if something fails + SeenVals.insert(V); + + // now compute the size and offset + SizeOffsetEvalType Result; + if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { + Result = visitGEPOperator(*GEP); + } else if (Instruction *I = dyn_cast<Instruction>(V)) { + Result = visit(*I); + } else if (isa<Argument>(V) || + (isa<ConstantExpr>(V) && + cast<ConstantExpr>(V)->getOpcode() == Instruction::IntToPtr) || + isa<GlobalVariable>(V)) { + // ignore values where we cannot do more than what ObjectSizeVisitor can + Result = unknown(); + } else { + DEBUG(dbgs() << "ObjectSizeOffsetEvaluator::compute() unhandled value: " + << *V << '\n'); + Result = unknown(); + } + + if (PrevInsertPoint) + Builder.SetInsertPoint(PrevInsertPoint); + + // Don't reuse CacheIt since it may be invalid at this point. + CacheMap[V] = Result; + return Result; +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) { + if (!I.getAllocatedType()->isSized()) + return unknown(); + + // must be a VLA + assert(I.isArrayAllocation()); + Value *ArraySize = I.getArraySize(); + Value *Size = ConstantInt::get(ArraySize->getType(), + TD->getTypeAllocSize(I.getAllocatedType())); + Size = Builder.CreateMul(Size, ArraySize); + return std::make_pair(Size, Zero); +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) { + const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc); + if (!FnData) + return unknown(); + + // handle strdup-like functions separately + if (FnData->AllocTy == StrDupLike) { + // TODO + return unknown(); + } + + Value *FirstArg = CS.getArgument(FnData->FstParam); + FirstArg = Builder.CreateZExt(FirstArg, IntTy); + if (FnData->SndParam < 0) + return std::make_pair(FirstArg, Zero); + + Value *SecondArg = CS.getArgument(FnData->SndParam); + SecondArg = Builder.CreateZExt(SecondArg, IntTy); + Value *Size = Builder.CreateMul(FirstArg, SecondArg); + return std::make_pair(Size, Zero); + + // TODO: handle more standard functions (+ wchar cousins): + // - strdup / strndup + // - strcpy / strncpy + // - strcat / strncat + // - memcpy / memmove + // - strcat / strncat + // - memset +} + +SizeOffsetEvalType +ObjectSizeOffsetEvaluator::visitExtractElementInst(ExtractElementInst&) { + return unknown(); +} + +SizeOffsetEvalType +ObjectSizeOffsetEvaluator::visitExtractValueInst(ExtractValueInst&) { + return unknown(); +} + +SizeOffsetEvalType +ObjectSizeOffsetEvaluator::visitGEPOperator(GEPOperator &GEP) { + SizeOffsetEvalType PtrData = compute_(GEP.getPointerOperand()); + if (!bothKnown(PtrData)) + return unknown(); + + Value *Offset = EmitGEPOffset(&Builder, *TD, &GEP, /*NoAssumptions=*/true); + Offset = Builder.CreateAdd(PtrData.second, Offset); + return std::make_pair(PtrData.first, Offset); +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitIntToPtrInst(IntToPtrInst&) { + // clueless + return unknown(); +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitLoadInst(LoadInst&) { + return unknown(); +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) { + // create 2 PHIs: one for size and another for offset + PHINode *SizePHI = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues()); + PHINode *OffsetPHI = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues()); + + // insert right away in the cache to handle recursive PHIs + CacheMap[&PHI] = std::make_pair(SizePHI, OffsetPHI); + + // compute offset/size for each PHI incoming pointer + for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) { + Builder.SetInsertPoint(PHI.getIncomingBlock(i)->getFirstInsertionPt()); + SizeOffsetEvalType EdgeData = compute_(PHI.getIncomingValue(i)); + + if (!bothKnown(EdgeData)) { + OffsetPHI->replaceAllUsesWith(UndefValue::get(IntTy)); + OffsetPHI->eraseFromParent(); + SizePHI->replaceAllUsesWith(UndefValue::get(IntTy)); + SizePHI->eraseFromParent(); + return unknown(); + } + SizePHI->addIncoming(EdgeData.first, PHI.getIncomingBlock(i)); + OffsetPHI->addIncoming(EdgeData.second, PHI.getIncomingBlock(i)); + } + + Value *Size = SizePHI, *Offset = OffsetPHI, *Tmp; + if ((Tmp = SizePHI->hasConstantValue())) { + Size = Tmp; + SizePHI->replaceAllUsesWith(Size); + SizePHI->eraseFromParent(); + } + if ((Tmp = OffsetPHI->hasConstantValue())) { + Offset = Tmp; + OffsetPHI->replaceAllUsesWith(Offset); + OffsetPHI->eraseFromParent(); + } + return std::make_pair(Size, Offset); +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitSelectInst(SelectInst &I) { + SizeOffsetEvalType TrueSide = compute_(I.getTrueValue()); + SizeOffsetEvalType FalseSide = compute_(I.getFalseValue()); + + if (!bothKnown(TrueSide) || !bothKnown(FalseSide)) + return unknown(); + if (TrueSide == FalseSide) + return TrueSide; + + Value *Size = Builder.CreateSelect(I.getCondition(), TrueSide.first, + FalseSide.first); + Value *Offset = Builder.CreateSelect(I.getCondition(), TrueSide.second, + FalseSide.second); + return std::make_pair(Size, Offset); +} + +SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitInstruction(Instruction &I) { + DEBUG(dbgs() << "ObjectSizeOffsetEvaluator unknown instruction:" << I <<'\n'); + return unknown(); +} diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index 3a544f3..7fb154d 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -16,13 +16,11 @@ #define DEBUG_TYPE "memdep" #include "llvm/Analysis/MemoryDependenceAnalysis.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Function.h" #include "llvm/LLVMContext.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" @@ -339,86 +337,6 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs, } } -namespace { - /// Only find pointer captures which happen before the given instruction. Uses - /// the dominator tree to determine whether one instruction is before another. - struct CapturesBefore : public CaptureTracker { - CapturesBefore(const Instruction *I, DominatorTree *DT) - : BeforeHere(I), DT(DT), Captured(false) {} - - void tooManyUses() { Captured = true; } - - bool shouldExplore(Use *U) { - Instruction *I = cast<Instruction>(U->getUser()); - BasicBlock *BB = I->getParent(); - if (BeforeHere != I && - (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I))) - return false; - return true; - } - - bool captured(Use *U) { - Instruction *I = cast<Instruction>(U->getUser()); - BasicBlock *BB = I->getParent(); - if (BeforeHere != I && - (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I))) - return false; - Captured = true; - return true; - } - - const Instruction *BeforeHere; - DominatorTree *DT; - - bool Captured; - }; -} - -AliasAnalysis::ModRefResult -MemoryDependenceAnalysis::getModRefInfo(const Instruction *Inst, - const AliasAnalysis::Location &MemLoc) { - AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc); - if (MR != AliasAnalysis::ModRef) return MR; - - // FIXME: this is really just shoring-up a deficiency in alias analysis. - // BasicAA isn't willing to spend linear time determining whether an alloca - // was captured before or after this particular call, while we are. However, - // with a smarter AA in place, this test is just wasting compile time. - if (!DT) return AliasAnalysis::ModRef; - const Value *Object = GetUnderlyingObject(MemLoc.Ptr, TD); - if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object)) - return AliasAnalysis::ModRef; - ImmutableCallSite CS(Inst); - if (!CS.getInstruction()) return AliasAnalysis::ModRef; - - CapturesBefore CB(Inst, DT); - llvm::PointerMayBeCaptured(Object, &CB); - - if (isa<Constant>(Object) || CS.getInstruction() == Object || CB.Captured) - return AliasAnalysis::ModRef; - - unsigned ArgNo = 0; - for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); - CI != CE; ++CI, ++ArgNo) { - // Only look at the no-capture or byval pointer arguments. If this - // pointer were passed to arguments that were neither of these, then it - // couldn't be no-capture. - if (!(*CI)->getType()->isPointerTy() || - (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo))) - continue; - - // If this is a no-capture pointer argument, see if we can tell that it - // is impossible to alias the pointer we're checking. If not, we have to - // assume that the call could touch the pointer, even though it doesn't - // escape. - if (!AA->isNoAlias(AliasAnalysis::Location(*CI), - AliasAnalysis::Location(Object))) { - return AliasAnalysis::ModRef; - } - } - return AliasAnalysis::NoModRef; -} - /// getPointerDependencyFrom - Return the instruction on which a memory /// location depends. If isLoad is true, this routine ignores may-aliases with /// read-only operations. If isLoad is false, this routine ignores may-aliases @@ -556,8 +474,7 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad, // a subsequent bitcast of the malloc call result. There can be stores to // the malloced memory between the malloc call and its bitcast uses, and we // need to continue scanning until the malloc call. - if (isa<AllocaInst>(Inst) || - (isa<CallInst>(Inst) && extractMallocCall(Inst))) { + if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst)) { const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, TD); if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr)) @@ -566,7 +483,11 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad, } // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer. - switch (getModRefInfo(Inst, MemLoc)) { + AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc); + // If necessary, perform additional analysis. + if (MR == AliasAnalysis::ModRef) + MR = AA->callCapturesBefore(Inst, MemLoc, DT); + switch (MR) { case AliasAnalysis::NoModRef: // If the call has no effect on the queried pointer, just ignore it. continue; @@ -984,7 +905,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, if (!Pair.second) { if (CacheInfo->Size < Loc.Size) { // The query's Size is greater than the cached one. Throw out the - // cached data and procede with the query at the greater size. + // cached data and proceed with the query at the greater size. CacheInfo->Pair = BBSkipFirstBlockPair(); CacheInfo->Size = Loc.Size; for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(), diff --git a/lib/Analysis/ModuleDebugInfoPrinter.cpp b/lib/Analysis/ModuleDebugInfoPrinter.cpp index e7e999c..f8c7514 100644 --- a/lib/Analysis/ModuleDebugInfoPrinter.cpp +++ b/lib/Analysis/ModuleDebugInfoPrinter.cpp @@ -16,10 +16,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Assembly/Writer.h" -#include "llvm/Pass.h" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" +#include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/Statistic.h" diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp index 80c5222..d4ad726 100644 --- a/lib/Analysis/PathNumbering.cpp +++ b/lib/Analysis/PathNumbering.cpp @@ -31,11 +31,11 @@ #include "llvm/Instructions.h" #include "llvm/Module.h" #include "llvm/Pass.h" +#include "llvm/TypeBuilder.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/TypeBuilder.h" #include "llvm/Support/raw_ostream.h" #include <queue> diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp index eaa38da..5c7c97c 100644 --- a/lib/Analysis/ProfileInfoLoader.cpp +++ b/lib/Analysis/ProfileInfoLoader.cpp @@ -83,10 +83,8 @@ const unsigned ProfileInfoLoader::Uncounted = ~0U; // program if the file is invalid or broken. // ProfileInfoLoader::ProfileInfoLoader(const char *ToolName, - const std::string &Filename, - Module &TheModule) : - Filename(Filename), - M(TheModule), Warned(false) { + const std::string &Filename) + : Filename(Filename) { FILE *F = fopen(Filename.c_str(), "rb"); if (F == 0) { errs() << ToolName << ": Error opening '" << Filename << "': "; diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp index c4da807..5ecf052 100644 --- a/lib/Analysis/ProfileInfoLoaderPass.cpp +++ b/lib/Analysis/ProfileInfoLoaderPass.cpp @@ -152,7 +152,7 @@ void LoaderPass::readEdge(ProfileInfo::Edge e, } bool LoaderPass::runOnModule(Module &M) { - ProfileInfoLoader PIL("profile-loader", Filename, M); + ProfileInfoLoader PIL("profile-loader", Filename); EdgeInformation.clear(); std::vector<unsigned> Counters = PIL.getRawEdgeCounts(); diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp index b507b1e..5f4458b 100644 --- a/lib/Analysis/RegionInfo.cpp +++ b/lib/Analysis/RegionInfo.cpp @@ -47,7 +47,7 @@ static cl::opt<enum Region::PrintStyle> printStyle("print-region-style", cl::values( clEnumValN(Region::PrintNone, "none", "print no details"), clEnumValN(Region::PrintBB, "bb", - "print regions in detail with block_iterator"), + "print regions in detail with block_node_iterator"), clEnumValN(Region::PrintRN, "rn", "print regions in detail with element_iterator"), clEnumValEnd)); @@ -246,22 +246,38 @@ void Region::verifyRegionNest() const { verifyRegion(); } -Region::block_iterator Region::block_begin() { +Region::block_node_iterator Region::block_node_begin() { return GraphTraits<FlatIt<Region*> >::nodes_begin(this); } -Region::block_iterator Region::block_end() { +Region::block_node_iterator Region::block_node_end() { return GraphTraits<FlatIt<Region*> >::nodes_end(this); } -Region::const_block_iterator Region::block_begin() const { +Region::const_block_node_iterator Region::block_node_begin() const { return GraphTraits<FlatIt<const Region*> >::nodes_begin(this); } -Region::const_block_iterator Region::block_end() const { +Region::const_block_node_iterator Region::block_node_end() const { return GraphTraits<FlatIt<const Region*> >::nodes_end(this); } +Region::block_iterator Region::block_begin() { + return block_node_begin(); +} + +Region::block_iterator Region::block_end() { + return block_node_end(); +} + +Region::const_block_iterator Region::block_begin() const { + return block_node_begin(); +} + +Region::const_block_iterator Region::block_end() const { + return block_node_end(); +} + Region::element_iterator Region::element_begin() { return GraphTraits<Region*>::nodes_begin(this); } @@ -425,7 +441,9 @@ void Region::print(raw_ostream &OS, bool print_tree, unsigned level, OS.indent(level*2 + 2); if (Style == PrintBB) { - for (const_block_iterator I = block_begin(), E = block_end(); I!=E; ++I) + for (const_block_node_iterator I = block_node_begin(), + E = block_node_end(); + I != E; ++I) OS << **I << ", "; // TODO: remove the last "," } else if (Style == PrintRN) { for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I) diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp index 3a3529b..c97b5eb 100644 --- a/lib/Analysis/RegionPass.cpp +++ b/lib/Analysis/RegionPass.cpp @@ -195,7 +195,8 @@ public: virtual bool runOnRegion(Region *R, RGPassManager &RGM) { Out << Banner; - for (Region::block_iterator I = R->block_begin(), E = R->block_end(); + for (Region::block_node_iterator I = R->block_node_begin(), + E = R->block_node_end(); I != E; ++I) (*I)->getEntry()->print(Out); diff --git a/lib/Analysis/RegionPrinter.cpp b/lib/Analysis/RegionPrinter.cpp index a1730b0..8b23cc7 100644 --- a/lib/Analysis/RegionPrinter.cpp +++ b/lib/Analysis/RegionPrinter.cpp @@ -122,13 +122,11 @@ struct DOTGraphTraits<RegionInfo*> : public DOTGraphTraits<RegionNode*> { RegionInfo *RI = R->getRegionInfo(); for (Region::const_block_iterator BI = R->block_begin(), - BE = R->block_end(); BI != BE; ++BI) { - BasicBlock *BB = (*BI)->getNodeAs<BasicBlock>(); - if (RI->getRegionFor(BB) == R) + BE = R->block_end(); BI != BE; ++BI) + if (RI->getRegionFor(*BI) == R) O.indent(2 * (depth + 1)) << "Node" - << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(BB)) + << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(*BI)) << ";\n"; - } O.indent(2 * depth) << "}\n"; } diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 205227c..f0f3b1c 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -826,8 +826,7 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) return getConstant( - cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(), - getEffectiveSCEVType(Ty)))); + cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(), Ty))); // trunc(trunc(x)) --> trunc(x) if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) @@ -879,13 +878,6 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap); } - // As a special case, fold trunc(undef) to undef. We don't want to - // know too much about SCEVUnknowns, but this special case is handy - // and harmless. - if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Op)) - if (isa<UndefValue>(U->getValue())) - return getSCEV(UndefValue::get(Ty)); - // The cast wasn't folded; create an explicit cast node. We can reuse // the existing insert position since if we get here, we won't have // made any changes which would invalidate it. @@ -906,8 +898,7 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) return getConstant( - cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), - getEffectiveSCEVType(Ty)))); + cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty))); // zext(zext(x)) --> zext(x) if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op)) @@ -976,12 +967,15 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *WideTy = IntegerType::get(getContext(), BitWidth * 2); // Check whether Start+Step*MaxBECount has no unsigned overflow. const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step); - const SCEV *Add = getAddExpr(Start, ZMul); + const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul), WideTy); + const SCEV *WideStart = getZeroExtendExpr(Start, WideTy); + const SCEV *WideMaxBECount = + getZeroExtendExpr(CastedMaxBECount, WideTy); const SCEV *OperandExtendedAdd = - getAddExpr(getZeroExtendExpr(Start, WideTy), - getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy), + getAddExpr(WideStart, + getMulExpr(WideMaxBECount, getZeroExtendExpr(Step, WideTy))); - if (getZeroExtendExpr(Add, WideTy) == OperandExtendedAdd) { + if (ZAdd == OperandExtendedAdd) { // Cache knowledge of AR NUW, which is propagated to this AddRec. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW); // Return the expression with the addrec on the outside. @@ -991,13 +985,11 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, } // Similar to above, only this time treat the step value as signed. // This covers loops that count down. - const SCEV *SMul = getMulExpr(CastedMaxBECount, Step); - Add = getAddExpr(Start, SMul); OperandExtendedAdd = - getAddExpr(getZeroExtendExpr(Start, WideTy), - getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy), + getAddExpr(WideStart, + getMulExpr(WideMaxBECount, getSignExtendExpr(Step, WideTy))); - if (getZeroExtendExpr(Add, WideTy) == OperandExtendedAdd) { + if (ZAdd == OperandExtendedAdd) { // Cache knowledge of AR NW, which is propagated to this AddRec. // Negative step causes unsigned wrap, but it still can't self-wrap. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW); @@ -1164,8 +1156,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, // Fold if the operand is constant. if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) return getConstant( - cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), - getEffectiveSCEVType(Ty)))); + cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty))); // sext(sext(x)) --> sext(x) if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op)) @@ -1242,12 +1233,15 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *WideTy = IntegerType::get(getContext(), BitWidth * 2); // Check whether Start+Step*MaxBECount has no signed overflow. const SCEV *SMul = getMulExpr(CastedMaxBECount, Step); - const SCEV *Add = getAddExpr(Start, SMul); + const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul), WideTy); + const SCEV *WideStart = getSignExtendExpr(Start, WideTy); + const SCEV *WideMaxBECount = + getZeroExtendExpr(CastedMaxBECount, WideTy); const SCEV *OperandExtendedAdd = - getAddExpr(getSignExtendExpr(Start, WideTy), - getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy), + getAddExpr(WideStart, + getMulExpr(WideMaxBECount, getSignExtendExpr(Step, WideTy))); - if (getSignExtendExpr(Add, WideTy) == OperandExtendedAdd) { + if (SAdd == OperandExtendedAdd) { // Cache knowledge of AR NSW, which is propagated to this AddRec. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); // Return the expression with the addrec on the outside. @@ -1257,13 +1251,11 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, } // Similar to above, only this time treat the step value as unsigned. // This covers loops that count up with an unsigned step. - const SCEV *UMul = getMulExpr(CastedMaxBECount, Step); - Add = getAddExpr(Start, UMul); OperandExtendedAdd = - getAddExpr(getSignExtendExpr(Start, WideTy), - getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy), + getAddExpr(WideStart, + getMulExpr(WideMaxBECount, getZeroExtendExpr(Step, WideTy))); - if (getSignExtendExpr(Add, WideTy) == OperandExtendedAdd) { + if (SAdd == OperandExtendedAdd) { // Cache knowledge of AR NSW, which is propagated to this AddRec. const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW); // Return the expression with the addrec on the outside. @@ -1345,13 +1337,6 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op, return getAddRecExpr(Ops, AR->getLoop(), SCEV::FlagNW); } - // As a special case, fold anyext(undef) to undef. We don't want to - // know too much about SCEVUnknowns, but this special case is handy - // and harmless. - if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Op)) - if (isa<UndefValue>(U->getValue())) - return getSCEV(UndefValue::get(Ty)); - // If the expression is obviously signed, use the sext cast value. if (isa<SCEVSMaxExpr>(Op)) return SExt; @@ -1839,7 +1824,7 @@ static uint64_t umul_ov(uint64_t i, uint64_t j, bool &Overflow) { /// Compute the result of "n choose k", the binomial coefficient. If an /// intermediate computation overflows, Overflow will be set and the return will -/// be garbage. Overflow is not cleared on absense of overflow. +/// be garbage. Overflow is not cleared on absence of overflow. static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) { // We use the multiplicative formula: // n(n-1)(n-2)...(n-(k-1)) / k(k-1)(k-2)...1 . @@ -2038,63 +2023,67 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, for (unsigned OtherIdx = Idx+1; OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]); ++OtherIdx) { - if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) { - // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L> - // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [ - // choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z - // ]]],+,...up to x=2n}. - // Note that the arguments to choose() are always integers with values - // known at compile time, never SCEV objects. - // - // The implementation avoids pointless extra computations when the two - // addrec's are of different length (mathematically, it's equivalent to - // an infinite stream of zeros on the right). - bool OpsModified = false; - for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]); - ++OtherIdx) - if (const SCEVAddRecExpr *OtherAddRec = - dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx])) - if (OtherAddRec->getLoop() == AddRecLoop) { - bool Overflow = false; - Type *Ty = AddRec->getType(); - bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64; - SmallVector<const SCEV*, 7> AddRecOps; - for (int x = 0, xe = AddRec->getNumOperands() + - OtherAddRec->getNumOperands() - 1; - x != xe && !Overflow; ++x) { - const SCEV *Term = getConstant(Ty, 0); - for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) { - uint64_t Coeff1 = Choose(x, 2*x - y, Overflow); - for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1), - ze = std::min(x+1, (int)OtherAddRec->getNumOperands()); - z < ze && !Overflow; ++z) { - uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow); - uint64_t Coeff; - if (LargerThan64Bits) - Coeff = umul_ov(Coeff1, Coeff2, Overflow); - else - Coeff = Coeff1*Coeff2; - const SCEV *CoeffTerm = getConstant(Ty, Coeff); - const SCEV *Term1 = AddRec->getOperand(y-z); - const SCEV *Term2 = OtherAddRec->getOperand(z); - Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2)); - } - } - AddRecOps.push_back(Term); - } - if (!Overflow) { - const SCEV *NewAddRec = getAddRecExpr(AddRecOps, - AddRec->getLoop(), - SCEV::FlagAnyWrap); - if (Ops.size() == 2) return NewAddRec; - Ops[Idx] = AddRec = cast<SCEVAddRecExpr>(NewAddRec); - Ops.erase(Ops.begin() + OtherIdx); --OtherIdx; - OpsModified = true; - } + if (AddRecLoop != cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) + continue; + + // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L> + // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [ + // choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z + // ]]],+,...up to x=2n}. + // Note that the arguments to choose() are always integers with values + // known at compile time, never SCEV objects. + // + // The implementation avoids pointless extra computations when the two + // addrec's are of different length (mathematically, it's equivalent to + // an infinite stream of zeros on the right). + bool OpsModified = false; + for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]); + ++OtherIdx) { + const SCEVAddRecExpr *OtherAddRec = + dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]); + if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop) + continue; + + bool Overflow = false; + Type *Ty = AddRec->getType(); + bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64; + SmallVector<const SCEV*, 7> AddRecOps; + for (int x = 0, xe = AddRec->getNumOperands() + + OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) { + const SCEV *Term = getConstant(Ty, 0); + for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) { + uint64_t Coeff1 = Choose(x, 2*x - y, Overflow); + for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1), + ze = std::min(x+1, (int)OtherAddRec->getNumOperands()); + z < ze && !Overflow; ++z) { + uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow); + uint64_t Coeff; + if (LargerThan64Bits) + Coeff = umul_ov(Coeff1, Coeff2, Overflow); + else + Coeff = Coeff1*Coeff2; + const SCEV *CoeffTerm = getConstant(Ty, Coeff); + const SCEV *Term1 = AddRec->getOperand(y-z); + const SCEV *Term2 = OtherAddRec->getOperand(z); + Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2)); } - if (OpsModified) - return getMulExpr(Ops); + } + AddRecOps.push_back(Term); + } + if (!Overflow) { + const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(), + SCEV::FlagAnyWrap); + if (Ops.size() == 2) return NewAddRec; + Ops[Idx] = NewAddRec; + Ops.erase(Ops.begin() + OtherIdx); --OtherIdx; + OpsModified = true; + AddRec = dyn_cast<SCEVAddRecExpr>(NewAddRec); + if (!AddRec) + break; + } } + if (OpsModified) + return getMulExpr(Ops); } // Otherwise couldn't fold anything into this recurrence. Move onto the @@ -2723,7 +2712,7 @@ const SCEV *ScalarEvolution::getCouldNotCompute() { const SCEV *ScalarEvolution::getSCEV(Value *V) { assert(isSCEVable(V->getType()) && "Value is not SCEVable!"); - ValueExprMapType::const_iterator I = ValueExprMap.find(V); + ValueExprMapType::const_iterator I = ValueExprMap.find_as(V); if (I != ValueExprMap.end()) return I->second; const SCEV *S = createSCEV(V); @@ -2960,7 +2949,7 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) { if (!Visited.insert(I)) continue; ValueExprMapType::iterator It = - ValueExprMap.find(static_cast<Value *>(I)); + ValueExprMap.find_as(static_cast<Value *>(I)); if (It != ValueExprMap.end()) { const SCEV *Old = It->second; @@ -3017,7 +3006,7 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) { if (BEValueV && StartValueV) { // While we are analyzing this PHI node, handle its value symbolically. const SCEV *SymbolicName = getUnknown(PN); - assert(ValueExprMap.find(PN) == ValueExprMap.end() && + assert(ValueExprMap.find_as(PN) == ValueExprMap.end() && "PHI node already processed?"); ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName)); @@ -4081,7 +4070,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { if (!Visited.insert(I)) continue; ValueExprMapType::iterator It = - ValueExprMap.find(static_cast<Value *>(I)); + ValueExprMap.find_as(static_cast<Value *>(I)); if (It != ValueExprMap.end()) { const SCEV *Old = It->second; @@ -4132,7 +4121,8 @@ void ScalarEvolution::forgetLoop(const Loop *L) { Instruction *I = Worklist.pop_back_val(); if (!Visited.insert(I)) continue; - ValueExprMapType::iterator It = ValueExprMap.find(static_cast<Value *>(I)); + ValueExprMapType::iterator It = + ValueExprMap.find_as(static_cast<Value *>(I)); if (It != ValueExprMap.end()) { forgetMemoizedResults(It->second); ValueExprMap.erase(It); @@ -4165,7 +4155,8 @@ void ScalarEvolution::forgetValue(Value *V) { I = Worklist.pop_back_val(); if (!Visited.insert(I)) continue; - ValueExprMapType::iterator It = ValueExprMap.find(static_cast<Value *>(I)); + ValueExprMapType::iterator It = + ValueExprMap.find_as(static_cast<Value *>(I)); if (It != ValueExprMap.end()) { forgetMemoizedResults(It->second); ValueExprMap.erase(It); @@ -5481,7 +5472,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) { // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step. // We have not yet seen any such cases. const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step); - if (StepC == 0) + if (StepC == 0 || StepC->getValue()->equalsInt(0)) return getCouldNotCompute(); // For positive steps (counting up until unsigned overflow): @@ -5602,9 +5593,14 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) { /// predicate Pred. Return true iff any changes were made. /// bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred, - const SCEV *&LHS, const SCEV *&RHS) { + const SCEV *&LHS, const SCEV *&RHS, + unsigned Depth) { bool Changed = false; + // If we hit the max recursion limit bail out. + if (Depth >= 3) + return false; + // Canonicalize a constant to the right side. if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) { // Check for both operands constant. @@ -5642,6 +5638,16 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred, default: llvm_unreachable("Unexpected ICmpInst::Predicate value!"); case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_NE: + // Fold ((-1) * %a) + %b == 0 (equivalent to %b-%a == 0) into %a == %b. + if (!RA) + if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(LHS)) + if (const SCEVMulExpr *ME = dyn_cast<SCEVMulExpr>(AE->getOperand(0))) + if (AE->getNumOperands() == 2 && ME->getNumOperands() == 2 && + ME->getOperand(0)->isAllOnesValue()) { + RHS = AE->getOperand(1); + LHS = ME->getOperand(1); + Changed = true; + } break; case ICmpInst::ICMP_UGE: if ((RA - 1).isMinValue()) { @@ -5843,6 +5849,11 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred, // TODO: More simplifications are possible here. + // Recursively simplify until we either hit a recursion limit or nothing + // changes. + if (Changed) + return SimplifyICmpOperands(Pred, LHS, RHS, Depth+1); + return Changed; trivially_true: @@ -6040,12 +6051,34 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, return false; } +/// RAII wrapper to prevent recursive application of isImpliedCond. +/// ScalarEvolution's PendingLoopPredicates set must be empty unless we are +/// currently evaluating isImpliedCond. +struct MarkPendingLoopPredicate { + Value *Cond; + DenseSet<Value*> &LoopPreds; + bool Pending; + + MarkPendingLoopPredicate(Value *C, DenseSet<Value*> &LP) + : Cond(C), LoopPreds(LP) { + Pending = !LoopPreds.insert(Cond).second; + } + ~MarkPendingLoopPredicate() { + if (!Pending) + LoopPreds.erase(Cond); + } +}; + /// isImpliedCond - Test whether the condition described by Pred, LHS, /// and RHS is true whenever the given Cond value evaluates to true. bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, Value *FoundCondValue, bool Inverse) { + MarkPendingLoopPredicate Mark(FoundCondValue, PendingLoopPredicates); + if (Mark.Pending) + return false; + // Recursively handle And and Or conditions. if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) { if (BO->getOpcode() == Instruction::And) { @@ -6572,6 +6605,8 @@ void ScalarEvolution::releaseMemory() { I->second.clear(); } + assert(PendingLoopPredicates.empty() && "isImpliedCond garbage"); + BackedgeTakenCounts.clear(); ConstantEvolutionLoopExitValue.clear(); ValuesAtScopes.clear(); @@ -6859,44 +6894,27 @@ bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) { return getBlockDisposition(S, BB) == ProperlyDominatesBlock; } -bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const { - switch (S->getSCEVType()) { - case scConstant: - return false; - case scTruncate: - case scZeroExtend: - case scSignExtend: { - const SCEVCastExpr *Cast = cast<SCEVCastExpr>(S); - const SCEV *CastOp = Cast->getOperand(); - return Op == CastOp || hasOperand(CastOp, Op); - } - case scAddRecExpr: - case scAddExpr: - case scMulExpr: - case scUMaxExpr: - case scSMaxExpr: { - const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S); - for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end(); - I != E; ++I) { - const SCEV *NAryOp = *I; - if (NAryOp == Op || hasOperand(NAryOp, Op)) - return true; - } - return false; - } - case scUDivExpr: { - const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S); - const SCEV *LHS = UDiv->getLHS(), *RHS = UDiv->getRHS(); - return LHS == Op || hasOperand(LHS, Op) || - RHS == Op || hasOperand(RHS, Op); - } - case scUnknown: - return false; - case scCouldNotCompute: - llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); - default: - llvm_unreachable("Unknown SCEV kind!"); +namespace { +// Search for a SCEV expression node within an expression tree. +// Implements SCEVTraversal::Visitor. +struct SCEVSearch { + const SCEV *Node; + bool IsFound; + + SCEVSearch(const SCEV *N): Node(N), IsFound(false) {} + + bool follow(const SCEV *S) { + IsFound |= (S == Node); + return !IsFound; } + bool isDone() const { return IsFound; } +}; +} + +bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const { + SCEVSearch Search(Op); + visitAll(S, Search); + return Search.IsFound; } void ScalarEvolution::forgetMemoizedResults(const SCEV *S) { diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp index 69507be..62710c5 100644 --- a/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/lib/Analysis/ScalarEvolutionExpander.cpp @@ -37,7 +37,7 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty, // We use this precondition to produce a cast that will dominate all its // uses. In particular, this is crucial for the case where the builder's // insertion point *is* the point where we were asked to put the cast. - // Since we don't know the the builder's insertion point is actually + // Since we don't know the builder's insertion point is actually // where the uses will be added (only that it dominates it), we are // not allowed to move it. BasicBlock::iterator BIP = Builder.GetInsertPoint(); @@ -955,7 +955,8 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) { // InsertPos must itself dominate IncV so that IncV's new position satisfies // its existing users. - if (!SE.DT->dominates(InsertPos->getParent(), IncV->getParent())) + if (isa<PHINode>(InsertPos) + || !SE.DT->dominates(InsertPos->getParent(), IncV->getParent())) return false; // Check that the chain of IV operands leading back to Phi can be hoisted. @@ -1699,3 +1700,44 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, } return NumElim; } + +namespace { +// Search for a SCEV subexpression that is not safe to expand. Any expression +// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely +// UDiv expressions. We don't know if the UDiv is derived from an IR divide +// instruction, but the important thing is that we prove the denominator is +// nonzero before expansion. +// +// IVUsers already checks that IV-derived expressions are safe. So this check is +// only needed when the expression includes some subexpression that is not IV +// derived. +// +// Currently, we only allow division by a nonzero constant here. If this is +// inadequate, we could easily allow division by SCEVUnknown by using +// ValueTracking to check isKnownNonZero(). +struct SCEVFindUnsafe { + bool IsUnsafe; + + SCEVFindUnsafe(): IsUnsafe(false) {} + + bool follow(const SCEV *S) { + const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S); + if (!D) + return true; + const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS()); + if (SC && !SC->getValue()->isZero()) + return true; + IsUnsafe = true; + return false; + } + bool isDone() const { return IsUnsafe; } +}; +} + +namespace llvm { +bool isSafeToExpand(const SCEV *S) { + SCEVFindUnsafe Search; + visitAll(S, Search); + return !Search.IsUnsafe; +} +} diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index a430f62..cea34e1 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -564,7 +564,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne, Depth+1); // If it's known zero, our sign bit is also zero. if (LHSKnownZero.isNegative()) - KnownZero |= LHSKnownZero; + KnownZero.setBit(BitWidth - 1); } break; @@ -694,7 +694,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne, // taking conservative care to avoid excessive recursion. if (Depth < MaxDepth - 1 && !KnownZero && !KnownOne) { // Skip if every incoming value references to ourself. - if (P->hasConstantValue() == P) + if (dyn_cast_or_null<UndefValue>(P->hasConstantValue())) break; KnownZero = APInt::getAllOnesValue(BitWidth); @@ -1796,6 +1796,37 @@ llvm::GetUnderlyingObject(Value *V, const TargetData *TD, unsigned MaxLookup) { return V; } +void +llvm::GetUnderlyingObjects(Value *V, + SmallVectorImpl<Value *> &Objects, + const TargetData *TD, + unsigned MaxLookup) { + SmallPtrSet<Value *, 4> Visited; + SmallVector<Value *, 4> Worklist; + Worklist.push_back(V); + do { + Value *P = Worklist.pop_back_val(); + P = GetUnderlyingObject(P, TD, MaxLookup); + + if (!Visited.insert(P)) + continue; + + if (SelectInst *SI = dyn_cast<SelectInst>(P)) { + Worklist.push_back(SI->getTrueValue()); + Worklist.push_back(SI->getFalseValue()); + continue; + } + + if (PHINode *PN = dyn_cast<PHINode>(P)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + Worklist.push_back(PN->getIncomingValue(i)); + continue; + } + + Objects.push_back(P); + } while (!Worklist.empty()); +} + /// onlyUsedByLifetimeMarkers - Return true if the only users of this pointer /// are lifetime markers. /// diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp index 8818168..670c1bb 100644 --- a/lib/AsmParser/LLLexer.cpp +++ b/lib/AsmParser/LLLexer.cpp @@ -474,6 +474,9 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(extern_weak); KEYWORD(external); KEYWORD(thread_local); + KEYWORD(localdynamic); + KEYWORD(initialexec); + KEYWORD(localexec); KEYWORD(zeroinitializer); KEYWORD(undef); KEYWORD(null); @@ -673,11 +676,12 @@ lltok::Kind LLLexer::LexIdentifier() { /// HexFP80Constant 0xK[0-9A-Fa-f]+ /// HexFP128Constant 0xL[0-9A-Fa-f]+ /// HexPPC128Constant 0xM[0-9A-Fa-f]+ +/// HexHalfConstant 0xH[0-9A-Fa-f]+ lltok::Kind LLLexer::Lex0x() { CurPtr = TokStart + 2; char Kind; - if (CurPtr[0] >= 'K' && CurPtr[0] <= 'M') { + if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H') { Kind = *CurPtr++; } else { Kind = 'J'; @@ -718,6 +722,9 @@ lltok::Kind LLLexer::Lex0x() { HexToIntPair(TokStart+3, CurPtr, Pair); APFloatVal = APFloat(APInt(128, Pair)); return lltok::APFloat; + case 'H': + APFloatVal = APFloat(APInt(16,HexIntToVal(TokStart+3, CurPtr))); + return lltok::APFloat; } } diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp index 068be3d..095b7c5 100644 --- a/lib/AsmParser/LLParser.cpp +++ b/lib/AsmParser/LLParser.cpp @@ -645,12 +645,13 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, unsigned Linkage, bool HasLinkage, unsigned Visibility) { unsigned AddrSpace; - bool ThreadLocal, IsConstant, UnnamedAddr; + bool IsConstant, UnnamedAddr; + GlobalVariable::ThreadLocalMode TLM; LocTy UnnamedAddrLoc; LocTy TyLoc; Type *Ty = 0; - if (ParseOptionalToken(lltok::kw_thread_local, ThreadLocal) || + if (ParseOptionalThreadLocal(TLM) || ParseOptionalAddrSpace(AddrSpace) || ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr, &UnnamedAddrLoc) || @@ -691,7 +692,8 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, if (GV == 0) { GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, 0, - Name, 0, false, AddrSpace); + Name, 0, GlobalVariable::NotThreadLocal, + AddrSpace); } else { if (GV->getType()->getElementType() != Ty) return Error(TyLoc, @@ -710,7 +712,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, GV->setConstant(IsConstant); GV->setLinkage((GlobalValue::LinkageTypes)Linkage); GV->setVisibility((GlobalValue::VisibilityTypes)Visibility); - GV->setThreadLocal(ThreadLocal); + GV->setThreadLocalMode(TLM); GV->setUnnamedAddr(UnnamedAddr); // Parse attributes on the global. @@ -858,6 +860,46 @@ bool LLParser::ParseUInt32(unsigned &Val) { return false; } +/// ParseTLSModel +/// := 'localdynamic' +/// := 'initialexec' +/// := 'localexec' +bool LLParser::ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM) { + switch (Lex.getKind()) { + default: + return TokError("expected localdynamic, initialexec or localexec"); + case lltok::kw_localdynamic: + TLM = GlobalVariable::LocalDynamicTLSModel; + break; + case lltok::kw_initialexec: + TLM = GlobalVariable::InitialExecTLSModel; + break; + case lltok::kw_localexec: + TLM = GlobalVariable::LocalExecTLSModel; + break; + } + + Lex.Lex(); + return false; +} + +/// ParseOptionalThreadLocal +/// := /*empty*/ +/// := 'thread_local' +/// := 'thread_local' '(' tlsmodel ')' +bool LLParser::ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) { + TLM = GlobalVariable::NotThreadLocal; + if (!EatIfPresent(lltok::kw_thread_local)) + return false; + + TLM = GlobalVariable::GeneralDynamicTLSModel; + if (Lex.getKind() == lltok::lparen) { + Lex.Lex(); + return ParseTLSModel(TLM) || + ParseToken(lltok::rparen, "expected ')' after thread local model"); + } + return false; +} /// ParseOptionalAddrSpace /// := /*empty*/ @@ -2692,7 +2734,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { if (FuncAttrs != Attribute::None) Attrs.push_back(AttributeWithIndex::get(~0, FuncAttrs)); - AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end()); + AttrListPtr PAL = AttrListPtr::get(Attrs); if (PAL.paramHasAttr(1, Attribute::StructRet) && !RetType->isVoidTy()) return Error(RetTypeLoc, "functions with 'sret' argument must return void"); @@ -3239,7 +3281,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) { Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs)); // Finish off the Attributes and check them - AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end()); + AttrListPtr PAL = AttrListPtr::get(Attrs); InvokeInst *II = InvokeInst::Create(Callee, NormalBB, UnwindBB, Args); II->setCallingConv(CC); @@ -3635,7 +3677,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs)); // Finish off the Attributes and check them - AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end()); + AttrListPtr PAL = AttrListPtr::get(Attrs); CallInst *CI = CallInst::Create(Callee, Args); CI->setTailCall(isTail); diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h index dda8808..257c726 100644 --- a/lib/AsmParser/LLParser.h +++ b/lib/AsmParser/LLParser.h @@ -171,6 +171,9 @@ namespace llvm { Loc = Lex.getLoc(); return ParseUInt32(Val); } + + bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM); + bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM); bool ParseOptionalAddrSpace(unsigned &AddrSpace); bool ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind); bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage); diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h index adf5d4f..0461e7b 100644 --- a/lib/AsmParser/LLToken.h +++ b/lib/AsmParser/LLToken.h @@ -44,13 +44,14 @@ namespace lltok { kw_unnamed_addr, kw_extern_weak, kw_external, kw_thread_local, + kw_localdynamic, kw_initialexec, kw_localexec, kw_zeroinitializer, kw_undef, kw_null, kw_to, kw_tail, kw_target, kw_triple, - kw_unwind, + kw_unwind, kw_deplibs, kw_datalayout, kw_volatile, diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index e399040..4ffee38 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -28,6 +28,10 @@ #include "llvm/OperandTraits.h" using namespace llvm; +enum { + SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex +}; + void BitcodeReader::materializeForwardReferencedFunctions() { while (!BlockAddrFwdRefs.empty()) { Function *F = BlockAddrFwdRefs.begin()->first; @@ -57,7 +61,7 @@ void BitcodeReader::FreeState() { /// ConvertToString - Convert a string from a record into an std::string, return /// true on failure. template<typename StrTy> -static bool ConvertToString(SmallVector<uint64_t, 64> &Record, unsigned Idx, +static bool ConvertToString(ArrayRef<uint64_t> Record, unsigned Idx, StrTy &Result) { if (Idx > Record.size()) return true; @@ -98,6 +102,17 @@ static GlobalValue::VisibilityTypes GetDecodedVisibility(unsigned Val) { } } +static GlobalVariable::ThreadLocalMode GetDecodedThreadLocalMode(unsigned Val) { + switch (Val) { + case 0: return GlobalVariable::NotThreadLocal; + default: // Map unknown non-zero value to general dynamic. + case 1: return GlobalVariable::GeneralDynamicTLSModel; + case 2: return GlobalVariable::LocalDynamicTLSModel; + case 3: return GlobalVariable::InitialExecTLSModel; + case 4: return GlobalVariable::LocalExecTLSModel; + } +} + static int GetDecodedCastOpcode(unsigned Val) { switch (Val) { default: return -1; @@ -458,61 +473,19 @@ bool BitcodeReader::ParseAttributeBlock() { if (Record.size() & 1) return Error("Invalid ENTRY record"); - // FIXME : Remove this autoupgrade code in LLVM 3.0. - // If Function attributes are using index 0 then transfer them - // to index ~0. Index 0 is used for return value attributes but used to be - // used for function attributes. - Attributes RetAttribute; - Attributes FnAttribute; for (unsigned i = 0, e = Record.size(); i != e; i += 2) { - // FIXME: remove in LLVM 3.0 - // The alignment is stored as a 16-bit raw value from bits 31--16. - // We shift the bits above 31 down by 11 bits. - - unsigned Alignment = (Record[i+1] & (0xffffull << 16)) >> 16; - if (Alignment && !isPowerOf2_32(Alignment)) - return Error("Alignment is not a power of two."); - - Attributes ReconstitutedAttr(Record[i+1] & 0xffff); - if (Alignment) - ReconstitutedAttr |= Attribute::constructAlignmentFromInt(Alignment); - ReconstitutedAttr |= - Attributes((Record[i+1] & (0xffffull << 32)) >> 11); - + Attributes ReconstitutedAttr = + Attribute::decodeLLVMAttributesForBitcode(Record[i+1]); Record[i+1] = ReconstitutedAttr.Raw(); - if (Record[i] == 0) - RetAttribute = ReconstitutedAttr; - else if (Record[i] == ~0U) - FnAttribute = ReconstitutedAttr; - } - - Attributes OldRetAttrs = (Attribute::NoUnwind|Attribute::NoReturn| - Attribute::ReadOnly|Attribute::ReadNone); - - if (FnAttribute == Attribute::None && RetAttribute != Attribute::None && - (RetAttribute & OldRetAttrs)) { - if (FnAttribute == Attribute::None) { // add a slot so they get added. - Record.push_back(~0U); - Record.push_back(0); - } - - FnAttribute |= RetAttribute & OldRetAttrs; - RetAttribute &= ~OldRetAttrs; } for (unsigned i = 0, e = Record.size(); i != e; i += 2) { - if (Record[i] == 0) { - if (RetAttribute != Attribute::None) - Attrs.push_back(AttributeWithIndex::get(0, RetAttribute)); - } else if (Record[i] == ~0U) { - if (FnAttribute != Attribute::None) - Attrs.push_back(AttributeWithIndex::get(~0U, FnAttribute)); - } else if (Attributes(Record[i+1]) != Attribute::None) + if (Attributes(Record[i+1]) != Attribute::None) Attrs.push_back(AttributeWithIndex::get(Record[i], Attributes(Record[i+1]))); } - MAttributes.push_back(AttrListPtr::get(Attrs.begin(), Attrs.end())); + MAttributes.push_back(AttrListPtr::get(Attrs)); Attrs.clear(); break; } @@ -621,7 +594,7 @@ bool BitcodeReader::ParseTypeTableBody() { break; } case bitc::TYPE_CODE_FUNCTION_OLD: { - // FIXME: attrid is dead, remove it in LLVM 3.0 + // FIXME: attrid is dead, remove it in LLVM 4.0 // FUNCTION: [vararg, attrid, retty, paramty x N] if (Record.size() < 3) return Error("Invalid FUNCTION type record"); @@ -851,11 +824,7 @@ bool BitcodeReader::ParseMetadata() { break; case bitc::METADATA_NAME: { // Read named of the named metadata. - unsigned NameLength = Record.size(); - SmallString<8> Name; - Name.resize(NameLength); - for (unsigned i = 0; i != NameLength; ++i) - Name[i] = Record[i]; + SmallString<8> Name(Record.begin(), Record.end()); Record.clear(); Code = Stream.ReadCode(); @@ -899,26 +868,18 @@ bool BitcodeReader::ParseMetadata() { break; } case bitc::METADATA_STRING: { - unsigned MDStringLength = Record.size(); - SmallString<8> String; - String.resize(MDStringLength); - for (unsigned i = 0; i != MDStringLength; ++i) - String[i] = Record[i]; - Value *V = MDString::get(Context, - StringRef(String.data(), String.size())); + SmallString<8> String(Record.begin(), Record.end()); + Value *V = MDString::get(Context, String); MDValueList.AssignValue(V, NextMDValueNo++); break; } case bitc::METADATA_KIND: { - unsigned RecordLength = Record.size(); - if (Record.empty() || RecordLength < 2) + if (Record.size() < 2) return Error("Invalid METADATA_KIND record"); - SmallString<8> Name; - Name.resize(RecordLength-1); + unsigned Kind = Record[0]; - for (unsigned i = 1; i != RecordLength; ++i) - Name[i-1] = Record[i]; - + SmallString<8> Name(Record.begin()+1, Record.end()); + unsigned NewKind = TheModule->getMDKindID(Name.str()); if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second) return Error("Conflicting METADATA_KIND records"); @@ -977,6 +938,14 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() { return false; } +static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) { + SmallVector<uint64_t, 8> Words(Vals.size()); + std::transform(Vals.begin(), Vals.end(), Words.begin(), + DecodeSignRotatedValue); + + return APInt(TypeBits, Words); +} + bool BitcodeReader::ParseConstants() { if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID)) return Error("Malformed block record"); @@ -1032,14 +1001,10 @@ bool BitcodeReader::ParseConstants() { if (!CurTy->isIntegerTy() || Record.empty()) return Error("Invalid WIDE_INTEGER record"); - unsigned NumWords = Record.size(); - SmallVector<uint64_t, 8> Words; - Words.resize(NumWords); - for (unsigned i = 0; i != NumWords; ++i) - Words[i] = DecodeSignRotatedValue(Record[i]); - V = ConstantInt::get(Context, - APInt(cast<IntegerType>(CurTy)->getBitWidth(), - Words)); + APInt VInt = ReadWideAPInt(Record, + cast<IntegerType>(CurTy)->getBitWidth()); + V = ConstantInt::get(Context, VInt); + break; } case bitc::CST_CODE_FLOAT: { // FLOAT: [fpval] @@ -1098,10 +1063,7 @@ bool BitcodeReader::ParseConstants() { if (Record.empty()) return Error("Invalid CST_STRING record"); - unsigned Size = Record.size(); - SmallString<16> Elts; - for (unsigned i = 0; i != Size; ++i) - Elts.push_back(Record[i]); + SmallString<16> Elts(Record.begin(), Record.end()); V = ConstantDataArray::getString(Context, Elts, BitCode == bitc::CST_CODE_CSTRING); break; @@ -1138,23 +1100,16 @@ bool BitcodeReader::ParseConstants() { else V = ConstantDataArray::get(Context, Elts); } else if (EltTy->isFloatTy()) { - SmallVector<float, 16> Elts; - for (unsigned i = 0; i != Size; ++i) { - union { uint32_t I; float F; }; - I = Record[i]; - Elts.push_back(F); - } + SmallVector<float, 16> Elts(Size); + std::transform(Record.begin(), Record.end(), Elts.begin(), BitsToFloat); if (isa<VectorType>(CurTy)) V = ConstantDataVector::get(Context, Elts); else V = ConstantDataArray::get(Context, Elts); } else if (EltTy->isDoubleTy()) { - SmallVector<double, 16> Elts; - for (unsigned i = 0; i != Size; ++i) { - union { uint64_t I; double F; }; - I = Record[i]; - Elts.push_back(F); - } + SmallVector<double, 16> Elts(Size); + std::transform(Record.begin(), Record.end(), Elts.begin(), + BitsToDouble); if (isa<VectorType>(CurTy)) V = ConstantDataVector::get(Context, Elts); else @@ -1600,9 +1555,10 @@ bool BitcodeReader::ParseModule(bool Resume) { GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility; if (Record.size() > 6) Visibility = GetDecodedVisibility(Record[6]); - bool isThreadLocal = false; + + GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal; if (Record.size() > 7) - isThreadLocal = Record[7]; + TLM = GetDecodedThreadLocalMode(Record[7]); bool UnnamedAddr = false; if (Record.size() > 8) @@ -1610,12 +1566,11 @@ bool BitcodeReader::ParseModule(bool Resume) { GlobalVariable *NewGV = new GlobalVariable(*TheModule, Ty, isConstant, Linkage, 0, "", 0, - isThreadLocal, AddressSpace); + TLM, AddressSpace); NewGV->setAlignment(Alignment); if (!Section.empty()) NewGV->setSection(Section); NewGV->setVisibility(Visibility); - NewGV->setThreadLocal(isThreadLocal); NewGV->setUnnamedAddr(UnnamedAddr); ValueList.push_back(NewGV); @@ -1732,7 +1687,7 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) { // have to read and ignore these final 4 bytes :-( if (Stream.GetAbbrevIDWidth() == 2 && Code == 2 && Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a && - Stream.AtEndOfStream()) + Stream.AtEndOfStream()) return false; return Error("Invalid record at top-level"); @@ -2271,6 +2226,65 @@ bool BitcodeReader::ParseFunctionBody(Function *F) { break; } case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...] + // Check magic + if ((Record[0] >> 16) == SWITCH_INST_MAGIC) { + // New SwitchInst format with case ranges. + + Type *OpTy = getTypeByID(Record[1]); + unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth(); + + Value *Cond = getFnValueByID(Record[2], OpTy); + BasicBlock *Default = getBasicBlock(Record[3]); + if (OpTy == 0 || Cond == 0 || Default == 0) + return Error("Invalid SWITCH record"); + + unsigned NumCases = Record[4]; + + SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases); + InstructionList.push_back(SI); + + unsigned CurIdx = 5; + for (unsigned i = 0; i != NumCases; ++i) { + IntegersSubsetToBB CaseBuilder; + unsigned NumItems = Record[CurIdx++]; + for (unsigned ci = 0; ci != NumItems; ++ci) { + bool isSingleNumber = Record[CurIdx++]; + + APInt Low; + unsigned ActiveWords = 1; + if (ValueBitWidth > 64) + ActiveWords = Record[CurIdx++]; + Low = ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords), + ValueBitWidth); + CurIdx += ActiveWords; + + if (!isSingleNumber) { + ActiveWords = 1; + if (ValueBitWidth > 64) + ActiveWords = Record[CurIdx++]; + APInt High = + ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords), + ValueBitWidth); + + CaseBuilder.add(IntItem::fromType(OpTy, Low), + IntItem::fromType(OpTy, High)); + CurIdx += ActiveWords; + } else + CaseBuilder.add(IntItem::fromType(OpTy, Low)); + } + BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]); + IntegersSubset Case = CaseBuilder.getCase(); + SI->addCase(Case, DestBB); + } + uint16_t Hash = SI->hash(); + if (Hash != (Record[0] & 0xFFFF)) + return Error("Invalid SWITCH record"); + I = SI; + break; + } + + // Old SwitchInst format without case ranges. + if (Record.size() < 3 || (Record.size() & 1) == 0) return Error("Invalid SWITCH record"); Type *OpTy = getTypeByID(Record[0]); diff --git a/lib/Bitcode/Reader/CMakeLists.txt b/lib/Bitcode/Reader/CMakeLists.txt index 693d431..dfe7e10 100644 --- a/lib/Bitcode/Reader/CMakeLists.txt +++ b/lib/Bitcode/Reader/CMakeLists.txt @@ -2,3 +2,5 @@ add_llvm_library(LLVMBitReader BitReader.cpp BitcodeReader.cpp ) + +add_dependencies(LLVMBitReader intrinsics_gen) diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index b25d2e9..5b1725f 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -62,7 +62,10 @@ enum { FUNCTION_INST_CAST_ABBREV, FUNCTION_INST_RET_VOID_ABBREV, FUNCTION_INST_RET_VAL_ABBREV, - FUNCTION_INST_UNREACHABLE_ABBREV + FUNCTION_INST_UNREACHABLE_ABBREV, + + // SwitchInst Magic + SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex }; static unsigned GetEncodedCastOpcode(unsigned Opcode) { @@ -174,18 +177,7 @@ static void WriteAttributeTable(const ValueEnumerator &VE, for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i) { const AttributeWithIndex &PAWI = A.getSlot(i); Record.push_back(PAWI.Index); - - // FIXME: remove in LLVM 3.0 - // Store the alignment in the bitcode as a 16-bit raw value instead of a - // 5-bit log2 encoded value. Shift the bits above the alignment up by - // 11 bits. - uint64_t FauxAttr = PAWI.Attrs.Raw() & 0xffff; - if (PAWI.Attrs & Attribute::Alignment) - FauxAttr |= (1ull<<16)<< - (((PAWI.Attrs & Attribute::Alignment).Raw()-1) >> 16); - FauxAttr |= (PAWI.Attrs.Raw() & (0x3FFull << 21)) << 11; - - Record.push_back(FauxAttr); + Record.push_back(Attribute::encodeLLVMAttributesForBitcode(PAWI.Attrs)); } Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record); @@ -387,6 +379,17 @@ static unsigned getEncodedVisibility(const GlobalValue *GV) { llvm_unreachable("Invalid visibility"); } +static unsigned getEncodedThreadLocalMode(const GlobalVariable *GV) { + switch (GV->getThreadLocalMode()) { + case GlobalVariable::NotThreadLocal: return 0; + case GlobalVariable::GeneralDynamicTLSModel: return 1; + case GlobalVariable::LocalDynamicTLSModel: return 2; + case GlobalVariable::InitialExecTLSModel: return 3; + case GlobalVariable::LocalExecTLSModel: return 4; + } + llvm_unreachable("Invalid TLS model"); +} + // Emit top-level description of module, including target triple, inline asm, // descriptors for global variables, and function prototype info. static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, @@ -495,7 +498,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, GV->getVisibility() != GlobalValue::DefaultVisibility || GV->hasUnnamedAddr()) { Vals.push_back(getEncodedVisibility(GV)); - Vals.push_back(GV->isThreadLocal()); + Vals.push_back(getEncodedThreadLocalMode(GV)); Vals.push_back(GV->hasUnnamedAddr()); } else { AbbrevToUse = SimpleGVarAbbrev; @@ -719,6 +722,41 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) { Stream.ExitBlock(); } +static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals, + unsigned &Code, unsigned &AbbrevToUse, const APInt &Val, + bool EmitSizeForWideNumbers = false + ) { + if (Val.getBitWidth() <= 64) { + uint64_t V = Val.getSExtValue(); + if ((int64_t)V >= 0) + Vals.push_back(V << 1); + else + Vals.push_back((-V << 1) | 1); + Code = bitc::CST_CODE_INTEGER; + AbbrevToUse = CONSTANTS_INTEGER_ABBREV; + } else { + // Wide integers, > 64 bits in size. + // We have an arbitrary precision integer value to write whose + // bit width is > 64. However, in canonical unsigned integer + // format it is likely that the high bits are going to be zero. + // So, we only write the number of active words. + unsigned NWords = Val.getActiveWords(); + + if (EmitSizeForWideNumbers) + Vals.push_back(NWords); + + const uint64_t *RawWords = Val.getRawData(); + for (unsigned i = 0; i != NWords; ++i) { + int64_t V = RawWords[i]; + if (V >= 0) + Vals.push_back(V << 1); + else + Vals.push_back((-V << 1) | 1); + } + Code = bitc::CST_CODE_WIDE_INTEGER; + } +} + static void WriteConstants(unsigned FirstVal, unsigned LastVal, const ValueEnumerator &VE, BitstreamWriter &Stream, bool isGlobal) { @@ -801,30 +839,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal, } else if (isa<UndefValue>(C)) { Code = bitc::CST_CODE_UNDEF; } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) { - if (IV->getBitWidth() <= 64) { - uint64_t V = IV->getSExtValue(); - if ((int64_t)V >= 0) - Record.push_back(V << 1); - else - Record.push_back((-V << 1) | 1); - Code = bitc::CST_CODE_INTEGER; - AbbrevToUse = CONSTANTS_INTEGER_ABBREV; - } else { // Wide integers, > 64 bits in size. - // We have an arbitrary precision integer value to write whose - // bit width is > 64. However, in canonical unsigned integer - // format it is likely that the high bits are going to be zero. - // So, we only write the number of active words. - unsigned NWords = IV->getValue().getActiveWords(); - const uint64_t *RawWords = IV->getValue().getRawData(); - for (unsigned i = 0; i != NWords; ++i) { - int64_t V = RawWords[i]; - if (V >= 0) - Record.push_back(V << 1); - else - Record.push_back((-V << 1) | 1); - } - Code = bitc::CST_CODE_WIDE_INTEGER; - } + EmitAPInt(Record, Code, AbbrevToUse, IV->getValue()); } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) { Code = bitc::CST_CODE_FLOAT; Type *Ty = CFP->getType(); @@ -1137,16 +1152,63 @@ static void WriteInstruction(const Instruction &I, unsigned InstID, break; case Instruction::Switch: { + // Redefine Vals, since here we need to use 64 bit values + // explicitly to store large APInt numbers. + SmallVector<uint64_t, 128> Vals64; + Code = bitc::FUNC_CODE_INST_SWITCH; SwitchInst &SI = cast<SwitchInst>(I); - Vals.push_back(VE.getTypeID(SI.getCondition()->getType())); - Vals.push_back(VE.getValueID(SI.getCondition())); - Vals.push_back(VE.getValueID(SI.getDefaultDest())); + + uint32_t SwitchRecordHeader = SI.hash() | (SWITCH_INST_MAGIC << 16); + Vals64.push_back(SwitchRecordHeader); + + Vals64.push_back(VE.getTypeID(SI.getCondition()->getType())); + Vals64.push_back(VE.getValueID(SI.getCondition())); + Vals64.push_back(VE.getValueID(SI.getDefaultDest())); + Vals64.push_back(SI.getNumCases()); for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) { - Vals.push_back(VE.getValueID(i.getCaseValue())); - Vals.push_back(VE.getValueID(i.getCaseSuccessor())); + IntegersSubset& CaseRanges = i.getCaseValueEx(); + unsigned Code, Abbrev; // will unused. + + if (CaseRanges.isSingleNumber()) { + Vals64.push_back(1/*NumItems = 1*/); + Vals64.push_back(true/*IsSingleNumber = true*/); + EmitAPInt(Vals64, Code, Abbrev, CaseRanges.getSingleNumber(0), true); + } else { + + Vals64.push_back(CaseRanges.getNumItems()); + + if (CaseRanges.isSingleNumbersOnly()) { + for (unsigned ri = 0, rn = CaseRanges.getNumItems(); + ri != rn; ++ri) { + + Vals64.push_back(true/*IsSingleNumber = true*/); + + EmitAPInt(Vals64, Code, Abbrev, + CaseRanges.getSingleNumber(ri), true); + } + } else + for (unsigned ri = 0, rn = CaseRanges.getNumItems(); + ri != rn; ++ri) { + IntegersSubset::Range r = CaseRanges.getItem(ri); + bool IsSingleNumber = CaseRanges.isSingleNumber(ri); + + Vals64.push_back(IsSingleNumber); + + EmitAPInt(Vals64, Code, Abbrev, r.getLow(), true); + if (!IsSingleNumber) + EmitAPInt(Vals64, Code, Abbrev, r.getHigh(), true); + } + } + Vals64.push_back(VE.getValueID(i.getCaseSuccessor())); } + + Stream.EmitRecord(Code, Vals64, AbbrevToUse); + + // Also do expected action - clear external Vals collection: + Vals.clear(); + return; } break; case Instruction::IndirectBr: diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 822a564..205480a 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -16,10 +16,10 @@ #define DEBUG_TYPE "post-RA-sched" #include "AggressiveAntiDepBreaker.h" -#include "RegisterClassInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetInstrInfo.h" @@ -157,8 +157,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // In a return block, examine the function live-out regs. for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(), E = MRI.liveout_end(); I != E; ++I) { - for (const uint16_t *Alias = TRI->getOverlaps(*I); - unsigned Reg = *Alias; ++Alias) { + for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { + unsigned Reg = *AI; State->UnionGroups(Reg, 0); KillIndices[Reg] = BB->size(); DefIndices[Reg] = ~0u; @@ -173,8 +173,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { SE = BB->succ_end(); SI != SE; ++SI) for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(), E = (*SI)->livein_end(); I != E; ++I) { - for (const uint16_t *Alias = TRI->getOverlaps(*I); - unsigned Reg = *Alias; ++Alias) { + for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { + unsigned Reg = *AI; State->UnionGroups(Reg, 0); KillIndices[Reg] = BB->size(); DefIndices[Reg] = ~0u; @@ -189,8 +189,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { unsigned Reg = *I; if (!IsReturnBlock && !Pristine.test(Reg)) continue; - for (const uint16_t *Alias = TRI->getOverlaps(Reg); - unsigned AliasReg = *Alias; ++Alias) { + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { + unsigned AliasReg = *AI; State->UnionGroups(AliasReg, 0); KillIndices[AliasReg] = BB->size(); DefIndices[AliasReg] = ~0u; @@ -265,10 +265,8 @@ void AggressiveAntiDepBreaker::GetPassthruRegs(MachineInstr *MI, IsImplicitDefUse(MI, MO)) { const unsigned Reg = MO.getReg(); PassthruRegs.insert(Reg); - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) { - PassthruRegs.insert(*Subreg); - } + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + PassthruRegs.insert(*SubRegs); } } } @@ -333,9 +331,8 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx, DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag); } // Repeat for subregisters. - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) { - unsigned SubregReg = *Subreg; + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubregReg = *SubRegs; if (!State->IsLive(SubregReg)) { KillIndices[SubregReg] = KillIdx; DefIndices[SubregReg] = ~0u; @@ -392,8 +389,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI, // Any aliased that are live at this point are completely or // partially defined here, so group those aliases with Reg. - for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { - unsigned AliasReg = *Alias; + for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) { + unsigned AliasReg = *AI; if (State->IsLive(AliasReg)) { State->UnionGroups(Reg, AliasReg); DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via " << @@ -404,7 +401,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI, // Note register reference... const TargetRegisterClass *RC = NULL; if (i < MI->getDesc().getNumOperands()) - RC = TII->getRegClass(MI->getDesc(), i, TRI); + RC = TII->getRegClass(MI->getDesc(), i, TRI, MF); AggressiveAntiDepState::RegisterReference RR = { &MO, RC }; RegRefs.insert(std::make_pair(Reg, RR)); } @@ -423,9 +420,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI, continue; // Update def for Reg and aliases. - for (const uint16_t *Alias = TRI->getOverlaps(Reg); - unsigned AliasReg = *Alias; ++Alias) - DefIndices[AliasReg] = Count; + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + DefIndices[*AI] = Count; } } @@ -479,7 +475,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI, // Note register reference... const TargetRegisterClass *RC = NULL; if (i < MI->getDesc().getNumOperands()) - RC = TII->getRegClass(MI->getDesc(), i, TRI); + RC = TII->getRegClass(MI->getDesc(), i, TRI, MF); AggressiveAntiDepState::RegisterReference RR = { &MO, RC }; RegRefs.insert(std::make_pair(Reg, RR)); } @@ -678,9 +674,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( goto next_super_reg; } else { bool found = false; - for (const uint16_t *Alias = TRI->getAliasSet(NewReg); - *Alias; ++Alias) { - unsigned AliasReg = *Alias; + for (MCRegAliasIterator AI(NewReg, TRI, false); AI.isValid(); ++AI) { + unsigned AliasReg = *AI; if (State->IsLive(AliasReg) || (KillIndices[Reg] > DefIndices[AliasReg])) { DEBUG(dbgs() << "(alias " << TRI->getName(AliasReg) << " live)"); diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp index 87f6431..32ad34a 100644 --- a/lib/CodeGen/AllocationOrder.cpp +++ b/lib/CodeGen/AllocationOrder.cpp @@ -15,9 +15,9 @@ //===----------------------------------------------------------------------===// #include "AllocationOrder.h" -#include "RegisterClassInfo.h" #include "VirtRegMap.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" using namespace llvm; diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp index 00874d4..447f398 100644 --- a/lib/CodeGen/Analysis.cpp +++ b/lib/CodeGen/Analysis.cpp @@ -203,6 +203,63 @@ ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) { } } + +/// getNoopInput - If V is a noop (i.e., lowers to no machine code), look +/// through it (and any transitive noop operands to it) and return its input +/// value. This is used to determine if a tail call can be formed. +/// +static const Value *getNoopInput(const Value *V, const TargetLowering &TLI) { + // If V is not an instruction, it can't be looked through. + const Instruction *I = dyn_cast<Instruction>(V); + if (I == 0 || !I->hasOneUse() || I->getNumOperands() == 0) return V; + + Value *Op = I->getOperand(0); + + // Look through truly no-op truncates. + if (isa<TruncInst>(I) && + TLI.isTruncateFree(I->getOperand(0)->getType(), I->getType())) + return getNoopInput(I->getOperand(0), TLI); + + // Look through truly no-op bitcasts. + if (isa<BitCastInst>(I)) { + // No type change at all. + if (Op->getType() == I->getType()) + return getNoopInput(Op, TLI); + + // Pointer to pointer cast. + if (Op->getType()->isPointerTy() && I->getType()->isPointerTy()) + return getNoopInput(Op, TLI); + + if (isa<VectorType>(Op->getType()) && isa<VectorType>(I->getType()) && + TLI.isTypeLegal(EVT::getEVT(Op->getType())) && + TLI.isTypeLegal(EVT::getEVT(I->getType()))) + return getNoopInput(Op, TLI); + } + + // Look through inttoptr. + if (isa<IntToPtrInst>(I) && !isa<VectorType>(I->getType())) { + // Make sure this isn't a truncating or extending cast. We could support + // this eventually, but don't bother for now. + if (TLI.getPointerTy().getSizeInBits() == + cast<IntegerType>(Op->getType())->getBitWidth()) + return getNoopInput(Op, TLI); + } + + // Look through ptrtoint. + if (isa<PtrToIntInst>(I) && !isa<VectorType>(I->getType())) { + // Make sure this isn't a truncating or extending cast. We could support + // this eventually, but don't bother for now. + if (TLI.getPointerTy().getSizeInBits() == + cast<IntegerType>(I->getType())->getBitWidth()) + return getNoopInput(Op, TLI); + } + + + // Otherwise it's not something we can look through. + return V; +} + + /// Test if the given instruction is in a position to be optimized /// with a tail-call. This roughly means that it's in a block with /// a return and there's nothing that needs to be scheduled @@ -226,7 +283,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr, // been fully understood. if (!Ret && (!TLI.getTargetMachine().Options.GuaranteedTailCallOpt || - !isa<UnreachableInst>(Term))) return false; + !isa<UnreachableInst>(Term))) + return false; // If I will have a chain, make sure no other instruction that will have a // chain interposes between I and the return. @@ -264,28 +322,28 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr, return false; // Otherwise, make sure the unmodified return value of I is the return value. - for (const Instruction *U = dyn_cast<Instruction>(Ret->getOperand(0)); ; - U = dyn_cast<Instruction>(U->getOperand(0))) { - if (!U) - return false; - if (!U->hasOneUse()) + // We handle two cases: multiple return values + scalars. + Value *RetVal = Ret->getOperand(0); + if (!isa<InsertValueInst>(RetVal) || !isa<StructType>(RetVal->getType())) + // Handle scalars first. + return getNoopInput(Ret->getOperand(0), TLI) == I; + + // If this is an aggregate return, look through the insert/extract values and + // see if each is transparent. + for (unsigned i = 0, e =cast<StructType>(RetVal->getType())->getNumElements(); + i != e; ++i) { + const Value *InScalar = FindInsertedValue(RetVal, i); + if (InScalar == 0) return false; + InScalar = getNoopInput(InScalar, TLI); + + // If the scalar value being inserted is an extractvalue of the right index + // from the call, then everything is good. + const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(InScalar); + if (EVI == 0 || EVI->getOperand(0) != I || EVI->getNumIndices() != 1 || + EVI->getIndices()[0] != i) return false; - if (U == I) - break; - // Check for a truly no-op truncate. - if (isa<TruncInst>(U) && - TLI.isTruncateFree(U->getOperand(0)->getType(), U->getType())) - continue; - // Check for a truly no-op bitcast. - if (isa<BitCastInst>(U) && - (U->getOperand(0)->getType() == U->getType() || - (U->getOperand(0)->getType()->isPointerTy() && - U->getType()->isPointerTy()))) - continue; - // Otherwise it's not a true no-op. - return false; } - + return true; } diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp index b60fda8..bf5d8c4 100644 --- a/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -44,9 +44,7 @@ EnableARMEHABIDescriptors("arm-enable-ehabi-descriptors", cl::Hidden, ARMException::ARMException(AsmPrinter *A) - : DwarfException(A), - shouldEmitTable(false), shouldEmitMoves(false), shouldEmitTableModule(false) - {} + : DwarfException(A) {} ARMException::~ARMException() {} diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index f6cde98..b7fc663 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -16,6 +16,7 @@ #if !defined(ANDROID_TARGET_BUILD) || defined(ANDROID_ENGINEERING_BUILD) # include "DwarfDebug.h" # include "DwarfException.h" +# include "llvm/DebugInfo.h" #endif // !ANDROID_TARGET_BUILD || ANDROID_ENGINEERING_BUILD #include "llvm/Module.h" #include "llvm/CodeGen/GCMetadataPrinter.h" @@ -26,7 +27,6 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -484,10 +484,8 @@ void AsmPrinter::EmitFunctionHeader() { void AsmPrinter::EmitFunctionEntryLabel() { // The function label could have already been emitted if two symbols end up // conflicting due to asm renaming. Detect this and emit an error. - if (CurrentFnSym->isUndefined()) { - OutStreamer.ForceCodeRegion(); + if (CurrentFnSym->isUndefined()) return OutStreamer.EmitLabel(CurrentFnSym); - } report_fatal_error("'" + Twine(CurrentFnSym->getName()) + "' label emitted multiple times to assembly file"); @@ -624,7 +622,7 @@ bool AsmPrinter::needsSEHMoves() { } bool AsmPrinter::needsRelocationsForDwarfStringPool() const { - return MAI->doesDwarfUseRelocationsForStringPool(); + return MAI->doesDwarfUseRelocationsAcrossSections(); } void AsmPrinter::emitPrologLabel(const MachineInstr &MI) { @@ -813,8 +811,8 @@ void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const { const TargetRegisterInfo *TRI = TM.getRegisterInfo(); int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false); - for (const uint16_t *SR = TRI->getSuperRegisters(MLoc.getReg()); - *SR && Reg < 0; ++SR) { + for (MCSuperRegIterator SR(MLoc.getReg(), TRI); SR.isValid() && Reg < 0; + ++SR) { Reg = TRI->getDwarfRegNum(*SR, false); // FIXME: Get the bit range this register uses of the superregister // so that we can produce a DW_OP_bit_piece @@ -1102,15 +1100,6 @@ void AsmPrinter::EmitJumpTableInfo() { EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getTargetData()))); - // If we know the form of the jump table, go ahead and tag it as such. - if (!JTInDiffSection) { - if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32) { - OutStreamer.EmitJumpTable32Region(); - } else { - OutStreamer.EmitDataRegion(); - } - } - for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) { const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; @@ -1416,13 +1405,14 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset, unsigned Size) const { - // Emit Label+Offset - const MCExpr *Plus = - MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Label, OutContext), - MCConstantExpr::Create(Offset, OutContext), - OutContext); + // Emit Label+Offset (or just Label if Offset is zero) + const MCExpr *Expr = MCSymbolRefExpr::Create(Label, OutContext); + if (Offset) + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(Offset, OutContext), + OutContext); - OutStreamer.EmitValue(Plus, 4, 0/*AddrSpace*/); + OutStreamer.EmitValue(Expr, Size, 0/*AddrSpace*/); } diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index e9e9335..711375b 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -329,11 +329,11 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const { OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1; } - // We may have a location metadata attached to the end of the - // instruction, and at no point should see metadata at any - // other point while processing. It's an error if so. + // We may have a location metadata attached to the end of the + // instruction, and at no point should see metadata at any + // other point while processing. It's an error if so. if (OpNo >= MI->getNumOperands() || - MI->getOperand(OpNo).isMetadata()) { + MI->getOperand(OpNo).isMetadata()) { Error = true; } else { unsigned OpFlags = MI->getOperand(OpNo).getImm(); @@ -413,9 +413,28 @@ void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS, /// instruction, using the specified assembler variant. Targets should /// override this to format as appropriate. bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) { - // Target doesn't support this yet! + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + const MachineOperand &MO = MI->getOperand(OpNo); + switch (ExtraCode[0]) { + default: + return true; // Unknown modifier. + case 'c': // Substitute immediate value without immediate syntax + if (MO.getType() != MachineOperand::MO_Immediate) + return true; + O << MO.getImm(); + return false; + case 'n': // Negate the immediate constant. + if (MO.getType() != MachineOperand::MO_Immediate) + return true; + O << -MO.getImm(); + return false; + } + } return true; } diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index cc5b642..d231665 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -17,9 +17,9 @@ #include "DwarfCompileUnit.h" #include "DwarfDebug.h" #include "llvm/Constants.h" +#include "llvm/DIBuilder.h" #include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" -#include "llvm/Analysis/DIBuilder.h" #include "llvm/Support/Debug.h" #include "llvm/Target/Mangler.h" #include "llvm/Target/TargetData.h" @@ -33,7 +33,7 @@ using namespace llvm; /// CompileUnit - Compile unit constructor. CompileUnit::CompileUnit(unsigned I, unsigned L, DIE *D, AsmPrinter *A, - DwarfDebug *DW) + DwarfDebug *DW) : ID(I), Language(L), CUDie(D), Asm(A), DD(DW), IndexTyDie(0) { DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1); } @@ -198,7 +198,7 @@ void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) { return; DIFile File = Ty.getFile(); unsigned FileID = DD->GetOrCreateSourceID(File.getFilename(), - File.getDirectory()); + File.getDirectory()); assert(FileID && "Invalid file id"); addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID); addUInt(Die, dwarf::DW_AT_decl_line, 0, Line); @@ -308,7 +308,8 @@ void CompileUnit::addComplexAddress(DbgVariable *&DV, DIE *Die, addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst); addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i)); } else if (Element == DIBuilder::OpDeref) { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); + if (!Location.isReg()) + addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref); } else llvm_unreachable("unknown DIBuilder Opcode"); } @@ -418,27 +419,12 @@ void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die, // Decode the original location, and use that as the start of the byref // variable's location. - const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo(); - unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false); DIEBlock *Block = new (DIEValueAllocator) DIEBlock(); - if (Location.isReg()) { - if (Reg < 32) - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg); - else { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx); - addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); - } - } else { - if (Reg < 32) - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg); - else { - addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx); - addUInt(Block, 0, dwarf::DW_FORM_udata, Reg); - } - - addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset()); - } + if (Location.isReg()) + addRegisterOp(Block, Location.getReg()); + else + addRegisterOffset(Block, Location.getReg(), Location.getOffset()); // If we started with a pointer to the __Block_byref... struct, then // the first thing we need to do is dereference the pointer (DW_OP_deref). @@ -646,8 +632,7 @@ DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) { } /// addType - Add a new type attribute to the specified entity. -void CompileUnit::addType(DIE *Entity, DIType Ty, - unsigned Attribute) { +void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) { if (!Ty.Verify()) return; @@ -776,6 +761,11 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) { Buffer.addChild(ElemDie); } } + DIType DTy = CTy.getTypeDerivedFrom(); + if (DTy.Verify()) { + addType(&Buffer, DTy); + addUInt(&Buffer, dwarf::DW_AT_enum_class, dwarf::DW_FORM_flag, 1); + } } break; case dwarf::DW_TAG_subroutine_type: { @@ -801,9 +791,9 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) { // Add prototype flag if we're dealing with a C language and the // function has been prototyped. if (isPrototyped && - (Language == dwarf::DW_LANG_C89 || - Language == dwarf::DW_LANG_C99 || - Language == dwarf::DW_LANG_ObjC)) + (Language == dwarf::DW_LANG_C89 || + Language == dwarf::DW_LANG_C99 || + Language == dwarf::DW_LANG_ObjC)) addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1); } break; @@ -846,19 +836,19 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) { addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1); addSourceLine(ElemDie, DV); } else if (Element.isDerivedType()) { - DIDerivedType DDTy(Element); - if (DDTy.getTag() == dwarf::DW_TAG_friend) { - ElemDie = new DIE(dwarf::DW_TAG_friend); - addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend); - } else - ElemDie = createMemberDIE(DIDerivedType(Element)); + DIDerivedType DDTy(Element); + if (DDTy.getTag() == dwarf::DW_TAG_friend) { + ElemDie = new DIE(dwarf::DW_TAG_friend); + addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend); + } else + ElemDie = createMemberDIE(DIDerivedType(Element)); } else if (Element.isObjCProperty()) { DIObjCProperty Property(Element); ElemDie = new DIE(Property.getTag()); StringRef PropertyName = Property.getObjCPropertyName(); addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName); - addType(ElemDie, Property.getType()); - addSourceLine(ElemDie, Property); + addType(ElemDie, Property.getType()); + addSourceLine(ElemDie, Property); StringRef GetterName = Property.getObjCPropertyGetterName(); if (!GetterName.empty()) addString(ElemDie, dwarf::DW_AT_APPLE_property_getter, GetterName); @@ -925,19 +915,21 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) { if (!Name.empty()) addString(&Buffer, dwarf::DW_AT_name, Name); - if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type - || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) - { + if (Tag == dwarf::DW_TAG_enumeration_type || + Tag == dwarf::DW_TAG_class_type || + Tag == dwarf::DW_TAG_structure_type || + Tag == dwarf::DW_TAG_union_type) { // Add size if non-zero (derived types might be zero-sized.) + // TODO: Do we care about size for enum forward declarations? if (Size) addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size); - else { + else if (!CTy.isForwardDecl()) // Add zero size if it is not a forward declaration. - if (CTy.isForwardDecl()) - addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); - else - addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0); - } + addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0); + + // If we're a forward decl, say so. + if (CTy.isForwardDecl()) + addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1); // Add source line info if available. if (!CTy.isForwardDecl()) @@ -968,7 +960,7 @@ CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) { /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE /// for the given DITemplateValueParameter. DIE * -CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV) { +CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV){ DIE *ParamDIE = getDIE(TPV); if (ParamDIE) return ParamDIE; @@ -1015,17 +1007,17 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) { if (SPDie) return SPDie; + SPDie = new DIE(dwarf::DW_TAG_subprogram); + + // DW_TAG_inlined_subroutine may refer to this DIE. + insertDIE(SP, SPDie); + DISubprogram SPDecl = SP.getFunctionDeclaration(); DIE *DeclDie = NULL; if (SPDecl.isSubprogram()) { DeclDie = getOrCreateSubprogramDIE(SPDecl); } - SPDie = new DIE(dwarf::DW_TAG_subprogram); - - // DW_TAG_inlined_subroutine may refer to this DIE. - insertDIE(SP, SPDie); - // Add to context owner. addToContextOwner(SPDie, SP.getContext()); @@ -1240,7 +1232,8 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) { } /// constructSubrangeDIE - Construct subrange DIE from DISubrange. -void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){ +void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, + DIE *IndexTy) { DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type); addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy); uint64_t L = SR.getLo(); diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 45e407e..b4ff9e8 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -15,7 +15,7 @@ #define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H #include "DIE.h" -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/DebugInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/OwningPtr.h" diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index cb78878..649684a 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -17,9 +17,10 @@ #include "DwarfAccelTable.h" #include "DwarfCompileUnit.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" +#include "llvm/DIBuilder.h" #include "llvm/Module.h" #include "llvm/Instructions.h" -#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/MC/MCAsmInfo.h" @@ -32,11 +33,10 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Analysis/DIBuilder.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -117,7 +117,6 @@ DIType DbgVariable::getType() const { if (getName() == DT.getName()) return (DT.getTypeDerivedFrom()); } - return Ty; } return Ty; } @@ -127,6 +126,7 @@ DIType DbgVariable::getType() const { DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) : Asm(A), MMI(Asm->MMI), FirstCU(0), AbbreviationsSet(InitAbbreviationsSetSize), + SourceIdMap(DIEValueAllocator), StringPool(DIEValueAllocator), PrevLabel(NULL) { NextStringPoolNumber = 0; @@ -566,7 +566,7 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) { NewCU->addUInt(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0); // DW_AT_stmt_list is a offset of line number information for this // compile unit in debug_line section. - if (Asm->MAI->doesDwarfRequireRelocationForSectionOffset()) + if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, Asm->GetTempSymbol("section_line")); else @@ -1310,8 +1310,9 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { MOE = MI->operands_end(); MOI != MOE; ++MOI) { if (!MOI->isReg() || !MOI->isDef() || !MOI->getReg()) continue; - for (const uint16_t *AI = TRI->getOverlaps(MOI->getReg()); - unsigned Reg = *AI; ++AI) { + for (MCRegAliasIterator AI(MOI->getReg(), TRI, true); + AI.isValid(); ++AI) { + unsigned Reg = *AI; const MDNode *Var = LiveUserVar[Reg]; if (!Var) continue; @@ -1381,7 +1382,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { MF->getFunction()->getContext()); recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(), FnStartDL.getScope(MF->getFunction()->getContext()), - 0); + DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0); } } @@ -1421,6 +1422,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { DIVariable DV(Variables.getElement(i)); if (!DV || !DV.Verify() || !ProcessedVars.insert(DV)) continue; + // Check that DbgVariable for DV wasn't created earlier, when + // findAbstractVariable() was called for inlined instance of DV. + LLVMContext &Ctx = DV->getContext(); + DIVariable CleanDV = cleanseInlinedVariable(DV, Ctx); + if (AbstractVariables.lookup(CleanDV)) + continue; if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext())) addScopeVariable(Scope, new DbgVariable(DV, NULL)); } @@ -1623,7 +1630,7 @@ void DwarfDebug::emitDIE(DIE *Die) { // DW_AT_range Value encodes offset in debug_range section. DIEInteger *V = cast<DIEInteger>(Values[i]); - if (Asm->MAI->doesDwarfUseLabelOffsetForRanges()) { + if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) { Asm->EmitLabelPlusOffset(DwarfDebugRangeSectionSym, V->getValue(), 4); @@ -1636,10 +1643,14 @@ void DwarfDebug::emitDIE(DIE *Die) { break; } case dwarf::DW_AT_location: { - if (DIELabel *L = dyn_cast<DIELabel>(Values[i])) - Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4); - else + if (DIELabel *L = dyn_cast<DIELabel>(Values[i])) { + if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) + Asm->EmitLabelReference(L->getValue(), 4); + else + Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4); + } else { Values[i]->EmitValue(Asm, Form); + } break; } case dwarf::DW_AT_accessibility: { @@ -2049,9 +2060,11 @@ void DwarfDebug::emitDebugLoc() { if (Element == DIBuilder::OpPlus) { Asm->EmitInt8(dwarf::DW_OP_plus_uconst); Asm->EmitULEB128(DV.getAddrElement(++i)); - } else if (Element == DIBuilder::OpDeref) - Asm->EmitInt8(dwarf::DW_OP_deref); - else llvm_unreachable("unknown Opcode found in complex address"); + } else if (Element == DIBuilder::OpDeref) { + if (!Entry.Loc.isReg()) + Asm->EmitInt8(dwarf::DW_OP_deref); + } else + llvm_unreachable("unknown Opcode found in complex address"); } } } diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index 83f30f5..d1d6512 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -14,11 +14,11 @@ #ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__ #define CODEGEN_ASMPRINTER_DWARFDEBUG_H__ +#include "DIE.h" +#include "llvm/DebugInfo.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/LexicalScopes.h" #include "llvm/MC/MachineLocation.h" -#include "llvm/Analysis/DebugInfo.h" -#include "DIE.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/SmallPtrSet.h" @@ -188,6 +188,9 @@ class DwarfDebug { /// MMI - Collected machine module information. MachineModuleInfo *MMI; + /// DIEValueAllocator - All DIEValues are allocated through this allocator. + BumpPtrAllocator DIEValueAllocator; + //===--------------------------------------------------------------------===// // Attributes used to construct specific Dwarf sections. // @@ -210,11 +213,11 @@ class DwarfDebug { /// SourceIdMap - Source id map, i.e. pair of source filename and directory, /// separated by a zero byte, mapped to a unique id. - StringMap<unsigned> SourceIdMap; + StringMap<unsigned, BumpPtrAllocator&> SourceIdMap; /// StringPool - A String->Symbol mapping of strings used by indirect /// references. - StringMap<std::pair<MCSymbol*, unsigned> > StringPool; + StringMap<std::pair<MCSymbol*, unsigned>, BumpPtrAllocator&> StringPool; unsigned NextStringPoolNumber; /// SectionMap - Provides a unique id per text section. @@ -232,7 +235,7 @@ class DwarfDebug { /// ScopeVariables - Collection of dbg variables of a scope. DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> > ScopeVariables; - /// AbstractVariables - Collection on abstract variables. + /// AbstractVariables - Collection of abstract variables. DenseMap<const MDNode *, DbgVariable *> AbstractVariables; /// DotDebugLocEntries - Collection of DotDebugLocEntry. @@ -292,9 +295,6 @@ class DwarfDebug { std::vector<FunctionDebugFrameInfo> DebugFrames; - // DIEValueAllocator - All DIEValues are allocated through this allocator. - BumpPtrAllocator DIEValueAllocator; - // Section Symbols: these are assembler temporary labels that are emitted at // the beginning of each supported dwarf section. These are used to form // section offsets and are created by EmitSectionLabels. @@ -333,9 +333,6 @@ private: /// of the function. DIE *constructInlinedScopeDIE(CompileUnit *TheCU, LexicalScope *Scope); - /// constructVariableDIE - Construct a DIE for the given DbgVariable. - DIE *constructVariableDIE(DbgVariable *DV, LexicalScope *S); - /// constructScopeDIE - Construct a DIE for this scope. DIE *constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope); @@ -517,9 +514,6 @@ public: /// in the SourceIds map. unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName); - /// createSubprogramDIE - Create new DIE using SP. - DIE *createSubprogramDIE(DISubprogram SP); - /// getStringPool - returns the entry into the start of the pool. MCSymbol *getStringPool(); diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h index b5f86ab..75f6056 100644 --- a/lib/CodeGen/AsmPrinter/DwarfException.h +++ b/lib/CodeGen/AsmPrinter/DwarfException.h @@ -175,17 +175,6 @@ public: }; class ARMException : public DwarfException { - /// shouldEmitTable - Per-function flag to indicate if EH tables should - /// be emitted. - bool shouldEmitTable; - - /// shouldEmitMoves - Per-function flag to indicate if frame moves info - /// should be emitted. - bool shouldEmitMoves; - - /// shouldEmitTableModule - Per-module flag to indicate if EH tables - /// should be emitted. - bool shouldEmitTableModule; public: //===--------------------------------------------------------------------===// // Main entry points. diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index ef1d2ba..fb65bb7 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -137,9 +137,8 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) { break; unsigned Reg = I->getOperand(0).getReg(); ImpDefRegs.insert(Reg); - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) - ImpDefRegs.insert(SubReg); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + ImpDefRegs.insert(*SubRegs); ++I; } if (ImpDefRegs.empty()) @@ -188,7 +187,7 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF, // Use a RegScavenger to help update liveness when required. MachineRegisterInfo &MRI = MF.getRegInfo(); - if (MRI.tracksLiveness() && TRI->requiresRegisterScavenging(MF)) + if (MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF)) RS = new RegScavenger(); else MRI.invalidateLiveness(); @@ -819,10 +818,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB, } bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { - - if (!EnableTailMerge) return false; - bool MadeChange = false; + if (!EnableTailMerge) return MadeChange; // First find blocks with no successors. MergePotentials.clear(); @@ -839,6 +836,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { if (MergePotentials.size() == TailMergeThreshold) for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) TriedMerging.insert(MergePotentials[i].getBlock()); + // See if we can do any tail merging on those. if (MergePotentials.size() >= 2) MadeChange |= TryTailMergeBlocks(NULL, NULL); @@ -864,88 +862,97 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end(); I != E; ++I) { - if (I->pred_size() >= 2) { - SmallPtrSet<MachineBasicBlock *, 8> UniquePreds; - MachineBasicBlock *IBB = I; - MachineBasicBlock *PredBB = prior(I); - MergePotentials.clear(); - for (MachineBasicBlock::pred_iterator P = I->pred_begin(), - E2 = I->pred_end(); - P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) { - MachineBasicBlock *PBB = *P; - if (TriedMerging.count(PBB)) - continue; - // Skip blocks that loop to themselves, can't tail merge these. - if (PBB == IBB) - continue; - // Visit each predecessor only once. - if (!UniquePreds.insert(PBB)) - continue; - // Skip blocks which may jump to a landing pad. Can't tail merge these. - if (PBB->getLandingPadSuccessor()) - continue; - MachineBasicBlock *TBB = 0, *FBB = 0; - SmallVector<MachineOperand, 4> Cond; - if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) { - // Failing case: IBB is the target of a cbr, and - // we cannot reverse the branch. - SmallVector<MachineOperand, 4> NewCond(Cond); - if (!Cond.empty() && TBB == IBB) { - if (TII->ReverseBranchCondition(NewCond)) + if (I->pred_size() < 2) continue; + SmallPtrSet<MachineBasicBlock *, 8> UniquePreds; + MachineBasicBlock *IBB = I; + MachineBasicBlock *PredBB = prior(I); + MergePotentials.clear(); + for (MachineBasicBlock::pred_iterator P = I->pred_begin(), + E2 = I->pred_end(); + P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) { + MachineBasicBlock *PBB = *P; + if (TriedMerging.count(PBB)) + continue; + + // Skip blocks that loop to themselves, can't tail merge these. + if (PBB == IBB) + continue; + + // Visit each predecessor only once. + if (!UniquePreds.insert(PBB)) + continue; + + // Skip blocks which may jump to a landing pad. Can't tail merge these. + if (PBB->getLandingPadSuccessor()) + continue; + + MachineBasicBlock *TBB = 0, *FBB = 0; + SmallVector<MachineOperand, 4> Cond; + if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) { + // Failing case: IBB is the target of a cbr, and we cannot reverse the + // branch. + SmallVector<MachineOperand, 4> NewCond(Cond); + if (!Cond.empty() && TBB == IBB) { + if (TII->ReverseBranchCondition(NewCond)) + continue; + // This is the QBB case described above + if (!FBB) + FBB = llvm::next(MachineFunction::iterator(PBB)); + } + + // Failing case: the only way IBB can be reached from PBB is via + // exception handling. Happens for landing pads. Would be nice to have + // a bit in the edge so we didn't have to do all this. + if (IBB->isLandingPad()) { + MachineFunction::iterator IP = PBB; IP++; + MachineBasicBlock *PredNextBB = NULL; + if (IP != MF.end()) + PredNextBB = IP; + if (TBB == NULL) { + if (IBB != PredNextBB) // fallthrough + continue; + } else if (FBB) { + if (TBB != IBB && FBB != IBB) // cbr then ubr + continue; + } else if (Cond.empty()) { + if (TBB != IBB) // ubr + continue; + } else { + if (TBB != IBB && IBB != PredNextBB) // cbr continue; - // This is the QBB case described above - if (!FBB) - FBB = llvm::next(MachineFunction::iterator(PBB)); - } - // Failing case: the only way IBB can be reached from PBB is via - // exception handling. Happens for landing pads. Would be nice - // to have a bit in the edge so we didn't have to do all this. - if (IBB->isLandingPad()) { - MachineFunction::iterator IP = PBB; IP++; - MachineBasicBlock *PredNextBB = NULL; - if (IP != MF.end()) - PredNextBB = IP; - if (TBB == NULL) { - if (IBB != PredNextBB) // fallthrough - continue; - } else if (FBB) { - if (TBB != IBB && FBB != IBB) // cbr then ubr - continue; - } else if (Cond.empty()) { - if (TBB != IBB) // ubr - continue; - } else { - if (TBB != IBB && IBB != PredNextBB) // cbr - continue; - } - } - // Remove the unconditional branch at the end, if any. - if (TBB && (Cond.empty() || FBB)) { - DebugLoc dl; // FIXME: this is nowhere - TII->RemoveBranch(*PBB); - if (!Cond.empty()) - // reinsert conditional branch only, for now - TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl); } - MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P)); } + + // Remove the unconditional branch at the end, if any. + if (TBB && (Cond.empty() || FBB)) { + DebugLoc dl; // FIXME: this is nowhere + TII->RemoveBranch(*PBB); + if (!Cond.empty()) + // reinsert conditional branch only, for now + TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl); + } + + MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P)); } - // If this is a large problem, avoid visiting the same basic blocks - // multiple times. - if (MergePotentials.size() == TailMergeThreshold) - for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) - TriedMerging.insert(MergePotentials[i].getBlock()); - if (MergePotentials.size() >= 2) - MadeChange |= TryTailMergeBlocks(IBB, PredBB); - // Reinsert an unconditional branch if needed. - // The 1 below can occur as a result of removing blocks in - // TryTailMergeBlocks. - PredBB = prior(I); // this may have been changed in TryTailMergeBlocks - if (MergePotentials.size() == 1 && - MergePotentials.begin()->getBlock() != PredBB) - FixTail(MergePotentials.begin()->getBlock(), IBB, TII); } + + // If this is a large problem, avoid visiting the same basic blocks multiple + // times. + if (MergePotentials.size() == TailMergeThreshold) + for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) + TriedMerging.insert(MergePotentials[i].getBlock()); + + if (MergePotentials.size() >= 2) + MadeChange |= TryTailMergeBlocks(IBB, PredBB); + + // Reinsert an unconditional branch if needed. The 1 below can occur as a + // result of removing blocks in TryTailMergeBlocks. + PredBB = prior(I); // this may have been changed in TryTailMergeBlocks + if (MergePotentials.size() == 1 && + MergePotentials.begin()->getBlock() != PredBB) + FixTail(MergePotentials.begin()->getBlock(), IBB, TII); } + return MadeChange; } @@ -1459,7 +1466,7 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB, } /// findHoistingInsertPosAndDeps - Find the location to move common instructions -/// in successors to. The location is ususally just before the terminator, +/// in successors to. The location is usually just before the terminator, /// however if the terminator is a conditional branch and its previous /// instruction is the flag setting instruction, the previous instruction is /// the preferred location. This function also gathers uses and defs of the @@ -1483,9 +1490,8 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, if (!Reg) continue; if (MO.isUse()) { - Uses.insert(Reg); - for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) - Uses.insert(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + Uses.insert(*AI); } else if (!MO.isDead()) // Don't try to hoist code in the rare case the terminator defines a // register that is later used. @@ -1545,18 +1551,16 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, if (!Reg) continue; if (MO.isUse()) { - Uses.insert(Reg); - for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) - Uses.insert(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + Uses.insert(*AI); } else { if (Uses.count(Reg)) { Uses.erase(Reg); - for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR) - Uses.erase(*SR); // Use getSubRegisters to be conservative + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + Uses.erase(*SubRegs); // Use sub-registers to be conservative } - Defs.insert(Reg); - for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) - Defs.insert(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + Defs.insert(*AI); } } @@ -1683,8 +1687,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { unsigned Reg = MO.getReg(); if (!Reg || !LocalDefsSet.count(Reg)) continue; - for (const uint16_t *OR = TRI->getOverlaps(Reg); *OR; ++OR) - LocalDefsSet.erase(*OR); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + LocalDefsSet.erase(*AI); } // Track local defs so we can update liveins. @@ -1696,8 +1700,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { if (!Reg) continue; LocalDefs.push_back(Reg); - for (const uint16_t *OR = TRI->getOverlaps(Reg); *OR; ++OR) - LocalDefsSet.insert(*OR); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + LocalDefsSet.insert(*AI); } HasDups = true; diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 21729cd..d240389 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -11,6 +11,7 @@ add_llvm_library(LLVMCodeGen DeadMachineInstructionElim.cpp DFAPacketizer.cpp DwarfEHPrepare.cpp + EarlyIfConversion.cpp EdgeBundles.cpp ExecutionDepsFix.cpp ExpandISelPseudos.cpp @@ -30,6 +31,7 @@ add_llvm_library(LLVMCodeGen LiveInterval.cpp LiveIntervalAnalysis.cpp LiveIntervalUnion.cpp + LiveRegMatrix.cpp LiveStackAnalysis.cpp LiveVariables.cpp LiveRangeCalc.cpp @@ -77,8 +79,8 @@ add_llvm_library(LLVMCodeGen RegAllocPBQP.cpp RegisterClassInfo.cpp RegisterCoalescer.cpp + RegisterPressure.cpp RegisterScavenging.cpp - RenderMachineFunction.cpp ScheduleDAG.cpp ScheduleDAGInstrs.cpp ScheduleDAGPrinter.cpp @@ -103,5 +105,7 @@ add_llvm_library(LLVMCodeGen VirtRegMap.cpp ) +add_dependencies(LLVMCodeGen intrinsics_gen) + add_subdirectory(SelectionDAG) add_subdirectory(AsmPrinter) diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp index ea16a25..939af3f 100644 --- a/lib/CodeGen/CalcSpillWeights.cpp +++ b/lib/CodeGen/CalcSpillWeights.cpp @@ -39,18 +39,20 @@ void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const { MachineFunctionPass::getAnalysisUsage(au); } -bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) { +bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** Compute Spill Weights **********\n" << "********** Function: " - << fn.getFunction()->getName() << '\n'); - - LiveIntervals &lis = getAnalysis<LiveIntervals>(); - VirtRegAuxInfo vrai(fn, lis, getAnalysis<MachineLoopInfo>()); - for (LiveIntervals::iterator I = lis.begin(), E = lis.end(); I != E; ++I) { - LiveInterval &li = *I->second; - if (TargetRegisterInfo::isVirtualRegister(li.reg)) - vrai.CalculateWeightAndHint(li); + << MF.getFunction()->getName() << '\n'); + + LiveIntervals &LIS = getAnalysis<LiveIntervals>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + VirtRegAuxInfo VRAI(MF, LIS, getAnalysis<MachineLoopInfo>()); + for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (MRI.reg_nodbg_empty(Reg)) + continue; + VRAI.CalculateWeightAndHint(LIS.getInterval(Reg)); } return false; } @@ -86,6 +88,27 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg, return tri.getMatchingSuperReg(hreg, sub, rc); } +// Check if all values in LI are rematerializable +static bool isRematerializable(const LiveInterval &LI, + const LiveIntervals &LIS, + const TargetInstrInfo &TII) { + for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); + I != E; ++I) { + const VNInfo *VNI = *I; + if (VNI->isUnused()) + continue; + if (VNI->isPHIDef()) + return false; + + MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def); + assert(MI && "Dead valno in interval"); + + if (!TII.isTriviallyReMaterializable(MI, LIS.getAliasAnalysis())) + return false; + } + return true; +} + void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { MachineRegisterInfo &mri = MF.getRegInfo(); const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo(); @@ -171,17 +194,11 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) { } // If all of the definitions of the interval are re-materializable, - // it is a preferred candidate for spilling. If none of the defs are - // loads, then it's potentially very cheap to re-materialize. + // it is a preferred candidate for spilling. // FIXME: this gets much more complicated once we support non-trivial // re-materialization. - bool isLoad = false; - if (LIS.isReMaterializable(li, 0, isLoad)) { - if (isLoad) - totalWeight *= 0.9F; - else - totalWeight *= 0.5F; - } + if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo())) + totalWeight *= 0.5F; li.weight = normalizeSpillWeight(totalWeight, li.getSize()); } diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp index 2b7dfdb..0b747fd 100644 --- a/lib/CodeGen/CallingConvLower.cpp +++ b/lib/CodeGen/CallingConvLower.cpp @@ -49,8 +49,7 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT, Size = MinSize; if (MinAlign > (int)Align) Align = MinAlign; - if (MF.getFrameInfo()->getMaxAlignment() < Align) - MF.getFrameInfo()->setMaxAlignment(Align); + MF.getFrameInfo()->ensureMaxAlignment(Align); TM.getTargetLowering()->HandleByVal(this, Size); unsigned Offset = AllocateStack(Size, Align); addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); @@ -58,9 +57,8 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT, /// MarkAllocated - Mark a register and all of its aliases as allocated. void CCState::MarkAllocated(unsigned Reg) { - for (const uint16_t *Alias = TRI.getOverlaps(Reg); - unsigned Reg = *Alias; ++Alias) - UsedRegs[Reg/32] |= 1 << (Reg&31); + for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) + UsedRegs[*AI/32] |= 1 << (*AI&31); } /// AnalyzeFormalArguments - Analyze an array of argument values, diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index a81bb5c..fb2c2e8 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -23,6 +23,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeCalculateSpillWeightsPass(Registry); initializeCodePlacementOptPass(Registry); initializeDeadMachineInstructionElimPass(Registry); + initializeEarlyIfConverterPass(Registry); initializeExpandPostRAPass(Registry); initializeExpandISelPseudosPass(Registry); initializeFinalizeMachineBundlesPass(Registry); @@ -53,7 +54,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeProcessImplicitDefsPass(Registry); initializePEIPass(Registry); initializeRegisterCoalescerPass(Registry); - initializeRenderMachineFunctionPass(Registry); initializeSlotIndexesPass(Registry); initializeStackProtectorPass(Registry); initializeStackSlotColoringPass(Registry); @@ -65,7 +65,9 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeUnreachableBlockElimPass(Registry); initializeUnreachableMachineBlockElimPass(Registry); initializeVirtRegMapPass(Registry); + initializeVirtRegRewriterPass(Registry); initializeLowerIntrinsicsPass(Registry); + initializeMachineFunctionPrinterPassPass(Registry); } void LLVMInitializeCodeGen(LLVMPassRegistryRef R) { diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp index c13c05e..99233df 100644 --- a/lib/CodeGen/CodePlacementOpt.cpp +++ b/lib/CodeGen/CodePlacementOpt.cpp @@ -201,7 +201,7 @@ bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF, // fallthrough edge. if (!Prior->isSuccessor(End)) goto next_pred; - // Otherwise we can stop scanning and procede to move the blocks. + // Otherwise we can stop scanning and proceed to move the blocks. break; } // If we hit a switch or something complicated, don't move anything diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp index bad5010..a9de1c7 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -62,17 +62,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // In a return block, examine the function live-out regs. for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(), E = MRI.liveout_end(); I != E; ++I) { - unsigned Reg = *I; - Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); - KillIndices[Reg] = BBSize; - DefIndices[Reg] = ~0u; - - // Repeat, for all aliases. - for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { - unsigned AliasReg = *Alias; - Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1); - KillIndices[AliasReg] = BBSize; - DefIndices[AliasReg] = ~0u; + for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { + unsigned Reg = *AI; + Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[Reg] = BBSize; + DefIndices[Reg] = ~0u; } } } @@ -84,17 +78,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { SE = BB->succ_end(); SI != SE; ++SI) for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(), E = (*SI)->livein_end(); I != E; ++I) { - unsigned Reg = *I; - Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); - KillIndices[Reg] = BBSize; - DefIndices[Reg] = ~0u; - - // Repeat, for all aliases. - for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { - unsigned AliasReg = *Alias; - Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1); - KillIndices[AliasReg] = BBSize; - DefIndices[AliasReg] = ~0u; + for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { + unsigned Reg = *AI; + Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[Reg] = BBSize; + DefIndices[Reg] = ~0u; } } @@ -104,18 +92,12 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { const MachineFrameInfo *MFI = MF.getFrameInfo(); BitVector Pristine = MFI->getPristineRegs(BB); for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { - unsigned Reg = *I; - if (!IsReturnBlock && !Pristine.test(Reg)) continue; - Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); - KillIndices[Reg] = BBSize; - DefIndices[Reg] = ~0u; - - // Repeat, for all aliases. - for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { - unsigned AliasReg = *Alias; - Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1); - KillIndices[AliasReg] = BBSize; - DefIndices[AliasReg] = ~0u; + if (!IsReturnBlock && !Pristine.test(*I)) continue; + for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { + unsigned Reg = *AI; + Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); + KillIndices[Reg] = BBSize; + DefIndices[Reg] = ~0u; } } } @@ -208,7 +190,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) { const TargetRegisterClass *NewRC = 0; if (i < MI->getDesc().getNumOperands()) - NewRC = TII->getRegClass(MI->getDesc(), i, TRI); + NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF); // For now, only allow the register to be changed if its register // class is consistent across all uses. @@ -218,11 +200,11 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) { Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); // Now check for aliases. - for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { + for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) { // If an alias of the reg is used during the live range, give up. // Note that this allows us to skip checking if AntiDepReg // overlaps with any of the aliases, among other things. - unsigned AliasReg = *Alias; + unsigned AliasReg = *AI; if (Classes[AliasReg]) { Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1); Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); @@ -236,9 +218,8 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) { if (MO.isUse() && Special) { if (!KeepRegs.test(Reg)) { KeepRegs.set(Reg); - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) - KeepRegs.set(*Subreg); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + KeepRegs.set(*SubRegs); } } } @@ -247,7 +228,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) { void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI, unsigned Count) { // Update liveness. - // Proceding upwards, registers that are defed but not used in this + // Proceeding upwards, registers that are defed but not used in this // instruction are now dead. if (!TII->isPredicated(MI)) { @@ -282,9 +263,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI, Classes[Reg] = 0; RegRefs.erase(Reg); // Repeat, for all subregs. - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) { - unsigned SubregReg = *Subreg; + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubregReg = *SubRegs; DefIndices[SubregReg] = Count; KillIndices[SubregReg] = ~0u; KeepRegs.reset(SubregReg); @@ -292,11 +272,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI, RegRefs.erase(SubregReg); } // Conservatively mark super-registers as unusable. - for (const uint16_t *Super = TRI->getSuperRegisters(Reg); - *Super; ++Super) { - unsigned SuperReg = *Super; - Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1); - } + for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) + Classes[*SR] = reinterpret_cast<TargetRegisterClass *>(-1); } } for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { @@ -308,7 +285,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI, const TargetRegisterClass *NewRC = 0; if (i < MI->getDesc().getNumOperands()) - NewRC = TII->getRegClass(MI->getDesc(), i, TRI); + NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF); // For now, only allow the register to be changed if its register // class is consistent across all uses. @@ -328,8 +305,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI, "Kill and Def maps aren't consistent for Reg!"); } // Repeat, for all aliases. - for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { - unsigned AliasReg = *Alias; + for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) { + unsigned AliasReg = *AI; if (KillIndices[AliasReg] == ~0u) { KillIndices[AliasReg] = Count; DefIndices[AliasReg] = ~0u; diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h index 7746259..ad95c48 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.h +++ b/lib/CodeGen/CriticalAntiDepBreaker.h @@ -17,11 +17,11 @@ #define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H #include "AntiDepBreaker.h" -#include "RegisterClassInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/ADT/BitVector.h" #include <map> diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp index 5ff641c..ff2f113 100644 --- a/lib/CodeGen/DFAPacketizer.cpp +++ b/lib/CodeGen/DFAPacketizer.cpp @@ -23,10 +23,10 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBundle.h" -#include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/MC/MCInstrItineraries.h" using namespace llvm; @@ -100,22 +100,23 @@ void DFAPacketizer::reserveResources(llvm::MachineInstr *MI) { reserveResources(&MID); } -namespace { +namespace llvm { // DefaultVLIWScheduler - This class extends ScheduleDAGInstrs and overrides // Schedule method to build the dependence graph. class DefaultVLIWScheduler : public ScheduleDAGInstrs { public: DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI, - MachineDominatorTree &MDT, bool IsPostRA); + MachineDominatorTree &MDT, bool IsPostRA); // Schedule - Actual scheduling work. void schedule(); }; -} // end anonymous namespace +} DefaultVLIWScheduler::DefaultVLIWScheduler( MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT, bool IsPostRA) : ScheduleDAGInstrs(MF, MLI, MDT, IsPostRA) { + CanHandleTerminators = true; } void DefaultVLIWScheduler::schedule() { @@ -129,49 +130,25 @@ VLIWPacketizerList::VLIWPacketizerList( bool IsPostRA) : TM(MF.getTarget()), MF(MF) { TII = TM.getInstrInfo(); ResourceTracker = TII->CreateTargetScheduleState(&TM, 0); - SchedulerImpl = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA); + VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA); } // VLIWPacketizerList Dtor VLIWPacketizerList::~VLIWPacketizerList() { - delete SchedulerImpl; - delete ResourceTracker; -} - -// ignorePseudoInstruction - ignore pseudo instructions. -bool VLIWPacketizerList::ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) { - if (MI->isDebugValue()) - return true; - - if (TII->isSchedulingBoundary(MI, MBB, MF)) - return true; - - return false; -} - -// isSoloInstruction - return true if instruction I must end previous -// packet. -bool VLIWPacketizerList::isSoloInstruction(MachineInstr *I) { - if (I->isInlineAsm()) - return true; - - return false; -} + if (VLIWScheduler) + delete VLIWScheduler; -// addToPacket - Add I to the current packet and reserve resource. -void VLIWPacketizerList::addToPacket(MachineInstr *MI) { - CurrentPacketMIs.push_back(MI); - ResourceTracker->reserveResources(MI); + if (ResourceTracker) + delete ResourceTracker; } // endPacket - End the current packet, bundle packet instructions and reset // DFA state. void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB, - MachineInstr *I) { + MachineInstr *MI) { if (CurrentPacketMIs.size() > 1) { MachineInstr *MIFirst = CurrentPacketMIs.front(); - finalizeBundle(*MBB, MIFirst, I); + finalizeBundle(*MBB, MIFirst, MI); } CurrentPacketMIs.clear(); ResourceTracker->clearResources(); @@ -181,31 +158,35 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB, void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB, MachineBasicBlock::iterator BeginItr, MachineBasicBlock::iterator EndItr) { - assert(MBB->end() == EndItr && "Bad EndIndex"); - - SchedulerImpl->enterRegion(MBB, BeginItr, EndItr, MBB->size()); - - // Build the DAG without reordering instructions. - SchedulerImpl->schedule(); - - // Remember scheduling units. - SUnits = SchedulerImpl->SUnits; + assert(VLIWScheduler && "VLIW Scheduler is not initialized!"); + VLIWScheduler->startBlock(MBB); + VLIWScheduler->enterRegion(MBB, BeginItr, EndItr, MBB->size()); + VLIWScheduler->schedule(); + + // Generate MI -> SU map. + MIToSUnit.clear(); + for (unsigned i = 0, e = VLIWScheduler->SUnits.size(); i != e; ++i) { + SUnit *SU = &VLIWScheduler->SUnits[i]; + MIToSUnit[SU->getInstr()] = SU; + } // The main packetizer loop. for (; BeginItr != EndItr; ++BeginItr) { MachineInstr *MI = BeginItr; - // Ignore pseudo instructions. - if (ignorePseudoInstruction(MI, MBB)) - continue; + this->initPacketizerState(); // End the current packet if needed. - if (isSoloInstruction(MI)) { + if (this->isSoloInstruction(MI)) { endPacket(MBB, MI); continue; } - SUnit *SUI = SchedulerImpl->getSUnit(MI); + // Ignore pseudo instructions. + if (this->ignorePseudoInstruction(MI, MBB)) + continue; + + SUnit *SUI = MIToSUnit[MI]; assert(SUI && "Missing SUnit Info!"); // Ask DFA if machine resource is available for MI. @@ -215,13 +196,13 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB, for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end(); VI != VE; ++VI) { MachineInstr *MJ = *VI; - SUnit *SUJ = SchedulerImpl->getSUnit(MJ); + SUnit *SUJ = MIToSUnit[MJ]; assert(SUJ && "Missing SUnit Info!"); // Is it legal to packetize SUI and SUJ together. - if (!isLegalToPacketizeTogether(SUI, SUJ)) { + if (!this->isLegalToPacketizeTogether(SUI, SUJ)) { // Allow packetization if dependency can be pruned. - if (!isLegalToPruneDependencies(SUI, SUJ)) { + if (!this->isLegalToPruneDependencies(SUI, SUJ)) { // End the packet if dependency cannot be pruned. endPacket(MBB, MI); break; @@ -234,11 +215,11 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB, } // Add MI to the current packet. - addToPacket(MI); + BeginItr = this->addToPacket(MI); } // For all instructions in BB. // End any packet left behind. endPacket(MBB, EndItr); - - SchedulerImpl->exitRegion(); + VLIWScheduler->exitRegion(); + VLIWScheduler->finishBlock(); } diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp index aa10d1d..b4394e8 100644 --- a/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -171,9 +171,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) { // Check the subreg set, not the alias set, because a def // of a super-register may still be partially live after // this def. - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - *SubRegs; ++SubRegs) - LivePhysRegs.reset(*SubRegs); + for (MCSubRegIterator SR(Reg, TRI); SR.isValid(); ++SR) + LivePhysRegs.reset(*SR); } } else if (MO.isRegMask()) { // Register mask of preserved registers. All clobbers are dead. @@ -187,10 +186,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) { if (MO.isReg() && MO.isUse()) { unsigned Reg = MO.getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - LivePhysRegs.set(Reg); - for (const uint16_t *AliasSet = TRI->getAliasSet(Reg); - *AliasSet; ++AliasSet) - LivePhysRegs.set(*AliasSet); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + LivePhysRegs.set(*AI); } } } diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp index 944dd4f..7095624 100644 --- a/lib/CodeGen/DwarfEHPrepare.cpp +++ b/lib/CodeGen/DwarfEHPrepare.cpp @@ -39,7 +39,7 @@ namespace { Constant *RewindFunction; bool InsertUnwindResumeCalls(Function &Fn); - Instruction *GetExceptionObject(ResumeInst *RI); + Value *GetExceptionObject(ResumeInst *RI); public: static char ID; // Pass identification, replacement for typeid. @@ -68,9 +68,9 @@ FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm) { /// GetExceptionObject - Return the exception object from the value passed into /// the 'resume' instruction (typically an aggregate). Clean up any dead /// instructions, including the 'resume' instruction. -Instruction *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) { +Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) { Value *V = RI->getOperand(0); - Instruction *ExnObj = 0; + Value *ExnObj = 0; InsertValueInst *SelIVI = dyn_cast<InsertValueInst>(V); LoadInst *SelLoad = 0; InsertValueInst *ExcIVI = 0; @@ -81,7 +81,7 @@ Instruction *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) { ExcIVI = dyn_cast<InsertValueInst>(SelIVI->getOperand(0)); if (ExcIVI && isa<UndefValue>(ExcIVI->getOperand(0)) && ExcIVI->getNumIndices() == 1 && *ExcIVI->idx_begin() == 0) { - ExnObj = cast<Instruction>(ExcIVI->getOperand(1)); + ExnObj = ExcIVI->getOperand(1); SelLoad = dyn_cast<LoadInst>(SelIVI->getOperand(1)); EraseIVIs = true; } @@ -139,7 +139,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) { // _Unwind_Resume to the end of the single resume block. ResumeInst *RI = Resumes.front(); BasicBlock *UnwindBB = RI->getParent(); - Instruction *ExnObj = GetExceptionObject(RI); + Value *ExnObj = GetExceptionObject(RI); // Call the _Unwind_Resume function. CallInst *CI = CallInst::Create(RewindFunction, ExnObj, "", UnwindBB); @@ -162,7 +162,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) { BasicBlock *Parent = RI->getParent(); BranchInst::Create(UnwindBB, Parent); - Instruction *ExnObj = GetExceptionObject(RI); + Value *ExnObj = GetExceptionObject(RI); PN->addIncoming(ExnObj, Parent); ++NumResumesLowered; diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp new file mode 100644 index 0000000..9840a40 --- /dev/null +++ b/lib/CodeGen/EarlyIfConversion.cpp @@ -0,0 +1,618 @@ +//===-- EarlyIfConversion.cpp - If-conversion on SSA form machine code ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Early if-conversion is for out-of-order CPUs that don't have a lot of +// predicable instructions. The goal is to eliminate conditional branches that +// may mispredict. +// +// Instructions from both sides of the branch are executed specutatively, and a +// cmov instruction selects the result. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "early-ifcvt" +#include "llvm/Function.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SparseSet.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +// Absolute maximum number of instructions allowed per speculated block. +// This bypasses all other heuristics, so it should be set fairly high. +static cl::opt<unsigned> +BlockInstrLimit("early-ifcvt-limit", cl::init(30), cl::Hidden, + cl::desc("Maximum number of instructions per speculated block.")); + +// Stress testing mode - disable heuristics. +static cl::opt<bool> Stress("stress-early-ifcvt", cl::Hidden, + cl::desc("Turn all knobs to 11")); + +typedef SmallSetVector<MachineBasicBlock*, 8> BlockSetVector; + +//===----------------------------------------------------------------------===// +// SSAIfConv +//===----------------------------------------------------------------------===// +// +// The SSAIfConv class performs if-conversion on SSA form machine code after +// determining if it is possible. The class contains no heuristics; external +// code should be used to determine when if-conversion is a good idea. +// +// SSAIfConv can convert both triangles and diamonds: +// +// Triangle: Head Diamond: Head +// | \ / \_ +// | \ / | +// | [TF]BB FBB TBB +// | / \ / +// | / \ / +// Tail Tail +// +// Instructions in the conditional blocks TBB and/or FBB are spliced into the +// Head block, and phis in the Tail block are converted to select instructions. +// +namespace { +class SSAIfConv { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + +public: + /// The block containing the conditional branch. + MachineBasicBlock *Head; + + /// The block containing phis after the if-then-else. + MachineBasicBlock *Tail; + + /// The 'true' conditional block as determined by AnalyzeBranch. + MachineBasicBlock *TBB; + + /// The 'false' conditional block as determined by AnalyzeBranch. + MachineBasicBlock *FBB; + + /// isTriangle - When there is no 'else' block, either TBB or FBB will be + /// equal to Tail. + bool isTriangle() const { return TBB == Tail || FBB == Tail; } + + /// Information about each phi in the Tail block. + struct PHIInfo { + MachineInstr *PHI; + unsigned TReg, FReg; + // Latencies from Cond+Branch, TReg, and FReg to DstReg. + int CondCycles, TCycles, FCycles; + + PHIInfo(MachineInstr *phi) + : PHI(phi), TReg(0), FReg(0), CondCycles(0), TCycles(0), FCycles(0) {} + }; + + SmallVector<PHIInfo, 8> PHIs; + +private: + /// The branch condition determined by AnalyzeBranch. + SmallVector<MachineOperand, 4> Cond; + + /// Instructions in Head that define values used by the conditional blocks. + /// The hoisted instructions must be inserted after these instructions. + SmallPtrSet<MachineInstr*, 8> InsertAfter; + + /// Register units clobbered by the conditional blocks. + BitVector ClobberedRegUnits; + + // Scratch pad for findInsertionPoint. + SparseSet<unsigned> LiveRegUnits; + + /// Insertion point in Head for speculatively executed instructions form TBB + /// and FBB. + MachineBasicBlock::iterator InsertionPoint; + + /// Return true if all non-terminator instructions in MBB can be safely + /// speculated. + bool canSpeculateInstrs(MachineBasicBlock *MBB); + + /// Find a valid insertion point in Head. + bool findInsertionPoint(); + +public: + /// runOnMachineFunction - Initialize per-function data structures. + void runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + LiveRegUnits.clear(); + LiveRegUnits.setUniverse(TRI->getNumRegUnits()); + ClobberedRegUnits.clear(); + ClobberedRegUnits.resize(TRI->getNumRegUnits()); + } + + /// canConvertIf - If the sub-CFG headed by MBB can be if-converted, + /// initialize the internal state, and return true. + bool canConvertIf(MachineBasicBlock *MBB); + + /// convertIf - If-convert the last block passed to canConvertIf(), assuming + /// it is possible. Add any erased blocks to RemovedBlocks. + void convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks); +}; +} // end anonymous namespace + + +/// canSpeculateInstrs - Returns true if all the instructions in MBB can safely +/// be speculated. The terminators are not considered. +/// +/// If instructions use any values that are defined in the head basic block, +/// the defining instructions are added to InsertAfter. +/// +/// Any clobbered regunits are added to ClobberedRegUnits. +/// +bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) { + // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to + // get right. + if (!MBB->livein_empty()) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n"); + return false; + } + + unsigned InstrCount = 0; + + // Check all instructions, except the terminators. It is assumed that + // terminators never have side effects or define any used register values. + for (MachineBasicBlock::iterator I = MBB->begin(), + E = MBB->getFirstTerminator(); I != E; ++I) { + if (I->isDebugValue()) + continue; + + if (++InstrCount > BlockInstrLimit && !Stress) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than " + << BlockInstrLimit << " instructions.\n"); + return false; + } + + // There shouldn't normally be any phis in a single-predecessor block. + if (I->isPHI()) { + DEBUG(dbgs() << "Can't hoist: " << *I); + return false; + } + + // Don't speculate loads. Note that it may be possible and desirable to + // speculate GOT or constant pool loads that are guaranteed not to trap, + // but we don't support that for now. + if (I->mayLoad()) { + DEBUG(dbgs() << "Won't speculate load: " << *I); + return false; + } + + // We never speculate stores, so an AA pointer isn't necessary. + bool DontMoveAcrossStore = true; + if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) { + DEBUG(dbgs() << "Can't speculate: " << *I); + return false; + } + + // Check for any dependencies on Head instructions. + for (MIOperands MO(I); MO.isValid(); ++MO) { + if (MO->isRegMask()) { + DEBUG(dbgs() << "Won't speculate regmask: " << *I); + return false; + } + if (!MO->isReg()) + continue; + unsigned Reg = MO->getReg(); + + // Remember clobbered regunits. + if (MO->isDef() && TargetRegisterInfo::isPhysicalRegister(Reg)) + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + ClobberedRegUnits.set(*Units); + + if (!MO->readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + MachineInstr *DefMI = MRI->getVRegDef(Reg); + if (!DefMI || DefMI->getParent() != Head) + continue; + if (InsertAfter.insert(DefMI)) + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI); + if (DefMI->isTerminator()) { + DEBUG(dbgs() << "Can't insert instructions below terminator.\n"); + return false; + } + } + } + return true; +} + + +/// Find an insertion point in Head for the speculated instructions. The +/// insertion point must be: +/// +/// 1. Before any terminators. +/// 2. After any instructions in InsertAfter. +/// 3. Not have any clobbered regunits live. +/// +/// This function sets InsertionPoint and returns true when successful, it +/// returns false if no valid insertion point could be found. +/// +bool SSAIfConv::findInsertionPoint() { + // Keep track of live regunits before the current position. + // Only track RegUnits that are also in ClobberedRegUnits. + LiveRegUnits.clear(); + SmallVector<unsigned, 8> Reads; + MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator(); + MachineBasicBlock::iterator I = Head->end(); + MachineBasicBlock::iterator B = Head->begin(); + while (I != B) { + --I; + // Some of the conditional code depends in I. + if (InsertAfter.count(I)) { + DEBUG(dbgs() << "Can't insert code after " << *I); + return false; + } + + // Update live regunits. + for (MIOperands MO(I); MO.isValid(); ++MO) { + // We're ignoring regmask operands. That is conservatively correct. + if (!MO->isReg()) + continue; + unsigned Reg = MO->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + // I clobbers Reg, so it isn't live before I. + if (MO->isDef()) + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + LiveRegUnits.erase(*Units); + // Unless I reads Reg. + if (MO->readsReg()) + Reads.push_back(Reg); + } + // Anything read by I is live before I. + while (!Reads.empty()) + for (MCRegUnitIterator Units(Reads.pop_back_val(), TRI); Units.isValid(); + ++Units) + if (ClobberedRegUnits.test(*Units)) + LiveRegUnits.insert(*Units); + + // We can't insert before a terminator. + if (I != FirstTerm && I->isTerminator()) + continue; + + // Some of the clobbered registers are live before I, not a valid insertion + // point. + if (!LiveRegUnits.empty()) { + DEBUG({ + dbgs() << "Would clobber"; + for (SparseSet<unsigned>::const_iterator + i = LiveRegUnits.begin(), e = LiveRegUnits.end(); i != e; ++i) + dbgs() << ' ' << PrintRegUnit(*i, TRI); + dbgs() << " live before " << *I; + }); + continue; + } + + // This is a valid insertion point. + InsertionPoint = I; + DEBUG(dbgs() << "Can insert before " << *I); + return true; + } + DEBUG(dbgs() << "No legal insertion point found.\n"); + return false; +} + + + +/// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is +/// a potential candidate for if-conversion. Fill out the internal state. +/// +bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) { + Head = MBB; + TBB = FBB = Tail = 0; + + if (Head->succ_size() != 2) + return false; + MachineBasicBlock *Succ0 = Head->succ_begin()[0]; + MachineBasicBlock *Succ1 = Head->succ_begin()[1]; + + // Canonicalize so Succ0 has MBB as its single predecessor. + if (Succ0->pred_size() != 1) + std::swap(Succ0, Succ1); + + if (Succ0->pred_size() != 1 || Succ0->succ_size() != 1) + return false; + + // We could support additional Tail predecessors by updating phis instead of + // eliminating them. Let's see an example where it matters first. + Tail = Succ0->succ_begin()[0]; + if (Tail->pred_size() != 2) + return false; + + // This is not a triangle. + if (Tail != Succ1) { + // Check for a diamond. We won't deal with any critical edges. + if (Succ1->pred_size() != 1 || Succ1->succ_size() != 1 || + Succ1->succ_begin()[0] != Tail) + return false; + DEBUG(dbgs() << "\nDiamond: BB#" << Head->getNumber() + << " -> BB#" << Succ0->getNumber() + << "/BB#" << Succ1->getNumber() + << " -> BB#" << Tail->getNumber() << '\n'); + + // Live-in physregs are tricky to get right when speculating code. + if (!Tail->livein_empty()) { + DEBUG(dbgs() << "Tail has live-ins.\n"); + return false; + } + } else { + DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() + << " -> BB#" << Succ0->getNumber() + << " -> BB#" << Tail->getNumber() << '\n'); + } + + // This is a triangle or a diamond. + // If Tail doesn't have any phis, there must be side effects. + if (Tail->empty() || !Tail->front().isPHI()) { + DEBUG(dbgs() << "No phis in tail.\n"); + return false; + } + + // The branch we're looking to eliminate must be analyzable. + Cond.clear(); + if (TII->AnalyzeBranch(*Head, TBB, FBB, Cond)) { + DEBUG(dbgs() << "Branch not analyzable.\n"); + return false; + } + + // This is weird, probably some sort of degenerate CFG. + if (!TBB) { + DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n"); + return false; + } + + // AnalyzeBranch doesn't set FBB on a fall-through branch. + // Make sure it is always set. + FBB = TBB == Succ0 ? Succ1 : Succ0; + + // Any phis in the tail block must be convertible to selects. + PHIs.clear(); + MachineBasicBlock *TPred = TBB == Tail ? Head : TBB; + MachineBasicBlock *FPred = FBB == Tail ? Head : FBB; + for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end(); + I != E && I->isPHI(); ++I) { + PHIs.push_back(&*I); + PHIInfo &PI = PHIs.back(); + // Find PHI operands corresponding to TPred and FPred. + for (unsigned i = 1; i != PI.PHI->getNumOperands(); i += 2) { + if (PI.PHI->getOperand(i+1).getMBB() == TPred) + PI.TReg = PI.PHI->getOperand(i).getReg(); + if (PI.PHI->getOperand(i+1).getMBB() == FPred) + PI.FReg = PI.PHI->getOperand(i).getReg(); + } + assert(TargetRegisterInfo::isVirtualRegister(PI.TReg) && "Bad PHI"); + assert(TargetRegisterInfo::isVirtualRegister(PI.FReg) && "Bad PHI"); + + // Get target information. + if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg, + PI.CondCycles, PI.TCycles, PI.FCycles)) { + DEBUG(dbgs() << "Can't convert: " << *PI.PHI); + return false; + } + } + + // Check that the conditional instructions can be speculated. + InsertAfter.clear(); + ClobberedRegUnits.reset(); + if (TBB != Tail && !canSpeculateInstrs(TBB)) + return false; + if (FBB != Tail && !canSpeculateInstrs(FBB)) + return false; + + // Try to find a valid insertion point for the speculated instructions in the + // head basic block. + if (!findInsertionPoint()) + return false; + + return true; +} + + +/// convertIf - Execute the if conversion after canConvertIf has determined the +/// feasibility. +/// +/// Any basic blocks erased will be added to RemovedBlocks. +/// +void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) { + assert(Head && Tail && TBB && FBB && "Call canConvertIf first."); + + // Move all instructions into Head, except for the terminators. + if (TBB != Tail) + Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator()); + if (FBB != Tail) + Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator()); + + MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator(); + assert(FirstTerm != Head->end() && "No terminators"); + DebugLoc HeadDL = FirstTerm->getDebugLoc(); + + // Convert all PHIs to select instructions inserted before FirstTerm. + for (unsigned i = 0, e = PHIs.size(); i != e; ++i) { + PHIInfo &PI = PHIs[i]; + DEBUG(dbgs() << "If-converting " << *PI.PHI); + assert(PI.PHI->getNumOperands() == 5 && "Unexpected PHI operands."); + unsigned DstReg = PI.PHI->getOperand(0).getReg(); + TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg); + DEBUG(dbgs() << " --> " << *llvm::prior(FirstTerm)); + PI.PHI->eraseFromParent(); + PI.PHI = 0; + } + + // Fix up the CFG, temporarily leave Head without any successors. + Head->removeSuccessor(TBB); + Head->removeSuccessor(FBB); + if (TBB != Tail) + TBB->removeSuccessor(Tail); + if (FBB != Tail) + FBB->removeSuccessor(Tail); + + // Fix up Head's terminators. + // It should become a single branch or a fallthrough. + TII->RemoveBranch(*Head); + + // Erase the now empty conditional blocks. It is likely that Head can fall + // through to Tail, and we can join the two blocks. + if (TBB != Tail) { + RemovedBlocks.push_back(TBB); + TBB->eraseFromParent(); + } + if (FBB != Tail) { + RemovedBlocks.push_back(FBB); + FBB->eraseFromParent(); + } + + assert(Head->succ_empty() && "Additional head successors?"); + if (Head->isLayoutSuccessor(Tail)) { + // Splice Tail onto the end of Head. + DEBUG(dbgs() << "Joining tail BB#" << Tail->getNumber() + << " into head BB#" << Head->getNumber() << '\n'); + Head->splice(Head->end(), Tail, + Tail->begin(), Tail->end()); + Head->transferSuccessorsAndUpdatePHIs(Tail); + RemovedBlocks.push_back(Tail); + Tail->eraseFromParent(); + } else { + // We need a branch to Tail, let code placement work it out later. + DEBUG(dbgs() << "Converting to unconditional branch.\n"); + SmallVector<MachineOperand, 0> EmptyCond; + TII->InsertBranch(*Head, Tail, 0, EmptyCond, HeadDL); + Head->addSuccessor(Tail); + } + DEBUG(dbgs() << *Head); +} + + +//===----------------------------------------------------------------------===// +// EarlyIfConverter Pass +//===----------------------------------------------------------------------===// + +namespace { +class EarlyIfConverter : public MachineFunctionPass { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + MachineDominatorTree *DomTree; + MachineLoopInfo *Loops; + SSAIfConv IfConv; + +public: + static char ID; + EarlyIfConverter() : MachineFunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnMachineFunction(MachineFunction &MF); + +private: + bool tryConvertIf(MachineBasicBlock*); + void updateDomTree(ArrayRef<MachineBasicBlock*> Removed); + void updateLoops(ArrayRef<MachineBasicBlock*> Removed); +}; +} // end anonymous namespace + +char EarlyIfConverter::ID = 0; +char &llvm::EarlyIfConverterID = EarlyIfConverter::ID; + +INITIALIZE_PASS_BEGIN(EarlyIfConverter, + "early-ifcvt", "Early If Converter", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(EarlyIfConverter, + "early-ifcvt", "Early If Converter", false, false) + +void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineBranchProbabilityInfo>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +/// Update the dominator tree after if-conversion erased some blocks. +void EarlyIfConverter::updateDomTree(ArrayRef<MachineBasicBlock*> Removed) { + // convertIf can remove TBB, FBB, and Tail can be merged into Head. + // TBB and FBB should not dominate any blocks. + // Tail children should be transferred to Head. + MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head); + for (unsigned i = 0, e = Removed.size(); i != e; ++i) { + MachineDomTreeNode *Node = DomTree->getNode(Removed[i]); + assert(Node != HeadNode && "Cannot erase the head node"); + while (Node->getNumChildren()) { + assert(Node->getBlock() == IfConv.Tail && "Unexpected children"); + DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); + } + DomTree->eraseNode(Removed[i]); + } +} + +/// Update LoopInfo after if-conversion. +void EarlyIfConverter::updateLoops(ArrayRef<MachineBasicBlock*> Removed) { + if (!Loops) + return; + // If-conversion doesn't change loop structure, and it doesn't mess with back + // edges, so updating LoopInfo is simply removing the dead blocks. + for (unsigned i = 0, e = Removed.size(); i != e; ++i) + Loops->removeBlock(Removed[i]); +} + +/// Attempt repeated if-conversion on MBB, return true if successful. +/// +bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) { + bool Changed = false; + while (IfConv.canConvertIf(MBB)) { + // If-convert MBB and update analyses. + SmallVector<MachineBasicBlock*, 4> RemovedBlocks; + IfConv.convertIf(RemovedBlocks); + Changed = true; + updateDomTree(RemovedBlocks); + updateLoops(RemovedBlocks); + } + return Changed; +} + +bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n" + << "********** Function: " + << ((Value*)MF.getFunction())->getName() << '\n'); + TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + DomTree = &getAnalysis<MachineDominatorTree>(); + Loops = getAnalysisIfAvailable<MachineLoopInfo>(); + + bool Changed = false; + IfConv.runOnMachineFunction(MF); + + // Visit blocks in dominator tree post-order. The post-order enables nested + // if-conversion in a single pass. The tryConvertIf() function may erase + // blocks, but only blocks dominated by the head block. This makes it safe to + // update the dominator tree while the post-order iterator is still active. + for (po_iterator<MachineDominatorTree*> + I = po_begin(DomTree), E = po_end(DomTree); I != E; ++I) + if (tryConvertIf(I->getBlock())) + Changed = true; + + MF.verify(this, "After early if-conversion"); + return Changed; +} diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp index a48c540..fee8e47 100644 --- a/lib/CodeGen/ExecutionDepsFix.cpp +++ b/lib/CodeGen/ExecutionDepsFix.cpp @@ -59,7 +59,7 @@ struct DomainValue { // Pointer to the next DomainValue in a chain. When two DomainValues are // merged, Victim.Next is set to point to Victor, so old DomainValue - // references can be updated by folowing the chain. + // references can be updated by following the chain. DomainValue *Next; // Twiddleable instructions using or defining these registers. @@ -666,7 +666,8 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { // or -1. AliasMap.resize(TRI->getNumRegs(), -1); for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i) - for (const uint16_t *AI = TRI->getOverlaps(RC->getRegister(i)); *AI; ++AI) + for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); + AI.isValid(); ++AI) AliasMap[*AI] = i; } diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index 75ae5b9..4214ba1 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" @@ -155,7 +156,9 @@ namespace { const TargetRegisterInfo *TRI; const InstrItineraryData *InstrItins; const MachineBranchProbabilityInfo *MBPI; + MachineRegisterInfo *MRI; + bool PreRegAlloc; bool MadeChange; int FnNum; public: @@ -263,14 +266,20 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) { TII = MF.getTarget().getInstrInfo(); TRI = MF.getTarget().getRegisterInfo(); MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); + MRI = &MF.getRegInfo(); InstrItins = MF.getTarget().getInstrItineraryData(); if (!TII) return false; - // Tail merge tend to expose more if-conversion opportunities. - BranchFolder BF(true, false); - bool BFChange = BF.OptimizeFunction(MF, TII, + PreRegAlloc = MRI->isSSA(); + + bool BFChange = false; + if (!PreRegAlloc) { + // Tail merge tend to expose more if-conversion opportunities. + BranchFolder BF(true, false); + BFChange = BF.OptimizeFunction(MF, TII, MF.getTarget().getRegisterInfo(), getAnalysisIfAvailable<MachineModuleInfo>()); + } DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum << ") \'" << MF.getFunction()->getName() << "\'"); @@ -621,7 +630,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI) { if (BBI.IsDone) return; - bool AlreadyPredicated = BBI.Predicate.size() > 0; + bool AlreadyPredicated = !BBI.Predicate.empty(); // First analyze the end of BB branches. BBI.TrueBB = BBI.FalseBB = NULL; BBI.BrCond.clear(); @@ -786,8 +795,8 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, unsigned Dups = 0; unsigned Dups2 = 0; - bool TNeedSub = TrueBBI.Predicate.size() > 0; - bool FNeedSub = FalseBBI.Predicate.size() > 0; + bool TNeedSub = !TrueBBI.Predicate.empty(); + bool FNeedSub = !FalseBBI.Predicate.empty(); bool Enqueued = false; BranchProbability Prediction = MBPI->getEdgeProbability(BB, TrueBBI.BB); @@ -962,9 +971,8 @@ static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs, E = BB->livein_end(); I != E; ++I) { unsigned Reg = *I; Redefs.insert(Reg); - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) - Redefs.insert(*Subreg); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + Redefs.insert(*SubRegs); } } @@ -983,8 +991,8 @@ static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs, Defs.push_back(Reg); else if (MO.isKill()) { Redefs.erase(Reg); - for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR) - Redefs.erase(*SR); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + Redefs.erase(*SubRegs); } } for (unsigned i = 0, e = Defs.size(); i != e; ++i) { @@ -993,11 +1001,12 @@ static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs, if (AddImpUse) // Treat predicated update as read + write. MI->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/, - true/*IsImp*/,false/*IsKill*/)); + true/*IsImp*/,false/*IsKill*/, + false/*IsDead*/,true/*IsUndef*/)); } else { Redefs.insert(Reg); - for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR) - Redefs.insert(*SR); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + Redefs.insert(*SubRegs); } } } @@ -1335,8 +1344,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind, // These are defined before ctrl flow reach the 'false' instructions. // They cannot be modified by the 'true' instructions. ExtUses.insert(Reg); - for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR) - ExtUses.insert(*SR); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + ExtUses.insert(*SubRegs); } } @@ -1344,8 +1353,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind, unsigned Reg = Defs[i]; if (!ExtUses.count(Reg)) { RedefsByFalse.insert(Reg); - for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR) - RedefsByFalse.insert(*SR); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + RedefsByFalse.insert(*SubRegs); } } } diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp index d5ea666..07e37af 100644 --- a/lib/CodeGen/InlineSpiller.cpp +++ b/lib/CodeGen/InlineSpiller.cpp @@ -52,7 +52,6 @@ static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden, namespace { class InlineSpiller : public Spiller { - MachineFunctionPass &Pass; MachineFunction &MF; LiveIntervals &LIS; LiveStacks &LSS; @@ -137,8 +136,7 @@ public: InlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm) - : Pass(pass), - MF(mf), + : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()), LSS(pass.getAnalysis<LiveStacks>()), AA(&pass.getAnalysis<AliasAnalysis>()), @@ -578,11 +576,11 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI, if (unsigned SrcReg = isFullCopyOf(MI, Reg)) { if (isSibling(SrcReg)) { LiveInterval &SrcLI = LIS.getInterval(SrcReg); - LiveRange *SrcLR = SrcLI.getLiveRangeContaining(VNI->def.getRegSlot(true)); - assert(SrcLR && "Copy from non-existing value"); + LiveRangeQuery SrcQ(SrcLI, VNI->def); + assert(SrcQ.valueIn() && "Copy from non-existing value"); // Check if this COPY kills its source. - SVI->second.KillsSource = (SrcLR->end == VNI->def); - VNInfo *SrcVNI = SrcLR->valno; + SVI->second.KillsSource = SrcQ.isKill(); + VNInfo *SrcVNI = SrcQ.valueIn(); DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':' << SrcVNI->id << '@' << SrcVNI->def << " kill=" << unsigned(SVI->second.KillsSource) << '\n'); @@ -1083,6 +1081,10 @@ void InlineSpiller::insertReload(LiveInterval &NewLI, MRI.getRegClass(NewLI.reg), &TRI); --MI; // Point to load instruction. SlotIndex LoadIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot(); + // Some (out-of-tree) targets have EC reload instructions. + if (MachineOperand *MO = MI->findRegisterDefOperand(NewLI.reg)) + if (MO->isEarlyClobber()) + LoadIdx = LoadIdx.getRegSlot(true); DEBUG(dbgs() << "\treload: " << LoadIdx << '\t' << *MI); VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, LIS.getVNInfoAllocator()); NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI)); @@ -1275,8 +1277,8 @@ void InlineSpiller::spill(LiveRangeEdit &edit) { DEBUG(dbgs() << "Inline spilling " << MRI.getRegClass(edit.getReg())->getName() - << ':' << edit.getParent() << "\nFrom original " - << LIS.getInterval(Original) << '\n'); + << ':' << PrintReg(edit.getReg()) << ' ' << edit.getParent() + << "\nFrom original " << LIS.getInterval(Original) << '\n'); assert(edit.getParent().isSpillable() && "Attempting to spill already spilled value."); assert(DeadDefs.empty() && "Previous spill didn't remove dead defs"); diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp index 8368b58..1541bf0 100644 --- a/lib/CodeGen/InterferenceCache.cpp +++ b/lib/CodeGen/InterferenceCache.cpp @@ -39,7 +39,7 @@ InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) { unsigned E = PhysRegEntries[PhysReg]; if (E < CacheEntries && Entries[E].getPhysReg() == PhysReg) { if (!Entries[E].valid(LIUArray, TRI)) - Entries[E].revalidate(); + Entries[E].revalidate(LIUArray, TRI); return &Entries[E]; } // No valid entry exists, pick the next round-robin entry. @@ -61,13 +61,15 @@ InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) { } /// revalidate - LIU contents have changed, update tags. -void InterferenceCache::Entry::revalidate() { +void InterferenceCache::Entry::revalidate(LiveIntervalUnion *LIUArray, + const TargetRegisterInfo *TRI) { // Invalidate all block entries. ++Tag; // Invalidate all iterators. PrevPos = SlotIndex(); - for (unsigned i = 0, e = Aliases.size(); i != e; ++i) - Aliases[i].second = Aliases[i].first->getTag(); + unsigned i = 0; + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i) + RegUnits[i].VirtTag = LIUArray[*Units].getTag(); } void InterferenceCache::Entry::reset(unsigned physReg, @@ -79,28 +81,23 @@ void InterferenceCache::Entry::reset(unsigned physReg, ++Tag; PhysReg = physReg; Blocks.resize(MF->getNumBlockIDs()); - Aliases.clear(); - for (const uint16_t *AS = TRI->getOverlaps(PhysReg); *AS; ++AS) { - LiveIntervalUnion *LIU = LIUArray + *AS; - Aliases.push_back(std::make_pair(LIU, LIU->getTag())); - } // Reset iterators. PrevPos = SlotIndex(); - unsigned e = Aliases.size(); - Iters.resize(e); - for (unsigned i = 0; i != e; ++i) - Iters[i].setMap(Aliases[i].first->getMap()); + RegUnits.clear(); + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + RegUnits.push_back(LIUArray[*Units]); + RegUnits.back().Fixed = &LIS->getRegUnit(*Units); + } } bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI) { - unsigned i = 0, e = Aliases.size(); - for (const uint16_t *AS = TRI->getOverlaps(PhysReg); *AS; ++AS, ++i) { - LiveIntervalUnion *LIU = LIUArray + *AS; - if (i == e || Aliases[i].first != LIU) + unsigned i = 0, e = RegUnits.size(); + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i) { + if (i == e) return false; - if (LIU->changedSince(Aliases[i].second)) + if (LIUArray[*Units].changedSince(RegUnits[i].VirtTag)) return false; } return i == e; @@ -112,12 +109,20 @@ void InterferenceCache::Entry::update(unsigned MBBNum) { // Use advanceTo only when possible. if (PrevPos != Start) { - if (!PrevPos.isValid() || Start < PrevPos) - for (unsigned i = 0, e = Iters.size(); i != e; ++i) - Iters[i].find(Start); - else - for (unsigned i = 0, e = Iters.size(); i != e; ++i) - Iters[i].advanceTo(Start); + if (!PrevPos.isValid() || Start < PrevPos) { + for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) { + RegUnitInfo &RUI = RegUnits[i]; + RUI.VirtI.find(Start); + RUI.FixedI = RUI.Fixed->find(Start); + } + } else { + for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) { + RegUnitInfo &RUI = RegUnits[i]; + RUI.VirtI.advanceTo(Start); + if (RUI.FixedI != RUI.Fixed->end()) + RUI.FixedI = RUI.Fixed->advanceTo(RUI.FixedI, Start); + } + } PrevPos = Start; } @@ -129,9 +134,9 @@ void InterferenceCache::Entry::update(unsigned MBBNum) { BI->Tag = Tag; BI->First = BI->Last = SlotIndex(); - // Check for first interference. - for (unsigned i = 0, e = Iters.size(); i != e; ++i) { - Iter &I = Iters[i]; + // Check for first interference from virtregs. + for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) { + LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI; if (!I.valid()) continue; SlotIndex StartI = I.start(); @@ -141,6 +146,19 @@ void InterferenceCache::Entry::update(unsigned MBBNum) { BI->First = StartI; } + // Same thing for fixed interference. + for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) { + LiveInterval::const_iterator I = RegUnits[i].FixedI; + LiveInterval::const_iterator E = RegUnits[i].Fixed->end(); + if (I == E) + continue; + SlotIndex StartI = I->start; + if (StartI >= Stop) + continue; + if (!BI->First.isValid() || StartI < BI->First) + BI->First = StartI; + } + // Also check for register mask interference. RegMaskSlots = LIS->getRegMaskSlotsInBlock(MBBNum); RegMaskBits = LIS->getRegMaskBitsInBlock(MBBNum); @@ -168,8 +186,8 @@ void InterferenceCache::Entry::update(unsigned MBBNum) { } // Check for last interference in block. - for (unsigned i = 0, e = Iters.size(); i != e; ++i) { - Iter &I = Iters[i]; + for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) { + LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI; if (!I.valid() || I.start() >= Stop) continue; I.advanceTo(Stop); @@ -183,6 +201,23 @@ void InterferenceCache::Entry::update(unsigned MBBNum) { ++I; } + // Fixed interference. + for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) { + LiveInterval::iterator &I = RegUnits[i].FixedI; + LiveInterval *LI = RegUnits[i].Fixed; + if (I == LI->end() || I->start >= Stop) + continue; + I = LI->advanceTo(I, Stop); + bool Backup = I == LI->end() || I->start >= Stop; + if (Backup) + --I; + SlotIndex StopI = I->end; + if (!BI->Last.isValid() || StopI > BI->Last) + BI->Last = StopI; + if (Backup) + ++I; + } + // Also check for register mask interference. SlotIndex Limit = BI->Last.isValid() ? BI->Last : Start; for (unsigned i = RegMaskSlots.size(); diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h index 485a325..3c928a5 100644 --- a/lib/CodeGen/InterferenceCache.h +++ b/lib/CodeGen/InterferenceCache.h @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// InterferenceCache remembers per-block interference in LiveIntervalUnions. +// InterferenceCache remembers per-block interference from LiveIntervalUnions, +// fixed RegUnit interference, and register masks. // //===----------------------------------------------------------------------===// @@ -59,14 +60,31 @@ class InterferenceCache { /// PrevPos - The previous position the iterators were moved to. SlotIndex PrevPos; - /// AliasTags - A LiveIntervalUnion pointer and tag for each alias of - /// PhysReg. - SmallVector<std::pair<LiveIntervalUnion*, unsigned>, 8> Aliases; + /// RegUnitInfo - Information tracked about each RegUnit in PhysReg. + /// When PrevPos is set, the iterators are valid as if advanceTo(PrevPos) + /// had just been called. + struct RegUnitInfo { + /// Iterator pointing into the LiveIntervalUnion containing virtual + /// register interference. + LiveIntervalUnion::SegmentIter VirtI; - typedef LiveIntervalUnion::SegmentIter Iter; + /// Tag of the LIU last time we looked. + unsigned VirtTag; - /// Iters - an iterator for each alias - SmallVector<Iter, 8> Iters; + /// Fixed interference in RegUnit. + LiveInterval *Fixed; + + /// Iterator pointing into the fixed RegUnit interference. + LiveInterval::iterator FixedI; + + RegUnitInfo(LiveIntervalUnion &LIU) : VirtTag(LIU.getTag()), Fixed(0) { + VirtI.setMap(LIU.getMap()); + } + }; + + /// Info for each RegUnit in PhysReg. It is very rare ofr a PHysReg to have + /// more than 4 RegUnits. + SmallVector<RegUnitInfo, 4> RegUnits; /// Blocks - Interference for each block in the function. SmallVector<BlockInterference, 8> Blocks; @@ -91,7 +109,7 @@ class InterferenceCache { bool hasRefs() const { return RefCount > 0; } - void revalidate(); + void revalidate(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI); /// valid - Return true if this is a valid entry for physReg. bool valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI); diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp index a9ca42f..8d2282a 100644 --- a/lib/CodeGen/IntrinsicLowering.cpp +++ b/lib/CodeGen/IntrinsicLowering.cpp @@ -11,17 +11,17 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" +#include "llvm/IRBuilder.h" #include "llvm/Module.h" #include "llvm/Type.h" -#include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" -#include "llvm/ADT/SmallVector.h" using namespace llvm; template <class ArgIt> diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index a1f479a..cac0c83 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/PassManager.h" +#include "llvm/Assembly/PrintModulePass.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" @@ -78,40 +79,15 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple, "and that InitializeAllTargetMCs() is being invoked!"); } -/// Turn exception handling constructs into something the code generators can -/// handle. -static void addPassesToHandleExceptions(TargetMachine *TM, - PassManagerBase &PM) { - switch (TM->getMCAsmInfo()->getExceptionHandlingType()) { - case ExceptionHandling::SjLj: - // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both - // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise, - // catch info can get misplaced when a selector ends up more than one block - // removed from the parent invoke(s). This could happen when a landing - // pad is shared by multiple invokes and is also a target of a normal - // edge from elsewhere. - PM.add(createSjLjEHPreparePass(TM->getTargetLowering())); - // FALLTHROUGH - case ExceptionHandling::DwarfCFI: - case ExceptionHandling::ARM: - case ExceptionHandling::Win64: - PM.add(createDwarfEHPass(TM)); - break; - case ExceptionHandling::None: - PM.add(createLowerInvokePass(TM->getTargetLowering())); - - // The lower invoke pass may create unreachable code. Remove it. - PM.add(createUnreachableBlockEliminationPass()); - break; - } -} - /// addPassesToX helper drives creation and initialization of TargetPassConfig. static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, - bool DisableVerify) { + bool DisableVerify, + AnalysisID StartAfter, + AnalysisID StopAfter) { // Targets may override createPassConfig to provide a target-specific sublass. TargetPassConfig *PassConfig = TM->createPassConfig(PM); + PassConfig->setStartStopPasses(StartAfter, StopAfter); // Set PassConfig options provided by TargetMachine. PassConfig->setDisableVerify(DisableVerify); @@ -120,7 +96,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM, PassConfig->addIRPasses(); - addPassesToHandleExceptions(TM, PM); + PassConfig->addPassesToHandleExceptions(); PassConfig->addISelPrepare(); @@ -155,16 +131,30 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM, bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &Out, CodeGenFileType FileType, - bool DisableVerify) { + bool DisableVerify, + AnalysisID StartAfter, + AnalysisID StopAfter) { // Add common CodeGen passes. - MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify); + MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, + StartAfter, StopAfter); if (!Context) return true; + if (StopAfter) { + // FIXME: The intent is that this should eventually write out a YAML file, + // containing the LLVM IR, the machine-level IR (when stopping after a + // machine-level pass), and whatever other information is needed to + // deserialize the code and resume compilation. For now, just write the + // LLVM IR. + PM.add(createPrintModulePass(&Out)); + return false; + } + if (hasMCSaveTempLabels()) Context->setAllowTemporaryLabels(false); const MCAsmInfo &MAI = *getMCAsmInfo(); + const MCRegisterInfo &MRI = *getRegisterInfo(); const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>(); OwningPtr<MCStreamer> AsmStreamer; @@ -180,7 +170,8 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM, MCAsmBackend *MAB = 0; if (ShowMCEncoding) { const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>(); - MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), STI, *Context); + MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, STI, + *Context); MAB = getTarget().createMCAsmBackend(getTargetTriple()); } @@ -198,8 +189,8 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM, case CGFT_ObjectFile: { // Create the code emitter for the target if it exists. If not, .o file // emission fails. - MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), STI, - *Context); + MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, + STI, *Context); MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple()); if (MCE == 0 || MAB == 0) return true; @@ -242,7 +233,7 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM, JITCodeEmitter &JCE, bool DisableVerify) { // Add common CodeGen passes. - MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify); + MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0); if (!Context) return true; @@ -262,7 +253,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, raw_ostream &Out, bool DisableVerify) { // Add common CodeGen passes. - Ctx = addPassesToGenerateCode(this, PM, DisableVerify); + Ctx = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0); if (!Ctx) return true; @@ -271,9 +262,10 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, // Create the code emitter for the target if it exists. If not, .o file // emission fails. + const MCRegisterInfo &MRI = *getRegisterInfo(); const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>(); - MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(),STI, - *Ctx); + MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, + STI, *Ctx); MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple()); if (MCE == 0 || MAB == 0) return true; diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp index f1abcbb..6b6b9d0 100644 --- a/lib/CodeGen/LexicalScopes.cpp +++ b/lib/CodeGen/LexicalScopes.cpp @@ -16,8 +16,8 @@ #define DEBUG_TYPE "lexicalscopes" #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/Debug.h" diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp index 2187833..d631726 100644 --- a/lib/CodeGen/LiveDebugVariables.cpp +++ b/lib/CodeGen/LiveDebugVariables.cpp @@ -23,9 +23,9 @@ #include "LiveDebugVariables.h" #include "VirtRegMap.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/Metadata.h" #include "llvm/Value.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/ADT/IntervalMap.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LexicalScopes.h" @@ -243,7 +243,7 @@ public: /// computeIntervals - Compute the live intervals of all locations after /// collecting all their def points. - void computeIntervals(MachineRegisterInfo &MRI, + void computeIntervals(MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, LiveIntervals &LIS, MachineDominatorTree &MDT, UserValueScopes &UVS); @@ -618,6 +618,7 @@ UserValue::addDefsFromCopies(LiveInterval *LI, unsigned LocNo, void UserValue::computeIntervals(MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, LiveIntervals &LIS, MachineDominatorTree &MDT, UserValueScopes &UVS) { @@ -634,15 +635,32 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI, unsigned LocNo = Defs[i].second; const MachineOperand &Loc = locations[LocNo]; + if (!Loc.isReg()) { + extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS); + continue; + } + // Register locations are constrained to where the register value is live. - if (Loc.isReg() && LIS.hasInterval(Loc.getReg())) { - LiveInterval *LI = &LIS.getInterval(Loc.getReg()); - const VNInfo *VNI = LI->getVNInfoAt(Idx); + if (TargetRegisterInfo::isVirtualRegister(Loc.getReg())) { + LiveInterval *LI = 0; + const VNInfo *VNI = 0; + if (LIS.hasInterval(Loc.getReg())) { + LI = &LIS.getInterval(Loc.getReg()); + VNI = LI->getVNInfoAt(Idx); + } SmallVector<SlotIndex, 16> Kills; extendDef(Idx, LocNo, LI, VNI, &Kills, LIS, MDT, UVS); - addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS); - } else - extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS); + if (LI) + addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS); + continue; + } + + // For physregs, use the live range of the first regunit as a guide. + unsigned Unit = *MCRegUnitIterator(Loc.getReg(), &TRI); + LiveInterval *LI = &LIS.getRegUnit(Unit); + const VNInfo *VNI = LI->getVNInfoAt(Idx); + // Don't track copies from physregs, it is too expensive. + extendDef(Idx, LocNo, LI, VNI, 0, LIS, MDT, UVS); } // Finally, erase all the undefs. @@ -656,7 +674,7 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI, void LDVImpl::computeIntervals() { for (unsigned i = 0, e = userValues.size(); i != e; ++i) { UserValueScopes UVS(userValues[i]->getDebugLoc(), LS); - userValues[i]->computeIntervals(MF->getRegInfo(), *LIS, *MDT, UVS); + userValues[i]->computeIntervals(MF->getRegInfo(), *TRI, *LIS, *MDT, UVS); userValues[i]->mapVirtRegs(this); } } @@ -721,7 +739,8 @@ renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx) { if (TargetRegisterInfo::isVirtualRegister(NewReg)) mapVirtReg(NewReg, UV); - virtRegToEqClass.erase(OldReg); + if (OldReg != NewReg) + virtRegToEqClass.erase(OldReg); do { UV->renameRegister(OldReg, NewReg, SubIdx, TRI); diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp index ac18843..01077db 100644 --- a/lib/CodeGen/LiveInterval.cpp +++ b/lib/CodeGen/LiveInterval.cpp @@ -48,6 +48,26 @@ LiveInterval::iterator LiveInterval::find(SlotIndex Pos) { return I; } +VNInfo *LiveInterval::createDeadDef(SlotIndex Def, + VNInfo::Allocator &VNInfoAllocator) { + assert(!Def.isDead() && "Cannot define a value at the dead slot"); + iterator I = find(Def); + if (I == end()) { + VNInfo *VNI = getNextValue(Def, VNInfoAllocator); + ranges.push_back(LiveRange(Def, Def.getDeadSlot(), VNI)); + return VNI; + } + if (SlotIndex::isSameInstr(Def, I->start)) { + assert(I->start == Def && "Cannot insert def, already live"); + assert(I->valno->def == Def && "Inconsistent existing value def"); + return I->valno; + } + assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def"); + VNInfo *VNI = getNextValue(Def, VNInfoAllocator); + ranges.insert(I, LiveRange(Def, Def.getDeadSlot(), VNI)); + return VNI; +} + /// killedInRange - Return true if the interval has kills in [Start,End). bool LiveInterval::killedInRange(SlotIndex Start, SlotIndex End) const { Ranges::const_iterator r = @@ -176,16 +196,16 @@ void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) { // If NewEnd was in the middle of an interval, make sure to get its endpoint. I->end = std::max(NewEnd, prior(MergeTo)->end); - // Erase any dead ranges. - ranges.erase(llvm::next(I), MergeTo); - // If the newly formed range now touches the range after it and if they have // the same value number, merge the two ranges into one range. - Ranges::iterator Next = llvm::next(I); - if (Next != ranges.end() && Next->start <= I->end && Next->valno == ValNo) { - I->end = Next->end; - ranges.erase(Next); + if (MergeTo != ranges.end() && MergeTo->start <= I->end && + MergeTo->valno == ValNo) { + I->end = MergeTo->end; + ++MergeTo; } + + // Erase any dead ranges. + ranges.erase(llvm::next(I), MergeTo); } @@ -353,18 +373,6 @@ void LiveInterval::removeValNo(VNInfo *ValNo) { markValNoForDeletion(ValNo); } -/// findDefinedVNInfo - Find the VNInfo defined by the specified -/// index (register interval). -VNInfo *LiveInterval::findDefinedVNInfoForRegInt(SlotIndex Idx) const { - for (LiveInterval::const_vni_iterator i = vni_begin(), e = vni_end(); - i != e; ++i) { - if ((*i)->def == Idx) - return *i; - } - - return 0; -} - /// join - Join two live intervals (this, and other) together. This applies /// mappings to the value numbers in the LHS/RHS intervals as specified. If /// the intervals are not joinable, this aborts. @@ -373,6 +381,8 @@ void LiveInterval::join(LiveInterval &Other, const int *RHSValNoAssignments, SmallVector<VNInfo*, 16> &NewVNInfo, MachineRegisterInfo *MRI) { + verify(); + // Determine if any of our live range values are mapped. This is uncommon, so // we want to avoid the interval scan if not. bool MustMapCurValNos = false; @@ -440,16 +450,148 @@ void LiveInterval::join(LiveInterval &Other, valnos.resize(NumNewVals); // shrinkify // Okay, now insert the RHS live ranges into the LHS. - iterator InsertPos = begin(); unsigned RangeNo = 0; for (iterator I = Other.begin(), E = Other.end(); I != E; ++I, ++RangeNo) { // Map the valno in the other live range to the current live range. I->valno = NewVNInfo[OtherAssignments[RangeNo]]; assert(I->valno && "Adding a dead range?"); - InsertPos = addRangeFrom(*I, InsertPos); + } + mergeIntervalRanges(Other); + + verify(); +} + +/// \brief Helper function for merging in another LiveInterval's ranges. +/// +/// This is a helper routine implementing an efficient merge of another +/// LiveIntervals ranges into the current interval. +/// +/// \param LHSValNo If non-NULL, set as the new value number for every range +/// from RHS which is merged into the LHS. +/// \param RHSValNo If non-NULL, then only ranges in RHS whose original value +/// number maches this value number will be merged into LHS. +void LiveInterval::mergeIntervalRanges(const LiveInterval &RHS, + VNInfo *LHSValNo, + const VNInfo *RHSValNo) { + if (RHS.empty()) + return; + + // Ensure we're starting with a valid range. Note that we don't verify RHS + // because it may have had its value numbers adjusted in preparation for + // merging. + verify(); + + // The strategy for merging these efficiently is as follows: + // + // 1) Find the beginning of the impacted ranges in the LHS. + // 2) Create a new, merged sub-squence of ranges merging from the position in + // #1 until either LHS or RHS is exhausted. Any part of LHS between RHS + // entries being merged will be copied into this new range. + // 3) Replace the relevant section in LHS with these newly merged ranges. + // 4) Append any remaning ranges from RHS if LHS is exhausted in #2. + // + // We don't follow the typical in-place merge strategy for sorted ranges of + // appending the new ranges to the back and then using std::inplace_merge + // because one step of the merge can both mutate the original elements and + // remove elements from the original. Essentially, because the merge includes + // collapsing overlapping ranges, a more complex approach is required. + + // We do an initial binary search to optimize for a common pattern: a large + // LHS, and a very small RHS. + const_iterator RI = RHS.begin(), RE = RHS.end(); + iterator LE = end(), LI = std::upper_bound(begin(), LE, *RI); + + // Merge into NewRanges until one of the ranges is exhausted. + SmallVector<LiveRange, 4> NewRanges; + + // Keep track of where to begin the replacement. + iterator ReplaceI = LI; + + // If there are preceding ranges in the LHS, put the last one into NewRanges + // so we can optionally extend it. Adjust the replacement point accordingly. + if (LI != begin()) { + ReplaceI = llvm::prior(LI); + NewRanges.push_back(*ReplaceI); + } + + // Now loop over the mergable portions of both LHS and RHS, merging into + // NewRanges. + while (LI != LE && RI != RE) { + // Skip incoming ranges with the wrong value. + if (RHSValNo && RI->valno != RHSValNo) { + ++RI; + continue; + } + + // Select the first range. We pick the earliest start point, and then the + // largest range. + LiveRange R = *LI; + if (*RI < R) { + R = *RI; + ++RI; + if (LHSValNo) + R.valno = LHSValNo; + } else { + ++LI; + } + + if (NewRanges.empty()) { + NewRanges.push_back(R); + continue; + } + + LiveRange &LastR = NewRanges.back(); + if (R.valno == LastR.valno) { + // Try to merge this range into the last one. + if (R.start <= LastR.end) { + LastR.end = std::max(LastR.end, R.end); + continue; + } + } else { + // We can't merge ranges across a value number. + assert(R.start >= LastR.end && + "Cannot overlap two LiveRanges with differing ValID's"); + } + + // If all else fails, just append the range. + NewRanges.push_back(R); + } + assert(RI == RE || LI == LE); + + // Check for being able to merge into the trailing sequence of ranges on the LHS. + if (!NewRanges.empty()) + for (; LI != LE && (LI->valno == NewRanges.back().valno && + LI->start <= NewRanges.back().end); + ++LI) + NewRanges.back().end = std::max(NewRanges.back().end, LI->end); + + // Replace the ranges in the LHS with the newly merged ones. It would be + // really nice if there were a move-supporting 'replace' directly in + // SmallVector, but as there is not, we pay the price of copies to avoid + // wasted memory allocations. + SmallVectorImpl<LiveRange>::iterator NRI = NewRanges.begin(), + NRE = NewRanges.end(); + for (; ReplaceI != LI && NRI != NRE; ++ReplaceI, ++NRI) + *ReplaceI = *NRI; + if (NRI == NRE) + ranges.erase(ReplaceI, LI); + else + ranges.insert(LI, NRI, NRE); + + // And finally insert any trailing end of RHS (if we have one). + for (; RI != RE; ++RI) { + LiveRange R = *RI; + if (LHSValNo) + R.valno = LHSValNo; + if (!ranges.empty() && + ranges.back().valno == R.valno && R.start <= ranges.back().end) + ranges.back().end = std::max(ranges.back().end, R.end); + else + ranges.push_back(R); } - ComputeJoinedWeight(Other); + // Ensure we finished with a valid new sequence of ranges. + verify(); } /// MergeRangesInAsValue - Merge all of the intervals in RHS into this live @@ -458,38 +600,20 @@ void LiveInterval::join(LiveInterval &Other, /// the overlapping LiveRanges have the specified value number. void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS, VNInfo *LHSValNo) { - // TODO: Make this more efficient. - iterator InsertPos = begin(); - for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) { - // Map the valno in the other live range to the current live range. - LiveRange Tmp = *I; - Tmp.valno = LHSValNo; - InsertPos = addRangeFrom(Tmp, InsertPos); - } + mergeIntervalRanges(RHS, LHSValNo); } - /// MergeValueInAsValue - Merge all of the live ranges of a specific val# /// in RHS into this live interval as the specified value number. /// The LiveRanges in RHS are allowed to overlap with LiveRanges in the /// current interval, it will replace the value numbers of the overlaped /// live ranges with the specified value number. -void LiveInterval::MergeValueInAsValue( - const LiveInterval &RHS, - const VNInfo *RHSValNo, VNInfo *LHSValNo) { - // TODO: Make this more efficient. - iterator InsertPos = begin(); - for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) { - if (I->valno != RHSValNo) - continue; - // Map the valno in the other live range to the current live range. - LiveRange Tmp = *I; - Tmp.valno = LHSValNo; - InsertPos = addRangeFrom(Tmp, InsertPos); - } +void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS, + const VNInfo *RHSValNo, + VNInfo *LHSValNo) { + mergeIntervalRanges(RHS, LHSValNo, RHSValNo); } - /// MergeValueNumberInto - This method is called when two value nubmers /// are found to be equivalent. This eliminates V1, replacing all /// LiveRanges with the V1 value number with the V2 value number. This can @@ -569,6 +693,8 @@ void LiveInterval::Copy(const LiveInterval &RHS, const LiveRange &LR = RHS.ranges[i]; addRange(LiveRange(LR.start, LR.end, getValNumInfo(LR.valno->id))); } + + verify(); } unsigned LiveInterval::getSize() const { @@ -578,29 +704,6 @@ unsigned LiveInterval::getSize() const { return Sum; } -/// ComputeJoinedWeight - Set the weight of a live interval Joined -/// after Other has been merged into it. -void LiveInterval::ComputeJoinedWeight(const LiveInterval &Other) { - // If either of these intervals was spilled, the weight is the - // weight of the non-spilled interval. This can only happen with - // iterative coalescers. - - if (Other.weight != HUGE_VALF) { - weight += Other.weight; - } - else if (weight == HUGE_VALF && - !TargetRegisterInfo::isPhysicalRegister(reg)) { - // Remove this assert if you have an iterative coalescer - assert(0 && "Joining to spilled interval"); - weight = Other.weight; - } - else { - // Otherwise the weight stays the same - // Remove this assert if you have an iterative coalescer - assert(0 && "Joining from spilled interval"); - } -} - raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) { return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")"; } @@ -609,15 +712,10 @@ void LiveRange::dump() const { dbgs() << *this << "\n"; } -void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const { - OS << PrintReg(reg, TRI); - if (weight != 0) - OS << ',' << weight; - +void LiveInterval::print(raw_ostream &OS) const { if (empty()) - OS << " EMPTY"; + OS << "EMPTY"; else { - OS << " = "; for (LiveInterval::Ranges::const_iterator I = ranges.begin(), E = ranges.end(); I != E; ++I) { OS << *I; @@ -651,6 +749,23 @@ void LiveInterval::dump() const { dbgs() << *this << "\n"; } +#ifndef NDEBUG +void LiveInterval::verify() const { + for (const_iterator I = begin(), E = end(); I != E; ++I) { + assert(I->start.isValid()); + assert(I->end.isValid()); + assert(I->start < I->end); + assert(I->valno != 0); + assert(I->valno == valnos[I->valno->id]); + if (llvm::next(I) != E) { + assert(I->end <= llvm::next(I)->start); + if (I->end == llvm::next(I)->start) + assert(I->valno != llvm::next(I)->valno); + } + } +} +#endif + void LiveRange::print(raw_ostream &os) const { os << *this; @@ -718,7 +833,10 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[], SlotIndex Idx = LIS.getInstructionIndex(MI); Idx = Idx.getRegSlot(MO.isUse()); const VNInfo *VNI = LI.getVNInfoAt(Idx); - assert(VNI && "Interval not live at use."); + // FIXME: We should be able to assert(VNI) here, but the coalescer leaves + // dangling defs around. + if (!VNI) + continue; MO.setReg(LIV[getEqClass(VNI)]->reg); } diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp index 934cc12..819707f 100644 --- a/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -20,30 +20,24 @@ #include "llvm/Value.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" +#include "LiveRangeCalc.h" #include <algorithm> #include <limits> #include <cmath> using namespace llvm; -// Hidden options for help debugging. -static cl::opt<bool> DisableReMat("disable-rematerialization", - cl::init(false), cl::Hidden); - -STATISTIC(numIntervals , "Number of original intervals"); - char LiveIntervals::ID = 0; INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals", "Live Interval Analysis", false, false) @@ -61,23 +55,35 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<LiveVariables>(); AU.addPreserved<LiveVariables>(); AU.addPreservedID(MachineLoopInfoID); + AU.addRequiredTransitiveID(MachineDominatorsID); AU.addPreservedID(MachineDominatorsID); AU.addPreserved<SlotIndexes>(); AU.addRequiredTransitive<SlotIndexes>(); MachineFunctionPass::getAnalysisUsage(AU); } +LiveIntervals::LiveIntervals() : MachineFunctionPass(ID), + DomTree(0), LRCalc(0) { + initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); +} + +LiveIntervals::~LiveIntervals() { + delete LRCalc; +} + void LiveIntervals::releaseMemory() { // Free the live intervals themselves. - for (DenseMap<unsigned, LiveInterval*>::iterator I = r2iMap_.begin(), - E = r2iMap_.end(); I != E; ++I) - delete I->second; - - r2iMap_.clear(); + for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i) + delete VirtRegIntervals[TargetRegisterInfo::index2VirtReg(i)]; + VirtRegIntervals.clear(); RegMaskSlots.clear(); RegMaskBits.clear(); RegMaskBlocks.clear(); + for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i) + delete RegUnitIntervals[i]; + RegUnitIntervals.clear(); + // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd. VNInfoAllocator.Reset(); } @@ -85,20 +91,22 @@ void LiveIntervals::releaseMemory() { /// runOnMachineFunction - Register allocate the whole function /// bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { - mf_ = &fn; - mri_ = &mf_->getRegInfo(); - tm_ = &fn.getTarget(); - tri_ = tm_->getRegisterInfo(); - tii_ = tm_->getInstrInfo(); - aa_ = &getAnalysis<AliasAnalysis>(); - lv_ = &getAnalysis<LiveVariables>(); - indexes_ = &getAnalysis<SlotIndexes>(); - allocatableRegs_ = tri_->getAllocatableSet(fn); - reservedRegs_ = tri_->getReservedRegs(fn); + MF = &fn; + MRI = &MF->getRegInfo(); + TM = &fn.getTarget(); + TRI = TM->getRegisterInfo(); + TII = TM->getInstrInfo(); + AA = &getAnalysis<AliasAnalysis>(); + LV = &getAnalysis<LiveVariables>(); + Indexes = &getAnalysis<SlotIndexes>(); + DomTree = &getAnalysis<MachineDominatorTree>(); + if (!LRCalc) + LRCalc = new LiveRangeCalc(); + AllocatableRegs = TRI->getAllocatableSet(fn); + ReservedRegs = TRI->getReservedRegs(fn); computeIntervals(); - - numIntervals += getNumIntervals(); + computeLiveInRegUnits(); DEBUG(dump()); return true; @@ -108,27 +116,24 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { void LiveIntervals::print(raw_ostream &OS, const Module* ) const { OS << "********** INTERVALS **********\n"; - // Dump the physregs. - for (unsigned Reg = 1, RegE = tri_->getNumRegs(); Reg != RegE; ++Reg) - if (const LiveInterval *LI = r2iMap_.lookup(Reg)) { - LI->print(OS, tri_); - OS << '\n'; - } + // Dump the regunits. + for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i) + if (LiveInterval *LI = RegUnitIntervals[i]) + OS << PrintRegUnit(i, TRI) << " = " << *LI << '\n'; // Dump the virtregs. - for (unsigned Reg = 0, RegE = mri_->getNumVirtRegs(); Reg != RegE; ++Reg) - if (const LiveInterval *LI = - r2iMap_.lookup(TargetRegisterInfo::index2VirtReg(Reg))) { - LI->print(OS, tri_); - OS << '\n'; - } + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (hasInterval(Reg)) + OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n'; + } printInstrs(OS); } void LiveIntervals::printInstrs(raw_ostream &OS) const { OS << "********** MACHINEINSTRS **********\n"; - mf_->print(OS, indexes_); + MF->print(OS, Indexes); } void LiveIntervals::dumpInstrs() const { @@ -176,13 +181,13 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb, MachineOperand& MO, unsigned MOIdx, LiveInterval &interval) { - DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, tri_)); + DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, TRI)); // Virtual registers may be defined multiple times (due to phi // elimination and 2-addr elimination). Much of what we do only has to be // done once for the vreg. We use an empty interval to detect the first // time we see a vreg. - LiveVariables::VarInfo& vi = lv_->getVarInfo(interval.reg); + LiveVariables::VarInfo& vi = LV->getVarInfo(interval.reg); if (interval.empty()) { // Get the Idx of the defining instructions. SlotIndex defIndex = MIIdx.getRegSlot(MO.isEarlyClobber()); @@ -226,11 +231,11 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb, DEBUG(dbgs() << " +" << NewLR); interval.addRange(NewLR); - bool PHIJoin = lv_->isPHIJoin(interval.reg); + bool PHIJoin = LV->isPHIJoin(interval.reg); if (PHIJoin) { - // A phi join register is killed at the end of the MBB and revived as a new - // valno in the killing blocks. + // A phi join register is killed at the end of the MBB and revived as a + // new valno in the killing blocks. assert(vi.AliveBlocks.empty() && "Phi join can't pass through blocks"); DEBUG(dbgs() << " phi-join"); ValNo->setHasPHIKill(true); @@ -240,8 +245,9 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb, // live interval. for (SparseBitVector<>::iterator I = vi.AliveBlocks.begin(), E = vi.AliveBlocks.end(); I != E; ++I) { - MachineBasicBlock *aliveBlock = mf_->getBlockNumbered(*I); - LiveRange LR(getMBBStartIdx(aliveBlock), getMBBEndIdx(aliveBlock), ValNo); + MachineBasicBlock *aliveBlock = MF->getBlockNumbered(*I); + LiveRange LR(getMBBStartIdx(aliveBlock), getMBBEndIdx(aliveBlock), + ValNo); interval.addRange(LR); DEBUG(dbgs() << " +" << LR); } @@ -319,11 +325,8 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb, interval.addRange(LiveRange(RedefIndex, RedefIndex.getDeadSlot(), OldValNo)); - DEBUG({ - dbgs() << " RESULT: "; - interval.print(dbgs(), tri_); - }); - } else if (lv_->isPHIJoin(interval.reg)) { + DEBUG(dbgs() << " RESULT: " << interval); + } else if (LV->isPHIJoin(interval.reg)) { // In the case of PHI elimination, each variable definition is only // live until the end of the block. We've already taken care of the // rest of the live range. @@ -347,101 +350,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb, DEBUG(dbgs() << '\n'); } -static bool isRegLiveIntoSuccessor(const MachineBasicBlock *MBB, unsigned Reg) { - for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); - SI != SE; ++SI) { - const MachineBasicBlock* succ = *SI; - if (succ->isLiveIn(Reg)) - return true; - } - return false; -} - -void LiveIntervals::handlePhysicalRegisterDef(MachineBasicBlock *MBB, - MachineBasicBlock::iterator mi, - SlotIndex MIIdx, - MachineOperand& MO, - LiveInterval &interval) { - DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, tri_)); - - SlotIndex baseIndex = MIIdx; - SlotIndex start = baseIndex.getRegSlot(MO.isEarlyClobber()); - SlotIndex end = start; - - // If it is not used after definition, it is considered dead at - // the instruction defining it. Hence its interval is: - // [defSlot(def), defSlot(def)+1) - // For earlyclobbers, the defSlot was pushed back one; the extra - // advance below compensates. - if (MO.isDead()) { - DEBUG(dbgs() << " dead"); - end = start.getDeadSlot(); - goto exit; - } - - // If it is not dead on definition, it must be killed by a - // subsequent instruction. Hence its interval is: - // [defSlot(def), useSlot(kill)+1) - baseIndex = baseIndex.getNextIndex(); - while (++mi != MBB->end()) { - - if (mi->isDebugValue()) - continue; - if (getInstructionFromIndex(baseIndex) == 0) - baseIndex = indexes_->getNextNonNullIndex(baseIndex); - - if (mi->killsRegister(interval.reg, tri_)) { - DEBUG(dbgs() << " killed"); - end = baseIndex.getRegSlot(); - goto exit; - } else { - int DefIdx = mi->findRegisterDefOperandIdx(interval.reg,false,false,tri_); - if (DefIdx != -1) { - if (mi->isRegTiedToUseOperand(DefIdx)) { - // Two-address instruction. - end = baseIndex.getRegSlot(mi->getOperand(DefIdx).isEarlyClobber()); - } else { - // Another instruction redefines the register before it is ever read. - // Then the register is essentially dead at the instruction that - // defines it. Hence its interval is: - // [defSlot(def), defSlot(def)+1) - DEBUG(dbgs() << " dead"); - end = start.getDeadSlot(); - } - goto exit; - } - } - - baseIndex = baseIndex.getNextIndex(); - } - - // If we get here the register *should* be live out. - assert(!isAllocatable(interval.reg) && "Physregs shouldn't be live out!"); - - // FIXME: We need saner rules for reserved regs. - if (isReserved(interval.reg)) { - end = start.getDeadSlot(); - } else { - // Unreserved, unallocable registers like EFLAGS can be live across basic - // block boundaries. - assert(isRegLiveIntoSuccessor(MBB, interval.reg) && - "Unreserved reg not live-out?"); - end = getMBBEndIdx(MBB); - } -exit: - assert(start < end && "did not find end of interval?"); - - // Already exists? Extend old live interval. - VNInfo *ValNo = interval.getVNInfoAt(start); - bool Extend = ValNo != 0; - if (!Extend) - ValNo = interval.getNextValue(start, VNInfoAllocator); - LiveRange LR(start, end, ValNo); - interval.addRange(LR); - DEBUG(dbgs() << " +" << LR << '\n'); -} - void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB, MachineBasicBlock::iterator MI, SlotIndex MIIdx, @@ -450,93 +358,6 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB, if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) handleVirtualRegisterDef(MBB, MI, MIIdx, MO, MOIdx, getOrCreateInterval(MO.getReg())); - else - handlePhysicalRegisterDef(MBB, MI, MIIdx, MO, - getOrCreateInterval(MO.getReg())); -} - -void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB, - SlotIndex MIIdx, - LiveInterval &interval) { - assert(TargetRegisterInfo::isPhysicalRegister(interval.reg) && - "Only physical registers can be live in."); - assert((!isAllocatable(interval.reg) || MBB->getParent()->begin() || - MBB->isLandingPad()) && - "Allocatable live-ins only valid for entry blocks and landing pads."); - - DEBUG(dbgs() << "\t\tlivein register: " << PrintReg(interval.reg, tri_)); - - // Look for kills, if it reaches a def before it's killed, then it shouldn't - // be considered a livein. - MachineBasicBlock::iterator mi = MBB->begin(); - MachineBasicBlock::iterator E = MBB->end(); - // Skip over DBG_VALUE at the start of the MBB. - if (mi != E && mi->isDebugValue()) { - while (++mi != E && mi->isDebugValue()) - ; - if (mi == E) - // MBB is empty except for DBG_VALUE's. - return; - } - - SlotIndex baseIndex = MIIdx; - SlotIndex start = baseIndex; - if (getInstructionFromIndex(baseIndex) == 0) - baseIndex = indexes_->getNextNonNullIndex(baseIndex); - - SlotIndex end = baseIndex; - bool SeenDefUse = false; - - while (mi != E) { - if (mi->killsRegister(interval.reg, tri_)) { - DEBUG(dbgs() << " killed"); - end = baseIndex.getRegSlot(); - SeenDefUse = true; - break; - } else if (mi->modifiesRegister(interval.reg, tri_)) { - // Another instruction redefines the register before it is ever read. - // Then the register is essentially dead at the instruction that defines - // it. Hence its interval is: - // [defSlot(def), defSlot(def)+1) - DEBUG(dbgs() << " dead"); - end = start.getDeadSlot(); - SeenDefUse = true; - break; - } - - while (++mi != E && mi->isDebugValue()) - // Skip over DBG_VALUE. - ; - if (mi != E) - baseIndex = indexes_->getNextNonNullIndex(baseIndex); - } - - // Live-in register might not be used at all. - if (!SeenDefUse) { - if (isAllocatable(interval.reg) || - !isRegLiveIntoSuccessor(MBB, interval.reg)) { - // Allocatable registers are never live through. - // Non-allocatable registers that aren't live into any successors also - // aren't live through. - DEBUG(dbgs() << " dead"); - return; - } else { - // If we get here the register is non-allocatable and live into some - // successor. We'll conservatively assume it's live-through. - DEBUG(dbgs() << " live through"); - end = getMBBEndIdx(MBB); - } - } - - SlotIndex defIdx = getMBBStartIdx(MBB); - assert(getInstructionFromIndex(defIdx) == 0 && - "PHI def index points at actual instruction."); - VNInfo *vni = interval.getNextValue(defIdx, VNInfoAllocator); - vni->setIsPHIDef(true); - LiveRange LR(start, end, vni); - - interval.addRange(LR); - DEBUG(dbgs() << " +" << LR << '\n'); } /// computeIntervals - computes the live intervals for virtual @@ -546,12 +367,12 @@ void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB, void LiveIntervals::computeIntervals() { DEBUG(dbgs() << "********** COMPUTING LIVE INTERVALS **********\n" << "********** Function: " - << ((Value*)mf_->getFunction())->getName() << '\n'); + << ((Value*)MF->getFunction())->getName() << '\n'); - RegMaskBlocks.resize(mf_->getNumBlockIDs()); + RegMaskBlocks.resize(MF->getNumBlockIDs()); SmallVector<unsigned, 8> UndefUses; - for (MachineFunction::iterator MBBI = mf_->begin(), E = mf_->end(); + for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); MBBI != E; ++MBBI) { MachineBasicBlock *MBB = MBBI; RegMaskBlocks[MBB->getNumber()].first = RegMaskSlots.size(); @@ -564,22 +385,16 @@ void LiveIntervals::computeIntervals() { DEBUG(dbgs() << "BB#" << MBB->getNumber() << ":\t\t# derived from " << MBB->getName() << "\n"); - // Create intervals for live-ins to this BB first. - for (MachineBasicBlock::livein_iterator LI = MBB->livein_begin(), - LE = MBB->livein_end(); LI != LE; ++LI) { - handleLiveInRegister(MBB, MIIndex, getOrCreateInterval(*LI)); - } - // Skip over empty initial indices. if (getInstructionFromIndex(MIIndex) == 0) - MIIndex = indexes_->getNextNonNullIndex(MIIndex); + MIIndex = Indexes->getNextNonNullIndex(MIIndex); for (MachineBasicBlock::iterator MI = MBB->begin(), miEnd = MBB->end(); MI != miEnd; ++MI) { DEBUG(dbgs() << MIIndex << "\t" << *MI); if (MI->isDebugValue()) continue; - assert(indexes_->getInstructionFromIndex(MIIndex) == MI && + assert(Indexes->getInstructionFromIndex(MIIndex) == MI && "Lost SlotIndex synchronization"); // Handle defs. @@ -593,7 +408,7 @@ void LiveIntervals::computeIntervals() { continue; } - if (!MO.isReg() || !MO.getReg()) + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue; // handle register defs - build intervals @@ -604,7 +419,7 @@ void LiveIntervals::computeIntervals() { } // Move to the next instr slot. - MIIndex = indexes_->getNextNonNullIndex(MIIndex); + MIIndex = Indexes->getNextNonNullIndex(MIIndex); } // Compute the number of register mask instructions in this block. @@ -626,14 +441,104 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) { return new LiveInterval(reg, Weight); } -/// dupInterval - Duplicate a live interval. The caller is responsible for -/// managing the allocated memory. -LiveInterval* LiveIntervals::dupInterval(LiveInterval *li) { - LiveInterval *NewLI = createInterval(li->reg); - NewLI->Copy(*li, mri_, getVNInfoAllocator()); - return NewLI; + +//===----------------------------------------------------------------------===// +// Register Unit Liveness +//===----------------------------------------------------------------------===// +// +// Fixed interference typically comes from ABI boundaries: Function arguments +// and return values are passed in fixed registers, and so are exception +// pointers entering landing pads. Certain instructions require values to be +// present in specific registers. That is also represented through fixed +// interference. +// + +/// computeRegUnitInterval - Compute the live interval of a register unit, based +/// on the uses and defs of aliasing registers. The interval should be empty, +/// or contain only dead phi-defs from ABI blocks. +void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) { + unsigned Unit = LI->reg; + + assert(LRCalc && "LRCalc not initialized."); + LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); + + // The physregs aliasing Unit are the roots and their super-registers. + // Create all values as dead defs before extending to uses. Note that roots + // may share super-registers. That's OK because createDeadDefs() is + // idempotent. It is very rare for a register unit to have multiple roots, so + // uniquing super-registers is probably not worthwhile. + for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { + unsigned Root = *Roots; + if (!MRI->reg_empty(Root)) + LRCalc->createDeadDefs(LI, Root); + for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) { + if (!MRI->reg_empty(*Supers)) + LRCalc->createDeadDefs(LI, *Supers); + } + } + + // Now extend LI to reach all uses. + // Ignore uses of reserved registers. We only track defs of those. + for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { + unsigned Root = *Roots; + if (!isReserved(Root) && !MRI->reg_empty(Root)) + LRCalc->extendToUses(LI, Root); + for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) { + unsigned Reg = *Supers; + if (!isReserved(Reg) && !MRI->reg_empty(Reg)) + LRCalc->extendToUses(LI, Reg); + } + } +} + + +/// computeLiveInRegUnits - Precompute the live ranges of any register units +/// that are live-in to an ABI block somewhere. Register values can appear +/// without a corresponding def when entering the entry block or a landing pad. +/// +void LiveIntervals::computeLiveInRegUnits() { + RegUnitIntervals.resize(TRI->getNumRegUnits()); + DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); + + // Keep track of the intervals allocated. + SmallVector<LiveInterval*, 8> NewIntvs; + + // Check all basic blocks for live-ins. + for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end(); + MFI != MFE; ++MFI) { + const MachineBasicBlock *MBB = MFI; + + // We only care about ABI blocks: Entry + landing pads. + if ((MFI != MF->begin() && !MBB->isLandingPad()) || MBB->livein_empty()) + continue; + + // Create phi-defs at Begin for all live-in registers. + SlotIndex Begin = Indexes->getMBBStartIdx(MBB); + DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber()); + for (MachineBasicBlock::livein_iterator LII = MBB->livein_begin(), + LIE = MBB->livein_end(); LII != LIE; ++LII) { + for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) { + unsigned Unit = *Units; + LiveInterval *Intv = RegUnitIntervals[Unit]; + if (!Intv) { + Intv = RegUnitIntervals[Unit] = new LiveInterval(Unit, HUGE_VALF); + NewIntvs.push_back(Intv); + } + VNInfo *VNI = Intv->createDeadDef(Begin, getVNInfoAllocator()); + (void)VNI; + DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << '#' << VNI->id); + } + } + DEBUG(dbgs() << '\n'); + } + DEBUG(dbgs() << "Created " << NewIntvs.size() << " new intervals.\n"); + + // Compute the 'normal' part of the intervals. + for (unsigned i = 0, e = NewIntvs.size(); i != e; ++i) + computeRegUnitInterval(NewIntvs[i]); } + /// shrinkToUses - After removing some uses of a register, shrink its live /// range to just the remaining uses. This method does not compute reaching /// defs for new uses, and it doesn't remove dead defs. @@ -649,14 +554,13 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, SmallPtrSet<MachineBasicBlock*, 16> LiveOut; // Visit all instructions reading li->reg. - for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li->reg); + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(li->reg); MachineInstr *UseMI = I.skipInstruction();) { if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg)) continue; SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot(); - // Note: This intentionally picks up the wrong VNI in case of an EC redef. - // See below. - VNInfo *VNI = li->getVNInfoBefore(Idx); + LiveRangeQuery LRQ(*li, Idx); + VNInfo *VNI = LRQ.valueIn(); if (!VNI) { // This shouldn't happen: readsVirtualRegister returns true, but there is // no live value. It is likely caused by a target getting <undef> flags @@ -667,13 +571,10 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, continue; } // Special case: An early-clobber tied operand reads and writes the - // register one slot early. The getVNInfoBefore call above would have - // picked up the value defined by UseMI. Adjust the kill slot and value. - if (SlotIndex::isSameInstr(VNI->def, Idx)) { - Idx = VNI->def; - VNI = li->getVNInfoBefore(Idx); - assert(VNI && "Early-clobber tied value not available"); - } + // register one slot early. + if (VNInfo *DefVNI = LRQ.valueDefined()) + Idx = DefVNI->def; + WorkList.push_back(std::make_pair(Idx, VNI)); } @@ -755,7 +656,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, // This is a dead def. Make sure the instruction knows. MachineInstr *MI = getInstructionFromIndex(VNI->def); assert(MI && "No instruction defining live value"); - MI->addRegisterDead(li->reg, tri_); + MI->addRegisterDead(li->reg, TRI); if (dead && MI->allDefsAreDead()) { DEBUG(dbgs() << "All defs dead: " << VNI->def << '\t' << *MI); dead->push_back(MI); @@ -775,13 +676,11 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, // void LiveIntervals::addKillFlags() { - for (iterator I = begin(), E = end(); I != E; ++I) { - unsigned Reg = I->first; - if (TargetRegisterInfo::isPhysicalRegister(Reg)) - continue; - if (mri_->reg_nodbg_empty(Reg)) + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (MRI->reg_nodbg_empty(Reg)) continue; - LiveInterval *LI = I->second; + LiveInterval *LI = &getInterval(Reg); // Every instruction that kills Reg corresponds to a live range end point. for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE; @@ -797,101 +696,6 @@ void LiveIntervals::addKillFlags() { } } -/// getReMatImplicitUse - If the remat definition MI has one (for now, we only -/// allow one) virtual register operand, then its uses are implicitly using -/// the register. Returns the virtual register. -unsigned LiveIntervals::getReMatImplicitUse(const LiveInterval &li, - MachineInstr *MI) const { - unsigned RegOp = 0; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg() || !MO.isUse()) - continue; - unsigned Reg = MO.getReg(); - if (Reg == 0 || Reg == li.reg) - continue; - - if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isAllocatable(Reg)) - continue; - RegOp = MO.getReg(); - break; // Found vreg operand - leave the loop. - } - return RegOp; -} - -/// isValNoAvailableAt - Return true if the val# of the specified interval -/// which reaches the given instruction also reaches the specified use index. -bool LiveIntervals::isValNoAvailableAt(const LiveInterval &li, MachineInstr *MI, - SlotIndex UseIdx) const { - VNInfo *UValNo = li.getVNInfoAt(UseIdx); - return UValNo && UValNo == li.getVNInfoAt(getInstructionIndex(MI)); -} - -/// isReMaterializable - Returns true if the definition MI of the specified -/// val# of the specified interval is re-materializable. -bool -LiveIntervals::isReMaterializable(const LiveInterval &li, - const VNInfo *ValNo, MachineInstr *MI, - const SmallVectorImpl<LiveInterval*> *SpillIs, - bool &isLoad) { - if (DisableReMat) - return false; - - if (!tii_->isTriviallyReMaterializable(MI, aa_)) - return false; - - // Target-specific code can mark an instruction as being rematerializable - // if it has one virtual reg use, though it had better be something like - // a PIC base register which is likely to be live everywhere. - unsigned ImpUse = getReMatImplicitUse(li, MI); - if (ImpUse) { - const LiveInterval &ImpLi = getInterval(ImpUse); - for (MachineRegisterInfo::use_nodbg_iterator - ri = mri_->use_nodbg_begin(li.reg), re = mri_->use_nodbg_end(); - ri != re; ++ri) { - MachineInstr *UseMI = &*ri; - SlotIndex UseIdx = getInstructionIndex(UseMI); - if (li.getVNInfoAt(UseIdx) != ValNo) - continue; - if (!isValNoAvailableAt(ImpLi, MI, UseIdx)) - return false; - } - - // If a register operand of the re-materialized instruction is going to - // be spilled next, then it's not legal to re-materialize this instruction. - if (SpillIs) - for (unsigned i = 0, e = SpillIs->size(); i != e; ++i) - if (ImpUse == (*SpillIs)[i]->reg) - return false; - } - return true; -} - -/// isReMaterializable - Returns true if every definition of MI of every -/// val# of the specified interval is re-materializable. -bool -LiveIntervals::isReMaterializable(const LiveInterval &li, - const SmallVectorImpl<LiveInterval*> *SpillIs, - bool &isLoad) { - isLoad = false; - for (LiveInterval::const_vni_iterator i = li.vni_begin(), e = li.vni_end(); - i != e; ++i) { - const VNInfo *VNI = *i; - if (VNI->isUnused()) - continue; // Dead val#. - // Is the def for the val# rematerializable? - MachineInstr *ReMatDefMI = getInstructionFromIndex(VNI->def); - if (!ReMatDefMI) - return false; - bool DefIsLoad = false; - if (!ReMatDefMI || - !isReMaterializable(li, VNI, ReMatDefMI, SpillIs, DefIsLoad)) - return false; - isLoad |= DefIsLoad; - } - return true; -} - MachineBasicBlock* LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const { // A local live range must be fully contained inside the block, meaning it is @@ -911,8 +715,8 @@ LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const { // getMBBFromIndex doesn't need to search the MBB table when both indexes // belong to proper instructions. - MachineBasicBlock *MBB1 = indexes_->getMBBFromIndex(Start); - MachineBasicBlock *MBB2 = indexes_->getMBBFromIndex(Stop); + MachineBasicBlock *MBB1 = Indexes->getMBBFromIndex(Start); + MachineBasicBlock *MBB2 = Indexes->getMBBFromIndex(Stop); return MBB1 == MBB2 ? MBB1 : NULL; } @@ -990,7 +794,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, if (!Found) { // This is the first overlap. Initialize UsableRegs to all ones. UsableRegs.clear(); - UsableRegs.resize(tri_->getNumRegs(), true); + UsableRegs.resize(TRI->getNumRegs(), true); Found = true; } // Remove usable registers clobbered by this mask. @@ -1101,6 +905,9 @@ public: BundleRanges BR = createBundleRanges(Entering, Internal, Exiting); + Entering.clear(); + Internal.clear(); + Exiting.clear(); collectRanges(MI, Entering, Internal, Exiting, hasRegMaskOp, OldIdx); assert(!hasRegMaskOp && "Can't have RegMask operand in bundle."); @@ -1176,78 +983,44 @@ private: // TODO: Currently we're skipping uses that are reserved or have no // interval, but we're not updating their kills. This should be // fixed. - if (!LIS.hasInterval(Reg) || - (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg))) + if (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg)) continue; - LiveInterval* LI = &LIS.getInterval(Reg); - - if (MO.readsReg()) { - LiveRange* LR = LI->getLiveRangeContaining(OldIdx); - if (LR != 0) - Entering.insert(std::make_pair(LI, LR)); - } - if (MO.isDef()) { - if (MO.isEarlyClobber()) { - LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot(true)); - assert(LR != 0 && "No EC range?"); - if (LR->end > OldIdx.getDeadSlot()) - Exiting.insert(std::make_pair(LI, LR)); - else - Internal.insert(std::make_pair(LI, LR)); - } else if (MO.isDead()) { - LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot()); - assert(LR != 0 && "No dead-def range?"); - Internal.insert(std::make_pair(LI, LR)); - } else { - LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getDeadSlot()); - assert(LR && LR->end > OldIdx.getDeadSlot() && - "Non-dead-def should have live range exiting."); - Exiting.insert(std::make_pair(LI, LR)); - } + // Collect ranges for register units. These live ranges are computed on + // demand, so just skip any that haven't been computed yet. + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) + if (LiveInterval *LI = LIS.getCachedRegUnit(*Units)) + collectRanges(MO, LI, Entering, Internal, Exiting, OldIdx); + } else { + // Collect ranges for individual virtual registers. + collectRanges(MO, &LIS.getInterval(Reg), + Entering, Internal, Exiting, OldIdx); } } } - // Collect IntRangePairs for all operands of MI that may need fixing. - void collectRangesInBundle(MachineInstr* MI, RangeSet& Entering, - RangeSet& Exiting, SlotIndex MIStartIdx, - SlotIndex MIEndIdx) { - for (MachineInstr::mop_iterator MOI = MI->operands_begin(), - MOE = MI->operands_end(); - MOI != MOE; ++MOI) { - const MachineOperand& MO = *MOI; - assert(!MO.isRegMask() && "Can't have RegMasks in bundles."); - if (!MO.isReg() || MO.getReg() == 0) - continue; - - unsigned Reg = MO.getReg(); - - // TODO: Currently we're skipping uses that are reserved or have no - // interval, but we're not updating their kills. This should be - // fixed. - if (!LIS.hasInterval(Reg) || - (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg))) - continue; - - LiveInterval* LI = &LIS.getInterval(Reg); - - if (MO.readsReg()) { - LiveRange* LR = LI->getLiveRangeContaining(MIStartIdx); - if (LR != 0) - Entering.insert(std::make_pair(LI, LR)); - } - if (MO.isDef()) { - assert(!MO.isEarlyClobber() && "Early clobbers not allowed in bundles."); - assert(!MO.isDead() && "Dead-defs not allowed in bundles."); - LiveRange* LR = LI->getLiveRangeContaining(MIEndIdx.getDeadSlot()); - assert(LR != 0 && "Internal ranges not allowed in bundles."); + void collectRanges(const MachineOperand &MO, LiveInterval *LI, + RangeSet &Entering, RangeSet &Internal, RangeSet &Exiting, + SlotIndex OldIdx) { + if (MO.readsReg()) { + LiveRange* LR = LI->getLiveRangeContaining(OldIdx); + if (LR != 0) + Entering.insert(std::make_pair(LI, LR)); + } + if (MO.isDef()) { + LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot()); + assert(LR != 0 && "No live range for def?"); + if (LR->end > OldIdx.getDeadSlot()) Exiting.insert(std::make_pair(LI, LR)); - } + else + Internal.insert(std::make_pair(LI, LR)); } } - BundleRanges createBundleRanges(RangeSet& Entering, RangeSet& Internal, RangeSet& Exiting) { + BundleRanges createBundleRanges(RangeSet& Entering, + RangeSet& Internal, + RangeSet& Exiting) { BundleRanges BR; for (RangeSet::iterator EI = Entering.begin(), EE = Entering.end(); @@ -1284,7 +1057,8 @@ private: return; // Bail out if we don't have kill flags on the old register. MachineInstr* NewKillMI = LIS.getInstructionFromIndex(newKillIdx); assert(OldKillMI->killsRegister(reg) && "Old 'kill' instr isn't a kill."); - assert(!NewKillMI->killsRegister(reg) && "New kill instr is already a kill."); + assert(!NewKillMI->killsRegister(reg) && + "New kill instr is already a kill."); OldKillMI->clearRegisterKills(reg, &TRI); NewKillMI->addRegisterKilled(reg, &TRI); } @@ -1523,22 +1297,23 @@ private: }; void LiveIntervals::handleMove(MachineInstr* MI) { - SlotIndex OldIndex = indexes_->getInstructionIndex(MI); - indexes_->removeMachineInstrFromMaps(MI); + SlotIndex OldIndex = Indexes->getInstructionIndex(MI); + Indexes->removeMachineInstrFromMaps(MI); SlotIndex NewIndex = MI->isInsideBundle() ? - indexes_->getInstructionIndex(MI) : - indexes_->insertMachineInstrInMaps(MI); + Indexes->getInstructionIndex(MI) : + Indexes->insertMachineInstrInMaps(MI); assert(getMBBStartIdx(MI->getParent()) <= OldIndex && OldIndex < getMBBEndIdx(MI->getParent()) && "Cannot handle moves across basic block boundaries."); assert(!MI->isBundled() && "Can't handle bundled instructions yet."); - HMEditor HME(*this, *mri_, *tri_, NewIndex); + HMEditor HME(*this, *MRI, *TRI, NewIndex); HME.moveAllRangesFrom(MI, OldIndex); } -void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI, MachineInstr* BundleStart) { - SlotIndex NewIndex = indexes_->getInstructionIndex(BundleStart); - HMEditor HME(*this, *mri_, *tri_, NewIndex); +void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI, + MachineInstr* BundleStart) { + SlotIndex NewIndex = Indexes->getInstructionIndex(BundleStart); + HMEditor HME(*this, *MRI, *TRI, NewIndex); HME.moveAllRangesInto(MI, BundleStart); } diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp index 60a6880..dadd02b 100644 --- a/lib/CodeGen/LiveIntervalUnion.cpp +++ b/lib/CodeGen/LiveIntervalUnion.cpp @@ -81,7 +81,6 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg) { void LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const { - OS << "LIU " << PrintReg(RepReg, TRI); if (empty()) { OS << " empty\n"; return; @@ -209,3 +208,26 @@ bool LiveIntervalUnion::Query::checkLoopInterference(MachineLoopRange *Loop) { VRI = VirtReg->advanceTo(VRI, Overlaps.start()); } } + +void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc, + unsigned NSize) { + // Reuse existing allocation. + if (NSize == Size) + return; + clear(); + Size = NSize; + LIUs = static_cast<LiveIntervalUnion*>( + malloc(sizeof(LiveIntervalUnion)*NSize)); + for (unsigned i = 0; i != Size; ++i) + new(LIUs + i) LiveIntervalUnion(Alloc); +} + +void LiveIntervalUnion::Array::clear() { + if (!LIUs) + return; + for (unsigned i = 0; i != Size; ++i) + LIUs[i].~LiveIntervalUnion(); + free(LIUs); + Size = 0; + LIUs = 0; +} diff --git a/lib/CodeGen/LiveIntervalUnion.h b/lib/CodeGen/LiveIntervalUnion.h index dbf5ac1..cd4e690 100644 --- a/lib/CodeGen/LiveIntervalUnion.h +++ b/lib/CodeGen/LiveIntervalUnion.h @@ -60,13 +60,11 @@ public: class Query; private: - const unsigned RepReg; // representative register number unsigned Tag; // unique tag for current contents. LiveSegments Segments; // union of virtual reg segments public: - LiveIntervalUnion(unsigned r, Allocator &a) : RepReg(r), Tag(0), Segments(a) - {} + explicit LiveIntervalUnion(Allocator &a) : Tag(0), Segments(a) {} // Iterate over all segments in the union of live virtual registers ordered // by their starting position. @@ -183,6 +181,28 @@ public: Query(const Query&); // DO NOT IMPLEMENT void operator=(const Query&); // DO NOT IMPLEMENT }; + + // Array of LiveIntervalUnions. + class Array { + unsigned Size; + LiveIntervalUnion *LIUs; + public: + Array() : Size(0), LIUs(0) {} + ~Array() { clear(); } + + // Initialize the array to have Size entries. + // Reuse an existing allocation if the size matches. + void init(LiveIntervalUnion::Allocator&, unsigned Size); + + unsigned size() const { return Size; } + + void clear(); + + LiveIntervalUnion& operator[](unsigned idx) { + assert(idx < Size && "idx out of bounds"); + return LIUs[idx]; + } + }; }; } // end namespace llvm diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp index d8ab791..9384075 100644 --- a/lib/CodeGen/LiveRangeCalc.cpp +++ b/lib/CodeGen/LiveRangeCalc.cpp @@ -14,10 +14,19 @@ #define DEBUG_TYPE "regalloc" #include "LiveRangeCalc.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" using namespace llvm; -void LiveRangeCalc::reset(const MachineFunction *MF) { +void LiveRangeCalc::reset(const MachineFunction *MF, + SlotIndexes *SI, + MachineDominatorTree *MDT, + VNInfo::Allocator *VNIA) { + MRI = &MF->getRegInfo(); + Indexes = SI; + DomTree = MDT; + Alloc = VNIA; + unsigned N = MF->getNumBlockIDs(); Seen.clear(); Seen.resize(N); @@ -26,8 +35,73 @@ void LiveRangeCalc::reset(const MachineFunction *MF) { } +void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) { + assert(MRI && Indexes && "call reset() first"); + + // Visit all def operands. If the same instruction has multiple defs of Reg, + // LI->createDeadDef() will deduplicate. + for (MachineRegisterInfo::def_iterator + I = MRI->def_begin(Reg), E = MRI->def_end(); I != E; ++I) { + const MachineInstr *MI = &*I; + // Find the corresponding slot index. + SlotIndex Idx; + if (MI->isPHI()) + // PHI defs begin at the basic block start index. + Idx = Indexes->getMBBStartIdx(MI->getParent()); + else + // Instructions are either normal 'r', or early clobber 'e'. + Idx = Indexes->getInstructionIndex(MI) + .getRegSlot(I.getOperand().isEarlyClobber()); + + // Create the def in LI. This may find an existing def. + VNInfo *VNI = LI->createDeadDef(Idx, *Alloc); + VNI->setIsPHIDef(MI->isPHI()); + } +} + + +void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) { + assert(MRI && Indexes && "call reset() first"); + + // Visit all operands that read Reg. This may include partial defs. + for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg), + E = MRI->reg_nodbg_end(); I != E; ++I) { + const MachineOperand &MO = I.getOperand(); + if (!MO.readsReg()) + continue; + // MI is reading Reg. We may have visited MI before if it happens to be + // reading Reg multiple times. That is OK, extend() is idempotent. + const MachineInstr *MI = &*I; + + // Find the SlotIndex being read. + SlotIndex Idx; + if (MI->isPHI()) { + assert(!MO.isDef() && "Cannot handle PHI def of partial register."); + // PHI operands are paired: (Reg, PredMBB). + // Extend the live range to be live-out from PredMBB. + Idx = Indexes->getMBBEndIdx(MI->getOperand(I.getOperandNo()+1).getMBB()); + } else { + // This is a normal instruction. + Idx = Indexes->getInstructionIndex(MI).getRegSlot(); + // Check for early-clobber redefs. + unsigned DefIdx; + if (MO.isDef()) { + if (MO.isEarlyClobber()) + Idx = Idx.getRegSlot(true); + } else if (MI->isRegTiedToDefOperand(I.getOperandNo(), &DefIdx)) { + // FIXME: This would be a lot easier if tied early-clobber uses also + // had an early-clobber flag. + if (MI->getOperand(DefIdx).isEarlyClobber()) + Idx = Idx.getRegSlot(true); + } + } + extend(LI, Idx, Reg); + } +} + + // Transfer information from the LiveIn vector to the live ranges. -void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI, SlotIndexes *Indexes) { +void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI) { for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(), E = LiveIn.end(); I != E; ++I) { if (!I->DomNode) @@ -56,9 +130,7 @@ void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI, SlotIndexes *Indexes) { void LiveRangeCalc::extend(LiveInterval *LI, SlotIndex Kill, - SlotIndexes *Indexes, - MachineDominatorTree *DomTree, - VNInfo::Allocator *Alloc) { + unsigned PhysReg) { assert(LI && "Missing live range"); assert(Kill.isValid() && "Invalid SlotIndex"); assert(Indexes && "Missing SlotIndexes"); @@ -75,34 +147,31 @@ void LiveRangeCalc::extend(LiveInterval *LI, // multiple values, and we may need to create even more phi-defs to preserve // VNInfo SSA form. Perform a search for all predecessor blocks where we // know the dominating VNInfo. - VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, Indexes, DomTree); + VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, PhysReg); // When there were multiple different values, we may need new PHIs. if (!VNI) - updateSSA(Indexes, DomTree, Alloc); + updateSSA(); - updateLiveIns(VNI, Indexes); + updateLiveIns(VNI); } // This function is called by a client after using the low-level API to add // live-out and live-in blocks. The unique value optimization is not // available, SplitEditor::transferValues handles that case directly anyway. -void LiveRangeCalc::calculateValues(SlotIndexes *Indexes, - MachineDominatorTree *DomTree, - VNInfo::Allocator *Alloc) { +void LiveRangeCalc::calculateValues() { assert(Indexes && "Missing SlotIndexes"); assert(DomTree && "Missing dominator tree"); - updateSSA(Indexes, DomTree, Alloc); - updateLiveIns(0, Indexes); + updateSSA(); + updateLiveIns(0); } VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI, MachineBasicBlock *KillMBB, SlotIndex Kill, - SlotIndexes *Indexes, - MachineDominatorTree *DomTree) { + unsigned PhysReg) { // Blocks where LI should be live-in. SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB); @@ -113,7 +182,22 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI, // Using Seen as a visited set, perform a BFS for all reaching defs. for (unsigned i = 0; i != WorkList.size(); ++i) { MachineBasicBlock *MBB = WorkList[i]; - assert(!MBB->pred_empty() && "Value live-in to entry block?"); + +#ifndef NDEBUG + if (MBB->pred_empty()) { + MBB->getParent()->verify(); + llvm_unreachable("Use not jointly dominated by defs."); + } + + if (TargetRegisterInfo::isPhysicalRegister(PhysReg) && + !MBB->isLiveIn(PhysReg)) { + MBB->getParent()->verify(); + errs() << "The register needs to be live in to BB#" << MBB->getNumber() + << ", but is missing from the live-in list.\n"; + llvm_unreachable("Invalid global physical register"); + } +#endif + for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), PE = MBB->pred_end(); PI != PE; ++PI) { MachineBasicBlock *Pred = *PI; @@ -168,9 +252,7 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI, // This is essentially the same iterative algorithm that SSAUpdater uses, // except we already have a dominator tree, so we don't have to recompute it. -void LiveRangeCalc::updateSSA(SlotIndexes *Indexes, - MachineDominatorTree *DomTree, - VNInfo::Allocator *Alloc) { +void LiveRangeCalc::updateSSA() { assert(Indexes && "Missing SlotIndexes"); assert(DomTree && "Missing dominator tree"); diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h index b8c8585..909829b 100644 --- a/lib/CodeGen/LiveRangeCalc.h +++ b/lib/CodeGen/LiveRangeCalc.h @@ -34,6 +34,11 @@ template <class NodeT> class DomTreeNodeBase; typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode; class LiveRangeCalc { + const MachineRegisterInfo *MRI; + SlotIndexes *Indexes; + MachineDominatorTree *DomTree; + VNInfo::Allocator *Alloc; + /// Seen - Bit vector of active entries in LiveOut, also used as a visited /// set by findReachingDefs. One entry per basic block, indexed by block /// number. This is kept as a separate bit vector because it can be cleared @@ -100,26 +105,27 @@ class LiveRangeCalc { /// to be live-in are added to LiveIn. If a unique reaching def is found, /// its value is returned, if Kill is jointly dominated by multiple values, /// NULL is returned. + /// + /// PhysReg, when set, is used to verify live-in lists on basic blocks. VNInfo *findReachingDefs(LiveInterval *LI, MachineBasicBlock *KillMBB, SlotIndex Kill, - SlotIndexes *Indexes, - MachineDominatorTree *DomTree); + unsigned PhysReg); /// updateSSA - Compute the values that will be live in to all requested /// blocks in LiveIn. Create PHI-def values as required to preserve SSA form. /// /// Every live-in block must be jointly dominated by the added live-out /// blocks. No values are read from the live ranges. - void updateSSA(SlotIndexes *Indexes, - MachineDominatorTree *DomTree, - VNInfo::Allocator *Alloc); + void updateSSA(); /// updateLiveIns - Add liveness as specified in the LiveIn vector, using VNI /// as a wildcard value for LiveIn entries without a value. - void updateLiveIns(VNInfo *VNI, SlotIndexes*); + void updateLiveIns(VNInfo *VNI); public: + LiveRangeCalc() : MRI(0), Indexes(0), DomTree(0), Alloc(0) {} + //===--------------------------------------------------------------------===// // High-level interface. //===--------------------------------------------------------------------===// @@ -132,14 +138,14 @@ public: /// that may overlap a previously computed live range, and before the first /// live range in a function. If live ranges are not known to be /// non-overlapping, call reset before each. - void reset(const MachineFunction *MF); + void reset(const MachineFunction *MF, + SlotIndexes*, + MachineDominatorTree*, + VNInfo::Allocator*); /// calculate - Calculate the live range of a virtual register from its defs /// and uses. LI must be empty with no values. - void calculate(LiveInterval *LI, - MachineRegisterInfo *MRI, - SlotIndexes *Indexes, - VNInfo::Allocator *Alloc); + void calculate(LiveInterval *LI); //===--------------------------------------------------------------------===// // Mid-level interface. @@ -154,21 +160,30 @@ public: /// Kill is not dominated by a single existing value, PHI-defs are inserted /// as required to preserve SSA form. If Kill is known to be dominated by a /// single existing value, Alloc may be null. - void extend(LiveInterval *LI, - SlotIndex Kill, - SlotIndexes *Indexes, - MachineDominatorTree *DomTree, - VNInfo::Allocator *Alloc); + /// + /// PhysReg, when set, is used to verify live-in lists on basic blocks. + void extend(LiveInterval *LI, SlotIndex Kill, unsigned PhysReg = 0); + + /// createDeadDefs - Create a dead def in LI for every def operand of Reg. + /// Each instruction defining Reg gets a new VNInfo with a corresponding + /// minimal live range. + void createDeadDefs(LiveInterval *LI, unsigned Reg); - /// extendToUses - Extend the live range of LI to reach all uses. + /// createDeadDefs - Create a dead def in LI for every def of LI->reg. + void createDeadDefs(LiveInterval *LI) { + createDeadDefs(LI, LI->reg); + } + + /// extendToUses - Extend the live range of LI to reach all uses of Reg. /// /// All uses must be jointly dominated by existing liveness. PHI-defs are /// inserted as needed to preserve SSA form. - void extendToUses(LiveInterval *LI, - MachineRegisterInfo *MRI, - SlotIndexes *Indexes, - MachineDominatorTree *DomTree, - VNInfo::Allocator *Alloc); + void extendToUses(LiveInterval *LI, unsigned Reg); + + /// extendToUses - Extend the live range of LI to reach all uses of LI->reg. + void extendToUses(LiveInterval *LI) { + extendToUses(LI, LI->reg); + } //===--------------------------------------------------------------------===// // Low-level interface. @@ -216,9 +231,7 @@ public: /// /// Every predecessor of a live-in block must have been given a value with /// setLiveOutValue, the value may be null for live-trough blocks. - void calculateValues(SlotIndexes *Indexes, - MachineDominatorTree *DomTree, - VNInfo::Allocator *Alloc); + void calculateValues(); }; } // end namespace llvm diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp index 695f536..896fdbf 100644 --- a/lib/CodeGen/LiveRangeEdit.cpp +++ b/lib/CodeGen/LiveRangeEdit.cpp @@ -38,7 +38,7 @@ LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg) { VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg)); } LiveInterval &LI = LIS.getOrCreateInterval(VReg); - newRegs_.push_back(&LI); + NewRegs.push_back(&LI); return LI; } @@ -46,16 +46,16 @@ bool LiveRangeEdit::checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI, AliasAnalysis *aa) { assert(DefMI && "Missing instruction"); - scannedRemattable_ = true; + ScannedRemattable = true; if (!TII.isTriviallyReMaterializable(DefMI, aa)) return false; - remattable_.insert(VNI); + Remattable.insert(VNI); return true; } void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) { - for (LiveInterval::vni_iterator I = parent_.vni_begin(), - E = parent_.vni_end(); I != E; ++I) { + for (LiveInterval::vni_iterator I = getParent().vni_begin(), + E = getParent().vni_end(); I != E; ++I) { VNInfo *VNI = *I; if (VNI->isUnused()) continue; @@ -64,13 +64,13 @@ void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) { continue; checkRematerializable(VNI, DefMI, aa); } - scannedRemattable_ = true; + ScannedRemattable = true; } bool LiveRangeEdit::anyRematerializable(AliasAnalysis *aa) { - if (!scannedRemattable_) + if (!ScannedRemattable) scanRemattable(aa); - return !remattable_.empty(); + return !Remattable.empty(); } /// allUsesAvailableAt - Return true if all registers used by OrigMI at @@ -82,12 +82,16 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI, UseIdx = UseIdx.getRegSlot(true); for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = OrigMI->getOperand(i); - if (!MO.isReg() || !MO.getReg() || MO.isDef()) - continue; - // Reserved registers are OK. - if (MO.isUndef() || !LIS.hasInterval(MO.getReg())) + if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) continue; + // We can't remat physreg uses, unless it is a constant. + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (MRI.isConstantPhysReg(MO.getReg(), VRM->getMachineFunction())) + continue; + return false; + } + LiveInterval &li = LIS.getInterval(MO.getReg()); const VNInfo *OVNI = li.getVNInfoAt(OrigIdx); if (!OVNI) @@ -101,10 +105,10 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI, bool LiveRangeEdit::canRematerializeAt(Remat &RM, SlotIndex UseIdx, bool cheapAsAMove) { - assert(scannedRemattable_ && "Call anyRematerializable first"); + assert(ScannedRemattable && "Call anyRematerializable first"); // Use scanRemattable info. - if (!remattable_.count(RM.ParentVNI)) + if (!Remattable.count(RM.ParentVNI)) return false; // No defining instruction provided. @@ -136,13 +140,13 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB, bool Late) { assert(RM.OrigMI && "Invalid remat"); TII.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri); - rematted_.insert(RM.ParentVNI); + Rematted.insert(RM.ParentVNI); return LIS.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late) .getRegSlot(); } void LiveRangeEdit::eraseVirtReg(unsigned Reg) { - if (delegate_ && delegate_->LRE_CanEraseVirtReg(Reg)) + if (TheDelegate && TheDelegate->LRE_CanEraseVirtReg(Reg)) LIS.removeInterval(Reg); } @@ -173,6 +177,19 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, if (!DefMI || !UseMI) return false; + // Since we're moving the DefMI load, make sure we're not extending any live + // ranges. + if (!allUsesAvailableAt(DefMI, + LIS.getInstructionIndex(DefMI), + LIS.getInstructionIndex(UseMI))) + return false; + + // We also need to make sure it is safe to move the load. + // Assume there are stores between DefMI and UseMI. + bool SawStore = true; + if (!DefMI->isSafeToMove(&TII, 0, SawStore)) + return false; + DEBUG(dbgs() << "Try to fold single def: " << *DefMI << " into single use: " << *UseMI); @@ -220,6 +237,9 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI); + // Collect virtual registers to be erased after MI is gone. + SmallVector<unsigned, 8> RegsToErase; + // Check for live intervals that may shrink for (MachineInstr::mop_iterator MOI = MI->operands_begin(), MOE = MI->operands_end(); MOI != MOE; ++MOI) { @@ -242,22 +262,30 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, // Remove defined value. if (MOI->isDef()) { if (VNInfo *VNI = LI.getVNInfoAt(Idx)) { - if (delegate_) - delegate_->LRE_WillShrinkVirtReg(LI.reg); + if (TheDelegate) + TheDelegate->LRE_WillShrinkVirtReg(LI.reg); LI.removeValNo(VNI); - if (LI.empty()) { - ToShrink.remove(&LI); - eraseVirtReg(Reg); - } + if (LI.empty()) + RegsToErase.push_back(Reg); } } } - if (delegate_) - delegate_->LRE_WillEraseInstruction(MI); + if (TheDelegate) + TheDelegate->LRE_WillEraseInstruction(MI); LIS.RemoveMachineInstrFromMaps(MI); MI->eraseFromParent(); ++NumDCEDeleted; + + // Erase any virtregs that are now empty and unused. There may be <undef> + // uses around. Keep the empty live range in that case. + for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) { + unsigned Reg = RegsToErase[i]; + if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) { + ToShrink.remove(&LIS.getInterval(Reg)); + eraseVirtReg(Reg); + } + } } if (ToShrink.empty()) @@ -268,8 +296,8 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, ToShrink.pop_back(); if (foldAsLoad(LI, Dead)) continue; - if (delegate_) - delegate_->LRE_WillShrinkVirtReg(LI->reg); + if (TheDelegate) + TheDelegate->LRE_WillShrinkVirtReg(LI->reg); if (!LIS.shrinkToUses(LI, &Dead)) continue; @@ -304,10 +332,14 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, // interval must contain all the split products, and LI doesn't. if (IsOriginal) VRM->setIsSplitFromReg(Dups.back()->reg, 0); - if (delegate_) - delegate_->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg); + if (TheDelegate) + TheDelegate->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg); } ConEQ.Distribute(&Dups[0], MRI); + DEBUG({ + for (unsigned i = 0; i != NumComp; ++i) + dbgs() << '\t' << *Dups[i] << '\n'; + }); } } diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp new file mode 100644 index 0000000..cdb1776 --- /dev/null +++ b/lib/CodeGen/LiveRegMatrix.cpp @@ -0,0 +1,152 @@ +//===-- LiveRegMatrix.cpp - Track register interference -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the LiveRegMatrix analysis pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "regalloc" +#include "LiveRegMatrix.h" +#include "VirtRegMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +STATISTIC(NumAssigned , "Number of registers assigned"); +STATISTIC(NumUnassigned , "Number of registers unassigned"); + +char LiveRegMatrix::ID = 0; +INITIALIZE_PASS_BEGIN(LiveRegMatrix, "liveregmatrix", + "Live Register Matrix", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix", + "Live Register Matrix", false, false) + +LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID), + UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {} + +void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequiredTransitive<LiveIntervals>(); + AU.addRequiredTransitive<VirtRegMap>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) { + TRI = MF.getTarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + LIS = &getAnalysis<LiveIntervals>(); + VRM = &getAnalysis<VirtRegMap>(); + + unsigned NumRegUnits = TRI->getNumRegUnits(); + if (NumRegUnits != Matrix.size()) + Queries.reset(new LiveIntervalUnion::Query[NumRegUnits]); + Matrix.init(LIUAlloc, NumRegUnits); + + // Make sure no stale queries get reused. + invalidateVirtRegs(); + return false; +} + +void LiveRegMatrix::releaseMemory() { + for (unsigned i = 0, e = Matrix.size(); i != e; ++i) { + Matrix[i].clear(); + Queries[i].clear(); + } +} + +void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) { + DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI) + << " to " << PrintReg(PhysReg, TRI) << ':'); + assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment"); + VRM->assignVirt2Phys(VirtReg.reg, PhysReg); + MRI->setPhysRegUsed(PhysReg); + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI)); + Matrix[*Units].unify(VirtReg); + } + ++NumAssigned; + DEBUG(dbgs() << '\n'); +} + +void LiveRegMatrix::unassign(LiveInterval &VirtReg) { + unsigned PhysReg = VRM->getPhys(VirtReg.reg); + DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI) + << " from " << PrintReg(PhysReg, TRI) << ':'); + VRM->clearVirt(VirtReg.reg); + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI)); + Matrix[*Units].extract(VirtReg); + } + ++NumUnassigned; + DEBUG(dbgs() << '\n'); +} + +bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg, + unsigned PhysReg) { + // Check if the cached information is valid. + // The same BitVector can be reused for all PhysRegs. + // We could cache multiple VirtRegs if it becomes necessary. + if (RegMaskVirtReg != VirtReg.reg || RegMaskTag != UserTag) { + RegMaskVirtReg = VirtReg.reg; + RegMaskTag = UserTag; + RegMaskUsable.clear(); + LIS->checkRegMaskInterference(VirtReg, RegMaskUsable); + } + + // The BitVector is indexed by PhysReg, not register unit. + // Regmask interference is more fine grained than regunits. + // For example, a Win64 call can clobber %ymm8 yet preserve %xmm8. + return !RegMaskUsable.empty() && (!PhysReg || !RegMaskUsable.test(PhysReg)); +} + +bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg, + unsigned PhysReg) { + if (VirtReg.empty()) + return false; + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) + if (VirtReg.overlaps(LIS->getRegUnit(*Units))) + return true; + return false; +} + +LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg, + unsigned RegUnit) { + LiveIntervalUnion::Query &Q = Queries[RegUnit]; + Q.init(UserTag, &VirtReg, &Matrix[RegUnit]); + return Q; +} + +LiveRegMatrix::InterferenceKind +LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) { + if (VirtReg.empty()) + return IK_Free; + + // Regmask interference is the fastest check. + if (checkRegMaskInterference(VirtReg, PhysReg)) + return IK_RegMask; + + // Check for fixed interference. + if (checkRegUnitInterference(VirtReg, PhysReg)) + return IK_RegUnit; + + // Check the matrix for virtual register interference. + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) + if (query(VirtReg, *Units).checkInterference()) + return IK_VirtReg; + + return IK_Free; +} diff --git a/lib/CodeGen/LiveRegMatrix.h b/lib/CodeGen/LiveRegMatrix.h new file mode 100644 index 0000000..b3e2d7f --- /dev/null +++ b/lib/CodeGen/LiveRegMatrix.h @@ -0,0 +1,148 @@ +//===-- LiveRegMatrix.h - Track register interference ---------*- C++ -*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The LiveRegMatrix analysis pass keeps track of virtual register interference +// along two dimensions: Slot indexes and register units. The matrix is used by +// register allocators to ensure that no interfering virtual registers get +// assigned to overlapping physical registers. +// +// Register units are defined in MCRegisterInfo.h, they represent the smallest +// unit of interference when dealing with overlapping physical registers. The +// LiveRegMatrix is represented as a LiveIntervalUnion per register unit. When +// a virtual register is assigned to a physicval register, the live range for +// the virtual register is inserted into the LiveIntervalUnion for each regunit +// in the physreg. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_LIVEREGMATRIX_H +#define LLVM_CODEGEN_LIVEREGMATRIX_H + +#include "LiveIntervalUnion.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +namespace llvm { + +class LiveInterval; +class LiveIntervalAnalysis; +class MachineRegisterInfo; +class TargetRegisterInfo; +class VirtRegMap; + +class LiveRegMatrix : public MachineFunctionPass { + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + VirtRegMap *VRM; + + // UserTag changes whenever virtual registers have been modified. + unsigned UserTag; + + // The matrix is represented as a LiveIntervalUnion per register unit. + LiveIntervalUnion::Allocator LIUAlloc; + LiveIntervalUnion::Array Matrix; + + // Cached queries per register unit. + OwningArrayPtr<LiveIntervalUnion::Query> Queries; + + // Cached register mask interference info. + unsigned RegMaskTag; + unsigned RegMaskVirtReg; + BitVector RegMaskUsable; + + // MachineFunctionPass boilerplate. + virtual void getAnalysisUsage(AnalysisUsage&) const; + virtual bool runOnMachineFunction(MachineFunction&); + virtual void releaseMemory(); +public: + static char ID; + LiveRegMatrix(); + + //===--------------------------------------------------------------------===// + // High-level interface. + //===--------------------------------------------------------------------===// + // + // Check for interference before assigning virtual registers to physical + // registers. + // + + /// Invalidate cached interference queries after modifying virtual register + /// live ranges. Interference checks may return stale information unless + /// caches are invalidated. + void invalidateVirtRegs() { ++UserTag; } + + enum InterferenceKind { + /// No interference, go ahead and assign. + IK_Free = 0, + + /// Virtual register interference. There are interfering virtual registers + /// assigned to PhysReg or its aliases. This interference could be resolved + /// by unassigning those other virtual registers. + IK_VirtReg, + + /// Register unit interference. A fixed live range is in the way, typically + /// argument registers for a call. This can't be resolved by unassigning + /// other virtual registers. + IK_RegUnit, + + /// RegMask interference. The live range is crossing an instruction with a + /// regmask operand that doesn't preserve PhysReg. This typically means + /// VirtReg is live across a call, and PhysReg isn't call-preserved. + IK_RegMask + }; + + /// Check for interference before assigning VirtReg to PhysReg. + /// If this function returns IK_Free, it is legal to assign(VirtReg, PhysReg). + /// When there is more than one kind of interference, the InterferenceKind + /// with the highest enum value is returned. + InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg); + + /// Assign VirtReg to PhysReg. + /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and + /// update VirtRegMap. The live range is expected to be available in PhysReg. + void assign(LiveInterval &VirtReg, unsigned PhysReg); + + /// Unassign VirtReg from its PhysReg. + /// Assuming that VirtReg was previously assigned to a PhysReg, this undoes + /// the assignment and updates VirtRegMap accordingly. + void unassign(LiveInterval &VirtReg); + + //===--------------------------------------------------------------------===// + // Low-level interface. + //===--------------------------------------------------------------------===// + // + // Provide access to the underlying LiveIntervalUnions. + // + + /// Check for regmask interference only. + /// Return true if VirtReg crosses a regmask operand that clobbers PhysReg. + /// If PhysReg is null, check if VirtReg crosses any regmask operands. + bool checkRegMaskInterference(LiveInterval &VirtReg, unsigned PhysReg = 0); + + /// Check for regunit interference only. + /// Return true if VirtReg overlaps a fixed assignment of one of PhysRegs's + /// register units. + bool checkRegUnitInterference(LiveInterval &VirtReg, unsigned PhysReg); + + /// Query a line of the assigned virtual register matrix directly. + /// Use MCRegUnitIterator to enumerate all regunits in the desired PhysReg. + /// This returns a reference to an internal Query data structure that is only + /// valid until the next query() call. + LiveIntervalUnion::Query &query(LiveInterval &VirtReg, unsigned RegUnit); + + /// Directly access the live interval unions per regunit. + /// This returns an array indexed by the regunit number. + LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; } +}; + +} // end namespace llvm + +#endif // LLVM_CODEGEN_LIVEREGMATRIX_H diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp index 5a0d97d..348ed3a 100644 --- a/lib/CodeGen/LiveVariables.cpp +++ b/lib/CodeGen/LiveVariables.cpp @@ -192,8 +192,8 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg, unsigned LastDefReg = 0; unsigned LastDefDist = 0; MachineInstr *LastDef = NULL; - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; MachineInstr *Def = PhysRegDef[SubReg]; if (!Def) continue; @@ -216,9 +216,8 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg, unsigned DefReg = MO.getReg(); if (TRI->isSubRegister(Reg, DefReg)) { PartDefRegs.insert(DefReg); - for (const uint16_t *SubRegs = TRI->getSubRegisters(DefReg); - unsigned SubReg = *SubRegs; ++SubRegs) - PartDefRegs.insert(SubReg); + for (MCSubRegIterator SubRegs(DefReg, TRI); SubRegs.isValid(); ++SubRegs) + PartDefRegs.insert(*SubRegs); } } return LastDef; @@ -247,8 +246,8 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) { true/*IsImp*/)); PhysRegDef[Reg] = LastPartialDef; SmallSet<unsigned, 8> Processed; - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; if (Processed.count(SubReg)) continue; if (PartDefRegs.count(SubReg)) @@ -259,7 +258,7 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) { false/*IsDef*/, true/*IsImp*/)); PhysRegDef[SubReg] = LastPartialDef; - for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS) + for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS) Processed.insert(*SS); } } @@ -271,9 +270,8 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) { // Remember this use. PhysRegUse[Reg] = MI; - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) - PhysRegUse[SubReg] = MI; + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + PhysRegUse[*SubRegs] = MI; } /// FindLastRefOrPartRef - Return the last reference or partial reference of @@ -287,8 +285,8 @@ MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) { MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef; unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef]; unsigned LastPartDefDist = 0; - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; MachineInstr *Def = PhysRegDef[SubReg]; if (Def && Def != LastDef) { // There was a def of this sub-register in between. This is a partial @@ -336,8 +334,8 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) { MachineInstr *LastPartDef = 0; unsigned LastPartDefDist = 0; SmallSet<unsigned, 8> PartUses; - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; MachineInstr *Def = PhysRegDef[SubReg]; if (Def && Def != LastDef) { // There was a def of this sub-register in between. This is a partial @@ -351,7 +349,7 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) { } if (MachineInstr *Use = PhysRegUse[SubReg]) { PartUses.insert(SubReg); - for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS) + for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS) PartUses.insert(*SS); unsigned Dist = DistanceMap[Use]; if (Dist > LastRefOrPartRefDist) { @@ -367,8 +365,8 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) { // EAX<dead> = op AL<imp-def> // That is, EAX def is dead but AL def extends pass it. PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true); - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; if (!PartUses.count(SubReg)) continue; bool NeedDef = true; @@ -388,11 +386,10 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) { else { LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true); PhysRegUse[SubReg] = LastRefOrPartRef; - for (const uint16_t *SSRegs = TRI->getSubRegisters(SubReg); - unsigned SSReg = *SSRegs; ++SSRegs) - PhysRegUse[SSReg] = LastRefOrPartRef; + for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS) + PhysRegUse[*SS] = LastRefOrPartRef; } - for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS) + for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS) PartUses.erase(*SS); } } else if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI) { @@ -434,7 +431,7 @@ void LiveVariables::HandleRegMask(const MachineOperand &MO) { // Kill the largest clobbered super-register. // This avoids needless implicit operands. unsigned Super = Reg; - for (const uint16_t *SR = TRI->getSuperRegisters(Reg); *SR; ++SR) + for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) if ((PhysRegDef[*SR] || PhysRegUse[*SR]) && MO.clobbersPhysReg(*SR)) Super = *SR; HandlePhysRegKill(Super, 0); @@ -447,11 +444,11 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI, SmallSet<unsigned, 32> Live; if (PhysRegDef[Reg] || PhysRegUse[Reg]) { Live.insert(Reg); - for (const uint16_t *SS = TRI->getSubRegisters(Reg); *SS; ++SS) - Live.insert(*SS); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + Live.insert(*SubRegs); } else { - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; // If a register isn't itself defined, but all parts that make up of it // are defined, then consider it also defined. // e.g. @@ -462,7 +459,7 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI, continue; if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) { Live.insert(SubReg); - for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS) + for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS) Live.insert(*SS); } } @@ -472,8 +469,8 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI, // is referenced. HandlePhysRegKill(Reg, MI); // Only some of the sub-registers are used. - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; if (!Live.count(SubReg)) // Skip if this sub-register isn't defined. continue; @@ -491,8 +488,8 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI, Defs.pop_back(); PhysRegDef[Reg] = MI; PhysRegUse[Reg] = NULL; - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; PhysRegDef[SubReg] = MI; PhysRegUse[SubReg] = NULL; } @@ -576,7 +573,8 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) { unsigned MOReg = MO.getReg(); if (MO.isUse()) { MO.setIsKill(false); - UseRegs.push_back(MOReg); + if (MO.readsReg()) + UseRegs.push_back(MOReg); } else /*MO.isDef()*/ { MO.setIsDead(false); DefRegs.push_back(MOReg); @@ -732,8 +730,9 @@ void LiveVariables::analyzePHINodes(const MachineFunction& Fn) { for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end(); BBI != BBE && BBI->isPHI(); ++BBI) for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) - PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()] - .push_back(BBI->getOperand(i).getReg()); + if (BBI->getOperand(i).readsReg()) + PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()] + .push_back(BBI->getOperand(i).getReg()); } bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB, diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp index 238bf52..fbc9e20 100644 --- a/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -314,7 +314,8 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) { // No previously defined register was in range, so create a // new one. int64_t InstrOffset = TRI->getFrameIndexInstrOffset(MI, idx); - const TargetRegisterClass *RC = TRI->getPointerRegClass(); + const MachineFunction *MF = MI->getParent()->getParent(); + const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF); BaseReg = Fn.getRegInfo().createVirtualRegister(RC); DEBUG(dbgs() << " Materializing base register " << BaseReg << diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 1abb8f2..ecc1e95 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -271,11 +271,9 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const { } if (isLandingPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; } if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; } - if (Alignment) { + if (Alignment) OS << Comma << "Align " << Alignment << " (" << (1u << Alignment) << " bytes)"; - Comma = ", "; - } OS << '\n'; @@ -596,6 +594,11 @@ bool MachineBasicBlock::canFallThrough() { MachineBasicBlock * MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { + // Splitting the critical edge to a landing pad block is non-trivial. Don't do + // it in this generic function. + if (Succ->isLandingPad()) + return NULL; + MachineFunction *MF = getParent(); DebugLoc dl; // FIXME: this is nowhere @@ -670,7 +673,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { // Inherit live-ins from the successor for (MachineBasicBlock::livein_iterator I = Succ->livein_begin(), - E = Succ->livein_end(); I != E; ++I) + E = Succ->livein_end(); I != E; ++I) NMBB->addLiveIn(*I); // Update LiveVariables. diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 5ba6851..5a15f92 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -11,7 +11,7 @@ // structure and branch probability estimates. // // The pass strives to preserve the structure of the CFG (that is, retain -// a topological ordering of basic blocks) in the absense of a *strong* signal +// a topological ordering of basic blocks) in the absence of a *strong* signal // to the contrary from probabilities. However, within the CFG structure, it // attempts to choose an ordering which favors placing more likely sequences of // blocks adjacent to each other. @@ -63,17 +63,13 @@ namespace { /// /// This is the datastructure representing a chain of consecutive blocks that /// are profitable to layout together in order to maximize fallthrough -/// probabilities. We also can use a block chain to represent a sequence of -/// basic blocks which have some external (correctness) requirement for -/// sequential layout. +/// probabilities and code locality. We also can use a block chain to represent +/// a sequence of basic blocks which have some external (correctness) +/// requirement for sequential layout. /// -/// Eventually, the block chains will form a directed graph over the function. -/// We provide an SCC-supporting-iterator in order to quicky build and walk the -/// SCCs of block chains within a function. -/// -/// The block chains also have support for calculating and caching probability -/// information related to the chain itself versus other chains. This is used -/// for ranking during the final layout of block chains. +/// Chains can be built around a single basic block and can be merged to grow +/// them. They participate in a block-to-chain mapping, which is updated +/// automatically as chains are merged together. class BlockChain { /// \brief The sequence of blocks belonging to this chain. /// @@ -179,10 +175,11 @@ class MachineBlockPlacement : public MachineFunctionPass { /// \brief Allocator and owner of BlockChain structures. /// - /// We build BlockChains lazily by merging together high probability BB - /// sequences acording to the "Algo2" in the paper mentioned at the top of - /// the file. To reduce malloc traffic, we allocate them using this slab-like - /// allocator, and destroy them after the pass completes. + /// We build BlockChains lazily while processing the loop structure of + /// a function. To reduce malloc traffic, we allocate them using this + /// slab-like allocator, and destroy them after the pass completes. An + /// important guarantee is that this allocator produces stable pointers to + /// the chains. SpecificBumpPtrAllocator<BlockChain> ChainAllocator; /// \brief Function wide BasicBlock to BlockChain mapping. @@ -329,7 +326,7 @@ MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor( // the MBPI analysis, we manually compute probabilities using the edge // weights. This is suboptimal as it means that the somewhat subtle // definition of edge weight semantics is encoded here as well. We should - // improve the MBPI interface to effeciently support query patterns such as + // improve the MBPI interface to efficiently support query patterns such as // this. uint32_t BestWeight = 0; uint32_t WeightScale = 0; @@ -1053,7 +1050,7 @@ namespace { /// /// A separate pass to compute interesting statistics for evaluating block /// placement. This is separate from the actual placement pass so that they can -/// be computed in the absense of any placement transformations or when using +/// be computed in the absence of any placement transformations or when using /// alternative placement strategies. class MachineBlockPlacementStats : public MachineFunctionPass { /// \brief A handle to the branch probability pass. diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp index a63688e..9cfe9ab 100644 --- a/lib/CodeGen/MachineCSE.cpp +++ b/lib/CodeGen/MachineCSE.cpp @@ -84,7 +84,7 @@ namespace { bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB); bool isPhysDefTriviallyDead(unsigned Reg, MachineBasicBlock::const_iterator I, - MachineBasicBlock::const_iterator E) const ; + MachineBasicBlock::const_iterator E) const; bool hasLivePhysRegDefUses(const MachineInstr *MI, const MachineBasicBlock *MBB, SmallSet<unsigned,8> &PhysRefs, @@ -100,8 +100,7 @@ namespace { void ExitScope(MachineBasicBlock *MBB); bool ProcessBlock(MachineBasicBlock *MBB); void ExitScopeIfDone(MachineDomTreeNode *Node, - DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren, - DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap); + DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren); bool PerformCSE(MachineDomTreeNode *Node); }; } // end anonymous namespace @@ -216,11 +215,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI, if (MO.isDef() && (MO.isDead() || isPhysDefTriviallyDead(Reg, I, MBB->end()))) continue; - PhysRefs.insert(Reg); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + PhysRefs.insert(*AI); if (MO.isDef()) PhysDefs.push_back(Reg); - for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) - PhysRefs.insert(*Alias); } return !PhysRefs.empty(); @@ -437,7 +435,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { // used, then it's not safe to replace it with a common subexpression. // It's also not safe if the instruction uses physical registers. bool CrossMBBPhysDef = false; - SmallSet<unsigned,8> PhysRefs; + SmallSet<unsigned, 8> PhysRefs; SmallVector<unsigned, 2> PhysDefs; if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs, PhysDefs)) { FoundCSE = false; @@ -480,6 +478,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { "Do not CSE physical register defs!"); if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) { + DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n"); DoCSE = false; break; } @@ -488,6 +487,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { // within the register class of the new instruction. const TargetRegisterClass *OldRC = MRI->getRegClass(OldReg); if (!MRI->constrainRegClass(NewReg, OldRC)) { + DEBUG(dbgs() << "*** Not the same register class, avoid CSE!\n"); DoCSE = false; break; } @@ -522,7 +522,6 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { ++NumCommutes; Changed = true; } else { - DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n"); VNT.insert(MI, CurrVN++); Exps.push_back(MI); } @@ -537,8 +536,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { /// up the dominator tree to destroy ancestors which are now done. void MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node, - DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren, - DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) { + DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) { if (OpenChildren[Node]) return; @@ -546,7 +544,7 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node, ExitScope(Node->getBlock()); // Now traverse upwards to pop ancestors whose offsprings are all done. - while (MachineDomTreeNode *Parent = ParentMap[Node]) { + while (MachineDomTreeNode *Parent = Node->getIDom()) { unsigned Left = --OpenChildren[Parent]; if (Left != 0) break; @@ -558,7 +556,6 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node, bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) { SmallVector<MachineDomTreeNode*, 32> Scopes; SmallVector<MachineDomTreeNode*, 8> WorkList; - DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap; DenseMap<MachineDomTreeNode*, unsigned> OpenChildren; CurrVN = 0; @@ -573,7 +570,6 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) { OpenChildren[Node] = NumChildren; for (unsigned i = 0; i != NumChildren; ++i) { MachineDomTreeNode *Child = Children[i]; - ParentMap[Child] = Node; WorkList.push_back(Child); } } while (!WorkList.empty()); @@ -586,7 +582,7 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) { EnterScope(MBB); Changed |= ProcessBlock(MBB); // If it's a leaf node, it's done. Traverse upwards to pop ancestors. - ExitScopeIfDone(Node, OpenChildren, ParentMap); + ExitScopeIfDone(Node, OpenChildren); } return Changed; diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp index 9730eaa..bac3aa2 100644 --- a/lib/CodeGen/MachineCopyPropagation.cpp +++ b/lib/CodeGen/MachineCopyPropagation.cpp @@ -62,28 +62,16 @@ void MachineCopyPropagation::SourceNoLongerAvailable(unsigned Reg, SourceMap &SrcMap, DenseMap<unsigned, MachineInstr*> &AvailCopyMap) { - SourceMap::iterator SI = SrcMap.find(Reg); - if (SI != SrcMap.end()) { - const DestList& Defs = SI->second; - for (DestList::const_iterator I = Defs.begin(), E = Defs.end(); - I != E; ++I) { - unsigned MappedDef = *I; - // Source of copy is no longer available for propagation. - if (AvailCopyMap.erase(MappedDef)) { - for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR) - AvailCopyMap.erase(*SR); - } - } - } - for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) { - SI = SrcMap.find(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { + SourceMap::iterator SI = SrcMap.find(*AI); if (SI != SrcMap.end()) { const DestList& Defs = SI->second; for (DestList::const_iterator I = Defs.begin(), E = Defs.end(); I != E; ++I) { unsigned MappedDef = *I; + // Source of copy is no longer available for propagation. if (AvailCopyMap.erase(MappedDef)) { - for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR) + for (MCSubRegIterator SR(MappedDef, TRI); SR.isValid(); ++SR) AvailCopyMap.erase(*SR); } } @@ -188,11 +176,8 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) { } // If Src is defined by a previous copy, it cannot be eliminated. - CI = CopyMap.find(Src); - if (CI != CopyMap.end()) - MaybeDeadCopies.remove(CI->second); - for (const uint16_t *AS = TRI->getAliasSet(Src); *AS; ++AS) { - CI = CopyMap.find(*AS); + for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) { + CI = CopyMap.find(*AI); if (CI != CopyMap.end()) MaybeDeadCopies.remove(CI->second); } @@ -211,13 +196,13 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) { // Remember Def is defined by the copy. // ... Make sure to clear the def maps of aliases first. - for (const uint16_t *AS = TRI->getAliasSet(Def); *AS; ++AS) { - CopyMap.erase(*AS); - AvailCopyMap.erase(*AS); + for (MCRegAliasIterator AI(Def, TRI, false); AI.isValid(); ++AI) { + CopyMap.erase(*AI); + AvailCopyMap.erase(*AI); } CopyMap[Def] = MI; AvailCopyMap[Def] = MI; - for (const uint16_t *SR = TRI->getSubRegisters(Def); *SR; ++SR) { + for (MCSubRegIterator SR(Def, TRI); SR.isValid(); ++SR) { CopyMap[*SR] = MI; AvailCopyMap[*SR] = MI; } @@ -256,11 +241,8 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) { // If 'Reg' is defined by a copy, the copy is no longer a candidate // for elimination. - DenseMap<unsigned, MachineInstr*>::iterator CI = CopyMap.find(Reg); - if (CI != CopyMap.end()) - MaybeDeadCopies.remove(CI->second); - for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) { - CI = CopyMap.find(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { + DenseMap<unsigned, MachineInstr*>::iterator CI = CopyMap.find(*AI); if (CI != CopyMap.end()) MaybeDeadCopies.remove(CI->second); } @@ -296,11 +278,9 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) { unsigned Reg = Defs[i]; // No longer defined by a copy. - CopyMap.erase(Reg); - AvailCopyMap.erase(Reg); - for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) { - CopyMap.erase(*AS); - AvailCopyMap.erase(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { + CopyMap.erase(*AI); + AvailCopyMap.erase(*AI); } // If 'Reg' is previously source of a copy, it is no longer available for diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index d8c2f6a..d4aede8 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -26,7 +27,6 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLowering.h" @@ -60,7 +60,7 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM, MFInfo = 0; FrameInfo = new (Allocator) MachineFrameInfo(*TM.getFrameLowering()); if (Fn->hasFnAttr(Attribute::StackAlignment)) - FrameInfo->setMaxAlignment(Attribute::getStackAlignmentFromAttrs( + FrameInfo->ensureMaxAlignment(Attribute::getStackAlignmentFromAttrs( Fn->getAttributes().getFnAttributes())); ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData()); Alignment = TM.getTargetLowering()->getMinFunctionAlignment(); @@ -84,9 +84,13 @@ MachineFunction::~MachineFunction() { MFInfo->~MachineFunctionInfo(); Allocator.Deallocate(MFInfo); } - FrameInfo->~MachineFrameInfo(); Allocator.Deallocate(FrameInfo); - ConstantPool->~MachineConstantPool(); Allocator.Deallocate(ConstantPool); - + + FrameInfo->~MachineFrameInfo(); + Allocator.Deallocate(FrameInfo); + + ConstantPool->~MachineConstantPool(); + Allocator.Deallocate(ConstantPool); + if (JumpTableInfo) { JumpTableInfo->~MachineJumpTableInfo(); Allocator.Deallocate(JumpTableInfo); @@ -98,7 +102,7 @@ MachineFunction::~MachineFunction() { MachineJumpTableInfo *MachineFunction:: getOrCreateJumpTableInfo(unsigned EntryKind) { if (JumpTableInfo) return JumpTableInfo; - + JumpTableInfo = new (Allocator) MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind); return JumpTableInfo; @@ -116,12 +120,12 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) { MBBI = begin(); else MBBI = MBB; - + // Figure out the block number this should have. unsigned BlockNo = 0; if (MBBI != begin()) BlockNo = prior(MBBI)->getNumber()+1; - + for (; MBBI != E; ++MBBI, ++BlockNo) { if (MBBI->getNumber() != (int)BlockNo) { // Remove use of the old number. @@ -130,7 +134,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) { "MBB number mismatch!"); MBBNumbering[MBBI->getNumber()] = 0; } - + // If BlockNo is already taken, set that block's number to -1. if (MBBNumbering[BlockNo]) MBBNumbering[BlockNo]->setNumber(-1); @@ -138,7 +142,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) { MBBNumbering[BlockNo] = MBBI; MBBI->setNumber(BlockNo); } - } + } // Okay, all the blocks are renumbered. If we have compactified the block // numbering, shrink MBBNumbering now. @@ -295,16 +299,16 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const { // Print Frame Information FrameInfo->print(*this, OS); - + // Print JumpTable Information if (JumpTableInfo) JumpTableInfo->print(OS); // Print Constant Pool ConstantPool->print(OS); - + const TargetRegisterInfo *TRI = getTarget().getRegisterInfo(); - + if (RegInfo && !RegInfo->livein_empty()) { OS << "Function Live Ins: "; for (MachineRegisterInfo::livein_iterator @@ -324,7 +328,7 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const { OS << ' ' << PrintReg(*I, TRI); OS << '\n'; } - + for (const_iterator BB = begin(), E = end(); BB != E; ++BB) { OS << '\n'; BB->print(OS, Indexes); @@ -411,10 +415,9 @@ unsigned MachineFunction::addLiveIn(unsigned PReg, MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, bool isLinkerPrivate) const { assert(JumpTableInfo && "No jump tables"); - assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!"); const MCAsmInfo &MAI = *getTarget().getMCAsmInfo(); - + const char *Prefix = isLinkerPrivate ? MAI.getLinkerPrivateGlobalPrefix() : MAI.getPrivateGlobalPrefix(); SmallString<60> Name; @@ -691,7 +694,7 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B, else if (B->getType() != IntTy) B = ConstantFoldInstOperands(Instruction::BitCast, IntTy, const_cast<Constant*>(B), TD); - + return A == B; } @@ -714,7 +717,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C, Constants[i].Alignment = Alignment; return i; } - + Constants.push_back(MachineConstantPoolEntry(C, Alignment)); return Constants.size()-1; } @@ -723,7 +726,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V, unsigned Alignment) { assert(Alignment && "Alignment must be specified!"); if (Alignment > PoolAlignment) PoolAlignment = Alignment; - + // Check to see if we already have this constant. // // FIXME, this could be made much more efficient for large constant pools. diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp index 2aaa798..0102ac7 100644 --- a/lib/CodeGen/MachineFunctionPrinterPass.cpp +++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp @@ -14,7 +14,9 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -28,6 +30,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass { raw_ostream &OS; const std::string Banner; + MachineFunctionPrinterPass() : MachineFunctionPass(ID), OS(dbgs()) { } MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) : MachineFunctionPass(ID), OS(os), Banner(banner) {} @@ -40,7 +43,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) { OS << "# " << Banner << ":\n"; - MF.print(OS); + MF.print(OS, getAnalysisIfAvailable<SlotIndexes>()); return false; } }; @@ -48,6 +51,10 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass { char MachineFunctionPrinterPass::ID = 0; } +char &MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID; +INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs", + "Machine Function Printer", false, false) + namespace llvm { /// Returns a newly-created MachineFunction Printer pass. The /// default banner is empty. diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index e553a04..8dada05 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -13,6 +13,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/InlineAsm.h" #include "llvm/LLVMContext.h" @@ -33,7 +34,6 @@ #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LeakDetector.h" @@ -186,7 +186,8 @@ void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp, } /// isIdenticalTo - Return true if this operand is identical to the specified -/// operand. +/// operand. Note that this should stay in sync with the hash_value overload +/// below. bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { if (getType() != Other.getType() || getTargetFlags() != Other.getTargetFlags()) @@ -227,6 +228,46 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { llvm_unreachable("Invalid machine operand type"); } +// Note: this must stay exactly in sync with isIdenticalTo above. +hash_code llvm::hash_value(const MachineOperand &MO) { + switch (MO.getType()) { + case MachineOperand::MO_Register: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getReg(), + MO.getSubReg(), MO.isDef()); + case MachineOperand::MO_Immediate: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm()); + case MachineOperand::MO_CImmediate: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCImm()); + case MachineOperand::MO_FPImmediate: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getFPImm()); + case MachineOperand::MO_MachineBasicBlock: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMBB()); + case MachineOperand::MO_FrameIndex: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex()); + case MachineOperand::MO_ConstantPoolIndex: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex(), + MO.getOffset()); + case MachineOperand::MO_JumpTableIndex: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex()); + case MachineOperand::MO_ExternalSymbol: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(), + MO.getSymbolName()); + case MachineOperand::MO_GlobalAddress: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getGlobal(), + MO.getOffset()); + case MachineOperand::MO_BlockAddress: + return hash_combine(MO.getType(), MO.getTargetFlags(), + MO.getBlockAddress()); + case MachineOperand::MO_RegisterMask: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask()); + case MachineOperand::MO_Metadata: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata()); + case MachineOperand::MO_MCSymbol: + return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol()); + } + llvm_unreachable("Invalid machine operand type"); +} + /// print - Print the specified machine operand. /// void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const { @@ -255,12 +296,16 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const { OS << "imp-"; OS << "def"; NeedComma = true; + // <def,read-undef> only makes sense when getSubReg() is set. + // Don't clutter the output otherwise. + if (isUndef() && getSubReg()) + OS << ",read-undef"; } else if (isImplicit()) { OS << "imp-use"; NeedComma = true; } - if (isKill() || isDead() || isUndef() || isInternalRead()) { + if (isKill() || isDead() || (isUndef() && isUse()) || isInternalRead()) { if (NeedComma) OS << ','; NeedComma = false; if (isKill()) { @@ -271,7 +316,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const { OS << "dead"; NeedComma = true; } - if (isUndef()) { + if (isUndef() && isUse()) { if (NeedComma) OS << ','; OS << "undef"; NeedComma = true; @@ -656,7 +701,9 @@ void MachineInstr::addOperand(const MachineOperand &Op) { // OpNo now points as the desired insertion point. Unless this is a variadic // instruction, only implicit regs are allowed beyond MCID->getNumOperands(). - assert((isImpReg || MCID->isVariadic() || OpNo < MCID->getNumOperands()) && + // RegMask operands go between the explicit and implicit operands. + assert((isImpReg || Op.isRegMask() || MCID->isVariadic() || + OpNo < MCID->getNumOperands()) && "Trying to add an operand to a machine instr that is already done!"); // All operands from OpNo have been removed from RegInfo. If the Operands @@ -868,7 +915,8 @@ void MachineInstr::eraseFromParent() { MBB->erase(MI); } } - getParent()->erase(this); + // Erase the individual instruction, which may itself be inside a bundle. + getParent()->erase_instr(this); } @@ -938,9 +986,13 @@ const TargetRegisterClass* MachineInstr::getRegClassConstraint(unsigned OpIdx, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const { + assert(getParent() && "Can't have an MBB reference here!"); + assert(getParent()->getParent() && "Can't have an MF reference here!"); + const MachineFunction &MF = *getParent()->getParent(); + // Most opcodes have fixed constraints in their MCInstrDesc. if (!isInlineAsm()) - return TII->getRegClass(getDesc(), OpIdx, TRI); + return TII->getRegClass(getDesc(), OpIdx, TRI, MF); if (!getOperand(OpIdx).isReg()) return NULL; @@ -962,7 +1014,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx, // Assume that all registers in a memory operand are pointers. if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem) - return TRI->getPointerRegClass(); + return TRI->getPointerRegClass(MF); return NULL; } @@ -1530,12 +1582,14 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const { const MachineRegisterInfo &MRI = MF->getRegInfo(); if (MRI.use_empty(Reg) && !MRI.isLiveOut(Reg)) { bool HasAliasLive = false; - for (const uint16_t *Alias = TM->getRegisterInfo()->getAliasSet(Reg); - unsigned AliasReg = *Alias; ++Alias) + for (MCRegAliasIterator AI(Reg, TM->getRegisterInfo(), true); + AI.isValid(); ++AI) { + unsigned AliasReg = *AI; if (!MRI.use_empty(AliasReg) || MRI.isLiveOut(AliasReg)) { HasAliasLive = true; break; } + } if (!HasAliasLive) { OmittedAnyCallClobbers = true; continue; @@ -1667,7 +1721,8 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound) { bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg); - bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg); + bool hasAliases = isPhysReg && + MCRegAliasIterator(IncomingReg, RegInfo, false).isValid(); bool Found = false; SmallVector<unsigned,4> DeadOps; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { @@ -1739,7 +1794,8 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound) { bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg); - bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg); + bool hasAliases = isPhysReg && + MCRegAliasIterator(IncomingReg, RegInfo, false).isValid(); bool Found = false; SmallVector<unsigned,4> DeadOps; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { @@ -1758,9 +1814,7 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg, // There exists a super-register that's marked dead. if (RegInfo->isSuperRegister(IncomingReg, Reg)) return true; - if (RegInfo->getSubRegisters(IncomingReg) && - RegInfo->getSuperRegisters(Reg) && - RegInfo->isSubRegister(IncomingReg, Reg)) + if (RegInfo->isSubRegister(IncomingReg, Reg)) DeadOps.push_back(i); } } @@ -1841,52 +1895,16 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs, unsigned MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) { // Build up a buffer of hash code components. - // - // FIXME: This is a total hack. We should have a hash_value overload for - // MachineOperand, but currently that doesn't work because there are many - // different ideas of "equality" and thus different sets of information that - // contribute to the hash code. This one happens to want to take a specific - // subset. And it's still not clear that this routine uses the *correct* - // subset of information when computing the hash code. The goal is to use the - // same inputs for the hash code here that MachineInstr::isIdenticalTo uses to - // test for equality when passed the 'IgnoreVRegDefs' filter flag. It would - // be very useful to factor the selection of relevant inputs out of the two - // functions and into a common routine, but it's not clear how that can be - // done. SmallVector<size_t, 8> HashComponents; HashComponents.reserve(MI->getNumOperands() + 1); HashComponents.push_back(MI->getOpcode()); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); - switch (MO.getType()) { - default: break; - case MachineOperand::MO_Register: - if (MO.isDef() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) - continue; // Skip virtual register defs. - HashComponents.push_back(hash_combine(MO.getType(), MO.getReg())); - break; - case MachineOperand::MO_Immediate: - HashComponents.push_back(hash_combine(MO.getType(), MO.getImm())); - break; - case MachineOperand::MO_FrameIndex: - case MachineOperand::MO_ConstantPoolIndex: - case MachineOperand::MO_JumpTableIndex: - HashComponents.push_back(hash_combine(MO.getType(), MO.getIndex())); - break; - case MachineOperand::MO_MachineBasicBlock: - HashComponents.push_back(hash_combine(MO.getType(), MO.getMBB())); - break; - case MachineOperand::MO_GlobalAddress: - HashComponents.push_back(hash_combine(MO.getType(), MO.getGlobal())); - break; - case MachineOperand::MO_BlockAddress: - HashComponents.push_back(hash_combine(MO.getType(), - MO.getBlockAddress())); - break; - case MachineOperand::MO_MCSymbol: - HashComponents.push_back(hash_combine(MO.getType(), MO.getMCSymbol())); - break; - } + if (MO.isReg() && MO.isDef() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; // Skip virtual register defs. + + HashComponents.push_back(hash_value(MO)); } return hash_combine_range(HashComponents.begin(), HashComponents.end()); } diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp index 73489a7..b7de7bf 100644 --- a/lib/CodeGen/MachineInstrBundle.cpp +++ b/lib/CodeGen/MachineInstrBundle.cpp @@ -169,8 +169,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, } if (!MO.isDead()) { - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + unsigned SubReg = *SubRegs; if (LocalDefSet.insert(SubReg)) LocalDefs.push_back(SubReg); } diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp index 8c562cc..efec481 100644 --- a/lib/CodeGen/MachineLICM.cpp +++ b/lib/CodeGen/MachineLICM.cpp @@ -445,8 +445,8 @@ void MachineLICM::ProcessMI(MachineInstr *MI, } if (MO.isImplicit()) { - for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS) - PhysRegClobbers.set(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + PhysRegClobbers.set(*AI); if (!MO.isDead()) // Non-dead implicit def? This cannot be hoisted. RuledOut = true; @@ -465,7 +465,7 @@ void MachineLICM::ProcessMI(MachineInstr *MI, // If we have already seen another instruction that defines the same // register, then this is not safe. Two defs is indicated by setting a // PhysRegClobbers bit. - for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS) { + for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) { if (PhysRegDefs.test(*AS)) PhysRegClobbers.set(*AS); if (PhysRegClobbers.test(*AS)) @@ -517,8 +517,8 @@ void MachineLICM::HoistRegionPostRA() { for (MachineBasicBlock::livein_iterator I = BB->livein_begin(), E = BB->livein_end(); I != E; ++I) { unsigned Reg = *I; - for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS) - PhysRegDefs.set(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + PhysRegDefs.set(*AI); } SpeculationState = SpeculateUnknown; @@ -540,8 +540,8 @@ void MachineLICM::HoistRegionPostRA() { unsigned Reg = MO.getReg(); if (!Reg) continue; - for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS) - TermRegs.set(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + TermRegs.set(*AI); } } @@ -1260,11 +1260,11 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) { if (NewOpc == 0) return 0; const MCInstrDesc &MID = TII->get(NewOpc); if (MID.getNumDefs() != 1) return 0; - const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI); + MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF); // Ok, we're unfolding. Create a temporary register and do the unfold. unsigned Reg = MRI->createVirtualRegister(RC); - MachineFunction &MF = *MI->getParent()->getParent(); SmallVector<MachineInstr *, 2> NewMIs; bool Success = TII->unfoldMemoryOperand(MF, MI, Reg, diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp index 189cb2b..9f3829e 100644 --- a/lib/CodeGen/MachineLoopInfo.cpp +++ b/lib/CodeGen/MachineLoopInfo.cpp @@ -9,7 +9,7 @@ // // This file defines the MachineLoopInfo class that is used to identify natural // loops and determine the loop depth of various nodes of the CFG. Note that -// the loops identified may actually be several natural loops that share the +// the loops identified may actually be several natural loops that share the // same header node... not just a single natural loop. // //===----------------------------------------------------------------------===// @@ -17,17 +17,13 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Analysis/LoopInfoImpl.h" #include "llvm/Support/Debug.h" using namespace llvm; -namespace llvm { -#define MLB class LoopBase<MachineBasicBlock, MachineLoop> -TEMPLATE_INSTANTIATION(MLB); -#undef MLB -#define MLIB class LoopInfoBase<MachineBasicBlock, MachineLoop> -TEMPLATE_INSTANTIATION(MLIB); -#undef MLIB -} +// Explicitly instantiate methods in LoopInfoImpl.h for MI-level Loops. +template class llvm::LoopBase<MachineBasicBlock, MachineLoop>; +template class llvm::LoopInfoBase<MachineBasicBlock, MachineLoop>; char MachineLoopInfo::ID = 0; INITIALIZE_PASS_BEGIN(MachineLoopInfo, "machine-loops", @@ -40,7 +36,7 @@ char &llvm::MachineLoopInfoID = MachineLoopInfo::ID; bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) { releaseMemory(); - LI.Calculate(getAnalysis<MachineDominatorTree>().getBase()); // Update + LI.Analyze(getAnalysis<MachineDominatorTree>().getBase()); return false; } diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp index 7ea1517..82e1235 100644 --- a/lib/CodeGen/MachineRegisterInfo.cpp +++ b/lib/CodeGen/MachineRegisterInfo.cpp @@ -162,9 +162,22 @@ void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) { MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const { // Since we are in SSA form, we can use the first definition. def_iterator I = def_begin(Reg); + assert((I.atEnd() || llvm::next(I) == def_end()) && + "getVRegDef assumes a single definition or no definition"); return !I.atEnd() ? &*I : 0; } +/// getUniqueVRegDef - Return the unique machine instr that defines the +/// specified virtual register or null if none is found. If there are +/// multiple definitions or no definition, return null. +MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const { + if (def_empty(Reg)) return 0; + def_iterator I = def_begin(Reg); + if (llvm::next(I) != def_end()) + return 0; + return &*I; +} + bool MachineRegisterInfo::hasOneUse(unsigned RegNo) const { use_iterator UI = use_begin(RegNo); if (UI == use_end()) @@ -268,15 +281,15 @@ bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg, assert(TargetRegisterInfo::isPhysicalRegister(PhysReg)); // Check if any overlapping register is modified. - for (const uint16_t *R = TRI->getOverlaps(PhysReg); *R; ++R) - if (!def_empty(*R)) + for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) + if (!def_empty(*AI)) return false; // Check if any overlapping register is allocatable so it may be used later. if (AllocatableRegs.empty()) AllocatableRegs = TRI->getAllocatableSet(MF); - for (const uint16_t *R = TRI->getOverlaps(PhysReg); *R; ++R) - if (AllocatableRegs.test(*R)) + for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) + if (AllocatableRegs.test(*AI)) return false; return true; } diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp index 070a557..acb1ee6 100644 --- a/lib/CodeGen/MachineSSAUpdater.cpp +++ b/lib/CodeGen/MachineSSAUpdater.cpp @@ -241,30 +241,6 @@ void MachineSSAUpdater::ReplaceRegWith(unsigned OldReg, unsigned NewReg) { I->second = NewReg; } -/// MachinePHIiter - Iterator for PHI operands. This is used for the -/// PHI_iterator in the SSAUpdaterImpl template. -namespace { - class MachinePHIiter { - private: - MachineInstr *PHI; - unsigned idx; - - public: - explicit MachinePHIiter(MachineInstr *P) // begin iterator - : PHI(P), idx(1) {} - MachinePHIiter(MachineInstr *P, bool) // end iterator - : PHI(P), idx(PHI->getNumOperands()) {} - - MachinePHIiter &operator++() { idx += 2; return *this; } - bool operator==(const MachinePHIiter& x) const { return idx == x.idx; } - bool operator!=(const MachinePHIiter& x) const { return !operator==(x); } - unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); } - MachineBasicBlock *getIncomingBlock() { - return PHI->getOperand(idx+1).getMBB(); - } - }; -} - /// SSAUpdaterTraits<MachineSSAUpdater> - Traits for the SSAUpdaterImpl /// template, specialized for MachineSSAUpdater. namespace llvm { @@ -279,7 +255,26 @@ public: static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); } static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); } - typedef MachinePHIiter PHI_iterator; + /// Iterator for PHI operands. + class PHI_iterator { + private: + MachineInstr *PHI; + unsigned idx; + + public: + explicit PHI_iterator(MachineInstr *P) // begin iterator + : PHI(P), idx(1) {} + PHI_iterator(MachineInstr *P, bool) // end iterator + : PHI(P), idx(PHI->getNumOperands()) {} + + PHI_iterator &operator++() { idx += 2; return *this; } + bool operator==(const PHI_iterator& x) const { return idx == x.idx; } + bool operator!=(const PHI_iterator& x) const { return !operator==(x); } + unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); } + MachineBasicBlock *getIncomingBlock() { + return PHI->getOperand(idx+1).getMBB(); + } + }; static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); } static inline PHI_iterator PHI_end(PhiT *PHI) { return PHI_iterator(PHI, true); diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 1d3241b..a1dc948 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -17,9 +17,13 @@ #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" -#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -50,6 +54,15 @@ static bool ViewMISchedDAGs = false; // Machine Instruction Scheduling Pass and Registry //===----------------------------------------------------------------------===// +MachineSchedContext::MachineSchedContext(): + MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) { + RegClassInfo = new RegisterClassInfo(); +} + +MachineSchedContext::~MachineSchedContext() { + delete RegClassInfo; +} + namespace { /// MachineScheduler runs after coalescing and before register allocation. class MachineScheduler : public MachineSchedContext, @@ -122,6 +135,29 @@ DefaultSchedRegistry("default", "Use the target's default scheduler choice.", /// default scheduler if the target does not set a default. static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C); + +/// Decrement this iterator until reaching the top or a non-debug instr. +static MachineBasicBlock::iterator +priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) { + assert(I != Beg && "reached the top of the region, cannot decrement"); + while (--I != Beg) { + if (!I->isDebugValue()) + break; + } + return I; +} + +/// If this iterator is a debug value, increment until reaching the End or a +/// non-debug instruction. +static MachineBasicBlock::iterator +nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) { + for(; I != End; ++I) { + if (!I->isDebugValue()) + break; + } + return I; +} + /// Top-level MachineScheduler pass driver. /// /// Visit blocks in function order. Divide each block into scheduling regions @@ -139,6 +175,8 @@ static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C); /// design would be to split blocks at scheduling boundaries, but LLVM has a /// general bias against block splitting purely for implementation simplicity. bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { + DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs())); + // Initialize the context of the pass. MF = &mf; MLI = &getAnalysis<MachineLoopInfo>(); @@ -149,6 +187,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { LIS = &getAnalysis<LiveIntervals>(); const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + RegClassInfo->runOnMachineFunction(*MF); + // Select the scheduler, or set the default. MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; if (Ctor == useDefaultMachineSched) { @@ -163,13 +203,16 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this)); // Visit all machine basic blocks. + // + // TODO: Visit blocks in global postorder or postorder within the bottom-up + // loop tree. Then we can optionally compute global RegPressure. for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end(); MBB != MBBEnd; ++MBB) { Scheduler->startBlock(MBB); // Break the block into scheduling regions [I, RegionEnd), and schedule each - // region as soon as it is discovered. RegionEnd points the the scheduling + // region as soon as it is discovered. RegionEnd points the scheduling // boundary at the bottom of the region. The DAG does not include RegionEnd, // but the region does (i.e. the next RegionEnd is above the previous // RegionBegin). If the current block has no terminator then RegionEnd == @@ -181,6 +224,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { unsigned RemainingCount = MBB->size(); for(MachineBasicBlock::iterator RegionEnd = MBB->end(); RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) { + // Avoid decrementing RegionEnd for blocks with no terminator. if (RegionEnd != MBB->end() || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) { @@ -207,7 +251,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { Scheduler->exitRegion(); continue; } - DEBUG(dbgs() << "MachineScheduling " << MF->getFunction()->getName() + DEBUG(dbgs() << "********** MI Scheduling **********\n"); + DEBUG(dbgs() << MF->getFunction()->getName() << ":BB#" << MBB->getNumber() << "\n From: " << *I << " To: "; if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; else dbgs() << "End"; @@ -260,6 +305,9 @@ public: /// be scheduled at the bottom. virtual SUnit *pickNode(bool &IsTopNode) = 0; + /// Notify MachineSchedStrategy that ScheduleDAGMI has scheduled a node. + virtual void schedNode(SUnit *SU, bool IsTopNode) = 0; + /// When all predecessor dependencies have been resolved, free this node for /// top-down scheduling. virtual void releaseTopNode(SUnit *SU) = 0; @@ -279,22 +327,45 @@ namespace { /// machine instructions while updating LiveIntervals. class ScheduleDAGMI : public ScheduleDAGInstrs { AliasAnalysis *AA; + RegisterClassInfo *RegClassInfo; MachineSchedStrategy *SchedImpl; + MachineBasicBlock::iterator LiveRegionEnd; + + /// Register pressure in this region computed by buildSchedGraph. + IntervalPressure RegPressure; + RegPressureTracker RPTracker; + + /// List of pressure sets that exceed the target's pressure limit before + /// scheduling, listed in increasing set ID order. Each pressure set is paired + /// with its max pressure in the currently scheduled regions. + std::vector<PressureElement> RegionCriticalPSets; + /// The top of the unscheduled zone. MachineBasicBlock::iterator CurrentTop; + IntervalPressure TopPressure; + RegPressureTracker TopRPTracker; /// The bottom of the unscheduled zone. MachineBasicBlock::iterator CurrentBottom; + IntervalPressure BotPressure; + RegPressureTracker BotRPTracker; +#ifndef NDEBUG /// The number of instructions scheduled so far. Used to cut off the /// scheduler at the point determined by misched-cutoff. unsigned NumInstrsScheduled; +#endif public: ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S): ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS), - AA(C->AA), SchedImpl(S), CurrentTop(), CurrentBottom(), - NumInstrsScheduled(0) {} + AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S), + RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure), + CurrentBottom(), BotRPTracker(BotPressure) { +#ifndef NDEBUG + NumInstrsScheduled = 0; +#endif + } ~ScheduleDAGMI() { delete SchedImpl; @@ -303,22 +374,68 @@ public: MachineBasicBlock::iterator top() const { return CurrentTop; } MachineBasicBlock::iterator bottom() const { return CurrentBottom; } - /// Implement ScheduleDAGInstrs interface. + /// Implement the ScheduleDAGInstrs interface for handling the next scheduling + /// region. This covers all instructions in a block, while schedule() may only + /// cover a subset. + void enterRegion(MachineBasicBlock *bb, + MachineBasicBlock::iterator begin, + MachineBasicBlock::iterator end, + unsigned endcount); + + /// Implement ScheduleDAGInstrs interface for scheduling a sequence of + /// reorderable instructions. void schedule(); + /// Get current register pressure for the top scheduled instructions. + const IntervalPressure &getTopPressure() const { return TopPressure; } + const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; } + + /// Get current register pressure for the bottom scheduled instructions. + const IntervalPressure &getBotPressure() const { return BotPressure; } + const RegPressureTracker &getBotRPTracker() const { return BotRPTracker; } + + /// Get register pressure for the entire scheduling region before scheduling. + const IntervalPressure &getRegPressure() const { return RegPressure; } + + const std::vector<PressureElement> &getRegionCriticalPSets() const { + return RegionCriticalPSets; + } + + /// getIssueWidth - Return the max instructions per scheduling group. + unsigned getIssueWidth() const { + return (InstrItins && InstrItins->SchedModel) + ? InstrItins->SchedModel->IssueWidth : 1; + } + + /// getNumMicroOps - Return the number of issue slots required for this MI. + unsigned getNumMicroOps(MachineInstr *MI) const { + if (!InstrItins) return 1; + int UOps = InstrItins->getNumMicroOps(MI->getDesc().getSchedClass()); + return (UOps >= 0) ? UOps : TII->getNumMicroOps(InstrItins, MI); + } + protected: + void initRegPressure(); + void updateScheduledPressure(std::vector<unsigned> NewMaxPressure); + void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos); bool checkSchedLimit(); + void releaseRoots(); + void releaseSucc(SUnit *SU, SDep *SuccEdge); void releaseSuccessors(SUnit *SU); void releasePred(SUnit *SU, SDep *PredEdge); void releasePredecessors(SUnit *SU); + + void placeDebugValues(); }; } // namespace /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When /// NumPredsLeft reaches zero, release the successor node. +/// +/// FIXME: Adjust SuccSU height based on MinLatency. void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { SUnit *SuccSU = SuccEdge->getSUnit(); @@ -345,6 +462,8 @@ void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When /// NumSuccsLeft reaches zero, release the predecessor node. +/// +/// FIXME: Adjust PredSU height based on MinLatency. void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { SUnit *PredSU = PredEdge->getSUnit(); @@ -371,12 +490,17 @@ void ScheduleDAGMI::releasePredecessors(SUnit *SU) { void ScheduleDAGMI::moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos) { - // Fix RegionBegin if the first instruction moves down. + // Advance RegionBegin if the first instruction moves down. if (&*RegionBegin == MI) - RegionBegin = llvm::next(RegionBegin); + ++RegionBegin; + + // Update the instruction stream. BB->splice(InsertPos, BB, MI); + + // Update LiveIntervals LIS->handleMove(MI); - // Fix RegionBegin if another instruction moves above the first instruction. + + // Recede RegionBegin if an instruction moves above the first. if (RegionBegin == InsertPos) RegionBegin = MI; } @@ -392,12 +516,114 @@ bool ScheduleDAGMI::checkSchedLimit() { return true; } +/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after +/// crossing a scheduling boundary. [begin, end) includes all instructions in +/// the region, including the boundary itself and single-instruction regions +/// that don't get scheduled. +void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb, + MachineBasicBlock::iterator begin, + MachineBasicBlock::iterator end, + unsigned endcount) +{ + ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount); + + // For convenience remember the end of the liveness region. + LiveRegionEnd = + (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd); +} + +// Setup the register pressure trackers for the top scheduled top and bottom +// scheduled regions. +void ScheduleDAGMI::initRegPressure() { + TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); + BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); + + // Close the RPTracker to finalize live ins. + RPTracker.closeRegion(); + + DEBUG(RPTracker.getPressure().dump(TRI)); + + // Initialize the live ins and live outs. + TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); + BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); + + // Close one end of the tracker so we can call + // getMaxUpward/DownwardPressureDelta before advancing across any + // instructions. This converts currently live regs into live ins/outs. + TopRPTracker.closeTop(); + BotRPTracker.closeBottom(); + + // Account for liveness generated by the region boundary. + if (LiveRegionEnd != RegionEnd) + BotRPTracker.recede(); + + assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom"); + + // Cache the list of excess pressure sets in this region. This will also track + // the max pressure in the scheduled code for these sets. + RegionCriticalPSets.clear(); + std::vector<unsigned> RegionPressure = RPTracker.getPressure().MaxSetPressure; + for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) { + unsigned Limit = TRI->getRegPressureSetLimit(i); + if (RegionPressure[i] > Limit) + RegionCriticalPSets.push_back(PressureElement(i, 0)); + } + DEBUG(dbgs() << "Excess PSets: "; + for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i) + dbgs() << TRI->getRegPressureSetName( + RegionCriticalPSets[i].PSetID) << " "; + dbgs() << "\n"); +} + +// FIXME: When the pressure tracker deals in pressure differences then we won't +// iterate over all RegionCriticalPSets[i]. +void ScheduleDAGMI:: +updateScheduledPressure(std::vector<unsigned> NewMaxPressure) { + for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) { + unsigned ID = RegionCriticalPSets[i].PSetID; + int &MaxUnits = RegionCriticalPSets[i].UnitIncrease; + if ((int)NewMaxPressure[ID] > MaxUnits) + MaxUnits = NewMaxPressure[ID]; + } +} + +// Release all DAG roots for scheduling. +void ScheduleDAGMI::releaseRoots() { + SmallVector<SUnit*, 16> BotRoots; + + for (std::vector<SUnit>::iterator + I = SUnits.begin(), E = SUnits.end(); I != E; ++I) { + // A SUnit is ready to top schedule if it has no predecessors. + if (I->Preds.empty()) + SchedImpl->releaseTopNode(&(*I)); + // A SUnit is ready to bottom schedule if it has no successors. + if (I->Succs.empty()) + BotRoots.push_back(&(*I)); + } + // Release bottom roots in reverse order so the higher priority nodes appear + // first. This is more natural and slightly more efficient. + for (SmallVectorImpl<SUnit*>::const_reverse_iterator + I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) + SchedImpl->releaseBottomNode(*I); +} + /// schedule - Called back from MachineScheduler::runOnMachineFunction -/// after setting up the current scheduling region. +/// after setting up the current scheduling region. [RegionBegin, RegionEnd) +/// only includes instructions that have DAG nodes, not scheduling boundaries. void ScheduleDAGMI::schedule() { - buildSchedGraph(AA); + // Initialize the register pressure tracker used by buildSchedGraph. + RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); + + // Account for liveness generate by the region boundary. + if (LiveRegionEnd != RegionEnd) + RPTracker.recede(); + + // Build the DAG, and compute current register pressure. + buildSchedGraph(AA, &RPTracker); + + // Initialize top/bottom trackers after computing region pressure. + initRegPressure(); - DEBUG(dbgs() << "********** MI Scheduling **********\n"); DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) SUnits[su].dumpAll(this)); @@ -410,22 +636,12 @@ void ScheduleDAGMI::schedule() { releasePredecessors(&ExitSU); // Release all DAG roots for scheduling. - for (std::vector<SUnit>::iterator I = SUnits.begin(), E = SUnits.end(); - I != E; ++I) { - // A SUnit is ready to top schedule if it has no predecessors. - if (I->Preds.empty()) - SchedImpl->releaseTopNode(&(*I)); - // A SUnit is ready to bottom schedule if it has no successors. - if (I->Succs.empty()) - SchedImpl->releaseBottomNode(&(*I)); - } + releaseRoots(); - CurrentTop = RegionBegin; + CurrentTop = nextIfDebug(RegionBegin, RegionEnd); CurrentBottom = RegionEnd; bool IsTopNode = false; while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { - DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom") - << " Scheduling Instruction:\n"; SU->dump(this)); if (!checkSchedLimit()) break; @@ -435,28 +651,69 @@ void ScheduleDAGMI::schedule() { if (IsTopNode) { assert(SU->isTopReady() && "node still has unscheduled dependencies"); if (&*CurrentTop == MI) - ++CurrentTop; - else + CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); + else { moveInstruction(MI, CurrentTop); + TopRPTracker.setPos(MI); + } + + // Update top scheduled pressure. + TopRPTracker.advance(); + assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); + updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure); + // Release dependent instructions for scheduling. releaseSuccessors(SU); } else { assert(SU->isBottomReady() && "node still has unscheduled dependencies"); - if (&*llvm::prior(CurrentBottom) == MI) - --CurrentBottom; + MachineBasicBlock::iterator priorII = + priorNonDebug(CurrentBottom, CurrentTop); + if (&*priorII == MI) + CurrentBottom = priorII; else { - if (&*CurrentTop == MI) - CurrentTop = llvm::next(CurrentTop); + if (&*CurrentTop == MI) { + CurrentTop = nextIfDebug(++CurrentTop, priorII); + TopRPTracker.setPos(CurrentTop); + } moveInstruction(MI, CurrentBottom); CurrentBottom = MI; } + // Update bottom scheduled pressure. + BotRPTracker.recede(); + assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); + updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure); + // Release dependent instructions for scheduling. releasePredecessors(SU); } SU->isScheduled = true; + SchedImpl->schedNode(SU, IsTopNode); } assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); + + placeDebugValues(); +} + +/// Reinsert any remaining debug_values, just like the PostRA scheduler. +void ScheduleDAGMI::placeDebugValues() { + // If first instruction was a DBG_VALUE then put it back. + if (FirstDbgValue) { + BB->splice(RegionBegin, BB, FirstDbgValue); + RegionBegin = FirstDbgValue; + } + + for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator + DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { + std::pair<MachineInstr *, MachineInstr *> P = *prior(DI); + MachineInstr *DbgValue = P.first; + MachineBasicBlock::iterator OrigPrevMI = P.second; + BB->splice(++OrigPrevMI, BB, DbgValue); + if (OrigPrevMI == llvm::prior(RegionEnd)) + RegionEnd = DbgValue; + } + DbgValues.clear(); + FirstDbgValue = NULL; } //===----------------------------------------------------------------------===// @@ -464,56 +721,603 @@ void ScheduleDAGMI::schedule() { //===----------------------------------------------------------------------===// namespace { +/// ReadyQueue encapsulates vector of "ready" SUnits with basic convenience +/// methods for pushing and removing nodes. ReadyQueue's are uniquely identified +/// by an ID. SUnit::NodeQueueId is a mask of the ReadyQueues the SUnit is in. +class ReadyQueue { + unsigned ID; + std::string Name; + std::vector<SUnit*> Queue; + +public: + ReadyQueue(unsigned id, const Twine &name): ID(id), Name(name.str()) {} + + unsigned getID() const { return ID; } + + StringRef getName() const { return Name; } + + // SU is in this queue if it's NodeQueueID is a superset of this ID. + bool isInQueue(SUnit *SU) const { return (SU->NodeQueueId & ID); } + + bool empty() const { return Queue.empty(); } + + unsigned size() const { return Queue.size(); } + + typedef std::vector<SUnit*>::iterator iterator; + + iterator begin() { return Queue.begin(); } + + iterator end() { return Queue.end(); } + + iterator find(SUnit *SU) { + return std::find(Queue.begin(), Queue.end(), SU); + } + + void push(SUnit *SU) { + Queue.push_back(SU); + SU->NodeQueueId |= ID; + } + + void remove(iterator I) { + (*I)->NodeQueueId &= ~ID; + *I = Queue.back(); + Queue.pop_back(); + } + + void dump() { + dbgs() << Name << ": "; + for (unsigned i = 0, e = Queue.size(); i < e; ++i) + dbgs() << Queue[i]->NodeNum << " "; + dbgs() << "\n"; + } +}; + /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance /// the schedule. class ConvergingScheduler : public MachineSchedStrategy { + + /// Store the state used by ConvergingScheduler heuristics, required for the + /// lifetime of one invocation of pickNode(). + struct SchedCandidate { + // The best SUnit candidate. + SUnit *SU; + + // Register pressure values for the best candidate. + RegPressureDelta RPDelta; + + SchedCandidate(): SU(NULL) {} + }; + /// Represent the type of SchedCandidate found within a single queue. + enum CandResult { + NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure }; + + /// Each Scheduling boundary is associated with ready queues. It tracks the + /// current cycle in whichever direction at has moved, and maintains the state + /// of "hazards" and other interlocks at the current cycle. + struct SchedBoundary { + ScheduleDAGMI *DAG; + + ReadyQueue Available; + ReadyQueue Pending; + bool CheckPending; + + ScheduleHazardRecognizer *HazardRec; + + unsigned CurrCycle; + unsigned IssueCount; + + /// MinReadyCycle - Cycle of the soonest available instruction. + unsigned MinReadyCycle; + + // Remember the greatest min operand latency. + unsigned MaxMinLatency; + + /// Pending queues extend the ready queues with the same ID and the + /// PendingFlag set. + SchedBoundary(unsigned ID, const Twine &Name): + DAG(0), Available(ID, Name+".A"), + Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"), + CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0), + MinReadyCycle(UINT_MAX), MaxMinLatency(0) {} + + ~SchedBoundary() { delete HazardRec; } + + bool isTop() const { + return Available.getID() == ConvergingScheduler::TopQID; + } + + bool checkHazard(SUnit *SU); + + void releaseNode(SUnit *SU, unsigned ReadyCycle); + + void bumpCycle(); + + void bumpNode(SUnit *SU); + + void releasePending(); + + void removeReady(SUnit *SU); + + SUnit *pickOnlyChoice(); + }; + ScheduleDAGMI *DAG; + const TargetRegisterInfo *TRI; - unsigned NumTopReady; - unsigned NumBottomReady; + // State of the top and bottom scheduled instruction boundaries. + SchedBoundary Top; + SchedBoundary Bot; public: - virtual void initialize(ScheduleDAGMI *dag) { - DAG = dag; + /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both) + enum { + TopQID = 1, + BotQID = 2, + LogMaxQID = 2 + }; + + ConvergingScheduler(): + DAG(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {} + + virtual void initialize(ScheduleDAGMI *dag); + + virtual SUnit *pickNode(bool &IsTopNode); + + virtual void schedNode(SUnit *SU, bool IsTopNode); + + virtual void releaseTopNode(SUnit *SU); + + virtual void releaseBottomNode(SUnit *SU); + +protected: + SUnit *pickNodeBidrectional(bool &IsTopNode); - assert((!ForceTopDown || !ForceBottomUp) && - "-misched-topdown incompatible with -misched-bottomup"); + CandResult pickNodeFromQueue(ReadyQueue &Q, + const RegPressureTracker &RPTracker, + SchedCandidate &Candidate); +#ifndef NDEBUG + void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU, + PressureElement P = PressureElement()); +#endif +}; +} // namespace + +void ConvergingScheduler::initialize(ScheduleDAGMI *dag) { + DAG = dag; + TRI = DAG->TRI; + Top.DAG = dag; + Bot.DAG = dag; + + // Initialize the HazardRecognizers. + const TargetMachine &TM = DAG->MF.getTarget(); + const InstrItineraryData *Itin = TM.getInstrItineraryData(); + Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); + Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG); + + assert((!ForceTopDown || !ForceBottomUp) && + "-misched-topdown incompatible with -misched-bottomup"); +} + +void ConvergingScheduler::releaseTopNode(SUnit *SU) { + if (SU->isScheduled) + return; + + for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; + unsigned Latency = + DAG->computeOperandLatency(I->getSUnit(), SU, *I, /*FindMin=*/true); +#ifndef NDEBUG + Top.MaxMinLatency = std::max(Latency, Top.MaxMinLatency); +#endif + if (SU->TopReadyCycle < PredReadyCycle + Latency) + SU->TopReadyCycle = PredReadyCycle + Latency; } + Top.releaseNode(SU, SU->TopReadyCycle); +} - virtual SUnit *pickNode(bool &IsTopNode) { - if (DAG->top() == DAG->bottom()) - return NULL; +void ConvergingScheduler::releaseBottomNode(SUnit *SU) { + if (SU->isScheduled) + return; - // As an initial placeholder heuristic, schedule in the direction that has - // the fewest choices. - SUnit *SU; - if (ForceTopDown || (!ForceBottomUp && NumTopReady <= NumBottomReady)) { - SU = DAG->getSUnit(DAG->top()); - IsTopNode = true; + assert(SU->getInstr() && "Scheduled SUnit must have instr"); + + for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); + I != E; ++I) { + unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; + unsigned Latency = + DAG->computeOperandLatency(SU, I->getSUnit(), *I, /*FindMin=*/true); +#ifndef NDEBUG + Bot.MaxMinLatency = std::max(Latency, Bot.MaxMinLatency); +#endif + if (SU->BotReadyCycle < SuccReadyCycle + Latency) + SU->BotReadyCycle = SuccReadyCycle + Latency; + } + Bot.releaseNode(SU, SU->BotReadyCycle); +} + +/// Does this SU have a hazard within the current instruction group. +/// +/// The scheduler supports two modes of hazard recognition. The first is the +/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that +/// supports highly complicated in-order reservation tables +/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic. +/// +/// The second is a streamlined mechanism that checks for hazards based on +/// simple counters that the scheduler itself maintains. It explicitly checks +/// for instruction dispatch limitations, including the number of micro-ops that +/// can dispatch per cycle. +/// +/// TODO: Also check whether the SU must start a new group. +bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) { + if (HazardRec->isEnabled()) + return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; + + if (IssueCount + DAG->getNumMicroOps(SU->getInstr()) > DAG->getIssueWidth()) + return true; + + return false; +} + +void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU, + unsigned ReadyCycle) { + if (ReadyCycle < MinReadyCycle) + MinReadyCycle = ReadyCycle; + + // Check for interlocks first. For the purpose of other heuristics, an + // instruction that cannot issue appears as if it's not in the ReadyQueue. + if (ReadyCycle > CurrCycle || checkHazard(SU)) + Pending.push(SU); + else + Available.push(SU); +} + +/// Move the boundary of scheduled code by one cycle. +void ConvergingScheduler::SchedBoundary::bumpCycle() { + unsigned Width = DAG->getIssueWidth(); + IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width; + + assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); + unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle); + + if (!HazardRec->isEnabled()) { + // Bypass HazardRec virtual calls. + CurrCycle = NextCycle; + } + else { + // Bypass getHazardType calls in case of long latency. + for (; CurrCycle != NextCycle; ++CurrCycle) { + if (isTop()) + HazardRec->AdvanceCycle(); + else + HazardRec->RecedeCycle(); } - else { - SU = DAG->getSUnit(llvm::prior(DAG->bottom())); - IsTopNode = false; + } + CheckPending = true; + + DEBUG(dbgs() << "*** " << Available.getName() << " cycle " + << CurrCycle << '\n'); +} + +/// Move the boundary of scheduled code by one SUnit. +void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) { + // Update the reservation table. + if (HazardRec->isEnabled()) { + if (!isTop() && SU->isCall) { + // Calls are scheduled with their preceding instructions. For bottom-up + // scheduling, clear the pipeline state before emitting. + HazardRec->Reset(); } - if (SU->isTopReady()) { - assert(NumTopReady > 0 && "bad ready count"); - --NumTopReady; + HazardRec->EmitInstruction(SU); + } + // Check the instruction group dispatch limit. + // TODO: Check if this SU must end a dispatch group. + IssueCount += DAG->getNumMicroOps(SU->getInstr()); + if (IssueCount >= DAG->getIssueWidth()) { + DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n'); + bumpCycle(); + } +} + +/// Release pending ready nodes in to the available queue. This makes them +/// visible to heuristics. +void ConvergingScheduler::SchedBoundary::releasePending() { + // If the available queue is empty, it is safe to reset MinReadyCycle. + if (Available.empty()) + MinReadyCycle = UINT_MAX; + + // Check to see if any of the pending instructions are ready to issue. If + // so, add them to the available queue. + for (unsigned i = 0, e = Pending.size(); i != e; ++i) { + SUnit *SU = *(Pending.begin()+i); + unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; + + if (ReadyCycle < MinReadyCycle) + MinReadyCycle = ReadyCycle; + + if (ReadyCycle > CurrCycle) + continue; + + if (checkHazard(SU)) + continue; + + Available.push(SU); + Pending.remove(Pending.begin()+i); + --i; --e; + } + CheckPending = false; +} + +/// Remove SU from the ready set for this boundary. +void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) { + if (Available.isInQueue(SU)) + Available.remove(Available.find(SU)); + else { + assert(Pending.isInQueue(SU) && "bad ready count"); + Pending.remove(Pending.find(SU)); + } +} + +/// If this queue only has one ready candidate, return it. As a side effect, +/// advance the cycle until at least one node is ready. If multiple instructions +/// are ready, return NULL. +SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() { + if (CheckPending) + releasePending(); + + for (unsigned i = 0; Available.empty(); ++i) { + assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) && + "permanent hazard"); (void)i; + bumpCycle(); + releasePending(); + } + if (Available.size() == 1) + return *Available.begin(); + return NULL; +} + +#ifndef NDEBUG +void ConvergingScheduler::traceCandidate(const char *Label, const ReadyQueue &Q, + SUnit *SU, PressureElement P) { + dbgs() << Label << " " << Q.getName() << " "; + if (P.isValid()) + dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease + << " "; + else + dbgs() << " "; + SU->dump(DAG); +} +#endif + +/// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is +/// more desirable than RHS from scheduling standpoint. +static bool compareRPDelta(const RegPressureDelta &LHS, + const RegPressureDelta &RHS) { + // Compare each component of pressure in decreasing order of importance + // without checking if any are valid. Invalid PressureElements are assumed to + // have UnitIncrease==0, so are neutral. + + // Avoid increasing the max critical pressure in the scheduled region. + if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) + return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease; + + // Avoid increasing the max critical pressure in the scheduled region. + if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) + return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease; + + // Avoid increasing the max pressure of the entire region. + if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) + return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease; + + return false; +} + +/// Pick the best candidate from the top queue. +/// +/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during +/// DAG building. To adjust for the current scheduling location we need to +/// maintain the number of vreg uses remaining to be top-scheduled. +ConvergingScheduler::CandResult ConvergingScheduler:: +pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker, + SchedCandidate &Candidate) { + DEBUG(Q.dump()); + + // getMaxPressureDelta temporarily modifies the tracker. + RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); + + // BestSU remains NULL if no top candidates beat the best existing candidate. + CandResult FoundCandidate = NoCand; + for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { + RegPressureDelta RPDelta; + TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, + DAG->getRegionCriticalPSets(), + DAG->getRegPressure().MaxSetPressure); + + // Initialize the candidate if needed. + if (!Candidate.SU) { + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + FoundCandidate = NodeOrder; + continue; + } + // Avoid exceeding the target's limit. + if (RPDelta.Excess.UnitIncrease < Candidate.RPDelta.Excess.UnitIncrease) { + DEBUG(traceCandidate("ECAND", Q, *I, RPDelta.Excess)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + FoundCandidate = SingleExcess; + continue; + } + if (RPDelta.Excess.UnitIncrease > Candidate.RPDelta.Excess.UnitIncrease) + continue; + if (FoundCandidate == SingleExcess) + FoundCandidate = MultiPressure; + + // Avoid increasing the max critical pressure in the scheduled region. + if (RPDelta.CriticalMax.UnitIncrease + < Candidate.RPDelta.CriticalMax.UnitIncrease) { + DEBUG(traceCandidate("PCAND", Q, *I, RPDelta.CriticalMax)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + FoundCandidate = SingleCritical; + continue; + } + if (RPDelta.CriticalMax.UnitIncrease + > Candidate.RPDelta.CriticalMax.UnitIncrease) + continue; + if (FoundCandidate == SingleCritical) + FoundCandidate = MultiPressure; + + // Avoid increasing the max pressure of the entire region. + if (RPDelta.CurrentMax.UnitIncrease + < Candidate.RPDelta.CurrentMax.UnitIncrease) { + DEBUG(traceCandidate("MCAND", Q, *I, RPDelta.CurrentMax)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + FoundCandidate = SingleMax; + continue; } - if (SU->isBottomReady()) { - assert(NumBottomReady > 0 && "bad ready count"); - --NumBottomReady; + if (RPDelta.CurrentMax.UnitIncrease + > Candidate.RPDelta.CurrentMax.UnitIncrease) + continue; + if (FoundCandidate == SingleMax) + FoundCandidate = MultiPressure; + + // Fall through to original instruction order. + // Only consider node order if Candidate was chosen from this Q. + if (FoundCandidate == NoCand) + continue; + + if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) + || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) { + DEBUG(traceCandidate("NCAND", Q, *I)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + FoundCandidate = NodeOrder; } + } + return FoundCandidate; +} + +/// Pick the best candidate node from either the top or bottom queue. +SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) { + // Schedule as far as possible in the direction of no choice. This is most + // efficient, but also provides the best heuristics for CriticalPSets. + if (SUnit *SU = Bot.pickOnlyChoice()) { + IsTopNode = false; return SU; } + if (SUnit *SU = Top.pickOnlyChoice()) { + IsTopNode = true; + return SU; + } + SchedCandidate BotCand; + // Prefer bottom scheduling when heuristics are silent. + CandResult BotResult = pickNodeFromQueue(Bot.Available, + DAG->getBotRPTracker(), BotCand); + assert(BotResult != NoCand && "failed to find the first candidate"); + + // If either Q has a single candidate that provides the least increase in + // Excess pressure, we can immediately schedule from that Q. + // + // RegionCriticalPSets summarizes the pressure within the scheduled region and + // affects picking from either Q. If scheduling in one direction must + // increase pressure for one of the excess PSets, then schedule in that + // direction first to provide more freedom in the other direction. + if (BotResult == SingleExcess || BotResult == SingleCritical) { + IsTopNode = false; + return BotCand.SU; + } + // Check if the top Q has a better candidate. + SchedCandidate TopCand; + CandResult TopResult = pickNodeFromQueue(Top.Available, + DAG->getTopRPTracker(), TopCand); + assert(TopResult != NoCand && "failed to find the first candidate"); + + if (TopResult == SingleExcess || TopResult == SingleCritical) { + IsTopNode = true; + return TopCand.SU; + } + // If either Q has a single candidate that minimizes pressure above the + // original region's pressure pick it. + if (BotResult == SingleMax) { + IsTopNode = false; + return BotCand.SU; + } + if (TopResult == SingleMax) { + IsTopNode = true; + return TopCand.SU; + } + // Check for a salient pressure difference and pick the best from either side. + if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) { + IsTopNode = true; + return TopCand.SU; + } + // Otherwise prefer the bottom candidate in node order. + IsTopNode = false; + return BotCand.SU; +} - virtual void releaseTopNode(SUnit *SU) { - ++NumTopReady; +/// Pick the best node to balance the schedule. Implements MachineSchedStrategy. +SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) { + if (DAG->top() == DAG->bottom()) { + assert(Top.Available.empty() && Top.Pending.empty() && + Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); + return NULL; } - virtual void releaseBottomNode(SUnit *SU) { - ++NumBottomReady; + SUnit *SU; + if (ForceTopDown) { + SU = Top.pickOnlyChoice(); + if (!SU) { + SchedCandidate TopCand; + CandResult TopResult = + pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand); + assert(TopResult != NoCand && "failed to find the first candidate"); + (void)TopResult; + SU = TopCand.SU; + } + IsTopNode = true; } -}; -} // namespace + else if (ForceBottomUp) { + SU = Bot.pickOnlyChoice(); + if (!SU) { + SchedCandidate BotCand; + CandResult BotResult = + pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand); + assert(BotResult != NoCand && "failed to find the first candidate"); + (void)BotResult; + SU = BotCand.SU; + } + IsTopNode = false; + } + else { + SU = pickNodeBidrectional(IsTopNode); + } + if (SU->isTopReady()) + Top.removeReady(SU); + if (SU->isBottomReady()) + Bot.removeReady(SU); + + DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom") + << " Scheduling Instruction in cycle " + << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n'; + SU->dump(DAG)); + return SU; +} + +/// Update the scheduler's state after scheduling a node. This is the same node +/// that was just returned by pickNode(). However, ScheduleDAGMI needs to update +/// it's state based on the current cycle before MachineSchedStrategy does. +void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) { + if (IsTopNode) { + SU->TopReadyCycle = Top.CurrCycle; + Top.bumpNode(SU); + } + else { + SU->BotReadyCycle = Bot.CurrCycle; + Bot.bumpNode(SU); + } +} /// Create the standard converging machine scheduler. This will be used as the /// default scheduler if the target does not set a default. @@ -592,6 +1396,8 @@ public: return SU; } + virtual void schedNode(SUnit *SU, bool IsTopNode) {} + virtual void releaseTopNode(SUnit *SU) { TopQ.push(SU); } diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index 74ba94d..d8dece6 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -89,8 +89,8 @@ namespace { void addRegWithSubRegs(RegVector &RV, unsigned Reg) { RV.push_back(Reg); if (TargetRegisterInfo::isPhysicalRegister(Reg)) - for (const uint16_t *R = TRI->getSubRegisters(Reg); *R; R++) - RV.push_back(*R); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + RV.push_back(*SubRegs); } struct BBInfo { @@ -191,9 +191,11 @@ namespace { void visitMachineFunctionBefore(); void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB); + void visitMachineBundleBefore(const MachineInstr *MI); void visitMachineInstrBefore(const MachineInstr *MI); void visitMachineOperand(const MachineOperand *MO, unsigned MONum); void visitMachineInstrAfter(const MachineInstr *MI); + void visitMachineBundleAfter(const MachineInstr *MI); void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB); void visitMachineFunctionAfter(); @@ -288,6 +290,8 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end(); MFI!=MFE; ++MFI) { visitMachineBasicBlockBefore(MFI); + // Keep track of the current bundle header. + const MachineInstr *CurBundle = 0; for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(), MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) { if (MBBI->getParent() != MFI) { @@ -295,15 +299,21 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { *OS << "Instruction: " << *MBBI; continue; } - // Skip BUNDLE instruction for now. FIXME: We should add code to verify - // the BUNDLE's specifically. - if (MBBI->isBundle()) - continue; + // Is this a bundle header? + if (!MBBI->isInsideBundle()) { + if (CurBundle) + visitMachineBundleAfter(CurBundle); + CurBundle = MBBI; + visitMachineBundleBefore(CurBundle); + } else if (!CurBundle) + report("No bundle header", MBBI); visitMachineInstrBefore(MBBI); for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) visitMachineOperand(&MBBI->getOperand(I), I); visitMachineInstrAfter(MBBI); } + if (CurBundle) + visitMachineBundleAfter(CurBundle); visitMachineBasicBlockAfter(MFI); } visitMachineFunctionAfter(); @@ -384,10 +394,10 @@ void MachineVerifier::visitMachineFunctionBefore() { // A sub-register of a reserved register is also reserved for (int Reg = regsReserved.find_first(); Reg>=0; Reg = regsReserved.find_next(Reg)) { - for (const uint16_t *Sub = TRI->getSubRegisters(Reg); *Sub; ++Sub) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { // FIXME: This should probably be: - // assert(regsReserved.test(*Sub) && "Non-reserved sub-register"); - regsReserved.set(*Sub); + // assert(regsReserved.test(*SubRegs) && "Non-reserved sub-register"); + regsReserved.set(*SubRegs); } } @@ -466,8 +476,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { report("MBB exits via unconditional fall-through but its successor " "differs from its CFG successor!", MBB); } - if (!MBB->empty() && MBB->back().isBarrier() && - !TII->isPredicated(&MBB->back())) { + if (!MBB->empty() && getBundleStart(&MBB->back())->isBarrier() && + !TII->isPredicated(getBundleStart(&MBB->back()))) { report("MBB exits via unconditional fall-through but ends with a " "barrier instruction!", MBB); } @@ -487,10 +497,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { if (MBB->empty()) { report("MBB exits via unconditional branch but doesn't contain " "any instructions!", MBB); - } else if (!MBB->back().isBarrier()) { + } else if (!getBundleStart(&MBB->back())->isBarrier()) { report("MBB exits via unconditional branch but doesn't end with a " "barrier instruction!", MBB); - } else if (!MBB->back().isTerminator()) { + } else if (!getBundleStart(&MBB->back())->isTerminator()) { report("MBB exits via unconditional branch but the branch isn't a " "terminator instruction!", MBB); } @@ -510,10 +520,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { if (MBB->empty()) { report("MBB exits via conditional branch/fall-through but doesn't " "contain any instructions!", MBB); - } else if (MBB->back().isBarrier()) { + } else if (getBundleStart(&MBB->back())->isBarrier()) { report("MBB exits via conditional branch/fall-through but ends with a " "barrier instruction!", MBB); - } else if (!MBB->back().isTerminator()) { + } else if (!getBundleStart(&MBB->back())->isTerminator()) { report("MBB exits via conditional branch/fall-through but the branch " "isn't a terminator instruction!", MBB); } @@ -530,10 +540,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { if (MBB->empty()) { report("MBB exits via conditional branch/branch but doesn't " "contain any instructions!", MBB); - } else if (!MBB->back().isBarrier()) { + } else if (!getBundleStart(&MBB->back())->isBarrier()) { report("MBB exits via conditional branch/branch but doesn't end with a " "barrier instruction!", MBB); - } else if (!MBB->back().isTerminator()) { + } else if (!getBundleStart(&MBB->back())->isTerminator()) { report("MBB exits via conditional branch/branch but the branch " "isn't a terminator instruction!", MBB); } @@ -554,8 +564,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { continue; } regsLive.insert(*I); - for (const uint16_t *R = TRI->getSubRegisters(*I); *R; R++) - regsLive.insert(*R); + for (MCSubRegIterator SubRegs(*I, TRI); SubRegs.isValid(); ++SubRegs) + regsLive.insert(*SubRegs); } regsLiveInButUnused = regsLive; @@ -564,8 +574,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { BitVector PR = MFI->getPristineRegs(MBB); for (int I = PR.find_first(); I>0; I = PR.find_next(I)) { regsLive.insert(I); - for (const uint16_t *R = TRI->getSubRegisters(I); *R; R++) - regsLive.insert(*R); + for (MCSubRegIterator SubRegs(I, TRI); SubRegs.isValid(); ++SubRegs) + regsLive.insert(*SubRegs); } regsKilled.clear(); @@ -575,6 +585,30 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { lastIndex = Indexes->getMBBStartIdx(MBB); } +// This function gets called for all bundle headers, including normal +// stand-alone unbundled instructions. +void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) { + if (Indexes && Indexes->hasIndex(MI)) { + SlotIndex idx = Indexes->getInstructionIndex(MI); + if (!(idx > lastIndex)) { + report("Instruction index out of order", MI); + *OS << "Last instruction was at " << lastIndex << '\n'; + } + lastIndex = idx; + } + + // Ensure non-terminators don't follow terminators. + // Ignore predicated terminators formed by if conversion. + // FIXME: If conversion shouldn't need to violate this rule. + if (MI->isTerminator() && !TII->isPredicated(MI)) { + if (!FirstTerminator) + FirstTerminator = MI; + } else if (FirstTerminator) { + report("Non-terminator instruction after the first terminator", MI); + *OS << "First terminator was:\t" << *FirstTerminator; + } +} + void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { const MCInstrDesc &MCID = MI->getDesc(); if (MI->getNumOperands() < MCID.getNumOperands()) { @@ -608,17 +642,6 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { } } - // Ensure non-terminators don't follow terminators. - // Ignore predicated terminators formed by if conversion. - // FIXME: If conversion shouldn't need to violate this rule. - if (MI->isTerminator() && !TII->isPredicated(MI)) { - if (!FirstTerminator) - FirstTerminator = MI; - } else if (FirstTerminator) { - report("Non-terminator instruction after the first terminator", MI); - *OS << "First terminator was:\t" << *FirstTerminator; - } - StringRef ErrorInfo; if (!TII->verifyInstruction(MI, ErrorInfo)) report(ErrorInfo.data(), MI); @@ -634,7 +657,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { if (MONum < MCID.getNumDefs()) { if (!MO->isReg()) report("Explicit definition must be a register", MO, MONum); - else if (!MO->isDef()) + else if (!MO->isDef() && !MCOI.isOptionalDef()) report("Explicit definition marked as use", MO, MONum); else if (MO->isImplicit()) report("Explicit definition marked as implicit", MO, MONum); @@ -672,7 +695,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { report("Illegal subregister index for physical register", MO, MONum); return; } - if (const TargetRegisterClass *DRC = TII->getRegClass(MCID,MONum,TRI)) { + if (const TargetRegisterClass *DRC = + TII->getRegClass(MCID, MONum, TRI, *MF)) { if (!DRC->contains(Reg)) { report("Illegal physical register for instruction", MO, MONum); *OS << TRI->getName(Reg) << " is not a " @@ -698,7 +722,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { return; } } - if (const TargetRegisterClass *DRC = TII->getRegClass(MCID,MONum,TRI)) { + if (const TargetRegisterClass *DRC = + TII->getRegClass(MCID, MONum, TRI, *MF)) { if (SubIdx) { const TargetRegisterClass *SuperRC = TRI->getLargestLegalSuperClass(RC); @@ -812,6 +837,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { // Reserved registers may be used even when 'dead'. if (!isReserved(Reg)) report("Using an undefined physical register", MO, MONum); + } else if (MRI->def_empty(Reg)) { + report("Reading virtual register without a def", MO, MONum); } else { BBInfo &MInfo = MBBInfoMap[MI->getParent()]; // We don't know which virtual registers are live in, so only complain @@ -841,12 +868,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { // Check LiveInts for a live range, but only for virtual registers. if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) && !LiveInts->isNotInMIMap(MI)) { - SlotIndex DefIdx = LiveInts->getInstructionIndex(MI).getRegSlot(); + SlotIndex DefIdx = LiveInts->getInstructionIndex(MI); + DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber()); if (LiveInts->hasInterval(Reg)) { const LiveInterval &LI = LiveInts->getInterval(Reg); if (const VNInfo *VNI = LI.getVNInfoAt(DefIdx)) { assert(VNI && "NULL valno is not allowed"); - if (VNI->def != DefIdx && !MO->isEarlyClobber()) { + if (VNI->def != DefIdx) { report("Inconsistent valno->def", MO, MONum); *OS << "Valno " << VNI->id << " is not defined at " << DefIdx << " in " << LI << '\n'; @@ -863,6 +891,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { } void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) { +} + +// This function gets called after visiting all instructions in a bundle. The +// argument points to the bundle header. +// Normal stand-alone instructions are also considered 'bundles', and this +// function is called for all of them. +void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) { BBInfo &MInfo = MBBInfoMap[MI->getParent()]; set_union(MInfo.regsKilled, regsKilled); set_subtract(regsLive, regsKilled); regsKilled.clear(); @@ -876,15 +911,6 @@ void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) { } set_subtract(regsLive, regsDead); regsDead.clear(); set_union(regsLive, regsDefined); regsDefined.clear(); - - if (Indexes && Indexes->hasIndex(MI)) { - SlotIndex idx = Indexes->getInstructionIndex(MI); - if (!(idx > lastIndex)) { - report("Instruction index out of order", MI); - *OS << "Last instruction was at " << lastIndex << '\n'; - } - lastIndex = idx; - } } void @@ -1025,7 +1051,21 @@ void MachineVerifier::visitMachineFunctionAfter() { // Now check liveness info if available calcRegsRequired(); - if (MRI->isSSA() && !MF->empty()) { + // Check for killed virtual registers that should be live out. + for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end(); + MFI != MFE; ++MFI) { + BBInfo &MInfo = MBBInfoMap[MFI]; + for (RegSet::iterator + I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E; + ++I) + if (MInfo.regsKilled.count(*I)) { + report("Virtual register killed in block, but needed live out.", MFI); + *OS << "Virtual register " << PrintReg(*I) + << " is used after the block.\n"; + } + } + + if (!MF->empty()) { BBInfo &MInfo = MBBInfoMap[&MF->front()]; for (RegSet::iterator I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E; @@ -1069,20 +1109,21 @@ void MachineVerifier::verifyLiveVariables() { void MachineVerifier::verifyLiveIntervals() { assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts"); - for (LiveIntervals::const_iterator LVI = LiveInts->begin(), - LVE = LiveInts->end(); LVI != LVE; ++LVI) { - const LiveInterval &LI = *LVI->second; + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); // Spilling and splitting may leave unused registers around. Skip them. - if (MRI->use_empty(LI.reg)) + if (MRI->reg_nodbg_empty(Reg)) continue; - // Physical registers have much weirdness going on, mostly from coalescing. - // We should probably fix it, but for now just ignore them. - if (TargetRegisterInfo::isPhysicalRegister(LI.reg)) + if (!LiveInts->hasInterval(Reg)) { + report("Missing live interval for virtual register", MF); + *OS << PrintReg(Reg, TRI) << " still has defs or uses\n"; continue; + } - assert(LVI->first == LI.reg && "Invalid reg to interval mapping"); + const LiveInterval &LI = LiveInts->getInterval(Reg); + assert(Reg == LI.reg && "Invalid reg to interval mapping"); for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); I!=E; ++I) { @@ -1307,15 +1348,18 @@ void MachineVerifier::verifyLiveIntervals() { ++MFI; continue; } + + // Is VNI a PHI-def in the current block? + bool IsPHI = VNI->isPHIDef() && + VNI->def == LiveInts->getMBBStartIdx(MFI); + // Check that VNI is live-out of all predecessors. for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(), PE = MFI->pred_end(); PI != PE; ++PI) { SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI); const VNInfo *PVNI = LI.getVNInfoBefore(PEnd); - if (VNI->isPHIDef() && VNI->def == LiveInts->getMBBStartIdx(MFI)) - continue; - + // All predecessors must have a live-out value. if (!PVNI) { report("Register not marked live out of predecessor", *PI); *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber() @@ -1324,12 +1368,14 @@ void MachineVerifier::verifyLiveIntervals() { continue; } - if (PVNI != VNI) { + // Only PHI-defs can take different predecessor values. + if (!IsPHI && PVNI != VNI) { report("Different value live out of predecessor", *PI); *OS << "Valno #" << PVNI->id << " live out of BB#" << (*PI)->getNumber() << '@' << PEnd << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber() - << '@' << LiveInts->getMBBStartIdx(MFI) << " in " << LI << '\n'; + << '@' << LiveInts->getMBBStartIdx(MFI) << " in " + << PrintReg(Reg) << ": " << LI << '\n'; } } if (&*MFI == EndMBB) @@ -1357,4 +1403,3 @@ void MachineVerifier::verifyLiveIntervals() { } } } - diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp index 0ed4c34..e6e23da 100644 --- a/lib/CodeGen/PHIElimination.cpp +++ b/lib/CodeGen/PHIElimination.cpp @@ -171,23 +171,30 @@ bool PHIElimination::EliminatePHINodes(MachineFunction &MF, return true; } +/// isImplicitlyDefined - Return true if all defs of VirtReg are implicit-defs. +/// This includes registers with no defs. +static bool isImplicitlyDefined(unsigned VirtReg, + const MachineRegisterInfo *MRI) { + for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(VirtReg), + DE = MRI->def_end(); DI != DE; ++DI) + if (!DI->isImplicitDef()) + return false; + return true; +} + /// isSourceDefinedByImplicitDef - Return true if all sources of the phi node /// are implicit_def's. static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi, const MachineRegisterInfo *MRI) { - for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) { - unsigned SrcReg = MPhi->getOperand(i).getReg(); - const MachineInstr *DefMI = MRI->getVRegDef(SrcReg); - if (!DefMI || !DefMI->isImplicitDef()) + for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) + if (!isImplicitlyDefined(MPhi->getOperand(i).getReg(), MRI)) return false; - } return true; } - /// LowerAtomicPHINode - Lower the PHI node at the top of the specified block, -/// under the assuption that it needs to be lowered in a way that supports +/// under the assumption that it needs to be lowered in a way that supports /// atomic execution of PHIs. This lowering method is always correct all of the /// time. /// @@ -287,7 +294,8 @@ void PHIElimination::LowerAtomicPHINode( for (int i = NumSrcs - 1; i >= 0; --i) { unsigned SrcReg = MPhi->getOperand(i*2+1).getReg(); unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg(); - + bool SrcUndef = MPhi->getOperand(i*2+1).isUndef() || + isImplicitlyDefined(SrcReg, MRI); assert(TargetRegisterInfo::isVirtualRegister(SrcReg) && "Machine PHI Operands must all be virtual registers!"); @@ -295,14 +303,6 @@ void PHIElimination::LowerAtomicPHINode( // path the PHI. MachineBasicBlock &opBlock = *MPhi->getOperand(i*2+2).getMBB(); - // If source is defined by an implicit def, there is no need to insert a - // copy. - MachineInstr *DefMI = MRI->getVRegDef(SrcReg); - if (DefMI->isImplicitDef()) { - ImpDefs.insert(DefMI); - continue; - } - // Check to make sure we haven't already emitted the copy for this block. // This can happen because PHI nodes may have multiple entries for the same // basic block. @@ -315,12 +315,27 @@ void PHIElimination::LowerAtomicPHINode( findPHICopyInsertPoint(&opBlock, &MBB, SrcReg); // Insert the copy. - if (!reusedIncoming && IncomingReg) - BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(), - TII->get(TargetOpcode::COPY), IncomingReg).addReg(SrcReg, 0, SrcSubReg); + if (!reusedIncoming && IncomingReg) { + if (SrcUndef) { + // The source register is undefined, so there is no need for a real + // COPY, but we still need to ensure joint dominance by defs. + // Insert an IMPLICIT_DEF instruction. + BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(), + TII->get(TargetOpcode::IMPLICIT_DEF), IncomingReg); + + // Clean up the old implicit-def, if there even was one. + if (MachineInstr *DefMI = MRI->getVRegDef(SrcReg)) + if (DefMI->isImplicitDef()) + ImpDefs.insert(DefMI); + } else { + BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(), + TII->get(TargetOpcode::COPY), IncomingReg) + .addReg(SrcReg, 0, SrcSubReg); + } + } // Now update live variable information if we have it. Otherwise we're done - if (!LV) continue; + if (SrcUndef || !LV) continue; // We want to be able to insert a kill of the register if this PHI (aka, the // copy we just inserted) is the last use of the source value. Live @@ -340,39 +355,35 @@ void PHIElimination::LowerAtomicPHINode( // add a kill marker in this block saying that it kills the incoming value! if (!ValueIsUsed && !LV->isLiveOut(SrcReg, opBlock)) { // In our final twist, we have to decide which instruction kills the - // register. In most cases this is the copy, however, the first - // terminator instruction at the end of the block may also use the value. - // In this case, we should mark *it* as being the killing block, not the - // copy. - MachineBasicBlock::iterator KillInst; - MachineBasicBlock::iterator Term = opBlock.getFirstTerminator(); - if (Term != opBlock.end() && Term->readsRegister(SrcReg)) { - KillInst = Term; - - // Check that no other terminators use values. -#ifndef NDEBUG - for (MachineBasicBlock::iterator TI = llvm::next(Term); - TI != opBlock.end(); ++TI) { - if (TI->isDebugValue()) - continue; - assert(!TI->readsRegister(SrcReg) && - "Terminator instructions cannot use virtual registers unless" - "they are the first terminator in a block!"); - } -#endif - } else if (reusedIncoming || !IncomingReg) { - // We may have to rewind a bit if we didn't insert a copy this time. - KillInst = Term; - while (KillInst != opBlock.begin()) { - --KillInst; - if (KillInst->isDebugValue()) - continue; - if (KillInst->readsRegister(SrcReg)) - break; + // register. In most cases this is the copy, however, terminator + // instructions at the end of the block may also use the value. In this + // case, we should mark the last such terminator as being the killing + // block, not the copy. + MachineBasicBlock::iterator KillInst = opBlock.end(); + MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator(); + for (MachineBasicBlock::iterator Term = FirstTerm; + Term != opBlock.end(); ++Term) { + if (Term->readsRegister(SrcReg)) + KillInst = Term; + } + + if (KillInst == opBlock.end()) { + // No terminator uses the register. + + if (reusedIncoming || !IncomingReg) { + // We may have to rewind a bit if we didn't insert a copy this time. + KillInst = FirstTerm; + while (KillInst != opBlock.begin()) { + --KillInst; + if (KillInst->isDebugValue()) + continue; + if (KillInst->readsRegister(SrcReg)) + break; + } + } else { + // We just inserted this copy. + KillInst = prior(InsertPos); } - } else { - // We just inserted this copy. - KillInst = prior(InsertPos); } assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction"); @@ -412,28 +423,71 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF, if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad()) return false; // Quick exit for basic blocks without PHIs. + const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : 0; + bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader(); + bool Changed = false; for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end(); BBI != BBE && BBI->isPHI(); ++BBI) { for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) { unsigned Reg = BBI->getOperand(i).getReg(); MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB(); - // We break edges when registers are live out from the predecessor block - // (not considering PHI nodes). If the register is live in to this block - // anyway, we would gain nothing from splitting. + // Is there a critical edge from PreMBB to MBB? + if (PreMBB->succ_size() == 1) + continue; + // Avoid splitting backedges of loops. It would introduce small // out-of-line blocks into the loop which is very bad for code placement. - if (PreMBB != &MBB && - !LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB)) { - if (!MLI || - !(MLI->getLoopFor(PreMBB) == MLI->getLoopFor(&MBB) && - MLI->isLoopHeader(&MBB))) { - if (PreMBB->SplitCriticalEdge(&MBB, this)) { - Changed = true; - ++NumCriticalEdgesSplit; - } - } + if (PreMBB == &MBB) + continue; + const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : 0; + if (IsLoopHeader && PreLoop == CurLoop) + continue; + + // LV doesn't consider a phi use live-out, so isLiveOut only returns true + // when the source register is live-out for some other reason than a phi + // use. That means the copy we will insert in PreMBB won't be a kill, and + // there is a risk it may not be coalesced away. + // + // If the copy would be a kill, there is no need to split the edge. + if (!LV.isLiveOut(Reg, *PreMBB)) + continue; + + DEBUG(dbgs() << PrintReg(Reg) << " live-out before critical edge BB#" + << PreMBB->getNumber() << " -> BB#" << MBB.getNumber() + << ": " << *BBI); + + // If Reg is not live-in to MBB, it means it must be live-in to some + // other PreMBB successor, and we can avoid the interference by splitting + // the edge. + // + // If Reg *is* live-in to MBB, the interference is inevitable and a copy + // is likely to be left after coalescing. If we are looking at a loop + // exiting edge, split it so we won't insert code in the loop, otherwise + // don't bother. + bool ShouldSplit = !LV.isLiveIn(Reg, MBB); + + // Check for a loop exiting edge. + if (!ShouldSplit && CurLoop != PreLoop) { + DEBUG({ + dbgs() << "Split wouldn't help, maybe avoid loop copies?\n"; + if (PreLoop) dbgs() << "PreLoop: " << *PreLoop; + if (CurLoop) dbgs() << "CurLoop: " << *CurLoop; + }); + // This edge could be entering a loop, exiting a loop, or it could be + // both: Jumping directly form one loop to the header of a sibling + // loop. + // Split unless this edge is entering CurLoop from an outer loop. + ShouldSplit = PreLoop && !PreLoop->contains(CurLoop); + } + if (!ShouldSplit) + continue; + if (!PreMBB->SplitCriticalEdge(&MBB, this)) { + DEBUG(dbgs() << "Failed to split ciritcal edge.\n"); + continue; } + Changed = true; + ++NumCriticalEdgesSplit; } } return Changed; diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp index 13d1bbc..69d6d00 100644 --- a/lib/CodeGen/Passes.cpp +++ b/lib/CodeGen/Passes.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Assembly/PrintModulePass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -48,6 +49,8 @@ static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden, cl::desc("Disable Stack Slot Coloring")); static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden, cl::desc("Disable Machine Dead Code Elimination")); +static cl::opt<bool> EnableEarlyIfConversion("enable-early-ifcvt", cl::Hidden, + cl::desc("Enable Early If-conversion")); static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden, cl::desc("Disable Machine LICM")); static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden, @@ -80,15 +83,19 @@ static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden, static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden, cl::desc("Verify generated machine code"), cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=NULL)); +static cl::opt<std::string> +PrintMachineInstrs("print-machineinstrs", cl::ValueOptional, + cl::desc("Print machine instrs"), + cl::value_desc("pass-name"), cl::init("option-unspecified")); /// Allow standard passes to be disabled by command line options. This supports /// simple binary flags that either suppress the pass or do nothing. /// i.e. -disable-mypass=false has no effect. /// These should be converted to boolOrDefault in order to use applyOverride. -static AnalysisID applyDisable(AnalysisID ID, bool Override) { +static AnalysisID applyDisable(AnalysisID PassID, bool Override) { if (Override) - return &NoPassID; - return ID; + return 0; + return PassID; } /// Allow Pass selection to be overriden by command line options. This supports @@ -101,13 +108,13 @@ static AnalysisID applyOverride(AnalysisID TargetID, cl::boolOrDefault Override, case cl::BOU_UNSET: return TargetID; case cl::BOU_TRUE: - if (TargetID != &NoPassID) + if (TargetID) return TargetID; - if (StandardID == &NoPassID) + if (StandardID == 0) report_fatal_error("Target cannot enable pass"); return StandardID; case cl::BOU_FALSE: - return &NoPassID; + return 0; } llvm_unreachable("Invalid command line option state"); } @@ -149,6 +156,9 @@ static AnalysisID overridePass(AnalysisID StandardID, AnalysisID TargetID) { if (StandardID == &DeadMachineInstructionElimID) return applyDisable(TargetID, DisableMachineDCE); + if (StandardID == &EarlyIfConverterID) + return applyDisable(TargetID, !EnableEarlyIfConversion); + if (StandardID == &MachineLICMID) return applyDisable(TargetID, DisableMachineLICM); @@ -178,9 +188,6 @@ INITIALIZE_PASS(TargetPassConfig, "targetpassconfig", "Target Pass Configuration", false, false) char TargetPassConfig::ID = 0; -static char NoPassIDAnchor = 0; -char &llvm::NoPassID = NoPassIDAnchor; - // Pseudo Pass IDs. char TargetPassConfig::EarlyTailDuplicateID = 0; char TargetPassConfig::PostRAMachineLICMID = 0; @@ -193,9 +200,13 @@ public: // that are part of a standard pass pipeline without overridding the entire // pipeline. This mechanism allows target options to inherit a standard pass's // user interface. For example, a target may disable a standard pass by - // default by substituting NoPass, and the user may still enable that standard - // pass with an explicit command line option. + // default by substituting a pass ID of zero, and the user may still enable + // that standard pass with an explicit command line option. DenseMap<AnalysisID,AnalysisID> TargetPasses; + + /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass + /// is inserted after each instance of the first one. + SmallVector<std::pair<AnalysisID, AnalysisID>, 4> InsertedPasses; }; } // namespace llvm @@ -207,7 +218,8 @@ TargetPassConfig::~TargetPassConfig() { // Out of line constructor provides default values for pass options and // registers all common codegen passes. TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) - : ImmutablePass(ID), TM(tm), PM(pm), Impl(0), Initialized(false), + : ImmutablePass(ID), PM(&pm), StartAfter(0), StopAfter(0), + Started(true), Stopped(false), TM(tm), Impl(0), Initialized(false), DisableVerify(false), EnableTailMerge(true) { @@ -218,11 +230,22 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) initializeCodeGen(*PassRegistry::getPassRegistry()); // Substitute Pseudo Pass IDs for real ones. - substitutePass(EarlyTailDuplicateID, TailDuplicateID); - substitutePass(PostRAMachineLICMID, MachineLICMID); + substitutePass(&EarlyTailDuplicateID, &TailDuplicateID); + substitutePass(&PostRAMachineLICMID, &MachineLICMID); + + // Disable early if-conversion. Targets that are ready can enable it. + disablePass(&EarlyIfConverterID); // Temporarily disable experimental passes. - substitutePass(MachineSchedulerID, NoPassID); + substitutePass(&MachineSchedulerID, 0); +} + +/// Insert InsertedPassID pass after TargetPassID. +void TargetPassConfig::insertPass(AnalysisID TargetPassID, + AnalysisID InsertedPassID) { + assert(TargetPassID != InsertedPassID && "Insert a pass after itself!"); + std::pair<AnalysisID, AnalysisID> P(TargetPassID, InsertedPassID); + Impl->InsertedPasses.push_back(P); } /// createPassConfig - Create a pass configuration object to be used by @@ -234,7 +257,7 @@ TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) { } TargetPassConfig::TargetPassConfig() - : ImmutablePass(ID), PM(*(PassManagerBase*)0) { + : ImmutablePass(ID), PM(0) { llvm_unreachable("TargetPassConfig should not be constructed on-the-fly"); } @@ -244,8 +267,9 @@ void TargetPassConfig::setOpt(bool &Opt, bool Val) { Opt = Val; } -void TargetPassConfig::substitutePass(char &StandardID, char &TargetID) { - Impl->TargetPasses[&StandardID] = &TargetID; +void TargetPassConfig::substitutePass(AnalysisID StandardID, + AnalysisID TargetID) { + Impl->TargetPasses[StandardID] = TargetID; } AnalysisID TargetPassConfig::getPassSubstitution(AnalysisID ID) const { @@ -256,29 +280,62 @@ AnalysisID TargetPassConfig::getPassSubstitution(AnalysisID ID) const { return I->second; } -/// Add a CodeGen pass at this point in the pipeline after checking for target -/// and command line overrides. -AnalysisID TargetPassConfig::addPass(char &ID) { +/// Add a pass to the PassManager if that pass is supposed to be run. If the +/// Started/Stopped flags indicate either that the compilation should start at +/// a later pass or that it should stop after an earlier pass, then do not add +/// the pass. Finally, compare the current pass against the StartAfter +/// and StopAfter options and change the Started/Stopped flags accordingly. +void TargetPassConfig::addPass(Pass *P) { assert(!Initialized && "PassConfig is immutable"); - AnalysisID TargetID = getPassSubstitution(&ID); - AnalysisID FinalID = overridePass(&ID, TargetID); - if (FinalID == &NoPassID) + // Cache the Pass ID here in case the pass manager finds this pass is + // redundant with ones already scheduled / available, and deletes it. + // Fundamentally, once we add the pass to the manager, we no longer own it + // and shouldn't reference it. + AnalysisID PassID = P->getPassID(); + + if (Started && !Stopped) + PM->add(P); + if (StopAfter == PassID) + Stopped = true; + if (StartAfter == PassID) + Started = true; + if (Stopped && !Started) + report_fatal_error("Cannot stop compilation after pass that is not run"); +} + +/// Add a CodeGen pass at this point in the pipeline after checking for target +/// and command line overrides. +AnalysisID TargetPassConfig::addPass(AnalysisID PassID) { + AnalysisID TargetID = getPassSubstitution(PassID); + AnalysisID FinalID = overridePass(PassID, TargetID); + if (FinalID == 0) return FinalID; Pass *P = Pass::createPass(FinalID); if (!P) llvm_unreachable("Pass ID not registered"); - PM.add(P); + addPass(P); + // Add the passes after the pass P if there is any. + for (SmallVector<std::pair<AnalysisID, AnalysisID>, 4>::iterator + I = Impl->InsertedPasses.begin(), E = Impl->InsertedPasses.end(); + I != E; ++I) { + if ((*I).first == PassID) { + assert((*I).second && "Illegal Pass ID!"); + Pass *NP = Pass::createPass((*I).second); + assert(NP && "Pass ID not registered"); + addPass(NP); + } + } return FinalID; } -void TargetPassConfig::printAndVerify(const char *Banner) const { +void TargetPassConfig::printAndVerify(const char *Banner) { if (TM->shouldPrintMachineCode()) - PM.add(createMachineFunctionPrinterPass(dbgs(), Banner)); + addPass(createMachineFunctionPrinterPass(dbgs(), Banner)); if (VerifyMachineCode) - PM.add(createMachineVerifierPass(Banner)); + addPass(createMachineVerifierPass(Banner)); } /// Add common target configurable passes that perform LLVM IR to IR transforms @@ -288,46 +345,73 @@ void TargetPassConfig::addIRPasses() { // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that // BasicAliasAnalysis wins if they disagree. This is intended to help // support "obvious" type-punning idioms. - PM.add(createTypeBasedAliasAnalysisPass()); - PM.add(createBasicAliasAnalysisPass()); + addPass(createTypeBasedAliasAnalysisPass()); + addPass(createBasicAliasAnalysisPass()); // Before running any passes, run the verifier to determine if the input // coming from the front-end and/or optimizer is valid. if (!DisableVerify) - PM.add(createVerifierPass()); + addPass(createVerifierPass()); // Run loop strength reduction before anything else. if (getOptLevel() != CodeGenOpt::None && !DisableLSR) { - PM.add(createLoopStrengthReducePass(getTargetLowering())); + addPass(createLoopStrengthReducePass(getTargetLowering())); if (PrintLSR) - PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs())); + addPass(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs())); } - PM.add(createGCLoweringPass()); + addPass(createGCLoweringPass()); // Make sure that no unreachable blocks are instruction selected. - PM.add(createUnreachableBlockEliminationPass()); + addPass(createUnreachableBlockEliminationPass()); +} + +/// Turn exception handling constructs into something the code generators can +/// handle. +void TargetPassConfig::addPassesToHandleExceptions() { + switch (TM->getMCAsmInfo()->getExceptionHandlingType()) { + case ExceptionHandling::SjLj: + // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both + // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise, + // catch info can get misplaced when a selector ends up more than one block + // removed from the parent invoke(s). This could happen when a landing + // pad is shared by multiple invokes and is also a target of a normal + // edge from elsewhere. + addPass(createSjLjEHPreparePass(TM->getTargetLowering())); + // FALLTHROUGH + case ExceptionHandling::DwarfCFI: + case ExceptionHandling::ARM: + case ExceptionHandling::Win64: + addPass(createDwarfEHPass(TM)); + break; + case ExceptionHandling::None: + addPass(createLowerInvokePass(TM->getTargetLowering())); + + // The lower invoke pass may create unreachable code. Remove it. + addPass(createUnreachableBlockEliminationPass()); + break; + } } /// Add common passes that perform LLVM IR to IR transforms in preparation for /// instruction selection. void TargetPassConfig::addISelPrepare() { if (getOptLevel() != CodeGenOpt::None && !DisableCGP) - PM.add(createCodeGenPreparePass(getTargetLowering())); + addPass(createCodeGenPreparePass(getTargetLowering())); - PM.add(createStackProtectorPass(getTargetLowering())); + addPass(createStackProtectorPass(getTargetLowering())); addPreISel(); if (PrintISelInput) - PM.add(createPrintFunctionPass("\n\n" - "*** Final LLVM Code input to ISel ***\n", - &dbgs())); + addPass(createPrintFunctionPass("\n\n" + "*** Final LLVM Code input to ISel ***\n", + &dbgs())); // All passes which modify the LLVM IR are now complete; run the verifier // to ensure that the IR is valid. if (!DisableVerify) - PM.add(createVerifierPass()); + addPass(createVerifierPass()); } /// Add the complete set of target-independent postISel code generator passes. @@ -349,11 +433,26 @@ void TargetPassConfig::addISelPrepare() { /// TODO: We could use a single addPre/Post(ID) hook to allow pass injection /// before/after any target-independent pass. But it's currently overkill. void TargetPassConfig::addMachinePasses() { + // Insert a machine instr printer pass after the specified pass. + // If -print-machineinstrs specified, print machineinstrs after all passes. + if (StringRef(PrintMachineInstrs.getValue()).equals("")) + TM->Options.PrintMachineCode = true; + else if (!StringRef(PrintMachineInstrs.getValue()) + .equals("option-unspecified")) { + const PassRegistry *PR = PassRegistry::getPassRegistry(); + const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue()); + const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs")); + assert (TPI && IPI && "Pass ID not registered!"); + const char *TID = (char *)(TPI->getTypeInfo()); + const char *IID = (char *)(IPI->getTypeInfo()); + insertPass(TID, IID); + } + // Print the instruction selected machine code... printAndVerify("After Instruction Selection"); // Expand pseudo-instructions emitted by ISel. - addPass(ExpandISelPseudosID); + addPass(&ExpandISelPseudosID); // Add passes that optimize machine instructions in SSA form. if (getOptLevel() != CodeGenOpt::None) { @@ -362,7 +461,7 @@ void TargetPassConfig::addMachinePasses() { else { // If the target requests it, assign local variables to stack slots relative // to one another and simplify frame index references where possible. - addPass(LocalStackSlotAllocationID); + addPass(&LocalStackSlotAllocationID); } // Run pre-ra passes. @@ -381,7 +480,7 @@ void TargetPassConfig::addMachinePasses() { printAndVerify("After PostRegAlloc passes"); // Insert prolog/epilog code. Eliminate abstract frame index references... - addPass(PrologEpilogCodeInserterID); + addPass(&PrologEpilogCodeInserterID); printAndVerify("After PrologEpilogCodeInserter"); /// Add passes that optimize machine instructions after register allocation. @@ -389,7 +488,7 @@ void TargetPassConfig::addMachinePasses() { addMachineLateOptimization(); // Expand pseudo instructions before second scheduling pass. - addPass(ExpandPostRAPseudosID); + addPass(&ExpandPostRAPseudosID); printAndVerify("After ExpandPostRAPseudos"); // Run pre-sched2 passes. @@ -398,14 +497,14 @@ void TargetPassConfig::addMachinePasses() { // Second pass scheduler. if (getOptLevel() != CodeGenOpt::None) { - addPass(PostRASchedulerID); + addPass(&PostRASchedulerID); printAndVerify("After PostRAScheduler"); } // GC - addPass(GCMachineCodeAnalysisID); + addPass(&GCMachineCodeAnalysisID); if (PrintGCInfo) - PM.add(createGCInfoPrinter(dbgs())); + addPass(createGCInfoPrinter(dbgs())); // Basic block placement. if (getOptLevel() != CodeGenOpt::None) @@ -418,30 +517,31 @@ void TargetPassConfig::addMachinePasses() { /// Add passes that optimize machine instructions in SSA form. void TargetPassConfig::addMachineSSAOptimization() { // Pre-ra tail duplication. - if (addPass(EarlyTailDuplicateID) != &NoPassID) + if (addPass(&EarlyTailDuplicateID)) printAndVerify("After Pre-RegAlloc TailDuplicate"); // Optimize PHIs before DCE: removing dead PHI cycles may make more // instructions dead. - addPass(OptimizePHIsID); + addPass(&OptimizePHIsID); // If the target requests it, assign local variables to stack slots relative // to one another and simplify frame index references where possible. - addPass(LocalStackSlotAllocationID); + addPass(&LocalStackSlotAllocationID); // With optimization, dead code should already be eliminated. However // there is one known exception: lowered code for arguments that are only // used by tail calls, where the tail calls reuse the incoming stack // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). - addPass(DeadMachineInstructionElimID); + addPass(&DeadMachineInstructionElimID); printAndVerify("After codegen DCE pass"); - addPass(MachineLICMID); - addPass(MachineCSEID); - addPass(MachineSinkingID); + addPass(&EarlyIfConverterID); + addPass(&MachineLICMID); + addPass(&MachineCSEID); + addPass(&MachineSinkingID); printAndVerify("After Machine LICM, CSE and Sinking passes"); - addPass(PeepholeOptimizerID); + addPass(&PeepholeOptimizerID); printAndVerify("After codegen peephole optimization pass"); } @@ -519,10 +619,10 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) { /// Add the minimum set of target-independent passes that are required for /// register allocation. No coalescing or scheduling. void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { - addPass(PHIEliminationID); - addPass(TwoAddressInstructionPassID); + addPass(&PHIEliminationID); + addPass(&TwoAddressInstructionPassID); - PM.add(RegAllocPass); + addPass(RegAllocPass); printAndVerify("After Register Allocation"); } @@ -530,42 +630,46 @@ void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { /// optimized register allocation, including coalescing, machine instruction /// scheduling, and register allocation itself. void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + addPass(&ProcessImplicitDefsID); + // LiveVariables currently requires pure SSA form. // // FIXME: Once TwoAddressInstruction pass no longer uses kill flags, // LiveVariables can be removed completely, and LiveIntervals can be directly // computed. (We still either need to regenerate kill flags after regalloc, or // preferably fix the scavenger to not depend on them). - addPass(LiveVariablesID); + addPass(&LiveVariablesID); // Add passes that move from transformed SSA into conventional SSA. This is a // "copy coalescing" problem. // if (!EnableStrongPHIElim) { // Edge splitting is smarter with machine loop info. - addPass(MachineLoopInfoID); - addPass(PHIEliminationID); + addPass(&MachineLoopInfoID); + addPass(&PHIEliminationID); } - addPass(TwoAddressInstructionPassID); - - // FIXME: Either remove this pass completely, or fix it so that it works on - // SSA form. We could modify LiveIntervals to be independent of this pass, But - // it would be even better to simply eliminate *all* IMPLICIT_DEFs before - // leaving SSA. - addPass(ProcessImplicitDefsID); + addPass(&TwoAddressInstructionPassID); if (EnableStrongPHIElim) - addPass(StrongPHIEliminationID); + addPass(&StrongPHIEliminationID); - addPass(RegisterCoalescerID); + addPass(&RegisterCoalescerID); // PreRA instruction scheduling. - if (addPass(MachineSchedulerID) != &NoPassID) + if (addPass(&MachineSchedulerID)) printAndVerify("After Machine Scheduling"); // Add the selected register allocation pass. - PM.add(RegAllocPass); - printAndVerify("After Register Allocation"); + addPass(RegAllocPass); + printAndVerify("After Register Allocation, before rewriter"); + + // Allow targets to change the register assignments before rewriting. + if (addPreRewrite()) + printAndVerify("After pre-rewrite passes"); + + // Finally rewrite virtual registers. + addPass(&VirtRegRewriterID); + printAndVerify("After Virtual Register Rewriter"); // FinalizeRegAlloc is convenient until MachineInstrBundles is more mature, // but eventually, all users of it should probably be moved to addPostRA and @@ -579,12 +683,12 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { // // FIXME: Re-enable coloring with register when it's capable of adding // kill markers. - addPass(StackSlotColoringID); + addPass(&StackSlotColoringID); // Run post-ra machine LICM to hoist reloads / remats. // // FIXME: can this move into MachineLateOptimization? - addPass(PostRAMachineLICMID); + addPass(&PostRAMachineLICMID); printAndVerify("After StackSlotColoring and postra Machine LICM"); } @@ -596,33 +700,33 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { /// Add passes that optimize machine instructions after register allocation. void TargetPassConfig::addMachineLateOptimization() { // Branch folding must be run after regalloc and prolog/epilog insertion. - if (addPass(BranchFolderPassID) != &NoPassID) + if (addPass(&BranchFolderPassID)) printAndVerify("After BranchFolding"); // Tail duplication. - if (addPass(TailDuplicateID) != &NoPassID) + if (addPass(&TailDuplicateID)) printAndVerify("After TailDuplicate"); // Copy propagation. - if (addPass(MachineCopyPropagationID) != &NoPassID) + if (addPass(&MachineCopyPropagationID)) printAndVerify("After copy propagation pass"); } /// Add standard basic block placement passes. void TargetPassConfig::addBlockPlacement() { - AnalysisID ID = &NoPassID; + AnalysisID PassID = 0; if (!DisableBlockPlacement) { // MachineBlockPlacement is a new pass which subsumes the functionality of // CodPlacementOpt. The old code placement pass can be restored by // disabling block placement, but eventually it will be removed. - ID = addPass(MachineBlockPlacementID); + PassID = addPass(&MachineBlockPlacementID); } else { - ID = addPass(CodePlacementOptID); + PassID = addPass(&CodePlacementOptID); } - if (ID != &NoPassID) { + if (PassID) { // Run a separate pass to collect block placement statistics. if (EnableBlockPlacementStats) - addPass(MachineBlockPlacementStatsID); + addPass(&MachineBlockPlacementStatsID); printAndVerify("After machine block placement."); } diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp index 9c5c029..91c33c4 100644 --- a/lib/CodeGen/PeepholeOptimizer.cpp +++ b/lib/CodeGen/PeepholeOptimizer.cpp @@ -31,6 +31,15 @@ // same flag that the "cmp" instruction sets and that "bz" uses, then we can // eliminate the "cmp" instruction. // +// Another instance, in this code: +// +// sub r1, r3 | sub r1, imm +// cmp r3, r1 or cmp r1, r3 | cmp r1, imm +// bge L1 +// +// If the branch instruction can use flag from "sub", then we can replace +// "sub" with "subs" and eliminate the "cmp" instruction. +// // - Optimize Bitcast pairs: // // v1 = bitcast v0 @@ -95,14 +104,14 @@ namespace { } private: - bool OptimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB); - bool OptimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB); - bool OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, + bool optimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB); + bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB); + bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, SmallPtrSet<MachineInstr*, 8> &LocalMIs); bool isMoveImmediate(MachineInstr *MI, SmallSet<unsigned, 4> &ImmDefRegs, DenseMap<unsigned, MachineInstr*> &ImmDefMIs); - bool FoldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, + bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet<unsigned, 4> &ImmDefRegs, DenseMap<unsigned, MachineInstr*> &ImmDefMIs); }; @@ -116,7 +125,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts", "Peephole Optimizations", false, false) -/// OptimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads +/// optimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads /// a single register and writes a single register and it does not modify the /// source, and if the source value is preserved as a sub-register of the /// result, then replace all reachable uses of the source with the subreg of the @@ -126,7 +135,7 @@ INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts", /// the code. Since this code does not currently share EXTRACTs, just ignore all /// debug uses. bool PeepholeOptimizer:: -OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, +optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, SmallPtrSet<MachineInstr*, 8> &LocalMIs) { unsigned SrcReg, DstReg, SubIdx; if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx)) @@ -136,16 +145,30 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, TargetRegisterInfo::isPhysicalRegister(SrcReg)) return false; - MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(SrcReg); - if (++UI == MRI->use_nodbg_end()) + if (MRI->hasOneNonDBGUse(SrcReg)) // No other uses. return false; + // Ensure DstReg can get a register class that actually supports + // sub-registers. Don't change the class until we commit. + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + DstRC = TM->getRegisterInfo()->getSubClassWithSubReg(DstRC, SubIdx); + if (!DstRC) + return false; + + // The ext instr may be operating on a sub-register of SrcReg as well. + // PPC::EXTSW is a 32 -> 64-bit sign extension, but it reads a 64-bit + // register. + // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of + // SrcReg:SubIdx should be replaced. + bool UseSrcSubIdx = TM->getRegisterInfo()-> + getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != 0; + // The source has other uses. See if we can replace the other uses with use of // the result of the extension. SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs; - UI = MRI->use_nodbg_begin(DstReg); - for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end(); + for (MachineRegisterInfo::use_nodbg_iterator + UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end(); UI != UE; ++UI) ReachedBBs.insert(UI->getParent()); @@ -156,8 +179,8 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, SmallVector<MachineOperand*, 8> ExtendedUses; bool ExtendLife = true; - UI = MRI->use_nodbg_begin(SrcReg); - for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end(); + for (MachineRegisterInfo::use_nodbg_iterator + UI = MRI->use_nodbg_begin(SrcReg), UE = MRI->use_nodbg_end(); UI != UE; ++UI) { MachineOperand &UseMO = UI.getOperand(); MachineInstr *UseMI = &*UI; @@ -169,6 +192,10 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, continue; } + // Only accept uses of SrcReg:SubIdx. + if (UseSrcSubIdx && UseMO.getSubReg() != SubIdx) + continue; + // It's an error to translate this: // // %reg1025 = <sext> %reg1024 @@ -223,9 +250,9 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, // Look for PHI uses of the extended result, we don't want to extend the // liveness of a PHI input. It breaks all kinds of assumptions down // stream. A PHI use is expected to be the kill of its source values. - UI = MRI->use_nodbg_begin(DstReg); for (MachineRegisterInfo::use_nodbg_iterator - UE = MRI->use_nodbg_end(); UI != UE; ++UI) + UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end(); + UI != UE; ++UI) if (UI->isPHI()) PHIBBs.insert(UI->getParent()); @@ -238,14 +265,20 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, continue; // About to add uses of DstReg, clear DstReg's kill flags. - if (!Changed) + if (!Changed) { MRI->clearKillFlags(DstReg); + MRI->constrainRegClass(DstReg, DstRC); + } unsigned NewVR = MRI->createVirtualRegister(RC); - BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(), - TII->get(TargetOpcode::COPY), NewVR) + MachineInstr *Copy = BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(), + TII->get(TargetOpcode::COPY), NewVR) .addReg(DstReg, 0, SubIdx); - + // SubIdx applies to both SrcReg and DstReg when UseSrcSubIdx is set. + if (UseSrcSubIdx) { + Copy->getOperand(0).setSubReg(SubIdx); + Copy->getOperand(0).setIsUndef(); + } UseMO->setReg(NewVR); ++NumReuse; Changed = true; @@ -255,7 +288,7 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, return Changed; } -/// OptimizeBitcastInstr - If the instruction is a bitcast instruction A that +/// optimizeBitcastInstr - If the instruction is a bitcast instruction A that /// cannot be optimized away during isel (e.g. ARM::VMOVSR, which bitcast /// a value cross register classes), and the source is defined by another /// bitcast instruction B. And if the register class of source of B matches @@ -265,7 +298,7 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, /// %vreg3<def> = VMOVRS %vreg0 /// Replace all uses of vreg3 with vreg1. -bool PeepholeOptimizer::OptimizeBitcastInstr(MachineInstr *MI, +bool PeepholeOptimizer::optimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB) { unsigned NumDefs = MI->getDesc().getNumDefs(); unsigned NumSrcs = MI->getDesc().getNumOperands() - NumDefs; @@ -327,22 +360,23 @@ bool PeepholeOptimizer::OptimizeBitcastInstr(MachineInstr *MI, return true; } -/// OptimizeCmpInstr - If the instruction is a compare and the previous +/// optimizeCmpInstr - If the instruction is a compare and the previous /// instruction it's comparing against all ready sets (or could be modified to /// set) the same flag as the compare, then we can remove the comparison and use /// the flag from the previous instruction. -bool PeepholeOptimizer::OptimizeCmpInstr(MachineInstr *MI, +bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB) { // If this instruction is a comparison against zero and isn't comparing a // physical register, we can try to optimize it. - unsigned SrcReg; + unsigned SrcReg, SrcReg2; int CmpMask, CmpValue; - if (!TII->AnalyzeCompare(MI, SrcReg, CmpMask, CmpValue) || - TargetRegisterInfo::isPhysicalRegister(SrcReg)) + if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) || + TargetRegisterInfo::isPhysicalRegister(SrcReg) || + (SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2))) return false; // Attempt to optimize the comparison instruction. - if (TII->OptimizeCompareInstr(MI, SrcReg, CmpMask, CmpValue, MRI)) { + if (TII->optimizeCompareInstr(MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) { ++NumCmps; return true; } @@ -368,10 +402,10 @@ bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI, return false; } -/// FoldImmediate - Try folding register operands that are defined by move +/// foldImmediate - Try folding register operands that are defined by move /// immediate instructions, i.e. a trivial constant folding optimization, if /// and only if the def and use are in the same BB. -bool PeepholeOptimizer::FoldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, +bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet<unsigned, 4> &ImmDefRegs, DenseMap<unsigned, MachineInstr*> &ImmDefMIs) { for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { @@ -430,7 +464,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { } if (MI->isBitcast()) { - if (OptimizeBitcastInstr(MI, MBB)) { + if (optimizeBitcastInstr(MI, MBB)) { // MI is deleted. LocalMIs.erase(MI); Changed = true; @@ -438,7 +472,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { continue; } } else if (MI->isCompare()) { - if (OptimizeCmpInstr(MI, MBB)) { + if (optimizeCmpInstr(MI, MBB)) { // MI is deleted. LocalMIs.erase(MI); Changed = true; @@ -450,9 +484,9 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) { SeenMoveImm = true; } else { - Changed |= OptimizeExtInstr(MI, MBB, LocalMIs); + Changed |= optimizeExtInstr(MI, MBB, LocalMIs); if (SeenMoveImm) - Changed |= FoldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs); + Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs); } First = false; diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp index 24d3e5a..7449ff5 100644 --- a/lib/CodeGen/PostRASchedulerList.cpp +++ b/lib/CodeGen/PostRASchedulerList.cpp @@ -22,7 +22,6 @@ #include "AntiDepBreaker.h" #include "AggressiveAntiDepBreaker.h" #include "CriticalAntiDepBreaker.h" -#include "RegisterClassInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/LatencyPriorityQueue.h" #include "llvm/CodeGen/SchedulerRegistry.h" @@ -31,6 +30,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -78,7 +78,6 @@ AntiDepBreaker::~AntiDepBreaker() { } namespace { class PostRAScheduler : public MachineFunctionPass { - AliasAnalysis *AA; const TargetInstrInfo *TII; RegisterClassInfo RegClassInfo; @@ -206,6 +205,10 @@ SchedulePostRATDList::SchedulePostRATDList( const InstrItineraryData *InstrItins = TM.getInstrItineraryData(); HazardRec = TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins, this); + + assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE || + MRI.tracksLiveness()) && + "Live-ins must be accurate for anti-dependency breaking"); AntiDepBreak = ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL) ? (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) : @@ -423,9 +426,8 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) { unsigned Reg = *I; LiveRegs.set(Reg); // Repeat, for all subregs. - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) - LiveRegs.set(*Subreg); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + LiveRegs.set(*SubRegs); } } else { @@ -437,9 +439,8 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) { unsigned Reg = *I; LiveRegs.set(Reg); // Repeat, for all subregs. - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) - LiveRegs.set(*Subreg); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + LiveRegs.set(*SubRegs); } } } @@ -464,10 +465,9 @@ bool SchedulePostRATDList::ToggleKillFlag(MachineInstr *MI, MO.setIsKill(false); bool AllDead = true; const unsigned SuperReg = MO.getReg(); - for (const uint16_t *Subreg = TRI->getSubRegisters(SuperReg); - *Subreg; ++Subreg) { - if (LiveRegs.test(*Subreg)) { - MI->addOperand(MachineOperand::CreateReg(*Subreg, + for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) { + if (LiveRegs.test(*SubRegs)) { + MI->addOperand(MachineOperand::CreateReg(*SubRegs, true /*IsDef*/, true /*IsImp*/, false /*IsKill*/, @@ -517,9 +517,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) { LiveRegs.reset(Reg); // Repeat for all subregs. - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) - LiveRegs.reset(*Subreg); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + LiveRegs.reset(*SubRegs); } // Examine all used registers and set/clear kill flag. When a @@ -536,9 +535,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) { if (!killedRegs.test(Reg)) { kill = true; // A register is not killed if any subregs are live... - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) { - if (LiveRegs.test(*Subreg)) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { + if (LiveRegs.test(*SubRegs)) { kill = false; break; } @@ -570,9 +568,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) { LiveRegs.set(Reg); - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) - LiveRegs.set(*Subreg); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + LiveRegs.set(*SubRegs); } } } diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp index 1ad3479..34d075c 100644 --- a/lib/CodeGen/ProcessImplicitDefs.cpp +++ b/lib/CodeGen/ProcessImplicitDefs.cpp @@ -9,297 +9,163 @@ #define DEBUG_TYPE "processimplicitdefs" -#include "llvm/CodeGen/ProcessImplicitDefs.h" - -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" - using namespace llvm; +namespace { +/// Process IMPLICIT_DEF instructions and make sure there is one implicit_def +/// for each use. Add isUndef marker to implicit_def defs and their uses. +class ProcessImplicitDefs : public MachineFunctionPass { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + SmallSetVector<MachineInstr*, 16> WorkList; + + void processImplicitDef(MachineInstr *MI); + bool canTurnIntoImplicitDef(MachineInstr *MI); + +public: + static char ID; + + ProcessImplicitDefs() : MachineFunctionPass(ID) { + initializeProcessImplicitDefsPass(*PassRegistry::getPassRegistry()); + } + + virtual void getAnalysisUsage(AnalysisUsage &au) const; + + virtual bool runOnMachineFunction(MachineFunction &fn); +}; +} // end anonymous namespace + char ProcessImplicitDefs::ID = 0; char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID; INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs", "Process Implicit Definitions", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs", "Process Implicit Definitions", false, false) void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<LiveVariables>(); - AU.addPreservedID(MachineLoopInfoID); - AU.addPreservedID(MachineDominatorsID); - AU.addPreservedID(TwoAddressInstructionPassID); - AU.addPreservedID(PHIEliminationID); MachineFunctionPass::getAnalysisUsage(AU); } -bool -ProcessImplicitDefs::CanTurnIntoImplicitDef(MachineInstr *MI, - unsigned Reg, unsigned OpIdx, - SmallSet<unsigned, 8> &ImpDefRegs) { - switch(OpIdx) { - case 1: - return MI->isCopy() && (!MI->getOperand(0).readsReg() || - ImpDefRegs.count(MI->getOperand(0).getReg())); - case 2: - return MI->isSubregToReg() && (!MI->getOperand(0).readsReg() || - ImpDefRegs.count(MI->getOperand(0).getReg())); - default: return false; - } -} - -static bool isUndefCopy(MachineInstr *MI, unsigned Reg, - SmallSet<unsigned, 8> &ImpDefRegs) { - if (MI->isCopy()) { - MachineOperand &MO0 = MI->getOperand(0); - MachineOperand &MO1 = MI->getOperand(1); - if (MO1.getReg() != Reg) - return false; - if (!MO0.readsReg() || ImpDefRegs.count(MO0.getReg())) - return true; +bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) { + if (!MI->isCopyLike() && + !MI->isInsertSubreg() && + !MI->isRegSequence() && + !MI->isPHI()) return false; - } - return false; + for (MIOperands MO(MI); MO.isValid(); ++MO) + if (MO->isReg() && MO->isUse() && MO->readsReg()) + return false; + return true; } -/// processImplicitDefs - Process IMPLICIT_DEF instructions and make sure -/// there is one implicit_def for each use. Add isUndef marker to -/// implicit_def defs and their uses. -bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) { - - DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n" - << "********** Function: " - << ((Value*)fn.getFunction())->getName() << '\n'); - - bool Changed = false; - - TII = fn.getTarget().getInstrInfo(); - TRI = fn.getTarget().getRegisterInfo(); - MRI = &fn.getRegInfo(); - LV = getAnalysisIfAvailable<LiveVariables>(); - - SmallSet<unsigned, 8> ImpDefRegs; - SmallVector<MachineInstr*, 8> ImpDefMIs; - SmallVector<MachineInstr*, 4> RUses; - SmallPtrSet<MachineBasicBlock*,16> Visited; - SmallPtrSet<MachineInstr*, 8> ModInsts; - - MachineBasicBlock *Entry = fn.begin(); - for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> > - DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited); - DFI != E; ++DFI) { - MachineBasicBlock *MBB = *DFI; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); - I != E; ) { - MachineInstr *MI = &*I; - ++I; - if (MI->isImplicitDef()) { - ImpDefMIs.push_back(MI); - // Is this a sub-register read-modify-write? - if (MI->getOperand(0).readsReg()) - continue; - unsigned Reg = MI->getOperand(0).getReg(); - ImpDefRegs.insert(Reg); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - for (const uint16_t *SS = TRI->getSubRegisters(Reg); *SS; ++SS) - ImpDefRegs.insert(*SS); - } +void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) { + DEBUG(dbgs() << "Processing " << *MI); + unsigned Reg = MI->getOperand(0).getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + // For virtual regiusters, mark all uses as <undef>, and convert users to + // implicit-def when possible. + for (MachineRegisterInfo::use_nodbg_iterator UI = + MRI->use_nodbg_begin(Reg), + UE = MRI->use_nodbg_end(); UI != UE; ++UI) { + MachineOperand &MO = UI.getOperand(); + MO.setIsUndef(); + MachineInstr *UserMI = MO.getParent(); + if (!canTurnIntoImplicitDef(UserMI)) continue; - } - - // Eliminate %reg1032:sub<def> = COPY undef. - if (MI->isCopy() && MI->getOperand(0).readsReg()) { - MachineOperand &MO = MI->getOperand(1); - if (MO.isUndef() || ImpDefRegs.count(MO.getReg())) { - if (LV && MO.isKill()) { - LiveVariables::VarInfo& vi = LV->getVarInfo(MO.getReg()); - vi.removeKill(MI); - } - unsigned Reg = MI->getOperand(0).getReg(); - MI->eraseFromParent(); - Changed = true; - - // A REG_SEQUENCE may have been expanded into partial definitions. - // If this was the last one, mark Reg as implicitly defined. - if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI->def_empty(Reg)) - ImpDefRegs.insert(Reg); - continue; - } - } - - bool ChangedToImpDef = false; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand& MO = MI->getOperand(i); - if (!MO.isReg() || !MO.readsReg()) - continue; - unsigned Reg = MO.getReg(); - if (!Reg) - continue; - if (!ImpDefRegs.count(Reg)) - continue; - // Use is a copy, just turn it into an implicit_def. - if (CanTurnIntoImplicitDef(MI, Reg, i, ImpDefRegs)) { - bool isKill = MO.isKill(); - MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF)); - for (int j = MI->getNumOperands() - 1, ee = 0; j > ee; --j) - MI->RemoveOperand(j); - if (isKill) { - ImpDefRegs.erase(Reg); - if (LV) { - LiveVariables::VarInfo& vi = LV->getVarInfo(Reg); - vi.removeKill(MI); - } - } - ChangedToImpDef = true; - Changed = true; - break; - } - - Changed = true; - MO.setIsUndef(); - // This is a partial register redef of an implicit def. - // Make sure the whole register is defined by the instruction. - if (MO.isDef()) { - MI->addRegisterDefined(Reg); - continue; - } - if (MO.isKill() || MI->isRegTiedToDefOperand(i)) { - // Make sure other reads of Reg are also marked <undef>. - for (unsigned j = i+1; j != e; ++j) { - MachineOperand &MOJ = MI->getOperand(j); - if (MOJ.isReg() && MOJ.getReg() == Reg && MOJ.readsReg()) - MOJ.setIsUndef(); - } - ImpDefRegs.erase(Reg); - } - } - - if (ChangedToImpDef) { - // Backtrack to process this new implicit_def. - --I; - } else { - for (unsigned i = 0; i != MI->getNumOperands(); ++i) { - MachineOperand& MO = MI->getOperand(i); - if (!MO.isReg() || !MO.isDef()) - continue; - ImpDefRegs.erase(MO.getReg()); - } - } + DEBUG(dbgs() << "Converting to IMPLICIT_DEF: " << *UserMI); + UserMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF)); + WorkList.insert(UserMI); } + MI->eraseFromParent(); + return; + } - // Any outstanding liveout implicit_def's? - for (unsigned i = 0, e = ImpDefMIs.size(); i != e; ++i) { - MachineInstr *MI = ImpDefMIs[i]; - unsigned Reg = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg) || - !ImpDefRegs.count(Reg)) { - // Delete all "local" implicit_def's. That include those which define - // physical registers since they cannot be liveout. - MI->eraseFromParent(); - Changed = true; + // This is a physreg implicit-def. + // Look for the first instruction to use or define an alias. + MachineBasicBlock::instr_iterator UserMI = MI; + MachineBasicBlock::instr_iterator UserE = MI->getParent()->instr_end(); + bool Found = false; + for (++UserMI; UserMI != UserE; ++UserMI) { + for (MIOperands MO(UserMI); MO.isValid(); ++MO) { + if (!MO->isReg()) continue; - } - - // If there are multiple defs of the same register and at least one - // is not an implicit_def, do not insert implicit_def's before the - // uses. - bool Skip = false; - SmallVector<MachineInstr*, 4> DeadImpDefs; - for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(Reg), - DE = MRI->def_end(); DI != DE; ++DI) { - MachineInstr *DeadImpDef = &*DI; - if (!DeadImpDef->isImplicitDef()) { - Skip = true; - break; - } - DeadImpDefs.push_back(DeadImpDef); - } - if (Skip) + unsigned UserReg = MO->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(UserReg) || + !TRI->regsOverlap(Reg, UserReg)) continue; + // UserMI uses or redefines Reg. Set <undef> flags on all uses. + Found = true; + if (MO->isUse()) + MO->setIsUndef(); + } + if (Found) + break; + } - // The only implicit_def which we want to keep are those that are live - // out of its block. - for (unsigned j = 0, ee = DeadImpDefs.size(); j != ee; ++j) - DeadImpDefs[j]->eraseFromParent(); - Changed = true; - - // Process each use instruction once. - for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg), - UE = MRI->use_end(); UI != UE; ++UI) { - if (UI.getOperand().isUndef()) - continue; - MachineInstr *RMI = &*UI; - if (ModInsts.insert(RMI)) - RUses.push_back(RMI); - } + // If we found the using MI, we can erase the IMPLICIT_DEF. + if (Found) { + DEBUG(dbgs() << "Physreg user: " << *UserMI); + MI->eraseFromParent(); + return; + } - for (unsigned i = 0, e = RUses.size(); i != e; ++i) { - MachineInstr *RMI = RUses[i]; + // Using instr wasn't found, it could be in another block. + // Leave the physreg IMPLICIT_DEF, but trim any extra operands. + for (unsigned i = MI->getNumOperands() - 1; i; --i) + MI->RemoveOperand(i); + DEBUG(dbgs() << "Keeping physreg: " << *MI); +} - // Turn a copy use into an implicit_def. - if (isUndefCopy(RMI, Reg, ImpDefRegs)) { - RMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF)); +/// processImplicitDefs - Process IMPLICIT_DEF instructions and turn them into +/// <undef> operands. +bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) { - bool isKill = false; - SmallVector<unsigned, 4> Ops; - for (unsigned j = 0, ee = RMI->getNumOperands(); j != ee; ++j) { - MachineOperand &RRMO = RMI->getOperand(j); - if (RRMO.isReg() && RRMO.getReg() == Reg) { - Ops.push_back(j); - if (RRMO.isKill()) - isKill = true; - } - } - // Leave the other operands along. - for (unsigned j = 0, ee = Ops.size(); j != ee; ++j) { - unsigned OpIdx = Ops[j]; - RMI->RemoveOperand(OpIdx-j); - } + DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n" + << "********** Function: " + << ((Value*)MF.getFunction())->getName() << '\n'); - // Update LiveVariables varinfo if the instruction is a kill. - if (LV && isKill) { - LiveVariables::VarInfo& vi = LV->getVarInfo(Reg); - vi.removeKill(RMI); - } - continue; - } + bool Changed = false; - // Replace Reg with a new vreg that's marked implicit. - const TargetRegisterClass* RC = MRI->getRegClass(Reg); - unsigned NewVReg = MRI->createVirtualRegister(RC); - bool isKill = true; - for (unsigned j = 0, ee = RMI->getNumOperands(); j != ee; ++j) { - MachineOperand &RRMO = RMI->getOperand(j); - if (RRMO.isReg() && RRMO.getReg() == Reg) { - RRMO.setReg(NewVReg); - RRMO.setIsUndef(); - if (isKill) { - // Only the first operand of NewVReg is marked kill. - RRMO.setIsKill(); - isKill = false; - } - } - } - } - RUses.clear(); - ModInsts.clear(); - } - ImpDefRegs.clear(); - ImpDefMIs.clear(); + TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + assert(MRI->isSSA() && "ProcessImplicitDefs only works on SSA form."); + assert(WorkList.empty() && "Inconsistent worklist state"); + + for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); + MFI != MFE; ++MFI) { + // Scan the basic block for implicit defs. + for (MachineBasicBlock::instr_iterator MBBI = MFI->instr_begin(), + MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) + if (MBBI->isImplicitDef()) + WorkList.insert(MBBI); + + if (WorkList.empty()) + continue; + + DEBUG(dbgs() << "BB#" << MFI->getNumber() << " has " << WorkList.size() + << " implicit defs.\n"); + Changed = true; + + // Drain the WorkList to recursively process any new implicit defs. + do processImplicitDef(WorkList.pop_back_val()); + while (!WorkList.empty()); } - return Changed; } - diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index 458915e..c791ffb 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -302,7 +302,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) { const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo(); MachineBasicBlock::iterator I; - if (! ShrinkWrapThisFunction) { + if (!ShrinkWrapThisFunction) { // Spill using target interface. I = EntryBlock->begin(); if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) { diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp index b00eceb..993dbc7 100644 --- a/lib/CodeGen/RegAllocBase.cpp +++ b/lib/CodeGen/RegAllocBase.cpp @@ -14,6 +14,7 @@ #define DEBUG_TYPE "regalloc" #include "RegAllocBase.h" +#include "LiveRegMatrix.h" #include "Spiller.h" #include "VirtRegMap.h" #include "llvm/ADT/Statistic.h" @@ -34,8 +35,6 @@ using namespace llvm; -STATISTIC(NumAssigned , "Number of registers assigned"); -STATISTIC(NumUnassigned , "Number of registers unassigned"); STATISTIC(NumNewQueued , "Number of new live ranges queued"); // Temporary verification option until we can put verification inside @@ -47,85 +46,20 @@ VerifyRegAlloc("verify-regalloc", cl::location(RegAllocBase::VerifyEnabled), const char *RegAllocBase::TimerGroupName = "Register Allocation"; bool RegAllocBase::VerifyEnabled = false; -#ifndef NDEBUG -// Verify each LiveIntervalUnion. -void RegAllocBase::verify() { - LiveVirtRegBitSet VisitedVRegs; - OwningArrayPtr<LiveVirtRegBitSet> - unionVRegs(new LiveVirtRegBitSet[PhysReg2LiveUnion.numRegs()]); - - // Verify disjoint unions. - for (unsigned PhysReg = 0; PhysReg < PhysReg2LiveUnion.numRegs(); ++PhysReg) { - DEBUG(PhysReg2LiveUnion[PhysReg].print(dbgs(), TRI)); - LiveVirtRegBitSet &VRegs = unionVRegs[PhysReg]; - PhysReg2LiveUnion[PhysReg].verify(VRegs); - // Union + intersection test could be done efficiently in one pass, but - // don't add a method to SparseBitVector unless we really need it. - assert(!VisitedVRegs.intersects(VRegs) && "vreg in multiple unions"); - VisitedVRegs |= VRegs; - } - - // Verify vreg coverage. - for (LiveIntervals::iterator liItr = LIS->begin(), liEnd = LIS->end(); - liItr != liEnd; ++liItr) { - unsigned reg = liItr->first; - if (TargetRegisterInfo::isPhysicalRegister(reg)) continue; - if (!VRM->hasPhys(reg)) continue; // spilled? - unsigned PhysReg = VRM->getPhys(reg); - if (!unionVRegs[PhysReg].test(reg)) { - dbgs() << "LiveVirtReg " << reg << " not in union " << - TRI->getName(PhysReg) << "\n"; - llvm_unreachable("unallocated live vreg"); - } - } - // FIXME: I'm not sure how to verify spilled intervals. -} -#endif //!NDEBUG - //===----------------------------------------------------------------------===// // RegAllocBase Implementation //===----------------------------------------------------------------------===// -// Instantiate a LiveIntervalUnion for each physical register. -void RegAllocBase::LiveUnionArray::init(LiveIntervalUnion::Allocator &allocator, - unsigned NRegs) { - NumRegs = NRegs; - Array = - static_cast<LiveIntervalUnion*>(malloc(sizeof(LiveIntervalUnion)*NRegs)); - for (unsigned r = 0; r != NRegs; ++r) - new(Array + r) LiveIntervalUnion(r, allocator); -} - -void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis) { - NamedRegionTimer T("Initialize", TimerGroupName, TimePassesIsEnabled); +void RegAllocBase::init(VirtRegMap &vrm, + LiveIntervals &lis, + LiveRegMatrix &mat) { TRI = &vrm.getTargetRegInfo(); MRI = &vrm.getRegInfo(); VRM = &vrm; LIS = &lis; + Matrix = &mat; MRI->freezeReservedRegs(vrm.getMachineFunction()); RegClassInfo.runOnMachineFunction(vrm.getMachineFunction()); - - const unsigned NumRegs = TRI->getNumRegs(); - if (NumRegs != PhysReg2LiveUnion.numRegs()) { - PhysReg2LiveUnion.init(UnionAllocator, NumRegs); - // Cache an interferece query for each physical reg - Queries.reset(new LiveIntervalUnion::Query[PhysReg2LiveUnion.numRegs()]); - } -} - -void RegAllocBase::LiveUnionArray::clear() { - if (!Array) - return; - for (unsigned r = 0; r != NumRegs; ++r) - Array[r].~LiveIntervalUnion(); - free(Array); - NumRegs = 0; - Array = 0; -} - -void RegAllocBase::releaseMemory() { - for (unsigned r = 0, e = PhysReg2LiveUnion.numRegs(); r != e; ++r) - PhysReg2LiveUnion[r].clear(); } // Visit all the live registers. If they are already assigned to a physical @@ -133,35 +67,14 @@ void RegAllocBase::releaseMemory() { // them on the priority queue for later assignment. void RegAllocBase::seedLiveRegs() { NamedRegionTimer T("Seed Live Regs", TimerGroupName, TimePassesIsEnabled); - for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end(); I != E; ++I) { - unsigned RegNum = I->first; - LiveInterval &VirtReg = *I->second; - if (TargetRegisterInfo::isPhysicalRegister(RegNum)) - PhysReg2LiveUnion[RegNum].unify(VirtReg); - else - enqueue(&VirtReg); + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (MRI->reg_nodbg_empty(Reg)) + continue; + enqueue(&LIS->getInterval(Reg)); } } -void RegAllocBase::assign(LiveInterval &VirtReg, unsigned PhysReg) { - DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI) - << " to " << PrintReg(PhysReg, TRI) << '\n'); - assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment"); - VRM->assignVirt2Phys(VirtReg.reg, PhysReg); - MRI->setPhysRegUsed(PhysReg); - PhysReg2LiveUnion[PhysReg].unify(VirtReg); - ++NumAssigned; -} - -void RegAllocBase::unassign(LiveInterval &VirtReg, unsigned PhysReg) { - DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI) - << " from " << PrintReg(PhysReg, TRI) << '\n'); - assert(VRM->getPhys(VirtReg.reg) == PhysReg && "Inconsistent unassign"); - PhysReg2LiveUnion[PhysReg].extract(VirtReg); - VRM->clearVirt(VirtReg.reg); - ++NumUnassigned; -} - // Top-level driver to manage the queue of unassigned VirtRegs and call the // selectOrSplit implementation. void RegAllocBase::allocatePhysRegs() { @@ -179,14 +92,14 @@ void RegAllocBase::allocatePhysRegs() { } // Invalidate all interference queries, live ranges could have changed. - invalidateVirtRegs(); + Matrix->invalidateVirtRegs(); // selectOrSplit requests the allocator to return an available physical // register if possible and populate a list of new live intervals that // result from splitting. DEBUG(dbgs() << "\nselectOrSplit " << MRI->getRegClass(VirtReg->reg)->getName() - << ':' << *VirtReg << '\n'); + << ':' << PrintReg(VirtReg->reg) << ' ' << *VirtReg << '\n'); typedef SmallVector<LiveInterval*, 4> VirtRegVec; VirtRegVec SplitVRegs; unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs); @@ -211,7 +124,7 @@ void RegAllocBase::allocatePhysRegs() { } if (AvailablePhysReg) - assign(*VirtReg, AvailablePhysReg); + Matrix->assign(*VirtReg, AvailablePhysReg); for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end(); I != E; ++I) { @@ -230,51 +143,3 @@ void RegAllocBase::allocatePhysRegs() { } } } - -// Check if this live virtual register interferes with a physical register. If -// not, then check for interference on each register that aliases with the -// physical register. Return the interfering register. -unsigned RegAllocBase::checkPhysRegInterference(LiveInterval &VirtReg, - unsigned PhysReg) { - for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) - if (query(VirtReg, *AliasI).checkInterference()) - return *AliasI; - return 0; -} - -// Add newly allocated physical registers to the MBB live in sets. -void RegAllocBase::addMBBLiveIns(MachineFunction *MF) { - NamedRegionTimer T("MBB Live Ins", TimerGroupName, TimePassesIsEnabled); - SlotIndexes *Indexes = LIS->getSlotIndexes(); - if (MF->size() <= 1) - return; - - LiveIntervalUnion::SegmentIter SI; - for (unsigned PhysReg = 0; PhysReg < PhysReg2LiveUnion.numRegs(); ++PhysReg) { - LiveIntervalUnion &LiveUnion = PhysReg2LiveUnion[PhysReg]; - if (LiveUnion.empty()) - continue; - DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " live-in:"); - MachineFunction::iterator MBB = llvm::next(MF->begin()); - MachineFunction::iterator MFE = MF->end(); - SlotIndex Start, Stop; - tie(Start, Stop) = Indexes->getMBBRange(MBB); - SI.setMap(LiveUnion.getMap()); - SI.find(Start); - while (SI.valid()) { - if (SI.start() <= Start) { - if (!MBB->isLiveIn(PhysReg)) - MBB->addLiveIn(PhysReg); - DEBUG(dbgs() << "\tBB#" << MBB->getNumber() << ':' - << PrintReg(SI.value()->reg, TRI)); - } else if (SI.start() > Stop) - MBB = Indexes->getMBBFromIndex(SI.start().getPrevIndex()); - if (++MBB == MFE) - break; - tie(Start, Stop) = Indexes->getMBBRange(MBB); - SI.advanceTo(Start); - } - DEBUG(dbgs() << '\n'); - } -} - diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h index 072fe2b..db0c8e1 100644 --- a/lib/CodeGen/RegAllocBase.h +++ b/lib/CodeGen/RegAllocBase.h @@ -37,9 +37,9 @@ #ifndef LLVM_CODEGEN_REGALLOCBASE #define LLVM_CODEGEN_REGALLOCBASE -#include "llvm/ADT/OwningPtr.h" #include "LiveIntervalUnion.h" -#include "RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/ADT/OwningPtr.h" namespace llvm { @@ -47,6 +47,7 @@ template<typename T> class SmallVectorImpl; class TargetRegisterInfo; class VirtRegMap; class LiveIntervals; +class LiveRegMatrix; class Spiller; /// RegAllocBase provides the register allocation driver and interface that can @@ -56,69 +57,20 @@ class Spiller; /// live range splitting. They must also override enqueue/dequeue to provide an /// assignment order. class RegAllocBase { - LiveIntervalUnion::Allocator UnionAllocator; - - // Cache tag for PhysReg2LiveUnion entries. Increment whenever virtual - // registers may have changed. - unsigned UserTag; - - // Array of LiveIntervalUnions indexed by physical register. - class LiveUnionArray { - unsigned NumRegs; - LiveIntervalUnion *Array; - public: - LiveUnionArray(): NumRegs(0), Array(0) {} - ~LiveUnionArray() { clear(); } - - unsigned numRegs() const { return NumRegs; } - - void init(LiveIntervalUnion::Allocator &, unsigned NRegs); - - void clear(); - - LiveIntervalUnion& operator[](unsigned PhysReg) { - assert(PhysReg < NumRegs && "physReg out of bounds"); - return Array[PhysReg]; - } - }; - - LiveUnionArray PhysReg2LiveUnion; - - // Current queries, one per physreg. They must be reinitialized each time we - // query on a new live virtual register. - OwningArrayPtr<LiveIntervalUnion::Query> Queries; - protected: const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; VirtRegMap *VRM; LiveIntervals *LIS; + LiveRegMatrix *Matrix; RegisterClassInfo RegClassInfo; - RegAllocBase(): UserTag(0), TRI(0), MRI(0), VRM(0), LIS(0) {} + RegAllocBase(): TRI(0), MRI(0), VRM(0), LIS(0), Matrix(0) {} virtual ~RegAllocBase() {} // A RegAlloc pass should call this before allocatePhysRegs. - void init(VirtRegMap &vrm, LiveIntervals &lis); - - // Get an initialized query to check interferences between lvr and preg. Note - // that Query::init must be called at least once for each physical register - // before querying a new live virtual register. This ties Queries and - // PhysReg2LiveUnion together. - LiveIntervalUnion::Query &query(LiveInterval &VirtReg, unsigned PhysReg) { - Queries[PhysReg].init(UserTag, &VirtReg, &PhysReg2LiveUnion[PhysReg]); - return Queries[PhysReg]; - } - - // Get direct access to the underlying LiveIntervalUnion for PhysReg. - LiveIntervalUnion &getLiveUnion(unsigned PhysReg) { - return PhysReg2LiveUnion[PhysReg]; - } - - // Invalidate all cached information about virtual registers - live ranges may - // have changed. - void invalidateVirtRegs() { ++UserTag; } + void init(VirtRegMap &vrm, LiveIntervals &lis, LiveRegMatrix &mat); // The top-level driver. The output is a VirtRegMap that us updated with // physical register assignments. @@ -140,31 +92,6 @@ protected: virtual unsigned selectOrSplit(LiveInterval &VirtReg, SmallVectorImpl<LiveInterval*> &splitLVRs) = 0; - // A RegAlloc pass should call this when PassManager releases its memory. - virtual void releaseMemory(); - - // Helper for checking interference between a live virtual register and a - // physical register, including all its register aliases. If an interference - // exists, return the interfering register, which may be preg or an alias. - unsigned checkPhysRegInterference(LiveInterval& VirtReg, unsigned PhysReg); - - /// assign - Assign VirtReg to PhysReg. - /// This should not be called from selectOrSplit for the current register. - void assign(LiveInterval &VirtReg, unsigned PhysReg); - - /// unassign - Undo a previous assignment of VirtReg to PhysReg. - /// This can be invoked from selectOrSplit, but be careful to guarantee that - /// allocation is making progress. - void unassign(LiveInterval &VirtReg, unsigned PhysReg); - - /// addMBBLiveIns - Add physreg liveins to basic blocks. - void addMBBLiveIns(MachineFunction *); - -#ifndef NDEBUG - // Verify each LiveIntervalUnion. - void verify(); -#endif - // Use this group name for NamedRegionTimer. static const char *TimerGroupName; diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp index 77ee314..3a03807 100644 --- a/lib/CodeGen/RegAllocBasic.cpp +++ b/lib/CodeGen/RegAllocBasic.cpp @@ -13,11 +13,12 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "regalloc" +#include "AllocationOrder.h" #include "RegAllocBase.h" #include "LiveDebugVariables.h" -#include "RenderMachineFunction.h" #include "Spiller.h" #include "VirtRegMap.h" +#include "LiveRegMatrix.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Function.h" #include "llvm/PassAnalysisSupport.h" @@ -64,10 +65,6 @@ class RABasic : public MachineFunctionPass, public RegAllocBase // context MachineFunction *MF; - // analyses - LiveStacks *LS; - RenderMachineFunction *RMF; - // state std::auto_ptr<Spiller> SpillerInstance; std::priority_queue<LiveInterval*, std::vector<LiveInterval*>, @@ -118,9 +115,6 @@ public: bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, SmallVectorImpl<LiveInterval*> &SplitVRegs); - void spillReg(LiveInterval &VirtReg, unsigned PhysReg, - SmallVectorImpl<LiveInterval*> &SplitVRegs); - static char ID; }; @@ -139,7 +133,7 @@ RABasic::RABasic(): MachineFunctionPass(ID) { initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry()); initializeVirtRegMapPass(*PassRegistry::getPassRegistry()); - initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry()); + initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry()); } void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { @@ -147,6 +141,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); AU.addRequired<LiveDebugVariables>(); AU.addPreserved<LiveDebugVariables>(); @@ -159,41 +154,15 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<MachineLoopInfo>(); AU.addRequired<VirtRegMap>(); AU.addPreserved<VirtRegMap>(); - DEBUG(AU.addRequired<RenderMachineFunction>()); + AU.addRequired<LiveRegMatrix>(); + AU.addPreserved<LiveRegMatrix>(); MachineFunctionPass::getAnalysisUsage(AU); } void RABasic::releaseMemory() { SpillerInstance.reset(0); - RegAllocBase::releaseMemory(); } -// Helper for spillInterferences() that spills all interfering vregs currently -// assigned to this physical register. -void RABasic::spillReg(LiveInterval& VirtReg, unsigned PhysReg, - SmallVectorImpl<LiveInterval*> &SplitVRegs) { - LiveIntervalUnion::Query &Q = query(VirtReg, PhysReg); - assert(Q.seenAllInterferences() && "need collectInterferences()"); - const SmallVectorImpl<LiveInterval*> &PendingSpills = Q.interferingVRegs(); - - for (SmallVectorImpl<LiveInterval*>::const_iterator I = PendingSpills.begin(), - E = PendingSpills.end(); I != E; ++I) { - LiveInterval &SpilledVReg = **I; - DEBUG(dbgs() << "extracting from " << - TRI->getName(PhysReg) << " " << SpilledVReg << '\n'); - - // Deallocate the interfering vreg by removing it from the union. - // A LiveInterval instance may not be in a union during modification! - unassign(SpilledVReg, PhysReg); - - // Spill the extracted interval. - LiveRangeEdit LRE(SpilledVReg, SplitVRegs, *MF, *LIS, VRM); - spiller().spill(LRE); - } - // After extracting segments, the query's results are invalid. But keep the - // contents valid until we're done accessing pendingSpills. - Q.clear(); -} // Spill or split all live virtual registers currently unified under PhysReg // that interfere with VirtReg. The newly spilled or split live intervals are @@ -202,22 +171,41 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, SmallVectorImpl<LiveInterval*> &SplitVRegs) { // Record each interference and determine if all are spillable before mutating // either the union or live intervals. - unsigned NumInterferences = 0; + SmallVector<LiveInterval*, 8> Intfs; + // Collect interferences assigned to any alias of the physical register. - for (const uint16_t *asI = TRI->getOverlaps(PhysReg); *asI; ++asI) { - LiveIntervalUnion::Query &QAlias = query(VirtReg, *asI); - NumInterferences += QAlias.collectInterferingVRegs(); - if (QAlias.seenUnspillableVReg()) { + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + Q.collectInterferingVRegs(); + if (Q.seenUnspillableVReg()) return false; + for (unsigned i = Q.interferingVRegs().size(); i; --i) { + LiveInterval *Intf = Q.interferingVRegs()[i - 1]; + if (!Intf->isSpillable() || Intf->weight > VirtReg.weight) + return false; + Intfs.push_back(Intf); } } DEBUG(dbgs() << "spilling " << TRI->getName(PhysReg) << " interferences with " << VirtReg << "\n"); - assert(NumInterferences > 0 && "expect interference"); + assert(!Intfs.empty() && "expected interference"); // Spill each interfering vreg allocated to PhysReg or an alias. - for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) - spillReg(VirtReg, *AliasI, SplitVRegs); + for (unsigned i = 0, e = Intfs.size(); i != e; ++i) { + LiveInterval &Spill = *Intfs[i]; + + // Skip duplicates. + if (!VRM->hasPhys(Spill.reg)) + continue; + + // Deallocate the interfering vreg by removing it from the union. + // A LiveInterval instance may not be in a union during modification! + Matrix->unassign(Spill); + + // Spill the extracted interval. + LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM); + spiller().spill(LRE); + } return true; } @@ -235,49 +223,36 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, // selectOrSplit(). unsigned RABasic::selectOrSplit(LiveInterval &VirtReg, SmallVectorImpl<LiveInterval*> &SplitVRegs) { - // Check for register mask interference. When live ranges cross calls, the - // set of usable registers is reduced to the callee-saved ones. - bool CrossRegMasks = LIS->checkRegMaskInterference(VirtReg, UsableRegs); - // Populate a list of physical register spill candidates. SmallVector<unsigned, 8> PhysRegSpillCands; // Check for an available register in this class. - ArrayRef<unsigned> Order = - RegClassInfo.getOrder(MRI->getRegClass(VirtReg.reg)); - for (ArrayRef<unsigned>::iterator I = Order.begin(), E = Order.end(); I != E; - ++I) { - unsigned PhysReg = *I; - - // If PhysReg is clobbered by a register mask, it isn't useful for - // allocation or spilling. - if (CrossRegMasks && !UsableRegs.test(PhysReg)) - continue; - - // Check interference and as a side effect, intialize queries for this - // VirtReg and its aliases. - unsigned interfReg = checkPhysRegInterference(VirtReg, PhysReg); - if (interfReg == 0) { - // Found an available register. + AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo); + while (unsigned PhysReg = Order.next()) { + // Check for interference in PhysReg + switch (Matrix->checkInterference(VirtReg, PhysReg)) { + case LiveRegMatrix::IK_Free: + // PhysReg is available, allocate it. return PhysReg; - } - LiveIntervalUnion::Query &IntfQ = query(VirtReg, interfReg); - IntfQ.collectInterferingVRegs(1); - LiveInterval *interferingVirtReg = IntfQ.interferingVRegs().front(); - // The current VirtReg must either be spillable, or one of its interferences - // must have less spill weight. - if (interferingVirtReg->weight < VirtReg.weight ) { + case LiveRegMatrix::IK_VirtReg: + // Only virtual registers in the way, we may be able to spill them. PhysRegSpillCands.push_back(PhysReg); + continue; + + default: + // RegMask or RegUnit interference. + continue; } } + // Try to spill another interfering reg with less spill weight. for (SmallVectorImpl<unsigned>::iterator PhysRegI = PhysRegSpillCands.begin(), - PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) { - - if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs)) continue; + PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) { + if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs)) + continue; - assert(checkPhysRegInterference(VirtReg, *PhysRegI) == 0 && + assert(!Matrix->checkInterference(VirtReg, *PhysRegI) && "Interference after spill."); // Tell the caller to allocate to this newly freed physical register. return *PhysRegI; @@ -287,7 +262,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg, DEBUG(dbgs() << "spilling: " << VirtReg << '\n'); if (!VirtReg.isSpillable()) return ~0u; - LiveRangeEdit LRE(VirtReg, SplitVRegs, *MF, *LIS, VRM); + LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM); spiller().spill(LRE); // The live virtual register requesting allocation was spilled, so tell @@ -301,53 +276,17 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) { << ((Value*)mf.getFunction())->getName() << '\n'); MF = &mf; - DEBUG(RMF = &getAnalysis<RenderMachineFunction>()); - - RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>()); + RegAllocBase::init(getAnalysis<VirtRegMap>(), + getAnalysis<LiveIntervals>(), + getAnalysis<LiveRegMatrix>()); SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); allocatePhysRegs(); - addMBBLiveIns(MF); - // Diagnostic output before rewriting DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n"); - // optional HTML output - DEBUG(RMF->renderMachineFunction("After basic register allocation.", VRM)); - - // FIXME: Verification currently must run before VirtRegRewriter. We should - // make the rewriter a separate pass and override verifyAnalysis instead. When - // that happens, verification naturally falls under VerifyMachineCode. -#ifndef NDEBUG - if (VerifyEnabled) { - // Verify accuracy of LiveIntervals. The standard machine code verifier - // ensures that each LiveIntervals covers all uses of the virtual reg. - - // FIXME: MachineVerifier is badly broken when using the standard - // spiller. Always use -spiller=inline with -verify-regalloc. Even with the - // inline spiller, some tests fail to verify because the coalescer does not - // always generate verifiable code. - MF->verify(this, "In RABasic::verify"); - - // Verify that LiveIntervals are partitioned into unions and disjoint within - // the unions. - verify(); - } -#endif // !NDEBUG - - // Run rewriter - VRM->rewrite(LIS->getSlotIndexes()); - - // Write out new DBG_VALUE instructions. - getAnalysis<LiveDebugVariables>().emitDebugValues(VRM); - - // All machine operands and other references to virtual registers have been - // replaced. Remove the virtual registers and release all the transient data. - VRM->clearAllVirt(); - MRI->clearVirtRegs(); releaseMemory(); - return true; } diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp index e09b7f8..8325f20 100644 --- a/lib/CodeGen/RegAllocFast.cpp +++ b/lib/CodeGen/RegAllocFast.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "regalloc" -#include "RegisterClassInfo.h" #include "llvm/BasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -22,6 +21,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Support/CommandLine.h" @@ -77,7 +77,7 @@ namespace { explicit LiveReg(unsigned v) : LastUse(0), VirtReg(v), PhysReg(0), LastOpNum(0), Dirty(false) {} - unsigned getSparseSetKey() const { + unsigned getSparseSetIndex() const { return TargetRegisterInfo::virtReg2Index(VirtReg); } }; @@ -354,8 +354,8 @@ void RAFast::usePhysReg(MachineOperand &MO) { } // Maybe a superregister is reserved? - for (const uint16_t *AS = TRI->getAliasSet(PhysReg); - unsigned Alias = *AS; ++AS) { + for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) { + unsigned Alias = *AI; switch (PhysRegState[Alias]) { case regDisabled: break; @@ -408,8 +408,8 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg, // This is a disabled register, disable all aliases. PhysRegState[PhysReg] = NewState; - for (const uint16_t *AS = TRI->getAliasSet(PhysReg); - unsigned Alias = *AS; ++AS) { + for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) { + unsigned Alias = *AI; switch (unsigned VirtReg = PhysRegState[Alias]) { case regDisabled: break; @@ -456,8 +456,8 @@ unsigned RAFast::calcSpillCost(unsigned PhysReg) const { // This is a disabled register, add up cost of aliases. DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is disabled.\n"); unsigned Cost = 0; - for (const uint16_t *AS = TRI->getAliasSet(PhysReg); - unsigned Alias = *AS; ++AS) { + for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) { + unsigned Alias = *AI; if (UsedInInstr.test(Alias)) return spillImpossible; switch (unsigned VirtReg = PhysRegState[Alias]) { @@ -659,9 +659,10 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum, // Return true if the operand kills its register. bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) { MachineOperand &MO = MI->getOperand(OpNum); + bool Dead = MO.isDead(); if (!MO.getSubReg()) { MO.setReg(PhysReg); - return MO.isKill() || MO.isDead(); + return MO.isKill() || Dead; } // Handle subregister index. @@ -674,7 +675,13 @@ bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) { MI->addRegisterKilled(PhysReg, TRI, true); return true; } - return MO.isDead(); + + // A <def,read-undef> of a sub-register requires an implicit def of the full + // register. + if (MO.isDef() && MO.isUndef()) + MI->addRegisterDefined(PhysReg, TRI); + + return Dead; } // Handle special instruction operand like early clobbers and tied ops when @@ -704,13 +711,10 @@ void RAFast::handleThroughOperands(MachineInstr *MI, if (!MO.isReg() || !MO.isDef()) continue; unsigned Reg = MO.getReg(); if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue; - UsedInInstr.set(Reg); - if (ThroughRegs.count(PhysRegState[Reg])) - definePhysReg(MI, Reg, regFree); - for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) { - UsedInInstr.set(*AS); - if (ThroughRegs.count(PhysRegState[*AS])) - definePhysReg(MI, *AS, regFree); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { + UsedInInstr.set(*AI); + if (ThroughRegs.count(PhysRegState[*AI])) + definePhysReg(MI, *AI, regFree); } } @@ -1029,9 +1033,8 @@ void RAFast::AllocateBasicBlock() { if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue; // Look for physreg defs and tied uses. if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue; - UsedInInstr.set(Reg); - for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) - UsedInInstr.set(*AS); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + UsedInInstr.set(*AI); } } diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index 3f2a617..6ac5428 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -16,6 +16,7 @@ #include "AllocationOrder.h" #include "InterferenceCache.h" #include "LiveDebugVariables.h" +#include "LiveRegMatrix.h" #include "RegAllocBase.h" #include "Spiller.h" #include "SpillPlacement.h" @@ -73,7 +74,6 @@ class RAGreedy : public MachineFunctionPass, // analyses SlotIndexes *Indexes; - LiveStacks *LS; MachineDominatorTree *DomTree; MachineLoopInfo *Loops; EdgeBundles *Bundles; @@ -168,19 +168,6 @@ class RAGreedy : public MachineFunctionPass, } }; - // Register mask interference. The current VirtReg is checked for register - // mask interference on entry to selectOrSplit(). If there is no - // interference, UsableRegs is left empty. If there is interference, - // UsableRegs has a bit mask of registers that can be used without register - // mask interference. - BitVector UsableRegs; - - /// clobberedByRegMask - Returns true if PhysReg is not directly usable - /// because of register mask clobbers. - bool clobberedByRegMask(unsigned PhysReg) const { - return !UsableRegs.empty() && !UsableRegs.test(PhysReg); - } - // splitting state. std::auto_ptr<SplitAnalysis> SA; std::auto_ptr<SplitEditor> SE; @@ -286,6 +273,8 @@ private: SmallVectorImpl<LiveInterval*>&); unsigned tryBlockSplit(LiveInterval&, AllocationOrder&, SmallVectorImpl<LiveInterval*>&); + unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&, + SmallVectorImpl<LiveInterval*>&); unsigned tryLocalSplit(LiveInterval&, AllocationOrder&, SmallVectorImpl<LiveInterval*>&); unsigned trySplit(LiveInterval&, AllocationOrder&, @@ -327,6 +316,7 @@ RAGreedy::RAGreedy(): MachineFunctionPass(ID) { initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry()); initializeVirtRegMapPass(*PassRegistry::getPassRegistry()); + initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry()); initializeEdgeBundlesPass(*PassRegistry::getPassRegistry()); initializeSpillPlacementPass(*PassRegistry::getPassRegistry()); } @@ -336,6 +326,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); AU.addRequired<SlotIndexes>(); AU.addPreserved<SlotIndexes>(); AU.addRequired<LiveDebugVariables>(); @@ -349,6 +340,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<MachineLoopInfo>(); AU.addRequired<VirtRegMap>(); AU.addPreserved<VirtRegMap>(); + AU.addRequired<LiveRegMatrix>(); + AU.addPreserved<LiveRegMatrix>(); AU.addRequired<EdgeBundles>(); AU.addRequired<SpillPlacement>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -360,8 +353,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { //===----------------------------------------------------------------------===// bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) { - if (unsigned PhysReg = VRM->getPhys(VirtReg)) { - unassign(LIS->getInterval(VirtReg), PhysReg); + if (VRM->hasPhys(VirtReg)) { + Matrix->unassign(LIS->getInterval(VirtReg)); return true; } // Unassigned virtreg is probably in the priority queue. @@ -370,13 +363,12 @@ bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) { } void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) { - unsigned PhysReg = VRM->getPhys(VirtReg); - if (!PhysReg) + if (!VRM->hasPhys(VirtReg)) return; // Register is assigned, put it back on the queue for reassignment. LiveInterval &LI = LIS->getInterval(VirtReg); - unassign(LI, PhysReg); + Matrix->unassign(LI); enqueue(&LI); } @@ -398,7 +390,6 @@ void RAGreedy::releaseMemory() { SpillerInstance.reset(0); ExtraRegInfo.clear(); GlobalCand.clear(); - RegAllocBase::releaseMemory(); } void RAGreedy::enqueue(LiveInterval *LI) { @@ -450,12 +441,9 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg, SmallVectorImpl<LiveInterval*> &NewVRegs) { Order.rewind(); unsigned PhysReg; - while ((PhysReg = Order.next())) { - if (clobberedByRegMask(PhysReg)) - continue; - if (!checkPhysRegInterference(VirtReg, PhysReg)) + while ((PhysReg = Order.next())) + if (!Matrix->checkInterference(VirtReg, PhysReg)) break; - } if (!PhysReg || Order.isHint(PhysReg)) return PhysReg; @@ -464,7 +452,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg, // If we missed a simple hint, try to cheaply evict interference from the // preferred register. if (unsigned Hint = MRI->getSimpleHint(VirtReg.reg)) - if (Order.isHint(Hint) && !clobberedByRegMask(Hint)) { + if (Order.isHint(Hint)) { DEBUG(dbgs() << "missed hint " << PrintReg(Hint, TRI) << '\n'); EvictionCost MaxCost(1); if (canEvictInterference(VirtReg, Hint, true, MaxCost)) { @@ -527,6 +515,10 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint, /// @returns True when interference can be evicted cheaper than MaxCost. bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg, bool IsHint, EvictionCost &MaxCost) { + // It is only possible to evict virtual register interference. + if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg) + return false; + // Find VirtReg's cascade number. This will be unassigned if VirtReg was never // involved in an eviction before. If a cascade number was assigned, deny // evicting anything with the same or a newer cascade number. This prevents @@ -539,8 +531,8 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg, Cascade = NextCascade; EvictionCost Cost; - for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) { - LiveIntervalUnion::Query &Q = query(VirtReg, *AliasI); + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); // If there is 10 or more interferences, chances are one is heavier. if (Q.collectInterferingVRegs(10) >= 10) return false; @@ -548,15 +540,21 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg, // Check if any interfering live range is heavier than MaxWeight. for (unsigned i = Q.interferingVRegs().size(); i; --i) { LiveInterval *Intf = Q.interferingVRegs()[i - 1]; - if (TargetRegisterInfo::isPhysicalRegister(Intf->reg)) - return false; + assert(TargetRegisterInfo::isVirtualRegister(Intf->reg) && + "Only expecting virtual register interference from query"); // Never evict spill products. They cannot split or spill. if (getStage(*Intf) == RS_Done) return false; // Once a live range becomes small enough, it is urgent that we find a // register for it. This is indicated by an infinite spill weight. These // urgent live ranges get to evict almost anything. - bool Urgent = !VirtReg.isSpillable() && Intf->isSpillable(); + // + // Also allow urgent evictions of unspillable ranges from a strictly + // larger allocation order. + bool Urgent = !VirtReg.isSpillable() && + (Intf->isSpillable() || + RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg)) < + RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(Intf->reg))); // Only evict older cascades or live ranges without a cascade. unsigned IntfCascade = ExtraRegInfo[Intf->reg].Cascade; if (Cascade <= IntfCascade) { @@ -597,19 +595,29 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg, DEBUG(dbgs() << "evicting " << PrintReg(PhysReg, TRI) << " interference: Cascade " << Cascade << '\n'); - for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) { - LiveIntervalUnion::Query &Q = query(VirtReg, *AliasI); + + // Collect all interfering virtregs first. + SmallVector<LiveInterval*, 8> Intfs; + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); assert(Q.seenAllInterferences() && "Didn't check all interfererences."); - for (unsigned i = 0, e = Q.interferingVRegs().size(); i != e; ++i) { - LiveInterval *Intf = Q.interferingVRegs()[i]; - unassign(*Intf, VRM->getPhys(Intf->reg)); - assert((ExtraRegInfo[Intf->reg].Cascade < Cascade || - VirtReg.isSpillable() < Intf->isSpillable()) && - "Cannot decrease cascade number, illegal eviction"); - ExtraRegInfo[Intf->reg].Cascade = Cascade; - ++NumEvicted; - NewVRegs.push_back(Intf); - } + ArrayRef<LiveInterval*> IVR = Q.interferingVRegs(); + Intfs.append(IVR.begin(), IVR.end()); + } + + // Evict them second. This will invalidate the queries. + for (unsigned i = 0, e = Intfs.size(); i != e; ++i) { + LiveInterval *Intf = Intfs[i]; + // The same VirtReg may be present in multiple RegUnits. Skip duplicates. + if (!VRM->hasPhys(Intf->reg)) + continue; + Matrix->unassign(*Intf); + assert((ExtraRegInfo[Intf->reg].Cascade < Cascade || + VirtReg.isSpillable() < Intf->isSpillable()) && + "Cannot decrease cascade number, illegal eviction"); + ExtraRegInfo[Intf->reg].Cascade = Cascade; + ++NumEvicted; + NewVRegs.push_back(Intf); } } @@ -636,8 +644,6 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg, Order.rewind(); while (unsigned PhysReg = Order.next()) { - if (clobberedByRegMask(PhysReg)) - continue; if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit) continue; // The first use of a callee-saved register in a function has cost 1. @@ -1183,7 +1189,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order, return 0; // Prepare split editor. - LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); SE->reset(LREdit, SplitSpillMode); // Assign all edge bundles to the preferred candidate, or NoCand. @@ -1231,7 +1237,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order, assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed"); unsigned Reg = VirtReg.reg; bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg)); - LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); SE->reset(LREdit, SplitSpillMode); ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks(); for (unsigned i = 0; i != UseBlocks.size(); ++i) { @@ -1265,6 +1271,65 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order, return 0; } + +//===----------------------------------------------------------------------===// +// Per-Instruction Splitting +//===----------------------------------------------------------------------===// + +/// tryInstructionSplit - Split a live range around individual instructions. +/// This is normally not worthwhile since the spiller is doing essentially the +/// same thing. However, when the live range is in a constrained register +/// class, it may help to insert copies such that parts of the live range can +/// be moved to a larger register class. +/// +/// This is similar to spilling to a larger register class. +unsigned +RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order, + SmallVectorImpl<LiveInterval*> &NewVRegs) { + // There is no point to this if there are no larger sub-classes. + if (!RegClassInfo.isProperSubClass(MRI->getRegClass(VirtReg.reg))) + return 0; + + // Always enable split spill mode, since we're effectively spilling to a + // register. + LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); + SE->reset(LREdit, SplitEditor::SM_Size); + + ArrayRef<SlotIndex> Uses = SA->getUseSlots(); + if (Uses.size() <= 1) + return 0; + + DEBUG(dbgs() << "Split around " << Uses.size() << " individual instrs.\n"); + + // Split around every non-copy instruction. + for (unsigned i = 0; i != Uses.size(); ++i) { + if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i])) + if (MI->isFullCopy()) { + DEBUG(dbgs() << " skip:\t" << Uses[i] << '\t' << *MI); + continue; + } + SE->openIntv(); + SlotIndex SegStart = SE->enterIntvBefore(Uses[i]); + SlotIndex SegStop = SE->leaveIntvAfter(Uses[i]); + SE->useIntv(SegStart, SegStop); + } + + if (LREdit.empty()) { + DEBUG(dbgs() << "All uses were copies.\n"); + return 0; + } + + SmallVector<unsigned, 8> IntvMap; + SE->finish(&IntvMap); + DebugVars->splitRegister(VirtReg.reg, LREdit.regs()); + ExtraRegInfo.resize(MRI->getNumVirtRegs()); + + // Assign all new registers to RS_Spill. This was the last chance. + setStage(LREdit.begin(), LREdit.end(), RS_Spill); + return 0; +} + + //===----------------------------------------------------------------------===// // Local Splitting //===----------------------------------------------------------------------===// @@ -1291,9 +1356,9 @@ void RAGreedy::calcGapWeights(unsigned PhysReg, GapWeight.assign(NumGaps, 0.0f); // Add interference from each overlapping register. - for (const uint16_t *AI = TRI->getOverlaps(PhysReg); *AI; ++AI) { - if (!query(const_cast<LiveInterval&>(SA->getParent()), *AI) - .checkInterference()) + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + if (!Matrix->query(const_cast<LiveInterval&>(SA->getParent()), *Units) + .checkInterference()) continue; // We know that VirtReg is a continuous interval from FirstInstr to @@ -1303,7 +1368,8 @@ void RAGreedy::calcGapWeights(unsigned PhysReg, // surrounding the instruction. The exception is interference before // StartIdx and after StopIdx. // - LiveIntervalUnion::SegmentIter IntI = getLiveUnion(*AI).find(StartIdx); + LiveIntervalUnion::SegmentIter IntI = + Matrix->getLiveUnions()[*Units] .find(StartIdx); for (unsigned Gap = 0; IntI.valid() && IntI.start() < StopIdx; ++IntI) { // Skip the gaps before IntI. while (Uses[Gap+1].getBoundaryIndex() < IntI.start()) @@ -1323,6 +1389,30 @@ void RAGreedy::calcGapWeights(unsigned PhysReg, break; } } + + // Add fixed interference. + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + const LiveInterval &LI = LIS->getRegUnit(*Units); + LiveInterval::const_iterator I = LI.find(StartIdx); + LiveInterval::const_iterator E = LI.end(); + + // Same loop as above. Mark any overlapped gaps as HUGE_VALF. + for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) { + while (Uses[Gap+1].getBoundaryIndex() < I->start) + if (++Gap == NumGaps) + break; + if (Gap == NumGaps) + break; + + for (; Gap != NumGaps; ++Gap) { + GapWeight[Gap] = HUGE_VALF; + if (Uses[Gap+1].getBaseIndex() >= I->end) + break; + } + if (Gap == NumGaps) + break; + } + } } /// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only @@ -1355,7 +1445,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, // If VirtReg is live across any register mask operands, compute a list of // gaps with register masks. SmallVector<unsigned, 8> RegMaskGaps; - if (!UsableRegs.empty()) { + if (Matrix->checkRegMaskInterference(VirtReg)) { // Get regmask slots for the whole block. ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber()); DEBUG(dbgs() << RMS.size() << " regmasks in block:"); @@ -1417,7 +1507,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, calcGapWeights(PhysReg, GapWeight); // Remove any gaps with regmask clobbers. - if (clobberedByRegMask(PhysReg)) + if (Matrix->checkRegMaskInterference(VirtReg, PhysReg)) for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i) GapWeight[RegMaskGaps[i]] = HUGE_VALF; @@ -1512,7 +1602,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, << '-' << Uses[BestAfter] << ", " << BestDiff << ", " << (BestAfter - BestBefore + 1) << " instrs\n"); - LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); SE->reset(LREdit); SE->openIntv(); @@ -1561,7 +1651,10 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order, if (LIS->intervalIsInOneMBB(VirtReg)) { NamedRegionTimer T("Local Splitting", TimerGroupName, TimePassesIsEnabled); SA->analyze(&VirtReg); - return tryLocalSplit(VirtReg, Order, NewVRegs); + unsigned PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs); + if (PhysReg || !NewVRegs.empty()) + return PhysReg; + return tryInstructionSplit(VirtReg, Order, NewVRegs); } NamedRegionTimer T("Global Splitting", TimerGroupName, TimePassesIsEnabled); @@ -1574,7 +1667,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order, // an assertion when the coalescer is fixed. if (SA->didRepairRange()) { // VirtReg has changed, so all cached queries are invalid. - invalidateVirtRegs(); + Matrix->invalidateVirtRegs(); if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs)) return PhysReg; } @@ -1599,11 +1692,6 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order, unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg, SmallVectorImpl<LiveInterval*> &NewVRegs) { - // Check if VirtReg is live across any calls. - UsableRegs.clear(); - if (LIS->checkRegMaskInterference(VirtReg, UsableRegs)) - DEBUG(dbgs() << "Live across regmasks.\n"); - // First try assigning a free register. AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo); if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs)) @@ -1644,7 +1732,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg, // Finally spill VirtReg itself. NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled); - LiveRangeEdit LRE(VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); spiller().spill(LRE); setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done); @@ -1665,7 +1753,9 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { if (VerifyEnabled) MF->verify(this, "Before greedy register allocator"); - RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>()); + RegAllocBase::init(getAnalysis<VirtRegMap>(), + getAnalysis<LiveIntervals>(), + getAnalysis<LiveRegMatrix>()); Indexes = &getAnalysis<SlotIndexes>(); DomTree = &getAnalysis<MachineDominatorTree>(); SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); @@ -1679,30 +1769,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { ExtraRegInfo.clear(); ExtraRegInfo.resize(MRI->getNumVirtRegs()); NextCascade = 1; - IntfCache.init(MF, &getLiveUnion(0), Indexes, LIS, TRI); + IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI); GlobalCand.resize(32); // This will grow as needed. allocatePhysRegs(); - addMBBLiveIns(MF); - LIS->addKillFlags(); - - // Run rewriter - { - NamedRegionTimer T("Rewriter", TimerGroupName, TimePassesIsEnabled); - VRM->rewrite(Indexes); - } - - // Write out new DBG_VALUE instructions. - { - NamedRegionTimer T("Emit Debug Info", TimerGroupName, TimePassesIsEnabled); - DebugVars->emitDebugValues(VRM); - } - - // All machine operands and other references to virtual registers have been - // replaced. Remove the virtual registers and release all the transient data. - VRM->clearAllVirt(); - MRI->clearVirtRegs(); releaseMemory(); - return true; } diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp index a284614..d0db26b 100644 --- a/lib/CodeGen/RegAllocPBQP.cpp +++ b/lib/CodeGen/RegAllocPBQP.cpp @@ -31,7 +31,6 @@ #define DEBUG_TYPE "regalloc" -#include "RenderMachineFunction.h" #include "Spiller.h" #include "VirtRegMap.h" #include "RegisterCoalescer.h" @@ -98,7 +97,6 @@ public: initializeLiveStacksPass(*PassRegistry::getPassRegistry()); initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry()); initializeVirtRegMapPass(*PassRegistry::getPassRegistry()); - initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry()); } /// Return the pass name. @@ -134,7 +132,6 @@ private: const TargetInstrInfo *tii; const MachineLoopInfo *loopInfo; MachineRegisterInfo *mri; - RenderMachineFunction *rmf; std::auto_ptr<Spiller> spiller; LiveIntervals *lis; @@ -196,7 +193,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf, const RegSet &vregs) { typedef std::vector<const LiveInterval*> LIVector; - ArrayRef<SlotIndex> regMaskSlots = lis->getRegMaskSlots(); + LiveIntervals *LIS = const_cast<LiveIntervals*>(lis); MachineRegisterInfo *mri = &mf->getRegInfo(); const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo(); @@ -205,12 +202,11 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf, RegSet pregs; // Collect the set of preg intervals, record that they're used in the MF. - for (LiveIntervals::const_iterator itr = lis->begin(), end = lis->end(); - itr != end; ++itr) { - if (TargetRegisterInfo::isPhysicalRegister(itr->first)) { - pregs.insert(itr->first); - mri->setPhysRegUsed(itr->first); - } + for (unsigned Reg = 1, e = tri->getNumRegs(); Reg != e; ++Reg) { + if (mri->def_empty(Reg)) + continue; + pregs.insert(Reg); + mri->setPhysRegUsed(Reg); } BitVector reservedRegs = tri->getReservedRegs(*mf); @@ -220,7 +216,11 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf, vregItr != vregEnd; ++vregItr) { unsigned vreg = *vregItr; const TargetRegisterClass *trc = mri->getRegClass(vreg); - const LiveInterval *vregLI = &lis->getInterval(vreg); + LiveInterval *vregLI = &LIS->getInterval(vreg); + + // Record any overlaps with regmask operands. + BitVector regMaskOverlaps(tri->getNumRegs()); + LIS->checkRegMaskInterference(*vregLI, regMaskOverlaps); // Compute an initial allowed set for the current vreg. typedef std::vector<unsigned> VRAllowed; @@ -228,80 +228,26 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf, ArrayRef<uint16_t> rawOrder = trc->getRawAllocationOrder(*mf); for (unsigned i = 0; i != rawOrder.size(); ++i) { unsigned preg = rawOrder[i]; - if (!reservedRegs.test(preg)) { - vrAllowed.push_back(preg); - } - } - - RegSet overlappingPRegs; - - // Record physical registers whose ranges overlap. - for (RegSet::const_iterator pregItr = pregs.begin(), - pregEnd = pregs.end(); - pregItr != pregEnd; ++pregItr) { - unsigned preg = *pregItr; - const LiveInterval *pregLI = &lis->getInterval(preg); - - if (pregLI->empty()) { + if (reservedRegs.test(preg)) continue; - } - if (vregLI->overlaps(*pregLI)) - overlappingPRegs.insert(preg); - } + // vregLI crosses a regmask operand that clobbers preg. + if (!regMaskOverlaps.empty() && !regMaskOverlaps.test(preg)) + continue; - // Record any overlaps with regmask operands. - BitVector regMaskOverlaps(tri->getNumRegs()); - for (ArrayRef<SlotIndex>::iterator rmItr = regMaskSlots.begin(), - rmEnd = regMaskSlots.end(); - rmItr != rmEnd; ++rmItr) { - SlotIndex rmIdx = *rmItr; - if (vregLI->liveAt(rmIdx)) { - MachineInstr *rmMI = lis->getInstructionFromIndex(rmIdx); - const uint32_t* regMask = 0; - for (MachineInstr::mop_iterator mopItr = rmMI->operands_begin(), - mopEnd = rmMI->operands_end(); - mopItr != mopEnd; ++mopItr) { - if (mopItr->isRegMask()) { - regMask = mopItr->getRegMask(); - break; - } + // vregLI overlaps fixed regunit interference. + bool Interference = false; + for (MCRegUnitIterator Units(preg, tri); Units.isValid(); ++Units) { + if (vregLI->overlaps(LIS->getRegUnit(*Units))) { + Interference = true; + break; } - assert(regMask != 0 && "Couldn't find register mask."); - regMaskOverlaps.setBitsNotInMask(regMask); } - } + if (Interference) + continue; - for (unsigned preg = 0; preg < tri->getNumRegs(); ++preg) { - if (regMaskOverlaps.test(preg)) - overlappingPRegs.insert(preg); - } - - for (RegSet::const_iterator pregItr = overlappingPRegs.begin(), - pregEnd = overlappingPRegs.end(); - pregItr != pregEnd; ++pregItr) { - unsigned preg = *pregItr; - - // Remove the register from the allowed set. - VRAllowed::iterator eraseItr = - std::find(vrAllowed.begin(), vrAllowed.end(), preg); - - if (eraseItr != vrAllowed.end()) { - vrAllowed.erase(eraseItr); - } - - // Also remove any aliases. - const uint16_t *aliasItr = tri->getAliasSet(preg); - if (aliasItr != 0) { - for (; *aliasItr != 0; ++aliasItr) { - VRAllowed::iterator eraseItr = - std::find(vrAllowed.begin(), vrAllowed.end(), *aliasItr); - - if (eraseItr != vrAllowed.end()) { - vrAllowed.erase(eraseItr); - } - } - } + // preg is usable for this virtual register. + vrAllowed.push_back(preg); } // Construct the node. @@ -379,7 +325,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilderWithCoalescing::build( PBQP::Graph &g = p->getGraph(); const TargetMachine &tm = mf->getTarget(); - CoalescerPair cp(*tm.getInstrInfo(), *tm.getRegisterInfo()); + CoalescerPair cp(*tm.getRegisterInfo()); // Scan the machine function and add a coalescing cost whenever CoalescerPair // gives the Ok. @@ -498,21 +444,17 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const { au.addRequired<MachineLoopInfo>(); au.addPreserved<MachineLoopInfo>(); au.addRequired<VirtRegMap>(); - au.addRequired<RenderMachineFunction>(); MachineFunctionPass::getAnalysisUsage(au); } void RegAllocPBQP::findVRegIntervalsToAlloc() { // Iterate over all live ranges. - for (LiveIntervals::iterator itr = lis->begin(), end = lis->end(); - itr != end; ++itr) { - - // Ignore physical ones. - if (TargetRegisterInfo::isPhysicalRegister(itr->first)) + for (unsigned i = 0, e = mri->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (mri->reg_nodbg_empty(Reg)) continue; - - LiveInterval *li = itr->second; + LiveInterval *li = &lis->getInterval(Reg); // If this live interval is non-empty we will use pbqp to allocate it. // Empty intervals we allocate in a simple post-processing stage in @@ -544,16 +486,17 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem, if (problem.isPRegOption(vreg, alloc)) { unsigned preg = problem.getPRegForOption(vreg, alloc); - DEBUG(dbgs() << "VREG " << vreg << " -> " << tri->getName(preg) << "\n"); + DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> " + << tri->getName(preg) << "\n"); assert(preg != 0 && "Invalid preg selected."); vrm->assignVirt2Phys(vreg, preg); } else if (problem.isSpillOption(vreg, alloc)) { vregsToAlloc.erase(vreg); SmallVector<LiveInterval*, 8> newSpills; - LiveRangeEdit LRE(lis->getInterval(vreg), newSpills, *mf, *lis, vrm); + LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm); spiller->spill(LRE); - DEBUG(dbgs() << "VREG " << vreg << " -> SPILLED (Cost: " + DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> SPILLED (Cost: " << LRE.getParent().weight << ", New vregs: "); // Copy any newly inserted live intervals into the list of regs to @@ -561,7 +504,7 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem, for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end(); itr != end; ++itr) { assert(!(*itr)->empty() && "Empty spill range."); - DEBUG(dbgs() << (*itr)->reg << " "); + DEBUG(dbgs() << PrintReg((*itr)->reg, tri) << " "); vregsToAlloc.insert((*itr)->reg); } @@ -579,9 +522,6 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem, void RegAllocPBQP::finalizeAlloc() const { - typedef LiveIntervals::iterator LIIterator; - typedef LiveInterval::Ranges::const_iterator LRIterator; - // First allocate registers for the empty intervals. for (RegSet::const_iterator itr = emptyIntervalVRegs.begin(), end = emptyIntervalVRegs.end(); @@ -597,51 +537,6 @@ void RegAllocPBQP::finalizeAlloc() const { vrm->assignVirt2Phys(li->reg, physReg); } - - // Finally iterate over the basic blocks to compute and set the live-in sets. - SmallVector<MachineBasicBlock*, 8> liveInMBBs; - MachineBasicBlock *entryMBB = &*mf->begin(); - - for (LIIterator liItr = lis->begin(), liEnd = lis->end(); - liItr != liEnd; ++liItr) { - - const LiveInterval *li = liItr->second; - unsigned reg = 0; - - // Get the physical register for this interval - if (TargetRegisterInfo::isPhysicalRegister(li->reg)) { - reg = li->reg; - } else if (vrm->isAssignedReg(li->reg)) { - reg = vrm->getPhys(li->reg); - } else { - // Ranges which are assigned a stack slot only are ignored. - continue; - } - - if (reg == 0) { - // Filter out zero regs - they're for intervals that were spilled. - continue; - } - - // Iterate over the ranges of the current interval... - for (LRIterator lrItr = li->begin(), lrEnd = li->end(); - lrItr != lrEnd; ++lrItr) { - - // Find the set of basic blocks which this range is live into... - if (lis->findLiveInMBBs(lrItr->start, lrItr->end, liveInMBBs)) { - // And add the physreg for this interval to their live-in sets. - for (unsigned i = 0; i != liveInMBBs.size(); ++i) { - if (liveInMBBs[i] != entryMBB) { - if (!liveInMBBs[i]->isLiveIn(reg)) { - liveInMBBs[i]->addLiveIn(reg); - } - } - } - liveInMBBs.clear(); - } - } - } - } bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { @@ -655,7 +550,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { lis = &getAnalysis<LiveIntervals>(); lss = &getAnalysis<LiveStacks>(); loopInfo = &getAnalysis<MachineLoopInfo>(); - rmf = &getAnalysis<RenderMachineFunction>(); vrm = &getAnalysis<VirtRegMap>(); spiller.reset(createInlineSpiller(*this, MF, *vrm)); @@ -719,22 +613,11 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { // Finalise allocation, allocate empty ranges. finalizeAlloc(); - - rmf->renderMachineFunction("After PBQP register allocation.", vrm); - vregsToAlloc.clear(); emptyIntervalVRegs.clear(); DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *vrm << "\n"); - // Run rewriter - vrm->rewrite(lis->getSlotIndexes()); - - // All machine operands and other references to virtual registers have been - // replaced. Remove the virtual registers. - vrm->clearAllVirt(); - mri->clearVirtRegs(); - return true; } diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp index 17165fa..652bc30 100644 --- a/lib/CodeGen/RegisterClassInfo.cpp +++ b/lib/CodeGen/RegisterClassInfo.cpp @@ -15,8 +15,8 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "regalloc" -#include "RegisterClassInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -50,9 +50,8 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { CSRNum.clear(); CSRNum.resize(TRI->getNumRegs(), 0); for (unsigned N = 0; unsigned Reg = CSR[N]; ++N) - for (const uint16_t *AS = TRI->getOverlaps(Reg); - unsigned Alias = *AS; ++AS) - CSRNum[Alias] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ... + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ... Update = true; } CalleeSaved = CSR; diff --git a/lib/CodeGen/RegisterClassInfo.h b/lib/CodeGen/RegisterClassInfo.h deleted file mode 100644 index 400e1f4..0000000 --- a/lib/CodeGen/RegisterClassInfo.h +++ /dev/null @@ -1,132 +0,0 @@ -//===-- RegisterClassInfo.h - Dynamic Register Class Info -*- C++ -*-------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the RegisterClassInfo class which provides dynamic -// information about target register classes. Callee saved and reserved -// registers depends on calling conventions and other dynamic information, so -// some things cannot be determined statically. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CODEGEN_REGISTERCLASSINFO_H -#define LLVM_CODEGEN_REGISTERCLASSINFO_H - -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/OwningPtr.h" -#include "llvm/Target/TargetRegisterInfo.h" - -namespace llvm { - -class RegisterClassInfo { - struct RCInfo { - unsigned Tag; - unsigned NumRegs; - bool ProperSubClass; - OwningArrayPtr<unsigned> Order; - - RCInfo() : Tag(0), NumRegs(0), ProperSubClass(false) {} - operator ArrayRef<unsigned>() const { - return makeArrayRef(Order.get(), NumRegs); - } - }; - - // Brief cached information for each register class. - OwningArrayPtr<RCInfo> RegClass; - - // Tag changes whenever cached information needs to be recomputed. An RCInfo - // entry is valid when its tag matches. - unsigned Tag; - - const MachineFunction *MF; - const TargetRegisterInfo *TRI; - - // Callee saved registers of last MF. Assumed to be valid until the next - // runOnFunction() call. - const uint16_t *CalleeSaved; - - // Map register number to CalleeSaved index + 1; - SmallVector<uint8_t, 4> CSRNum; - - // Reserved registers in the current MF. - BitVector Reserved; - - // Compute all information about RC. - void compute(const TargetRegisterClass *RC) const; - - // Return an up-to-date RCInfo for RC. - const RCInfo &get(const TargetRegisterClass *RC) const { - const RCInfo &RCI = RegClass[RC->getID()]; - if (Tag != RCI.Tag) - compute(RC); - return RCI; - } - -public: - RegisterClassInfo(); - - /// runOnFunction - Prepare to answer questions about MF. This must be called - /// before any other methods are used. - void runOnMachineFunction(const MachineFunction &MF); - - /// getNumAllocatableRegs - Returns the number of actually allocatable - /// registers in RC in the current function. - unsigned getNumAllocatableRegs(const TargetRegisterClass *RC) const { - return get(RC).NumRegs; - } - - /// getOrder - Returns the preferred allocation order for RC. The order - /// contains no reserved registers, and registers that alias callee saved - /// registers come last. - ArrayRef<unsigned> getOrder(const TargetRegisterClass *RC) const { - return get(RC); - } - - /// isProperSubClass - Returns true if RC has a legal super-class with more - /// allocatable registers. - /// - /// Register classes like GR32_NOSP are not proper sub-classes because %esp - /// is not allocatable. Similarly, tGPR is not a proper sub-class in Thumb - /// mode because the GPR super-class is not legal. - bool isProperSubClass(const TargetRegisterClass *RC) const { - return get(RC).ProperSubClass; - } - - /// getLastCalleeSavedAlias - Returns the last callee saved register that - /// overlaps PhysReg, or 0 if Reg doesn't overlap a CSR. - unsigned getLastCalleeSavedAlias(unsigned PhysReg) const { - assert(TargetRegisterInfo::isPhysicalRegister(PhysReg)); - if (unsigned N = CSRNum[PhysReg]) - return CalleeSaved[N-1]; - return 0; - } - - /// isReserved - Returns true when PhysReg is a reserved register. - /// - /// Reserved registers may belong to an allocatable register class, but the - /// target has explicitly requested that they are not used. - /// - bool isReserved(unsigned PhysReg) const { - return Reserved.test(PhysReg); - } - - /// isAllocatable - Returns true when PhysReg belongs to an allocatable - /// register class and it hasn't been reserved. - /// - /// Allocatable registers may show up in the allocation order of some virtual - /// register, so a register allocator needs to track its liveness and - /// availability. - bool isAllocatable(unsigned PhysReg) const { - return TRI->isInAllocatableClass(PhysReg) && !isReserved(PhysReg); - } -}; -} // end namespace llvm - -#endif - diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index 75f88ca..733312f 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -16,34 +16,35 @@ #define DEBUG_TYPE "regalloc" #include "RegisterCoalescer.h" #include "LiveDebugVariables.h" -#include "RegisterClassInfo.h" #include "VirtRegMap.h" #include "llvm/Pass.h" #include "llvm/Value.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/OwningPtr.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" #include <algorithm> #include <cmath> using namespace llvm; @@ -53,8 +54,6 @@ STATISTIC(numCrossRCs , "Number of cross class joins performed"); STATISTIC(numCommutes , "Number of instruction commuting performed"); STATISTIC(numExtends , "Number of copies extended"); STATISTIC(NumReMats , "Number of instructions re-materialized"); -STATISTIC(numPeep , "Number of identity moves eliminated after coalescing"); -STATISTIC(numAborts , "Number of times interval joining aborted"); STATISTIC(NumInflated , "Number of register classes inflated"); static cl::opt<bool> @@ -63,22 +62,13 @@ EnableJoining("join-liveintervals", cl::init(true)); static cl::opt<bool> -DisableCrossClassJoin("disable-cross-class-join", - cl::desc("Avoid coalescing cross register class copies"), - cl::init(false), cl::Hidden); - -static cl::opt<bool> -EnablePhysicalJoin("join-physregs", - cl::desc("Join physical register copies"), - cl::init(false), cl::Hidden); - -static cl::opt<bool> VerifyCoalescing("verify-coalescing", cl::desc("Verify machine instrs before and after register coalescing"), cl::Hidden); namespace { - class RegisterCoalescer : public MachineFunctionPass { + class RegisterCoalescer : public MachineFunctionPass, + private LiveRangeEdit::Delegate { MachineFunction* MF; MachineRegisterInfo* MRI; const TargetMachine* TM; @@ -90,87 +80,83 @@ namespace { AliasAnalysis *AA; RegisterClassInfo RegClassInfo; - /// JoinedCopies - Keep track of copies eliminated due to coalescing. - /// - SmallPtrSet<MachineInstr*, 32> JoinedCopies; + /// WorkList - Copy instructions yet to be coalesced. + SmallVector<MachineInstr*, 8> WorkList; + + /// ErasedInstrs - Set of instruction pointers that have been erased, and + /// that may be present in WorkList. + SmallPtrSet<MachineInstr*, 8> ErasedInstrs; + + /// Dead instructions that are about to be deleted. + SmallVector<MachineInstr*, 8> DeadDefs; - /// ReMatCopies - Keep track of copies eliminated due to remat. - /// - SmallPtrSet<MachineInstr*, 32> ReMatCopies; + /// Virtual registers to be considered for register class inflation. + SmallVector<unsigned, 8> InflateRegs; - /// ReMatDefs - Keep track of definition instructions which have - /// been remat'ed. - SmallPtrSet<MachineInstr*, 8> ReMatDefs; + /// Recursively eliminate dead defs in DeadDefs. + void eliminateDeadDefs(); - /// joinIntervals - join compatible live intervals - void joinIntervals(); + /// LiveRangeEdit callback. + void LRE_WillEraseInstruction(MachineInstr *MI); - /// CopyCoalesceInMBB - Coalesce copies in the specified MBB, putting - /// copies that cannot yet be coalesced into the "TryAgain" list. - void CopyCoalesceInMBB(MachineBasicBlock *MBB, - std::vector<MachineInstr*> &TryAgain); + /// joinAllIntervals - join compatible live intervals + void joinAllIntervals(); - /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg, + /// copyCoalesceInMBB - Coalesce copies in the specified MBB, putting + /// copies that cannot yet be coalesced into WorkList. + void copyCoalesceInMBB(MachineBasicBlock *MBB); + + /// copyCoalesceWorkList - Try to coalesce all copies in WorkList after + /// position From. Return true if any progress was made. + bool copyCoalesceWorkList(unsigned From = 0); + + /// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg, /// which are the src/dst of the copy instruction CopyMI. This returns /// true if the copy was successfully coalesced away. If it is not /// currently possible to coalesce this interval, but it may be possible if /// other things get coalesced, then it returns true by reference in /// 'Again'. - bool JoinCopy(MachineInstr *TheCopy, bool &Again); + bool joinCopy(MachineInstr *TheCopy, bool &Again); - /// JoinIntervals - Attempt to join these two intervals. On failure, this + /// joinIntervals - Attempt to join these two intervals. On failure, this /// returns false. The output "SrcInt" will not have been modified, so we /// can use this information below to update aliases. - bool JoinIntervals(CoalescerPair &CP); + bool joinIntervals(CoalescerPair &CP); - /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If + /// Attempt joining with a reserved physreg. + bool joinReservedPhysReg(CoalescerPair &CP); + + /// adjustCopiesBackFrom - We found a non-trivially-coalescable copy. If /// the source value number is defined by a copy from the destination reg /// see if we can merge these two destination reg valno# into a single /// value number, eliminating a copy. - bool AdjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI); + bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI); - /// HasOtherReachingDefs - Return true if there are definitions of IntB + /// hasOtherReachingDefs - Return true if there are definitions of IntB /// other than BValNo val# that can reach uses of AValno val# of IntA. - bool HasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB, + bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB, VNInfo *AValNo, VNInfo *BValNo); - /// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy. + /// removeCopyByCommutingDef - We found a non-trivially-coalescable copy. /// If the source value number is defined by a commutable instruction and /// its other operand is coalesced to the copy dest register, see if we /// can transform the copy into a noop by commuting the definition. - bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); + bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); - /// ReMaterializeTrivialDef - If the source of a copy is defined by a + /// reMaterializeTrivialDef - If the source of a copy is defined by a /// trivial computation, replace the copy by rematerialize the definition. - /// If PreserveSrcInt is true, make sure SrcInt is valid after the call. - bool ReMaterializeTrivialDef(LiveInterval &SrcInt, bool PreserveSrcInt, - unsigned DstReg, MachineInstr *CopyMI); - - /// shouldJoinPhys - Return true if a physreg copy should be joined. - bool shouldJoinPhys(CoalescerPair &CP); - - /// isWinToJoinCrossClass - Return true if it's profitable to coalesce - /// two virtual registers from different register classes. - bool isWinToJoinCrossClass(unsigned SrcReg, - unsigned DstReg, - const TargetRegisterClass *SrcRC, - const TargetRegisterClass *DstRC, - const TargetRegisterClass *NewRC); - - /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and + bool reMaterializeTrivialDef(LiveInterval &SrcInt, unsigned DstReg, + MachineInstr *CopyMI); + + /// canJoinPhys - Return true if a physreg copy should be joined. + bool canJoinPhys(CoalescerPair &CP); + + /// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and /// update the subregister number if it is not zero. If DstReg is a /// physical register and the existing subregister number of the def / use /// being updated is not zero, make sure to set it to the correct physical /// subregister. - void UpdateRegDefsUses(const CoalescerPair &CP); - - /// RemoveDeadDef - If a def of a live interval is now determined dead, - /// remove the val# it defines. If the live interval becomes empty, remove - /// it as well. - bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI); - - /// markAsJoined - Remember that CopyMI has already been joined. - void markAsJoined(MachineInstr *CopyMI); + void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx); /// eliminateUndefCopy - Handle copies of undef values. bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP); @@ -233,7 +219,8 @@ static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI, } bool CoalescerPair::setRegisters(const MachineInstr *MI) { - SrcReg = DstReg = SubIdx = 0; + SrcReg = DstReg = 0; + SrcIdx = DstIdx = 0; NewRC = 0; Flipped = CrossClass = false; @@ -271,39 +258,44 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) { } } else { // Both registers are virtual. + const TargetRegisterClass *SrcRC = MRI.getRegClass(Src); + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); // Both registers have subreg indices. if (SrcSub && DstSub) { - // For now we only handle the case of identical indices in commensurate - // registers: Dreg:ssub_1 + Dreg:ssub_1 -> Dreg - // FIXME: Handle Qreg:ssub_3 + Dreg:ssub_1 as QReg:dsub_1 + Dreg. - if (SrcSub != DstSub) + // Copies between different sub-registers are never coalescable. + if (Src == Dst && SrcSub != DstSub) return false; - const TargetRegisterClass *SrcRC = MRI.getRegClass(Src); - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); - if (!TRI.getCommonSubClass(DstRC, SrcRC)) + + NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub, + SrcIdx, DstIdx); + if (!NewRC) return false; - SrcSub = DstSub = 0; + } else if (DstSub) { + // SrcReg will be merged with a sub-register of DstReg. + SrcIdx = DstSub; + NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub); + } else if (SrcSub) { + // DstReg will be merged with a sub-register of SrcReg. + DstIdx = SrcSub; + NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub); + } else { + // This is a straight copy without sub-registers. + NewRC = TRI.getCommonSubClass(DstRC, SrcRC); } - // There can be no SrcSub. - if (SrcSub) { + // The combined constraint may be impossible to satisfy. + if (!NewRC) + return false; + + // Prefer SrcReg to be a sub-register of DstReg. + // FIXME: Coalescer should support subregs symmetrically. + if (DstIdx && !SrcIdx) { std::swap(Src, Dst); - DstSub = SrcSub; - SrcSub = 0; - assert(!Flipped && "Unexpected flip"); - Flipped = true; + std::swap(SrcIdx, DstIdx); + Flipped = !Flipped; } - // Find the new register class. - const TargetRegisterClass *SrcRC = MRI.getRegClass(Src); - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); - if (DstSub) - NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub); - else - NewRC = TRI.getCommonSubClass(DstRC, SrcRC); - if (!NewRC) - return false; CrossClass = NewRC != DstRC || NewRC != SrcRC; } // Check our invariants @@ -312,14 +304,14 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) { "Cannot have a physical SubIdx"); SrcReg = Src; DstReg = Dst; - SubIdx = DstSub; return true; } bool CoalescerPair::flip() { - if (SubIdx || TargetRegisterInfo::isPhysicalRegister(DstReg)) + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) return false; std::swap(SrcReg, DstReg); + std::swap(SrcIdx, DstIdx); Flipped = !Flipped; return true; } @@ -343,7 +335,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const { if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { if (!TargetRegisterInfo::isPhysicalRegister(Dst)) return false; - assert(!SubIdx && "Inconsistent CoalescerPair state."); + assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state."); // DstSub could be set for a physreg from INSERT_SUBREG. if (DstSub) Dst = TRI.getSubReg(Dst, DstSub); @@ -357,7 +349,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const { if (DstReg != Dst) return false; // Registers match, do the subregisters line up? - return compose(TRI, SubIdx, SrcSub) == DstSub; + return compose(TRI, SrcIdx, SrcSub) == compose(TRI, DstIdx, DstSub); } } @@ -375,19 +367,18 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -void RegisterCoalescer::markAsJoined(MachineInstr *CopyMI) { - /// Joined copies are not deleted immediately, but kept in JoinedCopies. - JoinedCopies.insert(CopyMI); +void RegisterCoalescer::eliminateDeadDefs() { + SmallVector<LiveInterval*, 8> NewRegs; + LiveRangeEdit(0, NewRegs, *MF, *LIS, 0, this).eliminateDeadDefs(DeadDefs); +} - /// Mark all register operands of CopyMI as <undef> so they won't affect dead - /// code elimination. - for (MachineInstr::mop_iterator I = CopyMI->operands_begin(), - E = CopyMI->operands_end(); I != E; ++I) - if (I->isReg()) - I->setIsUndef(true); +// Callback from eliminateDeadDefs(). +void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) { + // MI may be in WorkList. Make sure we don't visit it. + ErasedInstrs.insert(MI); } -/// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA +/// adjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA /// being the source and IntB being the dest, thus this defines a value number /// in IntB. If the source value number (in IntA) is defined by a copy from B, /// see if we can merge these two pieces of B into a single value number, @@ -402,12 +393,10 @@ void RegisterCoalescer::markAsJoined(MachineInstr *CopyMI) { /// /// This returns true if an interval was modified. /// -bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP, - MachineInstr *CopyMI) { - // Bail if there is no dst interval - can happen when merging physical subreg - // operations. - if (!LIS->hasInterval(CP.getDstReg())) - return false; +bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP, + MachineInstr *CopyMI) { + assert(!CP.isPartial() && "This doesn't work for partial copies."); + assert(!CP.isPhys() && "This doesn't work for physreg copies."); LiveInterval &IntA = LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); @@ -457,24 +446,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP, // IntB, we can merge them. if (ValLR+1 != BLR) return false; - // If a live interval is a physical register, conservatively check if any - // of its aliases is overlapping the live interval of the virtual register. - // If so, do not coalesce. - if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) { - for (const uint16_t *AS = TRI->getAliasSet(IntB.reg); *AS; ++AS) - if (LIS->hasInterval(*AS) && IntA.overlaps(LIS->getInterval(*AS))) { - DEBUG({ - dbgs() << "\t\tInterfere with alias "; - LIS->getInterval(*AS).print(dbgs(), TRI); - }); - return false; - } - } - - DEBUG({ - dbgs() << "Extending: "; - IntB.print(dbgs(), TRI); - }); + DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI)); SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start; // We are about to delete CopyMI, so need to remove it as the 'instruction @@ -487,19 +459,6 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP, // two value numbers. IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo)); - // If the IntB live range is assigned to a physical register, and if that - // physreg has sub-registers, update their live intervals as well. - if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) { - for (const uint16_t *SR = TRI->getSubRegisters(IntB.reg); *SR; ++SR) { - if (!LIS->hasInterval(*SR)) - continue; - LiveInterval &SRLI = LIS->getInterval(*SR); - SRLI.addRange(LiveRange(FillerStart, FillerEnd, - SRLI.getNextValue(FillerStart, - LIS->getVNInfoAllocator()))); - } - } - // Okay, merge "B1" into the same value number as "B0". if (BValNo != ValLR->valno) { // If B1 is killed by a PHI, then the merged live range must also be killed @@ -509,11 +468,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP, if (HasPHIKill) ValLR->valno->setHasPHIKill(true); } - DEBUG({ - dbgs() << " result = "; - IntB.print(dbgs(), TRI); - dbgs() << "\n"; - }); + DEBUG(dbgs() << " result = " << IntB << '\n'); // If the source instruction was killing the source register before the // merge, unset the isKill marker given the live range has been extended. @@ -525,8 +480,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP, // Rewrite the copy. If the copy instruction was killing the destination // register before the merge, find the last use and trim the live range. That // will also add the isKill marker. - CopyMI->substituteRegister(IntA.reg, IntB.reg, CP.getSubIdx(), - *TRI); + CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI); if (ALR->end == CopyIdx) LIS->shrinkToUses(&IntA); @@ -534,12 +488,12 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP, return true; } -/// HasOtherReachingDefs - Return true if there are definitions of IntB +/// hasOtherReachingDefs - Return true if there are definitions of IntB /// other than BValNo val# that can reach uses of AValno val# of IntA. -bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA, - LiveInterval &IntB, - VNInfo *AValNo, - VNInfo *BValNo) { +bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA, + LiveInterval &IntB, + VNInfo *AValNo, + VNInfo *BValNo) { for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end(); AI != AE; ++AI) { if (AI->valno != AValNo) continue; @@ -559,7 +513,7 @@ bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA, return false; } -/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with +/// removeCopyByCommutingDef - We found a non-trivially-coalescable copy with /// IntA being the source and IntB being the dest, thus this defines a value /// number in IntB. If the source value number (in IntA) is defined by a /// commutable instruction and its other operand is coalesced to the copy dest @@ -582,18 +536,9 @@ bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA, /// /// This returns true if an interval was modified. /// -bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP, - MachineInstr *CopyMI) { - // FIXME: For now, only eliminate the copy by commuting its def when the - // source register is a virtual register. We want to guard against cases - // where the copy is a back edge copy and commuting the def lengthen the - // live interval of the source register to the entire loop. - if (CP.isPhys() && CP.isFlipped()) - return false; - - // Bail if there is no dst interval. - if (!LIS->hasInterval(CP.getDstReg())) - return false; +bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, + MachineInstr *CopyMI) { + assert (!CP.isPhys()); SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(); @@ -647,17 +592,9 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP, // Make sure there are no other definitions of IntB that would reach the // uses which the new definition can reach. - if (HasOtherReachingDefs(IntA, IntB, AValNo, BValNo)) + if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo)) return false; - // Abort if the aliases of IntB.reg have values that are not simply the - // clobbers from the superreg. - if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) - for (const uint16_t *AS = TRI->getAliasSet(IntB.reg); *AS; ++AS) - if (LIS->hasInterval(*AS) && - HasOtherReachingDefs(IntA, LIS->getInterval(*AS), AValNo, 0)) - return false; - // If some of the uses of IntA.reg is already coalesced away, return false. // It's not possible to determine whether it's safe to perform the coalescing. for (MachineRegisterInfo::use_nodbg_iterator UI = @@ -666,13 +603,14 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP, MachineInstr *UseMI = &*UI; SlotIndex UseIdx = LIS->getInstructionIndex(UseMI); LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx); - if (ULR == IntA.end()) + if (ULR == IntA.end() || ULR->valno != AValNo) continue; - if (ULR->valno == AValNo && JoinedCopies.count(UseMI)) + // If this use is tied to a def, we can't rewrite the register. + if (UseMI->isRegTiedToDefOperand(UI.getOperandNo())) return false; } - DEBUG(dbgs() << "\tRemoveCopyByCommutingDef: " << AValNo->def << '\t' + DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t' << *DefMI); // At this point we have decided that it is legal to do this @@ -709,8 +647,6 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP, MachineOperand &UseMO = UI.getOperand(); MachineInstr *UseMI = &*UI; ++UI; - if (JoinedCopies.count(UseMI)) - continue; if (UseMI->isDebugValue()) { // FIXME These don't have an instruction index. Not clear we have enough // info to decide whether to do this replacement or not. For now do it. @@ -742,7 +678,9 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP, DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI); assert(DVNI->def == DefIdx); BValNo = IntB.MergeValueNumberInto(BValNo, DVNI); - markAsJoined(UseMI); + ErasedInstrs.insert(UseMI); + LIS->RemoveMachineInstrFromMaps(UseMI); + UseMI->eraseFromParent(); } // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition @@ -762,12 +700,11 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP, return true; } -/// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial +/// reMaterializeTrivialDef - If the source of a copy is defined by a trivial /// computation, replace the copy by rematerialize the definition. -bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt, - bool preserveSrcInt, - unsigned DstReg, - MachineInstr *CopyMI) { +bool RegisterCoalescer::reMaterializeTrivialDef(LiveInterval &SrcInt, + unsigned DstReg, + MachineInstr *CopyMI) { SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true); LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx); assert(SrcLR != SrcInt.end() && "Live range not found!"); @@ -792,7 +729,7 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt, // Make sure the copy destination register class fits the instruction // definition register class. The mismatch can happen as a result of earlier // extract_subreg, insert_subreg, subreg_to_reg coalescing. - const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI); + const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI, *MF); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { if (MRI->getRegClass(DstReg) != RC) return false; @@ -838,23 +775,21 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt, SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI); for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) { - unsigned reg = NewMIImplDefs[i]; - LiveInterval &li = LIS->getInterval(reg); - VNInfo *DeadDefVN = li.getNextValue(NewMIIdx.getRegSlot(), - LIS->getVNInfoAllocator()); - LiveRange lr(NewMIIdx.getRegSlot(), NewMIIdx.getDeadSlot(), DeadDefVN); - li.addRange(lr); + unsigned Reg = NewMIImplDefs[i]; + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + if (LiveInterval *LI = LIS->getCachedRegUnit(*Units)) + LI->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator()); } CopyMI->eraseFromParent(); - ReMatCopies.insert(CopyMI); - ReMatDefs.insert(DefMI); + ErasedInstrs.insert(CopyMI); DEBUG(dbgs() << "Remat: " << *NewMI); ++NumReMats; // The source interval can become smaller because we removed a use. - if (preserveSrcInt) - LIS->shrinkToUses(&SrcInt); + LIS->shrinkToUses(&SrcInt, &DeadDefs); + if (!DeadDefs.empty()) + eliminateDeadDefs(); return true; } @@ -902,51 +837,40 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI, return true; } -/// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and +/// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and /// update the subregister number if it is not zero. If DstReg is a /// physical register and the existing subregister number of the def / use /// being updated is not zero, make sure to set it to the correct physical /// subregister. -void -RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) { - bool DstIsPhys = CP.isPhys(); - unsigned SrcReg = CP.getSrcReg(); - unsigned DstReg = CP.getDstReg(); - unsigned SubIdx = CP.getSubIdx(); +void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx) { + bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg); + LiveInterval *DstInt = DstIsPhys ? 0 : &LIS->getInterval(DstReg); // Update LiveDebugVariables. LDV->renameRegister(SrcReg, DstReg, SubIdx); for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg); MachineInstr *UseMI = I.skipInstruction();) { - // A PhysReg copy that won't be coalesced can perhaps be rematerialized - // instead. - if (DstIsPhys) { - if (UseMI->isFullCopy() && - UseMI->getOperand(1).getReg() == SrcReg && - UseMI->getOperand(0).getReg() != SrcReg && - UseMI->getOperand(0).getReg() != DstReg && - !JoinedCopies.count(UseMI) && - ReMaterializeTrivialDef(LIS->getInterval(SrcReg), false, - UseMI->getOperand(0).getReg(), UseMI)) - continue; - } - SmallVector<unsigned,8> Ops; bool Reads, Writes; tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops); + // If SrcReg wasn't read, it may still be the case that DstReg is live-in + // because SrcReg is a sub-register. + if (DstInt && !Reads && SubIdx) + Reads = DstInt->liveAt(LIS->getInstructionIndex(UseMI)); + // Replace SrcReg with DstReg in all UseMI operands. for (unsigned i = 0, e = Ops.size(); i != e; ++i) { MachineOperand &MO = UseMI->getOperand(Ops[i]); - // Make sure we don't create read-modify-write defs accidentally. We - // assume here that a SrcReg def cannot be joined into a live DstReg. If - // RegisterCoalescer starts tracking partially live registers, we will - // need to check the actual LiveInterval to determine if DstReg is live - // here. - if (SubIdx && !Reads) - MO.setIsUndef(); + // Adjust <undef> flags in case of sub-register joins. We don't want to + // turn a full def into a read-modify-write sub-register def and vice + // versa. + if (SubIdx && MO.isDef()) + MO.setIsUndef(!Reads); if (DstIsPhys) MO.substPhysReg(DstReg, *TRI); @@ -954,10 +878,6 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) { MO.substVirtReg(DstReg, SubIdx, *TRI); } - // This instruction is a copy that will be removed. - if (JoinedCopies.count(UseMI)) - continue; - DEBUG({ dbgs() << "\t\tupdated: "; if (!UseMI->isDebugValue()) @@ -967,210 +887,107 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) { } } -/// removeIntervalIfEmpty - Check if the live interval of a physical register -/// is empty, if so remove it and also remove the empty intervals of its -/// sub-registers. Return true if live interval is removed. -static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *LIS, - const TargetRegisterInfo *TRI) { - if (li.empty()) { - if (TargetRegisterInfo::isPhysicalRegister(li.reg)) - for (const uint16_t* SR = TRI->getSubRegisters(li.reg); *SR; ++SR) { - if (!LIS->hasInterval(*SR)) - continue; - LiveInterval &sli = LIS->getInterval(*SR); - if (sli.empty()) - LIS->removeInterval(*SR); - } - LIS->removeInterval(li.reg); - return true; - } - return false; -} - -/// RemoveDeadDef - If a def of a live interval is now determined dead, remove -/// the val# it defines. If the live interval becomes empty, remove it as well. -bool RegisterCoalescer::RemoveDeadDef(LiveInterval &li, - MachineInstr *DefMI) { - SlotIndex DefIdx = LIS->getInstructionIndex(DefMI).getRegSlot(); - LiveInterval::iterator MLR = li.FindLiveRangeContaining(DefIdx); - if (DefIdx != MLR->valno->def) - return false; - li.removeValNo(MLR->valno); - return removeIntervalIfEmpty(li, LIS, TRI); -} - -/// shouldJoinPhys - Return true if a copy involving a physreg should be joined. -/// We need to be careful about coalescing a source physical register with a -/// virtual register. Once the coalescing is done, it cannot be broken and these -/// are not spillable! If the destination interval uses are far away, think -/// twice about coalescing them! -bool RegisterCoalescer::shouldJoinPhys(CoalescerPair &CP) { - bool Allocatable = LIS->isAllocatable(CP.getDstReg()); - LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg()); - +/// canJoinPhys - Return true if a copy involving a physreg should be joined. +bool RegisterCoalescer::canJoinPhys(CoalescerPair &CP) { /// Always join simple intervals that are defined by a single copy from a /// reserved register. This doesn't increase register pressure, so it is /// always beneficial. - if (!Allocatable && CP.isFlipped() && JoinVInt.containsOneValue()) - return true; - - if (!EnablePhysicalJoin) { - DEBUG(dbgs() << "\tPhysreg joins disabled.\n"); + if (!RegClassInfo.isReserved(CP.getDstReg())) { + DEBUG(dbgs() << "\tCan only merge into reserved registers.\n"); return false; } - // Only coalesce to allocatable physreg, we don't want to risk modifying - // reserved registers. - if (!Allocatable) { - DEBUG(dbgs() << "\tRegister is an unallocatable physreg.\n"); - return false; // Not coalescable. - } - - // Don't join with physregs that have a ridiculous number of live - // ranges. The data structure performance is really bad when that - // happens. - if (LIS->hasInterval(CP.getDstReg()) && - LIS->getInterval(CP.getDstReg()).ranges.size() > 1000) { - ++numAborts; - DEBUG(dbgs() - << "\tPhysical register live interval too complicated, abort!\n"); - return false; - } - - // FIXME: Why are we skipping this test for partial copies? - // CodeGen/X86/phys_subreg_coalesce-3.ll needs it. - if (!CP.isPartial()) { - const TargetRegisterClass *RC = MRI->getRegClass(CP.getSrcReg()); - unsigned Threshold = RegClassInfo.getNumAllocatableRegs(RC) * 2; - unsigned Length = LIS->getApproximateInstructionCount(JoinVInt); - if (Length > Threshold) { - ++numAborts; - DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n"); - return false; - } - } - return true; -} - -/// isWinToJoinCrossClass - Return true if it's profitable to coalesce -/// two virtual registers from different register classes. -bool -RegisterCoalescer::isWinToJoinCrossClass(unsigned SrcReg, - unsigned DstReg, - const TargetRegisterClass *SrcRC, - const TargetRegisterClass *DstRC, - const TargetRegisterClass *NewRC) { - unsigned NewRCCount = RegClassInfo.getNumAllocatableRegs(NewRC); - // This heuristics is good enough in practice, but it's obviously not *right*. - // 4 is a magic number that works well enough for x86, ARM, etc. It filter - // out all but the most restrictive register classes. - if (NewRCCount > 4 || - // Early exit if the function is fairly small, coalesce aggressively if - // that's the case. For really special register classes with 3 or - // fewer registers, be a bit more careful. - (LIS->getFuncInstructionCount() / NewRCCount) < 8) - return true; - LiveInterval &SrcInt = LIS->getInterval(SrcReg); - LiveInterval &DstInt = LIS->getInterval(DstReg); - unsigned SrcSize = LIS->getApproximateInstructionCount(SrcInt); - unsigned DstSize = LIS->getApproximateInstructionCount(DstInt); - - // Coalesce aggressively if the intervals are small compared to the number of - // registers in the new class. The number 4 is fairly arbitrary, chosen to be - // less aggressive than the 8 used for the whole function size. - const unsigned ThresSize = 4 * NewRCCount; - if (SrcSize <= ThresSize && DstSize <= ThresSize) + LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg()); + if (CP.isFlipped() && JoinVInt.containsOneValue()) return true; - // Estimate *register use density*. If it doubles or more, abort. - unsigned SrcUses = std::distance(MRI->use_nodbg_begin(SrcReg), - MRI->use_nodbg_end()); - unsigned DstUses = std::distance(MRI->use_nodbg_begin(DstReg), - MRI->use_nodbg_end()); - unsigned NewUses = SrcUses + DstUses; - unsigned NewSize = SrcSize + DstSize; - if (SrcRC != NewRC && SrcSize > ThresSize) { - unsigned SrcRCCount = RegClassInfo.getNumAllocatableRegs(SrcRC); - if (NewUses*SrcSize*SrcRCCount > 2*SrcUses*NewSize*NewRCCount) - return false; - } - if (DstRC != NewRC && DstSize > ThresSize) { - unsigned DstRCCount = RegClassInfo.getNumAllocatableRegs(DstRC); - if (NewUses*DstSize*DstRCCount > 2*DstUses*NewSize*NewRCCount) - return false; - } - return true; + DEBUG(dbgs() << "\tCannot join defs into reserved register.\n"); + return false; } - -/// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg, +/// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg, /// which are the src/dst of the copy instruction CopyMI. This returns true /// if the copy was successfully coalesced away. If it is not currently /// possible to coalesce this interval, but it may be possible if other /// things get coalesced, then it returns true by reference in 'Again'. -bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) { +bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { Again = false; - if (JoinedCopies.count(CopyMI) || ReMatCopies.count(CopyMI)) - return false; // Already done. - DEBUG(dbgs() << LIS->getInstructionIndex(CopyMI) << '\t' << *CopyMI); - CoalescerPair CP(*TII, *TRI); + CoalescerPair CP(*TRI); if (!CP.setRegisters(CopyMI)) { DEBUG(dbgs() << "\tNot coalescable.\n"); return false; } - // If they are already joined we continue. - if (CP.getSrcReg() == CP.getDstReg()) { - markAsJoined(CopyMI); - DEBUG(dbgs() << "\tCopy already coalesced.\n"); - return false; // Not coalescable. + // Dead code elimination. This really should be handled by MachineDCE, but + // sometimes dead copies slip through, and we can't generate invalid live + // ranges. + if (!CP.isPhys() && CopyMI->allDefsAreDead()) { + DEBUG(dbgs() << "\tCopy is dead.\n"); + DeadDefs.push_back(CopyMI); + eliminateDeadDefs(); + return true; } // Eliminate undefs. if (!CP.isPhys() && eliminateUndefCopy(CopyMI, CP)) { - markAsJoined(CopyMI); DEBUG(dbgs() << "\tEliminated copy of <undef> value.\n"); + LIS->RemoveMachineInstrFromMaps(CopyMI); + CopyMI->eraseFromParent(); return false; // Not coalescable. } - DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI) - << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSubIdx()) - << "\n"); + // Coalesced copies are normally removed immediately, but transformations + // like removeCopyByCommutingDef() can inadvertently create identity copies. + // When that happens, just join the values and remove the copy. + if (CP.getSrcReg() == CP.getDstReg()) { + LiveInterval &LI = LIS->getInterval(CP.getSrcReg()); + DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n'); + LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(CopyMI)); + if (VNInfo *DefVNI = LRQ.valueDefined()) { + VNInfo *ReadVNI = LRQ.valueIn(); + assert(ReadVNI && "No value before copy and no <undef> flag."); + assert(ReadVNI != DefVNI && "Cannot read and define the same value."); + LI.MergeValueNumberInto(DefVNI, ReadVNI); + DEBUG(dbgs() << "\tMerged values: " << LI << '\n'); + } + LIS->RemoveMachineInstrFromMaps(CopyMI); + CopyMI->eraseFromParent(); + return true; + } // Enforce policies. if (CP.isPhys()) { - if (!shouldJoinPhys(CP)) { + DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI) + << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) + << '\n'); + if (!canJoinPhys(CP)) { // Before giving up coalescing, if definition of source is defined by // trivial computation, try rematerializing it. if (!CP.isFlipped() && - ReMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), true, + reMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), CP.getDstReg(), CopyMI)) return true; return false; } } else { - // Avoid constraining virtual register regclass too much. - if (CP.isCrossClass()) { - DEBUG(dbgs() << "\tCross-class to " << CP.getNewRC()->getName() << ".\n"); - if (DisableCrossClassJoin) { - DEBUG(dbgs() << "\tCross-class joins disabled.\n"); - return false; - } - if (!isWinToJoinCrossClass(CP.getSrcReg(), CP.getDstReg(), - MRI->getRegClass(CP.getSrcReg()), - MRI->getRegClass(CP.getDstReg()), - CP.getNewRC())) { - DEBUG(dbgs() << "\tAvoid coalescing to constrained register class.\n"); - Again = true; // May be possible to coalesce later. - return false; - } - } + DEBUG({ + dbgs() << "\tConsidering merging to " << CP.getNewRC()->getName() + << " with "; + if (CP.getDstIdx() && CP.getSrcIdx()) + dbgs() << PrintReg(CP.getDstReg()) << " in " + << TRI->getSubRegIndexName(CP.getDstIdx()) << " and " + << PrintReg(CP.getSrcReg()) << " in " + << TRI->getSubRegIndexName(CP.getSrcIdx()) << '\n'; + else + dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in " + << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n'; + }); // When possible, let DstReg be the larger interval. - if (!CP.getSubIdx() && LIS->getInterval(CP.getSrcReg()).ranges.size() > + if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).ranges.size() > LIS->getInterval(CP.getDstReg()).ranges.size()) CP.flip(); } @@ -1179,21 +996,22 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) { // Otherwise, if one of the intervals being joined is a physreg, this method // always canonicalizes DstInt to be it. The output "SrcInt" will not have // been modified, so we can use this information below to update aliases. - if (!JoinIntervals(CP)) { + if (!joinIntervals(CP)) { // Coalescing failed. // If definition of source is defined by trivial computation, try // rematerializing it. if (!CP.isFlipped() && - ReMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), true, + reMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), CP.getDstReg(), CopyMI)) return true; // If we can eliminate the copy without merging the live ranges, do so now. - if (!CP.isPartial()) { - if (AdjustCopiesBackFrom(CP, CopyMI) || - RemoveCopyByCommutingDef(CP, CopyMI)) { - markAsJoined(CopyMI); + if (!CP.isPartial() && !CP.isPhys()) { + if (adjustCopiesBackFrom(CP, CopyMI) || + removeCopyByCommutingDef(CP, CopyMI)) { + LIS->RemoveMachineInstrFromMaps(CopyMI); + CopyMI->eraseFromParent(); DEBUG(dbgs() << "\tTrivial!\n"); return true; } @@ -1212,29 +1030,21 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) { MRI->setRegClass(CP.getDstReg(), CP.getNewRC()); } - // Remember to delete the copy instruction. - markAsJoined(CopyMI); + // Removing sub-register copies can ease the register class constraints. + // Make sure we attempt to inflate the register class of DstReg. + if (!CP.isPhys() && RegClassInfo.isProperSubClass(CP.getNewRC())) + InflateRegs.push_back(CP.getDstReg()); - UpdateRegDefsUses(CP); + // CopyMI has been erased by joinIntervals at this point. Remove it from + // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back + // to the work list. This keeps ErasedInstrs from growing needlessly. + ErasedInstrs.erase(CopyMI); - // If we have extended the live range of a physical register, make sure we - // update live-in lists as well. - if (CP.isPhys()) { - SmallVector<MachineBasicBlock*, 16> BlockSeq; - // JoinIntervals invalidates the VNInfos in SrcInt, but we only need the - // ranges for this, and they are preserved. - LiveInterval &SrcInt = LIS->getInterval(CP.getSrcReg()); - for (LiveInterval::const_iterator I = SrcInt.begin(), E = SrcInt.end(); - I != E; ++I ) { - LIS->findLiveInMBBs(I->start, I->end, BlockSeq); - for (unsigned idx = 0, size = BlockSeq.size(); idx != size; ++idx) { - MachineBasicBlock &block = *BlockSeq[idx]; - if (!block.isLiveIn(CP.getDstReg())) - block.addLiveIn(CP.getDstReg()); - } - BlockSeq.clear(); - } - } + // Rewrite all SrcReg operands to DstReg. + // Also update DstReg operands to include DstIdx if it is set. + if (CP.getDstIdx()) + updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx()); + updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx()); // SrcReg is guaranteed to be the register whose live interval that is // being merged. @@ -1244,16 +1054,51 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) { TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF); DEBUG({ - LiveInterval &DstInt = LIS->getInterval(CP.getDstReg()); - dbgs() << "\tJoined. Result = "; - DstInt.print(dbgs(), TRI); - dbgs() << "\n"; + dbgs() << "\tJoined. Result = " << PrintReg(CP.getDstReg(), TRI); + if (!CP.isPhys()) + dbgs() << LIS->getInterval(CP.getDstReg()); + dbgs() << '\n'; }); ++numJoins; return true; } +/// Attempt joining with a reserved physreg. +bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { + assert(CP.isPhys() && "Must be a physreg copy"); + assert(RegClassInfo.isReserved(CP.getDstReg()) && "Not a reserved register"); + LiveInterval &RHS = LIS->getInterval(CP.getSrcReg()); + DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS + << '\n'); + + assert(CP.isFlipped() && RHS.containsOneValue() && + "Invalid join with reserved register"); + + // Optimization for reserved registers like ESP. We can only merge with a + // reserved physreg if RHS has a single value that is a copy of CP.DstReg(). + // The live range of the reserved register will look like a set of dead defs + // - we don't properly track the live range of reserved registers. + + // Deny any overlapping intervals. This depends on all the reserved + // register live ranges to look like dead defs. + for (MCRegUnitIterator UI(CP.getDstReg(), TRI); UI.isValid(); ++UI) + if (RHS.overlaps(LIS->getRegUnit(*UI))) { + DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n'); + return false; + } + + // Skip any value computations, we are not adding new values to the + // reserved register. Also skip merging the live ranges, the reserved + // register live range doesn't need to be accurate as long as all the + // defs are there. + + // We don't track kills for reserved registers. + MRI->clearKillFlags(CP.getSrcReg()); + + return true; +} + /// ComputeUltimateVN - Assuming we are going to join two live intervals, /// compute what the resultant value numbers for each value in the input two /// ranges will be. This is complicated by copies between the two which can @@ -1320,144 +1165,70 @@ static bool RegistersDefinedFromSameValue(LiveIntervals &li, const TargetRegisterInfo &tri, CoalescerPair &CP, VNInfo *VNI, - LiveRange *LR, + VNInfo *OtherVNI, SmallVector<MachineInstr*, 8> &DupCopies) { // FIXME: This is very conservative. For example, we don't handle // physical registers. MachineInstr *MI = li.getInstructionFromIndex(VNI->def); - if (!MI || !MI->isFullCopy() || CP.isPartial() || CP.isPhys()) + if (!MI || CP.isPartial() || CP.isPhys()) return false; - unsigned Dst = MI->getOperand(0).getReg(); - unsigned Src = MI->getOperand(1).getReg(); - - if (!TargetRegisterInfo::isVirtualRegister(Src) || - !TargetRegisterInfo::isVirtualRegister(Dst)) + unsigned A = CP.getDstReg(); + if (!TargetRegisterInfo::isVirtualRegister(A)) return false; - unsigned A = CP.getDstReg(); unsigned B = CP.getSrcReg(); - - if (B == Dst) - std::swap(A, B); - assert(Dst == A); - - VNInfo *Other = LR->valno; - const MachineInstr *OtherMI = li.getInstructionFromIndex(Other->def); - - if (!OtherMI || !OtherMI->isFullCopy()) + if (!TargetRegisterInfo::isVirtualRegister(B)) return false; - unsigned OtherDst = OtherMI->getOperand(0).getReg(); - unsigned OtherSrc = OtherMI->getOperand(1).getReg(); - - if (!TargetRegisterInfo::isVirtualRegister(OtherSrc) || - !TargetRegisterInfo::isVirtualRegister(OtherDst)) + MachineInstr *OtherMI = li.getInstructionFromIndex(OtherVNI->def); + if (!OtherMI) return false; - assert(OtherDst == B); - - if (Src != OtherSrc) - return false; + if (MI->isImplicitDef()) { + DupCopies.push_back(MI); + return true; + } else { + if (!MI->isFullCopy()) + return false; + unsigned Src = MI->getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Src)) + return false; + if (!OtherMI->isFullCopy()) + return false; + unsigned OtherSrc = OtherMI->getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(OtherSrc)) + return false; - // If the copies use two different value numbers of X, we cannot merge - // A and B. - LiveInterval &SrcInt = li.getInterval(Src); - // getVNInfoBefore returns NULL for undef copies. In this case, the - // optimization is still safe. - if (SrcInt.getVNInfoBefore(Other->def) != SrcInt.getVNInfoBefore(VNI->def)) - return false; + if (Src != OtherSrc) + return false; - DupCopies.push_back(MI); + // If the copies use two different value numbers of X, we cannot merge + // A and B. + LiveInterval &SrcInt = li.getInterval(Src); + // getVNInfoBefore returns NULL for undef copies. In this case, the + // optimization is still safe. + if (SrcInt.getVNInfoBefore(OtherVNI->def) != + SrcInt.getVNInfoBefore(VNI->def)) + return false; - return true; + DupCopies.push_back(MI); + return true; + } } -/// JoinIntervals - Attempt to join these two intervals. On failure, this +/// joinIntervals - Attempt to join these two intervals. On failure, this /// returns false. -bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) { - LiveInterval &RHS = LIS->getInterval(CP.getSrcReg()); - DEBUG({ dbgs() << "\t\tRHS = "; RHS.print(dbgs(), TRI); dbgs() << "\n"; }); - - // If a live interval is a physical register, check for interference with any - // aliases. The interference check implemented here is a bit more conservative - // than the full interfeence check below. We allow overlapping live ranges - // only when one is a copy of the other. - if (CP.isPhys()) { - // Optimization for reserved registers like ESP. - // We can only merge with a reserved physreg if RHS has a single value that - // is a copy of CP.DstReg(). The live range of the reserved register will - // look like a set of dead defs - we don't properly track the live range of - // reserved registers. - if (RegClassInfo.isReserved(CP.getDstReg())) { - assert(CP.isFlipped() && RHS.containsOneValue() && - "Invalid join with reserved register"); - // Deny any overlapping intervals. This depends on all the reserved - // register live ranges to look like dead defs. - for (const uint16_t *AS = TRI->getOverlaps(CP.getDstReg()); *AS; ++AS) { - if (!LIS->hasInterval(*AS)) { - // Make sure at least DstReg itself exists before attempting a join. - if (*AS == CP.getDstReg()) - LIS->getOrCreateInterval(CP.getDstReg()); - continue; - } - if (RHS.overlaps(LIS->getInterval(*AS))) { - DEBUG(dbgs() << "\t\tInterference: " << PrintReg(*AS, TRI) << '\n'); - return false; - } - } - // Skip any value computations, we are not adding new values to the - // reserved register. Also skip merging the live ranges, the reserved - // register live range doesn't need to be accurate as long as all the - // defs are there. - return true; - } - - // Check if a register mask clobbers DstReg. - BitVector UsableRegs; - if (LIS->checkRegMaskInterference(RHS, UsableRegs) && - !UsableRegs.test(CP.getDstReg())) { - DEBUG(dbgs() << "\t\tRegister mask interference.\n"); - return false; - } +bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) { + // Handle physreg joins separately. + if (CP.isPhys()) + return joinReservedPhysReg(CP); - for (const uint16_t *AS = TRI->getAliasSet(CP.getDstReg()); *AS; ++AS){ - if (!LIS->hasInterval(*AS)) - continue; - const LiveInterval &LHS = LIS->getInterval(*AS); - LiveInterval::const_iterator LI = LHS.begin(); - for (LiveInterval::const_iterator RI = RHS.begin(), RE = RHS.end(); - RI != RE; ++RI) { - LI = std::lower_bound(LI, LHS.end(), RI->start); - // Does LHS have an overlapping live range starting before RI? - if ((LI != LHS.begin() && LI[-1].end > RI->start) && - (RI->start != RI->valno->def || - !CP.isCoalescable(LIS->getInstructionFromIndex(RI->start)))) { - DEBUG({ - dbgs() << "\t\tInterference from alias: "; - LHS.print(dbgs(), TRI); - dbgs() << "\n\t\tOverlap at " << RI->start << " and no copy.\n"; - }); - return false; - } - - // Check that LHS ranges beginning in this range are copies. - for (; LI != LHS.end() && LI->start < RI->end; ++LI) { - if (LI->start != LI->valno->def || - !CP.isCoalescable(LIS->getInstructionFromIndex(LI->start))) { - DEBUG({ - dbgs() << "\t\tInterference from alias: "; - LHS.print(dbgs(), TRI); - dbgs() << "\n\t\tDef at " << LI->start << " is not a copy.\n"; - }); - return false; - } - } - } - } - } + LiveInterval &RHS = LIS->getInterval(CP.getSrcReg()); + DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS + << '\n'); // Compute the final value assignment, assuming that the live ranges can be // coalesced. @@ -1468,9 +1239,11 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) { SmallVector<VNInfo*, 16> NewVNInfo; SmallVector<MachineInstr*, 8> DupCopies; + SmallVector<MachineInstr*, 8> DeadCopies; LiveInterval &LHS = LIS->getOrCreateInterval(CP.getDstReg()); - DEBUG({ dbgs() << "\t\tLHS = "; LHS.print(dbgs(), TRI); dbgs() << "\n"; }); + DEBUG(dbgs() << "\t\tLHS = " << PrintReg(CP.getDstReg(), TRI) << ' ' << LHS + << '\n'); // Loop over the value numbers of the LHS, seeing if any are defined from // the RHS. @@ -1481,21 +1254,24 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) { continue; MachineInstr *MI = LIS->getInstructionFromIndex(VNI->def); assert(MI && "Missing def"); - if (!MI->isCopyLike()) // Src not defined by a copy? + if (!MI->isCopyLike() && !MI->isImplicitDef()) // Src not defined by a copy? continue; // Figure out the value # from the RHS. - LiveRange *lr = RHS.getLiveRangeContaining(VNI->def.getPrevSlot()); + VNInfo *OtherVNI = RHS.getVNInfoBefore(VNI->def); // The copy could be to an aliased physreg. - if (!lr) continue; + if (!OtherVNI) + continue; // DstReg is known to be a register in the LHS interval. If the src is // from the RHS interval, we can use its value #. - if (!CP.isCoalescable(MI) && - !RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, lr, DupCopies)) + if (CP.isCoalescable(MI)) + DeadCopies.push_back(MI); + else if (!RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, OtherVNI, + DupCopies)) continue; - LHSValsDefinedFromRHS[VNI] = lr->valno; + LHSValsDefinedFromRHS[VNI] = OtherVNI; } // Loop over the value numbers of the RHS, seeing if any are defined from @@ -1507,21 +1283,24 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) { continue; MachineInstr *MI = LIS->getInstructionFromIndex(VNI->def); assert(MI && "Missing def"); - if (!MI->isCopyLike()) // Src not defined by a copy? + if (!MI->isCopyLike() && !MI->isImplicitDef()) // Src not defined by a copy? continue; // Figure out the value # from the LHS. - LiveRange *lr = LHS.getLiveRangeContaining(VNI->def.getPrevSlot()); + VNInfo *OtherVNI = LHS.getVNInfoBefore(VNI->def); // The copy could be to an aliased physreg. - if (!lr) continue; + if (!OtherVNI) + continue; // DstReg is known to be a register in the RHS interval. If the src is // from the LHS interval, we can use its value #. - if (!CP.isCoalescable(MI) && - !RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, lr, DupCopies)) + if (CP.isCoalescable(MI)) + DeadCopies.push_back(MI); + else if (!RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, OtherVNI, + DupCopies)) continue; - RHSValsDefinedFromLHS[VNI] = lr->valno; + RHSValsDefinedFromLHS[VNI] = OtherVNI; } LHSValNoAssignments.resize(LHS.getNumValNums(), -1); @@ -1563,6 +1342,10 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) { LiveInterval::const_iterator J = RHS.begin(); LiveInterval::const_iterator JE = RHS.end(); + // Collect interval end points that will no longer be kills. + SmallVector<MachineInstr*, 8> LHSOldKills; + SmallVector<MachineInstr*, 8> RHSOldKills; + // Skip ahead until the first place of potential sharing. if (I != IE && J != JE) { if (I->start < J->start) { @@ -1576,20 +1359,21 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) { while (I != IE && J != JE) { // Determine if these two live ranges overlap. - bool Overlaps; - if (I->start < J->start) { - Overlaps = I->end > J->start; - } else { - Overlaps = J->end > I->start; - } - // If so, check value # info to determine if they are really different. - if (Overlaps) { + if (I->end > J->start && J->end > I->start) { // If the live range overlap will map to the same value number in the // result liverange, we can still coalesce them. If not, we can't. if (LHSValNoAssignments[I->valno->id] != RHSValNoAssignments[J->valno->id]) return false; + + // Extended live ranges should no longer be killed. + if (!I->end.isBlock() && I->end < J->end) + if (MachineInstr *MI = LIS->getInstructionFromIndex(I->end)) + LHSOldKills.push_back(MI); + if (!J->end.isBlock() && J->end < I->end) + if (MachineInstr *MI = LIS->getInstructionFromIndex(J->end)) + RHSOldKills.push_back(MI); } if (I->end < J->end) @@ -1616,29 +1400,48 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) { NewVNInfo[RHSValID]->setHasPHIKill(true); } + // Clear kill flags where live ranges are extended. + while (!LHSOldKills.empty()) + LHSOldKills.pop_back_val()->clearRegisterKills(LHS.reg, TRI); + while (!RHSOldKills.empty()) + RHSOldKills.pop_back_val()->clearRegisterKills(RHS.reg, TRI); + if (LHSValNoAssignments.empty()) LHSValNoAssignments.push_back(-1); if (RHSValNoAssignments.empty()) RHSValNoAssignments.push_back(-1); + // Now erase all the redundant copies. + for (unsigned i = 0, e = DeadCopies.size(); i != e; ++i) { + MachineInstr *MI = DeadCopies[i]; + if (!ErasedInstrs.insert(MI)) + continue; + DEBUG(dbgs() << "\t\terased:\t" << LIS->getInstructionIndex(MI) + << '\t' << *MI); + LIS->RemoveMachineInstrFromMaps(MI); + MI->eraseFromParent(); + } + SmallVector<unsigned, 8> SourceRegisters; for (SmallVector<MachineInstr*, 8>::iterator I = DupCopies.begin(), E = DupCopies.end(); I != E; ++I) { MachineInstr *MI = *I; + if (!ErasedInstrs.insert(MI)) + continue; - // We have pretended that the assignment to B in + // If MI is a copy, then we have pretended that the assignment to B in // A = X // B = X // was actually a copy from A. Now that we decided to coalesce A and B, // transform the code into // A = X - // X = X - // and mark the X as coalesced to keep the illusion. - unsigned Src = MI->getOperand(1).getReg(); - SourceRegisters.push_back(Src); - MI->getOperand(0).substVirtReg(Src, 0, *TRI); - - markAsJoined(MI); + // In the case of the implicit_def, we just have to remove it. + if (!MI->isImplicitDef()) { + unsigned Src = MI->getOperand(1).getReg(); + SourceRegisters.push_back(Src); + } + LIS->RemoveMachineInstrFromMaps(MI); + MI->eraseFromParent(); } // If B = X was the last use of X in a liverange, we have to shrink it now @@ -1678,73 +1481,58 @@ namespace { }; } -void RegisterCoalescer::CopyCoalesceInMBB(MachineBasicBlock *MBB, - std::vector<MachineInstr*> &TryAgain) { - DEBUG(dbgs() << MBB->getName() << ":\n"); - - SmallVector<MachineInstr*, 8> VirtCopies; - SmallVector<MachineInstr*, 8> PhysCopies; - SmallVector<MachineInstr*, 8> ImpDefCopies; - for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end(); - MII != E;) { - MachineInstr *Inst = MII++; - - // If this isn't a copy nor a extract_subreg, we can't join intervals. - unsigned SrcReg, DstReg; - if (Inst->isCopy()) { - DstReg = Inst->getOperand(0).getReg(); - SrcReg = Inst->getOperand(1).getReg(); - } else if (Inst->isSubregToReg()) { - DstReg = Inst->getOperand(0).getReg(); - SrcReg = Inst->getOperand(2).getReg(); - } else +// Try joining WorkList copies starting from index From. +// Null out any successful joins. +bool RegisterCoalescer::copyCoalesceWorkList(unsigned From) { + assert(From <= WorkList.size() && "Out of range"); + bool Progress = false; + for (unsigned i = From, e = WorkList.size(); i != e; ++i) { + if (!WorkList[i]) continue; - - bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg); - bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg); - if (LIS->hasInterval(SrcReg) && LIS->getInterval(SrcReg).empty()) - ImpDefCopies.push_back(Inst); - else if (SrcIsPhys || DstIsPhys) - PhysCopies.push_back(Inst); - else - VirtCopies.push_back(Inst); - } - - // Try coalescing implicit copies and insert_subreg <undef> first, - // followed by copies to / from physical registers, then finally copies - // from virtual registers to virtual registers. - for (unsigned i = 0, e = ImpDefCopies.size(); i != e; ++i) { - MachineInstr *TheCopy = ImpDefCopies[i]; - bool Again = false; - if (!JoinCopy(TheCopy, Again)) - if (Again) - TryAgain.push_back(TheCopy); - } - for (unsigned i = 0, e = PhysCopies.size(); i != e; ++i) { - MachineInstr *TheCopy = PhysCopies[i]; - bool Again = false; - if (!JoinCopy(TheCopy, Again)) - if (Again) - TryAgain.push_back(TheCopy); - } - for (unsigned i = 0, e = VirtCopies.size(); i != e; ++i) { - MachineInstr *TheCopy = VirtCopies[i]; + // Skip instruction pointers that have already been erased, for example by + // dead code elimination. + if (ErasedInstrs.erase(WorkList[i])) { + WorkList[i] = 0; + continue; + } bool Again = false; - if (!JoinCopy(TheCopy, Again)) - if (Again) - TryAgain.push_back(TheCopy); + bool Success = joinCopy(WorkList[i], Again); + Progress |= Success; + if (Success || !Again) + WorkList[i] = 0; } + return Progress; } -void RegisterCoalescer::joinIntervals() { +void +RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) { + DEBUG(dbgs() << MBB->getName() << ":\n"); + + // Collect all copy-like instructions in MBB. Don't start coalescing anything + // yet, it might invalidate the iterator. + const unsigned PrevSize = WorkList.size(); + for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end(); + MII != E; ++MII) + if (MII->isCopyLike()) + WorkList.push_back(MII); + + // Try coalescing the collected copies immediately, and remove the nulls. + // This prevents the WorkList from getting too large since most copies are + // joinable on the first attempt. + if (copyCoalesceWorkList(PrevSize)) + WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(), + (MachineInstr*)0), WorkList.end()); +} + +void RegisterCoalescer::joinAllIntervals() { DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n"); + assert(WorkList.empty() && "Old data still around."); - std::vector<MachineInstr*> TryAgainList; if (Loops->empty()) { // If there are no loops in the function, join intervals in function order. for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) - CopyCoalesceInMBB(I, TryAgainList); + copyCoalesceInMBB(I); } else { // Otherwise, join intervals in inner loops before other intervals. // Unfortunately we can't just iterate over loop hierarchy here because @@ -1763,34 +1551,20 @@ void RegisterCoalescer::joinIntervals() { // Finally, join intervals in loop nest order. for (unsigned i = 0, e = MBBs.size(); i != e; ++i) - CopyCoalesceInMBB(MBBs[i].second, TryAgainList); + copyCoalesceInMBB(MBBs[i].second); } // Joining intervals can allow other intervals to be joined. Iteratively join // until we make no progress. - bool ProgressMade = true; - while (ProgressMade) { - ProgressMade = false; - - for (unsigned i = 0, e = TryAgainList.size(); i != e; ++i) { - MachineInstr *&TheCopy = TryAgainList[i]; - if (!TheCopy) - continue; - - bool Again = false; - bool Success = JoinCopy(TheCopy, Again); - if (Success || !Again) { - TheCopy= 0; // Mark this one as done. - ProgressMade = true; - } - } - } + while (copyCoalesceWorkList()) + /* empty */ ; } void RegisterCoalescer::releaseMemory() { - JoinedCopies.clear(); - ReMatCopies.clear(); - ReMatDefs.clear(); + ErasedInstrs.clear(); + WorkList.clear(); + DeadDefs.clear(); + InflateRegs.clear(); } bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { @@ -1814,138 +1588,11 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { RegClassInfo.runOnMachineFunction(fn); // Join (coalesce) intervals if requested. - if (EnableJoining) { - joinIntervals(); - DEBUG({ - dbgs() << "********** INTERVALS POST JOINING **********\n"; - for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end(); - I != E; ++I){ - I->second->print(dbgs(), TRI); - dbgs() << "\n"; - } - }); - } - - // Perform a final pass over the instructions and compute spill weights - // and remove identity moves. - SmallVector<unsigned, 4> DeadDefs, InflateRegs; - for (MachineFunction::iterator mbbi = MF->begin(), mbbe = MF->end(); - mbbi != mbbe; ++mbbi) { - MachineBasicBlock* mbb = mbbi; - for (MachineBasicBlock::iterator mii = mbb->begin(), mie = mbb->end(); - mii != mie; ) { - MachineInstr *MI = mii; - if (JoinedCopies.count(MI)) { - // Delete all coalesced copies. - bool DoDelete = true; - assert(MI->isCopyLike() && "Unrecognized copy instruction"); - unsigned SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg(); - unsigned DstReg = MI->getOperand(0).getReg(); - - // Collect candidates for register class inflation. - if (TargetRegisterInfo::isVirtualRegister(SrcReg) && - RegClassInfo.isProperSubClass(MRI->getRegClass(SrcReg))) - InflateRegs.push_back(SrcReg); - if (TargetRegisterInfo::isVirtualRegister(DstReg) && - RegClassInfo.isProperSubClass(MRI->getRegClass(DstReg))) - InflateRegs.push_back(DstReg); - - if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - MI->getNumOperands() > 2) - // Do not delete extract_subreg, insert_subreg of physical - // registers unless the definition is dead. e.g. - // %DO<def> = INSERT_SUBREG %D0<undef>, %S0<kill>, 1 - // or else the scavenger may complain. LowerSubregs will - // delete them later. - DoDelete = false; - - if (MI->allDefsAreDead()) { - if (TargetRegisterInfo::isVirtualRegister(SrcReg) && - LIS->hasInterval(SrcReg)) - LIS->shrinkToUses(&LIS->getInterval(SrcReg)); - DoDelete = true; - } - if (!DoDelete) { - // We need the instruction to adjust liveness, so make it a KILL. - if (MI->isSubregToReg()) { - MI->RemoveOperand(3); - MI->RemoveOperand(1); - } - MI->setDesc(TII->get(TargetOpcode::KILL)); - mii = llvm::next(mii); - } else { - LIS->RemoveMachineInstrFromMaps(MI); - mii = mbbi->erase(mii); - ++numPeep; - } - continue; - } - - // Now check if this is a remat'ed def instruction which is now dead. - if (ReMatDefs.count(MI)) { - bool isDead = true; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (!Reg) - continue; - DeadDefs.push_back(Reg); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - // Remat may also enable register class inflation. - if (RegClassInfo.isProperSubClass(MRI->getRegClass(Reg))) - InflateRegs.push_back(Reg); - } - if (MO.isDead()) - continue; - if (TargetRegisterInfo::isPhysicalRegister(Reg) || - !MRI->use_nodbg_empty(Reg)) { - isDead = false; - break; - } - } - if (isDead) { - while (!DeadDefs.empty()) { - unsigned DeadDef = DeadDefs.back(); - DeadDefs.pop_back(); - RemoveDeadDef(LIS->getInterval(DeadDef), MI); - } - LIS->RemoveMachineInstrFromMaps(mii); - mii = mbbi->erase(mii); - continue; - } else - DeadDefs.clear(); - } - - ++mii; - - // Check for now unnecessary kill flags. - if (LIS->isNotInMIMap(MI)) continue; - SlotIndex DefIdx = LIS->getInstructionIndex(MI).getRegSlot(); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg() || !MO.isKill()) continue; - unsigned reg = MO.getReg(); - if (!reg || !LIS->hasInterval(reg)) continue; - if (!LIS->getInterval(reg).killedAt(DefIdx)) { - MO.setIsKill(false); - continue; - } - // When leaving a kill flag on a physreg, check if any subregs should - // remain alive. - if (!TargetRegisterInfo::isPhysicalRegister(reg)) - continue; - for (const uint16_t *SR = TRI->getSubRegisters(reg); - unsigned S = *SR; ++SR) - if (LIS->hasInterval(S) && LIS->getInterval(S).liveAt(DefIdx)) - MI->addRegisterDefined(S, TRI); - } - } - } + if (EnableJoining) + joinAllIntervals(); // After deleting a lot of copies, register classes may be less constrained. - // Removing sub-register opreands may alow GR32_ABCD -> GR32 and DPR_VFP2 -> + // Removing sub-register operands may allow GR32_ABCD -> GR32 and DPR_VFP2 -> // DPR inflation. array_pod_sort(InflateRegs.begin(), InflateRegs.end()); InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()), diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h index 310b933..8a6df98 100644 --- a/lib/CodeGen/RegisterCoalescer.h +++ b/lib/CodeGen/RegisterCoalescer.h @@ -26,7 +26,6 @@ namespace llvm { /// two registers can be coalesced, CoalescerPair can determine if a copy /// instruction would become an identity copy after coalescing. class CoalescerPair { - const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; /// DstReg - The register that will be left after coalescing. It can be a @@ -36,10 +35,13 @@ namespace llvm { /// SrcReg - the virtual register that will be coalesced into dstReg. unsigned SrcReg; - /// subReg_ - The subregister index of srcReg in DstReg. It is possible the - /// coalesce SrcReg into a subreg of the larger DstReg when DstReg is a - /// virtual register. - unsigned SubIdx; + /// DstIdx - The sub-register index of the old DstReg in the new coalesced + /// register. + unsigned DstIdx; + + /// SrcIdx - The sub-register index of the old SrcReg in the new coalesced + /// register. + unsigned SrcIdx; /// Partial - True when the original copy was a partial subregister copy. bool Partial; @@ -52,12 +54,13 @@ namespace llvm { bool Flipped; /// NewRC - The register class of the coalesced register, or NULL if DstReg - /// is a physreg. + /// is a physreg. This register class may be a super-register of both + /// SrcReg and DstReg. const TargetRegisterClass *NewRC; public: - CoalescerPair(const TargetInstrInfo &tii, const TargetRegisterInfo &tri) - : TII(tii), TRI(tri), DstReg(0), SrcReg(0), SubIdx(0), + CoalescerPair(const TargetRegisterInfo &tri) + : TRI(tri), DstReg(0), SrcReg(0), DstIdx(0), SrcIdx(0), Partial(false), CrossClass(false), Flipped(false), NewRC(0) {} /// setRegisters - set registers to match the copy instruction MI. Return @@ -94,9 +97,13 @@ namespace llvm { /// getSrcReg - Return the virtual register that will be coalesced away. unsigned getSrcReg() const { return SrcReg; } - /// getSubIdx - Return the subregister index in DstReg that SrcReg will be - /// coalesced into, or 0. - unsigned getSubIdx() const { return SubIdx; } + /// getDstIdx - Return the subregister index that DstReg will be coalesced + /// into, or 0. + unsigned getDstIdx() const { return DstIdx; } + + /// getSrcIdx - Return the subregister index that SrcReg will be coalesced + /// into, or 0. + unsigned getSrcIdx() const { return SrcIdx; } /// getNewRC - Return the register class of the coalesced register. const TargetRegisterClass *getNewRC() const { return NewRC; } diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp new file mode 100644 index 0000000..43448c8 --- /dev/null +++ b/lib/CodeGen/RegisterPressure.cpp @@ -0,0 +1,841 @@ +//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the RegisterPressure class which can be used to track +// MachineInstr level register pressure. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +/// Increase register pressure for each set impacted by this register class. +static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure, + std::vector<unsigned> &MaxSetPressure, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + unsigned Weight = TRI->getRegClassWeight(RC).RegWeight; + for (const int *PSet = TRI->getRegClassPressureSets(RC); + *PSet != -1; ++PSet) { + CurrSetPressure[*PSet] += Weight; + if (&CurrSetPressure != &MaxSetPressure + && CurrSetPressure[*PSet] > MaxSetPressure[*PSet]) { + MaxSetPressure[*PSet] = CurrSetPressure[*PSet]; + } + } +} + +/// Decrease register pressure for each set impacted by this register class. +static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + unsigned Weight = TRI->getRegClassWeight(RC).RegWeight; + for (const int *PSet = TRI->getRegClassPressureSets(RC); + *PSet != -1; ++PSet) { + assert(CurrSetPressure[*PSet] >= Weight && "register pressure underflow"); + CurrSetPressure[*PSet] -= Weight; + } +} + +/// Directly increase pressure only within this RegisterPressure result. +void RegisterPressure::increase(const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + increaseSetPressure(MaxSetPressure, MaxSetPressure, RC, TRI); +} + +/// Directly decrease pressure only within this RegisterPressure result. +void RegisterPressure::decrease(const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + decreaseSetPressure(MaxSetPressure, RC, TRI); +} + +void RegisterPressure::dump(const TargetRegisterInfo *TRI) { + dbgs() << "Live In: "; + for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i) + dbgs() << PrintReg(LiveInRegs[i], TRI) << " "; + dbgs() << '\n'; + dbgs() << "Live Out: "; + for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i) + dbgs() << PrintReg(LiveOutRegs[i], TRI) << " "; + dbgs() << '\n'; + for (unsigned i = 0, e = MaxSetPressure.size(); i < e; ++i) { + if (MaxSetPressure[i] != 0) + dbgs() << TRI->getRegPressureSetName(i) << "=" << MaxSetPressure[i] + << '\n'; + } +} + +/// Increase the current pressure as impacted by these physical registers and +/// bump the high water mark if needed. +void RegPressureTracker::increasePhysRegPressure(ArrayRef<unsigned> Regs) { + for (unsigned I = 0, E = Regs.size(); I != E; ++I) + increaseSetPressure(CurrSetPressure, P.MaxSetPressure, + TRI->getMinimalPhysRegClass(Regs[I]), TRI); +} + +/// Simply decrease the current pressure as impacted by these physcial +/// registers. +void RegPressureTracker::decreasePhysRegPressure(ArrayRef<unsigned> Regs) { + for (unsigned I = 0, E = Regs.size(); I != E; ++I) + decreaseSetPressure(CurrSetPressure, TRI->getMinimalPhysRegClass(Regs[I]), + TRI); +} + +/// Increase the current pressure as impacted by these virtual registers and +/// bump the high water mark if needed. +void RegPressureTracker::increaseVirtRegPressure(ArrayRef<unsigned> Regs) { + for (unsigned I = 0, E = Regs.size(); I != E; ++I) + increaseSetPressure(CurrSetPressure, P.MaxSetPressure, + MRI->getRegClass(Regs[I]), TRI); +} + +/// Simply decrease the current pressure as impacted by these virtual registers. +void RegPressureTracker::decreaseVirtRegPressure(ArrayRef<unsigned> Regs) { + for (unsigned I = 0, E = Regs.size(); I != E; ++I) + decreaseSetPressure(CurrSetPressure, MRI->getRegClass(Regs[I]), TRI); +} + +/// Clear the result so it can be used for another round of pressure tracking. +void IntervalPressure::reset() { + TopIdx = BottomIdx = SlotIndex(); + MaxSetPressure.clear(); + LiveInRegs.clear(); + LiveOutRegs.clear(); +} + +/// Clear the result so it can be used for another round of pressure tracking. +void RegionPressure::reset() { + TopPos = BottomPos = MachineBasicBlock::const_iterator(); + MaxSetPressure.clear(); + LiveInRegs.clear(); + LiveOutRegs.clear(); +} + +/// If the current top is not less than or equal to the next index, open it. +/// We happen to need the SlotIndex for the next top for pressure update. +void IntervalPressure::openTop(SlotIndex NextTop) { + if (TopIdx <= NextTop) + return; + TopIdx = SlotIndex(); + LiveInRegs.clear(); +} + +/// If the current top is the previous instruction (before receding), open it. +void RegionPressure::openTop(MachineBasicBlock::const_iterator PrevTop) { + if (TopPos != PrevTop) + return; + TopPos = MachineBasicBlock::const_iterator(); + LiveInRegs.clear(); +} + +/// If the current bottom is not greater than the previous index, open it. +void IntervalPressure::openBottom(SlotIndex PrevBottom) { + if (BottomIdx > PrevBottom) + return; + BottomIdx = SlotIndex(); + LiveInRegs.clear(); +} + +/// If the current bottom is the previous instr (before advancing), open it. +void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) { + if (BottomPos != PrevBottom) + return; + BottomPos = MachineBasicBlock::const_iterator(); + LiveInRegs.clear(); +} + +/// Setup the RegPressureTracker. +/// +/// TODO: Add support for pressure without LiveIntervals. +void RegPressureTracker::init(const MachineFunction *mf, + const RegisterClassInfo *rci, + const LiveIntervals *lis, + const MachineBasicBlock *mbb, + MachineBasicBlock::const_iterator pos) +{ + MF = mf; + TRI = MF->getTarget().getRegisterInfo(); + RCI = rci; + MRI = &MF->getRegInfo(); + MBB = mbb; + + if (RequireIntervals) { + assert(lis && "IntervalPressure requires LiveIntervals"); + LIS = lis; + } + + CurrPos = pos; + while (CurrPos != MBB->end() && CurrPos->isDebugValue()) + ++CurrPos; + + CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0); + + if (RequireIntervals) + static_cast<IntervalPressure&>(P).reset(); + else + static_cast<RegionPressure&>(P).reset(); + P.MaxSetPressure = CurrSetPressure; + + LivePhysRegs.clear(); + LivePhysRegs.setUniverse(TRI->getNumRegs()); + LiveVirtRegs.clear(); + LiveVirtRegs.setUniverse(MRI->getNumVirtRegs()); +} + +/// Does this pressure result have a valid top position and live ins. +bool RegPressureTracker::isTopClosed() const { + if (RequireIntervals) + return static_cast<IntervalPressure&>(P).TopIdx.isValid(); + return (static_cast<RegionPressure&>(P).TopPos == + MachineBasicBlock::const_iterator()); +} + +/// Does this pressure result have a valid bottom position and live outs. +bool RegPressureTracker::isBottomClosed() const { + if (RequireIntervals) + return static_cast<IntervalPressure&>(P).BottomIdx.isValid(); + return (static_cast<RegionPressure&>(P).BottomPos == + MachineBasicBlock::const_iterator()); +} + +/// Set the boundary for the top of the region and summarize live ins. +void RegPressureTracker::closeTop() { + if (RequireIntervals) + static_cast<IntervalPressure&>(P).TopIdx = + LIS->getInstructionIndex(CurrPos).getRegSlot(); + else + static_cast<RegionPressure&>(P).TopPos = CurrPos; + + assert(P.LiveInRegs.empty() && "inconsistent max pressure result"); + P.LiveInRegs.reserve(LivePhysRegs.size() + LiveVirtRegs.size()); + P.LiveInRegs.append(LivePhysRegs.begin(), LivePhysRegs.end()); + for (SparseSet<unsigned>::const_iterator I = + LiveVirtRegs.begin(), E = LiveVirtRegs.end(); I != E; ++I) + P.LiveInRegs.push_back(*I); + std::sort(P.LiveInRegs.begin(), P.LiveInRegs.end()); + P.LiveInRegs.erase(std::unique(P.LiveInRegs.begin(), P.LiveInRegs.end()), + P.LiveInRegs.end()); +} + +/// Set the boundary for the bottom of the region and summarize live outs. +void RegPressureTracker::closeBottom() { + if (RequireIntervals) + if (CurrPos == MBB->end()) + static_cast<IntervalPressure&>(P).BottomIdx = LIS->getMBBEndIdx(MBB); + else + static_cast<IntervalPressure&>(P).BottomIdx = + LIS->getInstructionIndex(CurrPos).getRegSlot(); + else + static_cast<RegionPressure&>(P).BottomPos = CurrPos; + + assert(P.LiveOutRegs.empty() && "inconsistent max pressure result"); + P.LiveOutRegs.reserve(LivePhysRegs.size() + LiveVirtRegs.size()); + P.LiveOutRegs.append(LivePhysRegs.begin(), LivePhysRegs.end()); + for (SparseSet<unsigned>::const_iterator I = + LiveVirtRegs.begin(), E = LiveVirtRegs.end(); I != E; ++I) + P.LiveOutRegs.push_back(*I); + std::sort(P.LiveOutRegs.begin(), P.LiveOutRegs.end()); + P.LiveOutRegs.erase(std::unique(P.LiveOutRegs.begin(), P.LiveOutRegs.end()), + P.LiveOutRegs.end()); +} + +/// Finalize the region boundaries and record live ins and live outs. +void RegPressureTracker::closeRegion() { + if (!isTopClosed() && !isBottomClosed()) { + assert(LivePhysRegs.empty() && LiveVirtRegs.empty() && + "no region boundary"); + return; + } + if (!isBottomClosed()) + closeBottom(); + else if (!isTopClosed()) + closeTop(); + // If both top and bottom are closed, do nothing. +} + +/// Return true if Reg aliases a register in Regs SparseSet. +static bool hasRegAlias(unsigned Reg, SparseSet<unsigned> &Regs, + const TargetRegisterInfo *TRI) { + assert(!TargetRegisterInfo::isVirtualRegister(Reg) && "only for physregs"); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + if (Regs.count(*AI)) + return true; + return false; +} + +/// Return true if Reg aliases a register in unsorted Regs SmallVector. +/// This is only valid for physical registers. +static SmallVectorImpl<unsigned>::iterator +findRegAlias(unsigned Reg, SmallVectorImpl<unsigned> &Regs, + const TargetRegisterInfo *TRI) { + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { + SmallVectorImpl<unsigned>::iterator I = + std::find(Regs.begin(), Regs.end(), *AI); + if (I != Regs.end()) + return I; + } + return Regs.end(); +} + +/// Return true if Reg can be inserted into Regs SmallVector. For virtual +/// register, do a linear search. For physical registers check for aliases. +static SmallVectorImpl<unsigned>::iterator +findReg(unsigned Reg, bool isVReg, SmallVectorImpl<unsigned> &Regs, + const TargetRegisterInfo *TRI) { + if(isVReg) + return std::find(Regs.begin(), Regs.end(), Reg); + return findRegAlias(Reg, Regs, TRI); +} + +/// Collect this instruction's unique uses and defs into SmallVectors for +/// processing defs and uses in order. +template<bool isVReg> +struct RegisterOperands { + SmallVector<unsigned, 8> Uses; + SmallVector<unsigned, 8> Defs; + SmallVector<unsigned, 8> DeadDefs; + + /// Push this operand's register onto the correct vector. + void collect(const MachineOperand &MO, const TargetRegisterInfo *TRI) { + if (MO.readsReg()) { + if (findReg(MO.getReg(), isVReg, Uses, TRI) == Uses.end()) + Uses.push_back(MO.getReg()); + } + if (MO.isDef()) { + if (MO.isDead()) { + if (findReg(MO.getReg(), isVReg, DeadDefs, TRI) == DeadDefs.end()) + DeadDefs.push_back(MO.getReg()); + } + else { + if (findReg(MO.getReg(), isVReg, Defs, TRI) == Defs.end()) + Defs.push_back(MO.getReg()); + } + } + } +}; +typedef RegisterOperands<false> PhysRegOperands; +typedef RegisterOperands<true> VirtRegOperands; + +/// Collect physical and virtual register operands. +static void collectOperands(const MachineInstr *MI, + PhysRegOperands &PhysRegOpers, + VirtRegOperands &VirtRegOpers, + const TargetRegisterInfo *TRI, + const RegisterClassInfo *RCI) { + for(ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI) { + const MachineOperand &MO = *OperI; + if (!MO.isReg() || !MO.getReg()) + continue; + + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) + VirtRegOpers.collect(MO, TRI); + else if (RCI->isAllocatable(MO.getReg())) + PhysRegOpers.collect(MO, TRI); + } + // Remove redundant physreg dead defs. + for (unsigned i = PhysRegOpers.DeadDefs.size(); i > 0; --i) { + unsigned Reg = PhysRegOpers.DeadDefs[i-1]; + if (findRegAlias(Reg, PhysRegOpers.Defs, TRI) != PhysRegOpers.Defs.end()) + PhysRegOpers.DeadDefs.erase(&PhysRegOpers.DeadDefs[i-1]); + } +} + +/// Force liveness of registers. +void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) { + for (unsigned i = 0, e = Regs.size(); i != e; ++i) { + if (TargetRegisterInfo::isVirtualRegister(Regs[i])) { + if (LiveVirtRegs.insert(Regs[i]).second) + increaseVirtRegPressure(Regs[i]); + } + else { + if (!hasRegAlias(Regs[i], LivePhysRegs, TRI)) { + LivePhysRegs.insert(Regs[i]); + increasePhysRegPressure(Regs[i]); + } + } + } +} + +/// Add PhysReg to the live in set and increase max pressure. +void RegPressureTracker::discoverPhysLiveIn(unsigned Reg) { + assert(!LivePhysRegs.count(Reg) && "avoid bumping max pressure twice"); + if (findRegAlias(Reg, P.LiveInRegs, TRI) != P.LiveInRegs.end()) + return; + + // At live in discovery, unconditionally increase the high water mark. + P.LiveInRegs.push_back(Reg); + P.increase(TRI->getMinimalPhysRegClass(Reg), TRI); +} + +/// Add PhysReg to the live out set and increase max pressure. +void RegPressureTracker::discoverPhysLiveOut(unsigned Reg) { + assert(!LivePhysRegs.count(Reg) && "avoid bumping max pressure twice"); + if (findRegAlias(Reg, P.LiveOutRegs, TRI) != P.LiveOutRegs.end()) + return; + + // At live out discovery, unconditionally increase the high water mark. + P.LiveOutRegs.push_back(Reg); + P.increase(TRI->getMinimalPhysRegClass(Reg), TRI); +} + +/// Add VirtReg to the live in set and increase max pressure. +void RegPressureTracker::discoverVirtLiveIn(unsigned Reg) { + assert(!LiveVirtRegs.count(Reg) && "avoid bumping max pressure twice"); + if (std::find(P.LiveInRegs.begin(), P.LiveInRegs.end(), Reg) != + P.LiveInRegs.end()) + return; + + // At live in discovery, unconditionally increase the high water mark. + P.LiveInRegs.push_back(Reg); + P.increase(MRI->getRegClass(Reg), TRI); +} + +/// Add VirtReg to the live out set and increase max pressure. +void RegPressureTracker::discoverVirtLiveOut(unsigned Reg) { + assert(!LiveVirtRegs.count(Reg) && "avoid bumping max pressure twice"); + if (std::find(P.LiveOutRegs.begin(), P.LiveOutRegs.end(), Reg) != + P.LiveOutRegs.end()) + return; + + // At live out discovery, unconditionally increase the high water mark. + P.LiveOutRegs.push_back(Reg); + P.increase(MRI->getRegClass(Reg), TRI); +} + +/// Recede across the previous instruction. +bool RegPressureTracker::recede() { + // Check for the top of the analyzable region. + if (CurrPos == MBB->begin()) { + closeRegion(); + return false; + } + if (!isBottomClosed()) + closeBottom(); + + // Open the top of the region using block iterators. + if (!RequireIntervals && isTopClosed()) + static_cast<RegionPressure&>(P).openTop(CurrPos); + + // Find the previous instruction. + do + --CurrPos; + while (CurrPos != MBB->begin() && CurrPos->isDebugValue()); + + if (CurrPos->isDebugValue()) { + closeRegion(); + return false; + } + SlotIndex SlotIdx; + if (RequireIntervals) + SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot(); + + // Open the top of the region using slot indexes. + if (RequireIntervals && isTopClosed()) + static_cast<IntervalPressure&>(P).openTop(SlotIdx); + + PhysRegOperands PhysRegOpers; + VirtRegOperands VirtRegOpers; + collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, RCI); + + // Boost pressure for all dead defs together. + increasePhysRegPressure(PhysRegOpers.DeadDefs); + increaseVirtRegPressure(VirtRegOpers.DeadDefs); + decreasePhysRegPressure(PhysRegOpers.DeadDefs); + decreaseVirtRegPressure(VirtRegOpers.DeadDefs); + + // Kill liveness at live defs. + // TODO: consider earlyclobbers? + for (unsigned i = 0, e = PhysRegOpers.Defs.size(); i < e; ++i) { + unsigned Reg = PhysRegOpers.Defs[i]; + if (LivePhysRegs.erase(Reg)) + decreasePhysRegPressure(Reg); + else + discoverPhysLiveOut(Reg); + } + for (unsigned i = 0, e = VirtRegOpers.Defs.size(); i < e; ++i) { + unsigned Reg = VirtRegOpers.Defs[i]; + if (LiveVirtRegs.erase(Reg)) + decreaseVirtRegPressure(Reg); + else + discoverVirtLiveOut(Reg); + } + + // Generate liveness for uses. + for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) { + unsigned Reg = PhysRegOpers.Uses[i]; + if (!hasRegAlias(Reg, LivePhysRegs, TRI)) { + increasePhysRegPressure(Reg); + LivePhysRegs.insert(Reg); + } + } + for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) { + unsigned Reg = VirtRegOpers.Uses[i]; + if (!LiveVirtRegs.count(Reg)) { + // Adjust liveouts if LiveIntervals are available. + if (RequireIntervals) { + const LiveInterval *LI = &LIS->getInterval(Reg); + if (!LI->killedAt(SlotIdx)) + discoverVirtLiveOut(Reg); + } + increaseVirtRegPressure(Reg); + LiveVirtRegs.insert(Reg); + } + } + return true; +} + +/// Advance across the current instruction. +bool RegPressureTracker::advance() { + // Check for the bottom of the analyzable region. + if (CurrPos == MBB->end()) { + closeRegion(); + return false; + } + if (!isTopClosed()) + closeTop(); + + SlotIndex SlotIdx; + if (RequireIntervals) + SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot(); + + // Open the bottom of the region using slot indexes. + if (isBottomClosed()) { + if (RequireIntervals) + static_cast<IntervalPressure&>(P).openBottom(SlotIdx); + else + static_cast<RegionPressure&>(P).openBottom(CurrPos); + } + + PhysRegOperands PhysRegOpers; + VirtRegOperands VirtRegOpers; + collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, RCI); + + // Kill liveness at last uses. + for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) { + unsigned Reg = PhysRegOpers.Uses[i]; + if (!hasRegAlias(Reg, LivePhysRegs, TRI)) + discoverPhysLiveIn(Reg); + else { + // Allocatable physregs are always single-use before regalloc. + decreasePhysRegPressure(Reg); + LivePhysRegs.erase(Reg); + } + } + for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) { + unsigned Reg = VirtRegOpers.Uses[i]; + if (RequireIntervals) { + const LiveInterval *LI = &LIS->getInterval(Reg); + if (LI->killedAt(SlotIdx)) { + if (LiveVirtRegs.erase(Reg)) + decreaseVirtRegPressure(Reg); + else + discoverVirtLiveIn(Reg); + } + } + else if (!LiveVirtRegs.count(Reg)) { + discoverVirtLiveIn(Reg); + increaseVirtRegPressure(Reg); + } + } + + // Generate liveness for defs. + for (unsigned i = 0, e = PhysRegOpers.Defs.size(); i < e; ++i) { + unsigned Reg = PhysRegOpers.Defs[i]; + if (!hasRegAlias(Reg, LivePhysRegs, TRI)) { + increasePhysRegPressure(Reg); + LivePhysRegs.insert(Reg); + } + } + for (unsigned i = 0, e = VirtRegOpers.Defs.size(); i < e; ++i) { + unsigned Reg = VirtRegOpers.Defs[i]; + if (LiveVirtRegs.insert(Reg).second) + increaseVirtRegPressure(Reg); + } + + // Boost pressure for all dead defs together. + increasePhysRegPressure(PhysRegOpers.DeadDefs); + increaseVirtRegPressure(VirtRegOpers.DeadDefs); + decreasePhysRegPressure(PhysRegOpers.DeadDefs); + decreaseVirtRegPressure(VirtRegOpers.DeadDefs); + + // Find the next instruction. + do + ++CurrPos; + while (CurrPos != MBB->end() && CurrPos->isDebugValue()); + return true; +} + +/// Find the max change in excess pressure across all sets. +static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec, + ArrayRef<unsigned> NewPressureVec, + RegPressureDelta &Delta, + const TargetRegisterInfo *TRI) { + int ExcessUnits = 0; + unsigned PSetID = ~0U; + for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) { + unsigned POld = OldPressureVec[i]; + unsigned PNew = NewPressureVec[i]; + int PDiff = (int)PNew - (int)POld; + if (!PDiff) // No change in this set in the common case. + continue; + // Only consider change beyond the limit. + unsigned Limit = TRI->getRegPressureSetLimit(i); + if (Limit > POld) { + if (Limit > PNew) + PDiff = 0; // Under the limit + else + PDiff = PNew - Limit; // Just exceeded limit. + } + else if (Limit > PNew) + PDiff = Limit - POld; // Just obeyed limit. + + if (std::abs(PDiff) > std::abs(ExcessUnits)) { + ExcessUnits = PDiff; + PSetID = i; + } + } + Delta.Excess.PSetID = PSetID; + Delta.Excess.UnitIncrease = ExcessUnits; +} + +/// Find the max change in max pressure that either surpasses a critical PSet +/// limit or exceeds the current MaxPressureLimit. +/// +/// FIXME: comparing each element of the old and new MaxPressure vectors here is +/// silly. It's done now to demonstrate the concept but will go away with a +/// RegPressureTracker API change to work with pressure differences. +static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec, + ArrayRef<unsigned> NewMaxPressureVec, + ArrayRef<PressureElement> CriticalPSets, + ArrayRef<unsigned> MaxPressureLimit, + RegPressureDelta &Delta) { + Delta.CriticalMax = PressureElement(); + Delta.CurrentMax = PressureElement(); + + unsigned CritIdx = 0, CritEnd = CriticalPSets.size(); + for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) { + unsigned POld = OldMaxPressureVec[i]; + unsigned PNew = NewMaxPressureVec[i]; + if (PNew == POld) // No change in this set in the common case. + continue; + + while (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID < i) + ++CritIdx; + + if (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID == i) { + int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].UnitIncrease; + if (PDiff > Delta.CriticalMax.UnitIncrease) { + Delta.CriticalMax.PSetID = i; + Delta.CriticalMax.UnitIncrease = PDiff; + } + } + + // Find the greatest increase above MaxPressureLimit. + // (Ignores negative MDiff). + int MDiff = (int)PNew - (int)MaxPressureLimit[i]; + if (MDiff > Delta.CurrentMax.UnitIncrease) { + Delta.CurrentMax.PSetID = i; + Delta.CurrentMax.UnitIncrease = PNew; + } + } +} + +/// Record the upward impact of a single instruction on current register +/// pressure. Unlike the advance/recede pressure tracking interface, this does +/// not discover live in/outs. +/// +/// This is intended for speculative queries. It leaves pressure inconsistent +/// with the current position, so must be restored by the caller. +void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { + // Account for register pressure similar to RegPressureTracker::recede(). + PhysRegOperands PhysRegOpers; + VirtRegOperands VirtRegOpers; + collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, RCI); + + // Boost max pressure for all dead defs together. + // Since CurrSetPressure and MaxSetPressure + increasePhysRegPressure(PhysRegOpers.DeadDefs); + increaseVirtRegPressure(VirtRegOpers.DeadDefs); + decreasePhysRegPressure(PhysRegOpers.DeadDefs); + decreaseVirtRegPressure(VirtRegOpers.DeadDefs); + + // Kill liveness at live defs. + decreasePhysRegPressure(PhysRegOpers.Defs); + decreaseVirtRegPressure(VirtRegOpers.Defs); + + // Generate liveness for uses. + for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) { + unsigned Reg = PhysRegOpers.Uses[i]; + if (!hasRegAlias(Reg, LivePhysRegs, TRI)) + increasePhysRegPressure(Reg); + } + for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) { + unsigned Reg = VirtRegOpers.Uses[i]; + if (!LiveVirtRegs.count(Reg)) + increaseVirtRegPressure(Reg); + } +} + +/// Consider the pressure increase caused by traversing this instruction +/// bottom-up. Find the pressure set with the most change beyond its pressure +/// limit based on the tracker's current pressure, and return the change in +/// number of register units of that pressure set introduced by this +/// instruction. +/// +/// This assumes that the current LiveOut set is sufficient. +/// +/// FIXME: This is expensive for an on-the-fly query. We need to cache the +/// result per-SUnit with enough information to adjust for the current +/// scheduling position. But this works as a proof of concept. +void RegPressureTracker:: +getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta, + ArrayRef<PressureElement> CriticalPSets, + ArrayRef<unsigned> MaxPressureLimit) { + // Snapshot Pressure. + // FIXME: The snapshot heap space should persist. But I'm planning to + // summarize the pressure effect so we don't need to snapshot at all. + std::vector<unsigned> SavedPressure = CurrSetPressure; + std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure; + + bumpUpwardPressure(MI); + + computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI); + computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets, + MaxPressureLimit, Delta); + assert(Delta.CriticalMax.UnitIncrease >= 0 && + Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure"); + + // Restore the tracker's state. + P.MaxSetPressure.swap(SavedMaxPressure); + CurrSetPressure.swap(SavedPressure); +} + +/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx). +static bool findUseBetween(unsigned Reg, + SlotIndex PriorUseIdx, SlotIndex NextUseIdx, + const MachineRegisterInfo *MRI, + const LiveIntervals *LIS) { + for (MachineRegisterInfo::use_nodbg_iterator + UI = MRI->use_nodbg_begin(Reg), UE = MRI->use_nodbg_end(); + UI != UE; UI.skipInstruction()) { + const MachineInstr* MI = &*UI; + SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot(); + if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx) + return true; + } + return false; +} + +/// Record the downward impact of a single instruction on current register +/// pressure. Unlike the advance/recede pressure tracking interface, this does +/// not discover live in/outs. +/// +/// This is intended for speculative queries. It leaves pressure inconsistent +/// with the current position, so must be restored by the caller. +void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { + // Account for register pressure similar to RegPressureTracker::recede(). + PhysRegOperands PhysRegOpers; + VirtRegOperands VirtRegOpers; + collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, RCI); + + // Kill liveness at last uses. Assume allocatable physregs are single-use + // rather than checking LiveIntervals. + decreasePhysRegPressure(PhysRegOpers.Uses); + if (RequireIntervals) { + SlotIndex SlotIdx = LIS->getInstructionIndex(MI).getRegSlot(); + for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) { + unsigned Reg = VirtRegOpers.Uses[i]; + const LiveInterval *LI = &LIS->getInterval(Reg); + // FIXME: allow the caller to pass in the list of vreg uses that remain to + // be bottom-scheduled to avoid searching uses at each query. + SlotIndex CurrIdx = LIS->getInstructionIndex(CurrPos).getRegSlot(); + if (LI->killedAt(SlotIdx) + && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) { + decreaseVirtRegPressure(Reg); + } + } + } + + // Generate liveness for defs. + increasePhysRegPressure(PhysRegOpers.Defs); + increaseVirtRegPressure(VirtRegOpers.Defs); + + // Boost pressure for all dead defs together. + increasePhysRegPressure(PhysRegOpers.DeadDefs); + increaseVirtRegPressure(VirtRegOpers.DeadDefs); + decreasePhysRegPressure(PhysRegOpers.DeadDefs); + decreaseVirtRegPressure(VirtRegOpers.DeadDefs); +} + +/// Consider the pressure increase caused by traversing this instruction +/// top-down. Find the register class with the most change in its pressure limit +/// based on the tracker's current pressure, and return the number of excess +/// register units of that pressure set introduced by this instruction. +/// +/// This assumes that the current LiveIn set is sufficient. +void RegPressureTracker:: +getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta, + ArrayRef<PressureElement> CriticalPSets, + ArrayRef<unsigned> MaxPressureLimit) { + // Snapshot Pressure. + std::vector<unsigned> SavedPressure = CurrSetPressure; + std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure; + + bumpDownwardPressure(MI); + + computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI); + computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets, + MaxPressureLimit, Delta); + assert(Delta.CriticalMax.UnitIncrease >= 0 && + Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure"); + + // Restore the tracker's state. + P.MaxSetPressure.swap(SavedMaxPressure); + CurrSetPressure.swap(SavedPressure); +} + +/// Get the pressure of each PSet after traversing this instruction bottom-up. +void RegPressureTracker:: +getUpwardPressure(const MachineInstr *MI, + std::vector<unsigned> &PressureResult, + std::vector<unsigned> &MaxPressureResult) { + // Snapshot pressure. + PressureResult = CurrSetPressure; + MaxPressureResult = P.MaxSetPressure; + + bumpUpwardPressure(MI); + + // Current pressure becomes the result. Restore current pressure. + P.MaxSetPressure.swap(MaxPressureResult); + CurrSetPressure.swap(PressureResult); +} + +/// Get the pressure of each PSet after traversing this instruction top-down. +void RegPressureTracker:: +getDownwardPressure(const MachineInstr *MI, + std::vector<unsigned> &PressureResult, + std::vector<unsigned> &MaxPressureResult) { + // Snapshot pressure. + PressureResult = CurrSetPressure; + MaxPressureResult = P.MaxSetPressure; + + bumpDownwardPressure(MI); + + // Current pressure becomes the result. Restore current pressure. + P.MaxSetPressure.swap(MaxPressureResult); + CurrSetPressure.swap(PressureResult); +} diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index 03bd82e..d673794 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -37,16 +37,13 @@ using namespace llvm; void RegScavenger::setUsed(unsigned Reg) { RegsAvailable.reset(Reg); - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) - RegsAvailable.reset(SubReg); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + RegsAvailable.reset(*SubRegs); } bool RegScavenger::isAliasUsed(unsigned Reg) const { - if (isUsed(Reg)) - return true; - for (const uint16_t *R = TRI->getAliasSet(Reg); *R; ++R) - if (isUsed(*R)) + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + if (isUsed(*AI)) return true; return false; } @@ -114,8 +111,8 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) { void RegScavenger::addRegWithSubRegs(BitVector &BV, unsigned Reg) { BV.set(Reg); - for (const uint16_t *R = TRI->getSubRegisters(Reg); *R; R++) - BV.set(*R); + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + BV.set(*SubRegs); } void RegScavenger::forward() { @@ -195,9 +192,8 @@ void RegScavenger::forward() { // Ideally we would like a way to model this, but leaving the // insert_subreg around causes both correctness and performance issues. bool SubUsed = false; - for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg); - unsigned SubReg = *SubRegs; ++SubRegs) - if (isUsed(SubReg)) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + if (isUsed(*SubRegs)) { SubUsed = true; break; } @@ -296,9 +292,8 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI, isVirtKillInsn = true; continue; } - Candidates.reset(MO.getReg()); - for (const uint16_t *R = TRI->getAliasSet(MO.getReg()); *R; R++) - Candidates.reset(*R); + for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI) + Candidates.reset(*AI); } // If we're not in a virtual reg's live range, this is a valid // restore point. diff --git a/lib/CodeGen/RenderMachineFunction.cpp b/lib/CodeGen/RenderMachineFunction.cpp deleted file mode 100644 index 6020908..0000000 --- a/lib/CodeGen/RenderMachineFunction.cpp +++ /dev/null @@ -1,1013 +0,0 @@ -//===-- llvm/CodeGen/RenderMachineFunction.cpp - MF->HTML -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "rendermf" - -#include "RenderMachineFunction.h" - -#include "VirtRegMap.h" - -#include "llvm/Function.h" -#include "llvm/Module.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -#include <sstream> - -using namespace llvm; - -char RenderMachineFunction::ID = 0; -INITIALIZE_PASS_BEGIN(RenderMachineFunction, "rendermf", - "Render machine functions (and related info) to HTML pages", - false, false) -INITIALIZE_PASS_DEPENDENCY(SlotIndexes) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(RenderMachineFunction, "rendermf", - "Render machine functions (and related info) to HTML pages", - false, false) - -static cl::opt<std::string> -outputFileSuffix("rmf-file-suffix", - cl::desc("Appended to function name to get output file name " - "(default: \".html\")"), - cl::init(".html"), cl::Hidden); - -static cl::opt<std::string> -machineFuncsToRender("rmf-funcs", - cl::desc("Comma separated list of functions to render" - ", or \"*\"."), - cl::init(""), cl::Hidden); - -static cl::opt<std::string> -pressureClasses("rmf-classes", - cl::desc("Register classes to render pressure for."), - cl::init(""), cl::Hidden); - -static cl::opt<std::string> -showIntervals("rmf-intervals", - cl::desc("Live intervals to show alongside code."), - cl::init(""), cl::Hidden); - -static cl::opt<bool> -filterEmpty("rmf-filter-empty-intervals", - cl::desc("Don't display empty intervals."), - cl::init(true), cl::Hidden); - -static cl::opt<bool> -showEmptyIndexes("rmf-empty-indexes", - cl::desc("Render indexes not associated with instructions or " - "MBB starts."), - cl::init(false), cl::Hidden); - -static cl::opt<bool> -useFancyVerticals("rmf-fancy-verts", - cl::desc("Use SVG for vertical text."), - cl::init(true), cl::Hidden); - -static cl::opt<bool> -prettyHTML("rmf-pretty-html", - cl::desc("Pretty print HTML. For debugging the renderer only.."), - cl::init(false), cl::Hidden); - - -namespace llvm { - - bool MFRenderingOptions::renderingOptionsProcessed; - std::set<std::string> MFRenderingOptions::mfNamesToRender; - bool MFRenderingOptions::renderAllMFs = false; - - std::set<std::string> MFRenderingOptions::classNamesToRender; - bool MFRenderingOptions::renderAllClasses = false; - - std::set<std::pair<unsigned, unsigned> > - MFRenderingOptions::intervalNumsToRender; - unsigned MFRenderingOptions::intervalTypesToRender = ExplicitOnly; - - template <typename OutputItr> - void MFRenderingOptions::splitComaSeperatedList(const std::string &s, - OutputItr outItr) { - std::string::const_iterator curPos = s.begin(); - std::string::const_iterator nextComa = std::find(curPos, s.end(), ','); - while (nextComa != s.end()) { - std::string elem; - std::copy(curPos, nextComa, std::back_inserter(elem)); - *outItr = elem; - ++outItr; - curPos = llvm::next(nextComa); - nextComa = std::find(curPos, s.end(), ','); - } - - if (curPos != s.end()) { - std::string elem; - std::copy(curPos, s.end(), std::back_inserter(elem)); - *outItr = elem; - ++outItr; - } - } - - void MFRenderingOptions::processOptions() { - if (!renderingOptionsProcessed) { - processFuncNames(); - processRegClassNames(); - processIntervalNumbers(); - renderingOptionsProcessed = true; - } - } - - void MFRenderingOptions::processFuncNames() { - if (machineFuncsToRender == "*") { - renderAllMFs = true; - } else { - splitComaSeperatedList(machineFuncsToRender, - std::inserter(mfNamesToRender, - mfNamesToRender.begin())); - } - } - - void MFRenderingOptions::processRegClassNames() { - if (pressureClasses == "*") { - renderAllClasses = true; - } else { - splitComaSeperatedList(pressureClasses, - std::inserter(classNamesToRender, - classNamesToRender.begin())); - } - } - - void MFRenderingOptions::processIntervalNumbers() { - std::set<std::string> intervalRanges; - splitComaSeperatedList(showIntervals, - std::inserter(intervalRanges, - intervalRanges.begin())); - std::for_each(intervalRanges.begin(), intervalRanges.end(), - processIntervalRange); - } - - void MFRenderingOptions::processIntervalRange( - const std::string &intervalRangeStr) { - if (intervalRangeStr == "*") { - intervalTypesToRender |= All; - } else if (intervalRangeStr == "virt-nospills*") { - intervalTypesToRender |= VirtNoSpills; - } else if (intervalRangeStr == "spills*") { - intervalTypesToRender |= VirtSpills; - } else if (intervalRangeStr == "virt*") { - intervalTypesToRender |= AllVirt; - } else if (intervalRangeStr == "phys*") { - intervalTypesToRender |= AllPhys; - } else { - std::istringstream iss(intervalRangeStr); - unsigned reg1, reg2; - if ((iss >> reg1 >> std::ws)) { - if (iss.eof()) { - intervalNumsToRender.insert(std::make_pair(reg1, reg1 + 1)); - } else { - char c; - iss >> c; - if (c == '-' && (iss >> reg2)) { - intervalNumsToRender.insert(std::make_pair(reg1, reg2 + 1)); - } else { - dbgs() << "Warning: Invalid interval range \"" - << intervalRangeStr << "\" in -rmf-intervals. Skipping.\n"; - } - } - } else { - dbgs() << "Warning: Invalid interval number \"" - << intervalRangeStr << "\" in -rmf-intervals. Skipping.\n"; - } - } - } - - void MFRenderingOptions::setup(MachineFunction *mf, - const TargetRegisterInfo *tri, - LiveIntervals *lis, - const RenderMachineFunction *rmf) { - this->mf = mf; - this->tri = tri; - this->lis = lis; - this->rmf = rmf; - - clear(); - } - - void MFRenderingOptions::clear() { - regClassesTranslatedToCurrentFunction = false; - regClassSet.clear(); - - intervalsTranslatedToCurrentFunction = false; - intervalSet.clear(); - } - - void MFRenderingOptions::resetRenderSpecificOptions() { - intervalSet.clear(); - intervalsTranslatedToCurrentFunction = false; - } - - bool MFRenderingOptions::shouldRenderCurrentMachineFunction() const { - processOptions(); - - return (renderAllMFs || - mfNamesToRender.find(mf->getFunction()->getName()) != - mfNamesToRender.end()); - } - - const MFRenderingOptions::RegClassSet& MFRenderingOptions::regClasses() const{ - translateRegClassNamesToCurrentFunction(); - return regClassSet; - } - - const MFRenderingOptions::IntervalSet& MFRenderingOptions::intervals() const { - translateIntervalNumbersToCurrentFunction(); - return intervalSet; - } - - bool MFRenderingOptions::renderEmptyIndexes() const { - return showEmptyIndexes; - } - - bool MFRenderingOptions::fancyVerticals() const { - return useFancyVerticals; - } - - void MFRenderingOptions::translateRegClassNamesToCurrentFunction() const { - if (!regClassesTranslatedToCurrentFunction) { - processOptions(); - for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(), - rcEnd = tri->regclass_end(); - rcItr != rcEnd; ++rcItr) { - const TargetRegisterClass *trc = *rcItr; - if (renderAllClasses || - classNamesToRender.find(trc->getName()) != - classNamesToRender.end()) { - regClassSet.insert(trc); - } - } - regClassesTranslatedToCurrentFunction = true; - } - } - - void MFRenderingOptions::translateIntervalNumbersToCurrentFunction() const { - if (!intervalsTranslatedToCurrentFunction) { - processOptions(); - - // If we're not just doing explicit then do a copy over all matching - // types. - if (intervalTypesToRender != ExplicitOnly) { - for (LiveIntervals::iterator liItr = lis->begin(), liEnd = lis->end(); - liItr != liEnd; ++liItr) { - LiveInterval *li = liItr->second; - - if (filterEmpty && li->empty()) - continue; - - if ((TargetRegisterInfo::isPhysicalRegister(li->reg) && - (intervalTypesToRender & AllPhys))) { - intervalSet.insert(li); - } else if (TargetRegisterInfo::isVirtualRegister(li->reg)) { - if (((intervalTypesToRender & VirtNoSpills) && !rmf->isSpill(li)) || - ((intervalTypesToRender & VirtSpills) && rmf->isSpill(li))) { - intervalSet.insert(li); - } - } - } - } - - // If we need to process the explicit list... - if (intervalTypesToRender != All) { - for (std::set<std::pair<unsigned, unsigned> >::const_iterator - regRangeItr = intervalNumsToRender.begin(), - regRangeEnd = intervalNumsToRender.end(); - regRangeItr != regRangeEnd; ++regRangeItr) { - const std::pair<unsigned, unsigned> &range = *regRangeItr; - for (unsigned reg = range.first; reg != range.second; ++reg) { - if (lis->hasInterval(reg)) { - intervalSet.insert(&lis->getInterval(reg)); - } - } - } - } - - intervalsTranslatedToCurrentFunction = true; - } - } - - // ---------- TargetRegisterExtraInformation implementation ---------- - - TargetRegisterExtraInfo::TargetRegisterExtraInfo() - : mapsPopulated(false) { - } - - void TargetRegisterExtraInfo::setup(MachineFunction *mf, - MachineRegisterInfo *mri, - const TargetRegisterInfo *tri, - LiveIntervals *lis) { - this->mf = mf; - this->mri = mri; - this->tri = tri; - this->lis = lis; - } - - void TargetRegisterExtraInfo::reset() { - if (!mapsPopulated) { - initWorst(); - //initBounds(); - initCapacity(); - mapsPopulated = true; - } - - resetPressureAndLiveStates(); - } - - void TargetRegisterExtraInfo::clear() { - prWorst.clear(); - vrWorst.clear(); - capacityMap.clear(); - pressureMap.clear(); - //liveStatesMap.clear(); - mapsPopulated = false; - } - - void TargetRegisterExtraInfo::initWorst() { - assert(!mapsPopulated && prWorst.empty() && vrWorst.empty() && - "Worst map already initialised?"); - - // Start with the physical registers. - for (unsigned preg = 1; preg < tri->getNumRegs(); ++preg) { - WorstMapLine &pregLine = prWorst[preg]; - - for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(), - rcEnd = tri->regclass_end(); - rcItr != rcEnd; ++rcItr) { - const TargetRegisterClass *trc = *rcItr; - - unsigned numOverlaps = 0; - for (TargetRegisterClass::iterator rItr = trc->begin(), - rEnd = trc->end(); - rItr != rEnd; ++rItr) { - unsigned trcPReg = *rItr; - if (tri->regsOverlap(preg, trcPReg)) - ++numOverlaps; - } - - pregLine[trc] = numOverlaps; - } - } - - // Now the register classes. - for (TargetRegisterInfo::regclass_iterator rc1Itr = tri->regclass_begin(), - rcEnd = tri->regclass_end(); - rc1Itr != rcEnd; ++rc1Itr) { - const TargetRegisterClass *trc1 = *rc1Itr; - WorstMapLine &classLine = vrWorst[trc1]; - - for (TargetRegisterInfo::regclass_iterator rc2Itr = tri->regclass_begin(); - rc2Itr != rcEnd; ++rc2Itr) { - const TargetRegisterClass *trc2 = *rc2Itr; - - unsigned worst = 0; - - for (TargetRegisterClass::iterator trc1Itr = trc1->begin(), - trc1End = trc1->end(); - trc1Itr != trc1End; ++trc1Itr) { - unsigned trc1Reg = *trc1Itr; - unsigned trc1RegWorst = 0; - - for (TargetRegisterClass::iterator trc2Itr = trc2->begin(), - trc2End = trc2->end(); - trc2Itr != trc2End; ++trc2Itr) { - unsigned trc2Reg = *trc2Itr; - if (tri->regsOverlap(trc1Reg, trc2Reg)) - ++trc1RegWorst; - } - if (trc1RegWorst > worst) { - worst = trc1RegWorst; - } - } - - if (worst != 0) { - classLine[trc2] = worst; - } - } - } - } - - unsigned TargetRegisterExtraInfo::getWorst( - unsigned reg, - const TargetRegisterClass *trc) const { - const WorstMapLine *wml = 0; - if (TargetRegisterInfo::isPhysicalRegister(reg)) { - PRWorstMap::const_iterator prwItr = prWorst.find(reg); - assert(prwItr != prWorst.end() && "Missing prWorst entry."); - wml = &prwItr->second; - } else { - const TargetRegisterClass *regTRC = mri->getRegClass(reg); - VRWorstMap::const_iterator vrwItr = vrWorst.find(regTRC); - assert(vrwItr != vrWorst.end() && "Missing vrWorst entry."); - wml = &vrwItr->second; - } - - WorstMapLine::const_iterator wmlItr = wml->find(trc); - if (wmlItr == wml->end()) - return 0; - - return wmlItr->second; - } - - void TargetRegisterExtraInfo::initCapacity() { - assert(!mapsPopulated && capacityMap.empty() && - "Capacity map already initialised?"); - - for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(), - rcEnd = tri->regclass_end(); - rcItr != rcEnd; ++rcItr) { - const TargetRegisterClass *trc = *rcItr; - unsigned capacity = trc->getRawAllocationOrder(*mf).size(); - - if (capacity != 0) - capacityMap[trc] = capacity; - } - } - - unsigned TargetRegisterExtraInfo::getCapacity( - const TargetRegisterClass *trc) const { - CapacityMap::const_iterator cmItr = capacityMap.find(trc); - assert(cmItr != capacityMap.end() && - "vreg with unallocable register class"); - return cmItr->second; - } - - void TargetRegisterExtraInfo::resetPressureAndLiveStates() { - pressureMap.clear(); - //liveStatesMap.clear(); - - // Iterate over all slots. - - - // Iterate over all live intervals. - for (LiveIntervals::iterator liItr = lis->begin(), - liEnd = lis->end(); - liItr != liEnd; ++liItr) { - LiveInterval *li = liItr->second; - - if (TargetRegisterInfo::isPhysicalRegister(li->reg)) - continue; - - // For all ranges in the current interal. - for (LiveInterval::iterator lrItr = li->begin(), - lrEnd = li->end(); - lrItr != lrEnd; ++lrItr) { - LiveRange *lr = &*lrItr; - - // For all slots in the current range. - for (SlotIndex i = lr->start; i != lr->end; i = i.getNextSlot()) { - - // Record increased pressure at index for all overlapping classes. - for (TargetRegisterInfo::regclass_iterator - rcItr = tri->regclass_begin(), - rcEnd = tri->regclass_end(); - rcItr != rcEnd; ++rcItr) { - const TargetRegisterClass *trc = *rcItr; - - if (trc->getRawAllocationOrder(*mf).empty()) - continue; - - unsigned worstAtI = getWorst(li->reg, trc); - - if (worstAtI != 0) { - pressureMap[i][trc] += worstAtI; - } - } - } - } - } - } - - unsigned TargetRegisterExtraInfo::getPressureAtSlot( - const TargetRegisterClass *trc, - SlotIndex i) const { - PressureMap::const_iterator pmItr = pressureMap.find(i); - if (pmItr == pressureMap.end()) - return 0; - const PressureMapLine &pmLine = pmItr->second; - PressureMapLine::const_iterator pmlItr = pmLine.find(trc); - if (pmlItr == pmLine.end()) - return 0; - return pmlItr->second; - } - - bool TargetRegisterExtraInfo::classOverCapacityAtSlot( - const TargetRegisterClass *trc, - SlotIndex i) const { - return (getPressureAtSlot(trc, i) > getCapacity(trc)); - } - - // ---------- MachineFunctionRenderer implementation ---------- - - void RenderMachineFunction::Spacer::print(raw_ostream &os) const { - if (!prettyHTML) - return; - for (unsigned i = 0; i < ns; ++i) { - os << " "; - } - } - - RenderMachineFunction::Spacer RenderMachineFunction::s(unsigned ns) const { - return Spacer(ns); - } - - raw_ostream& operator<<(raw_ostream &os, const RenderMachineFunction::Spacer &s) { - s.print(os); - return os; - } - - template <typename Iterator> - std::string RenderMachineFunction::escapeChars(Iterator sBegin, Iterator sEnd) const { - std::string r; - - for (Iterator sItr = sBegin; sItr != sEnd; ++sItr) { - char c = *sItr; - - switch (c) { - case '<': r.append("<"); break; - case '>': r.append(">"); break; - case '&': r.append("&"); break; - case ' ': r.append(" "); break; - case '\"': r.append("""); break; - default: r.push_back(c); break; - } - } - - return r; - } - - RenderMachineFunction::LiveState - RenderMachineFunction::getLiveStateAt(const LiveInterval *li, - SlotIndex i) const { - const MachineInstr *mi = sis->getInstructionFromIndex(i); - - // For uses/defs recorded use/def indexes override current liveness and - // instruction operands (Only for the interval which records the indexes). - // FIXME: This is all wrong, uses and defs share the same slots. - if (i.isEarlyClobber() || i.isRegister()) { - UseDefs::const_iterator udItr = useDefs.find(li); - if (udItr != useDefs.end()) { - const SlotSet &slotSet = udItr->second; - if (slotSet.count(i)) { - if (i.isEarlyClobber()) { - return Used; - } - // else - return Defined; - } - } - } - - // If the slot is a load/store, or there's no info in the use/def set then - // use liveness and instruction operand info. - if (li->liveAt(i)) { - - if (mi == 0) { - if (vrm == 0 || - (vrm->getStackSlot(li->reg) == VirtRegMap::NO_STACK_SLOT)) { - return AliveReg; - } else { - return AliveStack; - } - } else { - if (i.isRegister() && mi->definesRegister(li->reg, tri)) { - return Defined; - } else if (i.isEarlyClobber() && mi->readsRegister(li->reg)) { - return Used; - } else { - if (vrm == 0 || - (vrm->getStackSlot(li->reg) == VirtRegMap::NO_STACK_SLOT)) { - return AliveReg; - } else { - return AliveStack; - } - } - } - } - return Dead; - } - - RenderMachineFunction::PressureState - RenderMachineFunction::getPressureStateAt(const TargetRegisterClass *trc, - SlotIndex i) const { - if (trei.getPressureAtSlot(trc, i) == 0) { - return Zero; - } else if (trei.classOverCapacityAtSlot(trc, i)){ - return High; - } - return Low; - } - - /// \brief Render a machine instruction. - void RenderMachineFunction::renderMachineInstr(raw_ostream &os, - const MachineInstr *mi) const { - std::string s; - raw_string_ostream oss(s); - oss << *mi; - - os << escapeChars(oss.str()); - } - - template <typename T> - void RenderMachineFunction::renderVertical(const Spacer &indent, - raw_ostream &os, - const T &t) const { - if (ro.fancyVerticals()) { - os << indent << "<object\n" - << indent + s(2) << "class=\"obj\"\n" - << indent + s(2) << "type=\"image/svg+xml\"\n" - << indent + s(2) << "width=\"14px\"\n" - << indent + s(2) << "height=\"55px\"\n" - << indent + s(2) << "data=\"data:image/svg+xml,\n" - << indent + s(4) << "<svg xmlns='http://www.w3.org/2000/svg'>\n" - << indent + s(6) << "<text x='-55' y='10' " - "font-family='Courier' font-size='12' " - "transform='rotate(-90)' " - "text-rendering='optimizeSpeed' " - "fill='#000'>" << t << "</text>\n" - << indent + s(4) << "</svg>\">\n" - << indent << "</object>\n"; - } else { - std::ostringstream oss; - oss << t; - std::string tStr(oss.str()); - - os << indent; - for (std::string::iterator tStrItr = tStr.begin(), tStrEnd = tStr.end(); - tStrItr != tStrEnd; ++tStrItr) { - os << *tStrItr << "<br/>"; - } - os << "\n"; - } - } - - void RenderMachineFunction::insertCSS(const Spacer &indent, - raw_ostream &os) const { - os << indent << "<style type=\"text/css\">\n" - << indent + s(2) << "body { font-color: black; }\n" - << indent + s(2) << "table.code td { font-family: monospace; " - "border-width: 0px; border-style: solid; " - "border-bottom: 1px solid #dddddd; white-space: nowrap; }\n" - << indent + s(2) << "table.code td.p-z { background-color: #000000; }\n" - << indent + s(2) << "table.code td.p-l { background-color: #00ff00; }\n" - << indent + s(2) << "table.code td.p-h { background-color: #ff0000; }\n" - << indent + s(2) << "table.code td.l-n { background-color: #ffffff; }\n" - << indent + s(2) << "table.code td.l-d { background-color: #ff0000; }\n" - << indent + s(2) << "table.code td.l-u { background-color: #ffff00; }\n" - << indent + s(2) << "table.code td.l-r { background-color: #000000; }\n" - << indent + s(2) << "table.code td.l-s { background-color: #770000; }\n" - << indent + s(2) << "table.code th { border-width: 0px; " - "border-style: solid; }\n" - << indent << "</style>\n"; - } - - void RenderMachineFunction::renderFunctionSummary( - const Spacer &indent, raw_ostream &os, - const char * const renderContextStr) const { - os << indent << "<h1>Function: " << mf->getFunction()->getName() - << "</h1>\n" - << indent << "<h2>Rendering context: " << renderContextStr << "</h2>\n"; - } - - - void RenderMachineFunction::renderPressureTableLegend( - const Spacer &indent, - raw_ostream &os) const { - os << indent << "<h2>Rendering Pressure Legend:</h2>\n" - << indent << "<table class=\"code\">\n" - << indent + s(2) << "<tr>\n" - << indent + s(4) << "<th>Pressure</th><th>Description</th>" - "<th>Appearance</th>\n" - << indent + s(2) << "</tr>\n" - << indent + s(2) << "<tr>\n" - << indent + s(4) << "<td>No Pressure</td>" - "<td>No physical registers of this class requested.</td>" - "<td class=\"p-z\"> </td>\n" - << indent + s(2) << "</tr>\n" - << indent + s(2) << "<tr>\n" - << indent + s(4) << "<td>Low Pressure</td>" - "<td>Sufficient physical registers to meet demand.</td>" - "<td class=\"p-l\"> </td>\n" - << indent + s(2) << "</tr>\n" - << indent + s(2) << "<tr>\n" - << indent + s(4) << "<td>High Pressure</td>" - "<td>Potentially insufficient physical registers to meet demand.</td>" - "<td class=\"p-h\"> </td>\n" - << indent + s(2) << "</tr>\n" - << indent << "</table>\n"; - } - - template <typename CellType> - void RenderMachineFunction::renderCellsWithRLE( - const Spacer &indent, raw_ostream &os, - const std::pair<CellType, unsigned> &rleAccumulator, - const std::map<CellType, std::string> &cellTypeStrs) const { - - if (rleAccumulator.second == 0) - return; - - typename std::map<CellType, std::string>::const_iterator ctsItr = - cellTypeStrs.find(rleAccumulator.first); - - assert(ctsItr != cellTypeStrs.end() && "No string for given cell type."); - - os << indent + s(4) << "<td class=\"" << ctsItr->second << "\""; - if (rleAccumulator.second > 1) - os << " colspan=" << rleAccumulator.second; - os << "></td>\n"; - } - - - void RenderMachineFunction::renderCodeTablePlusPI(const Spacer &indent, - raw_ostream &os) const { - - std::map<LiveState, std::string> lsStrs; - lsStrs[Dead] = "l-n"; - lsStrs[Defined] = "l-d"; - lsStrs[Used] = "l-u"; - lsStrs[AliveReg] = "l-r"; - lsStrs[AliveStack] = "l-s"; - - std::map<PressureState, std::string> psStrs; - psStrs[Zero] = "p-z"; - psStrs[Low] = "p-l"; - psStrs[High] = "p-h"; - - // Open the table... - - os << indent << "<table cellpadding=0 cellspacing=0 class=\"code\">\n" - << indent + s(2) << "<tr>\n"; - - // Render the header row... - - os << indent + s(4) << "<th>index</th>\n" - << indent + s(4) << "<th>instr</th>\n"; - - // Render class names if necessary... - if (!ro.regClasses().empty()) { - for (MFRenderingOptions::RegClassSet::const_iterator - rcItr = ro.regClasses().begin(), - rcEnd = ro.regClasses().end(); - rcItr != rcEnd; ++rcItr) { - const TargetRegisterClass *trc = *rcItr; - os << indent + s(4) << "<th>\n"; - renderVertical(indent + s(6), os, trc->getName()); - os << indent + s(4) << "</th>\n"; - } - } - - // FIXME: Is there a nicer way to insert space between columns in HTML? - if (!ro.regClasses().empty() && !ro.intervals().empty()) - os << indent + s(4) << "<th> </th>\n"; - - // Render interval numbers if necessary... - if (!ro.intervals().empty()) { - for (MFRenderingOptions::IntervalSet::const_iterator - liItr = ro.intervals().begin(), - liEnd = ro.intervals().end(); - liItr != liEnd; ++liItr) { - - const LiveInterval *li = *liItr; - os << indent + s(4) << "<th>\n"; - renderVertical(indent + s(6), os, li->reg); - os << indent + s(4) << "</th>\n"; - } - } - - os << indent + s(2) << "</tr>\n"; - - // End header row, start with the data rows... - - MachineInstr *mi = 0; - - // Data rows: - for (SlotIndex i = sis->getZeroIndex(); i != sis->getLastIndex(); - i = i.getNextSlot()) { - - // Render the slot column. - os << indent + s(2) << "<tr height=6ex>\n"; - - // Render the code column. - if (i.isBlock()) { - MachineBasicBlock *mbb = sis->getMBBFromIndex(i); - mi = sis->getInstructionFromIndex(i); - - if (i == sis->getMBBStartIdx(mbb) || mi != 0 || - ro.renderEmptyIndexes()) { - os << indent + s(4) << "<td rowspan=4>" << i << " </td>\n" - << indent + s(4) << "<td rowspan=4>\n"; - - if (i == sis->getMBBStartIdx(mbb)) { - os << indent + s(6) << "BB#" << mbb->getNumber() << ": \n"; - } else if (mi != 0) { - os << indent + s(6) << " "; - renderMachineInstr(os, mi); - } else { - // Empty interval - leave blank. - } - os << indent + s(4) << "</td>\n"; - } else { - i = i.getDeadSlot(); // <- Will be incremented to the next index. - continue; - } - } - - // Render the class columns. - if (!ro.regClasses().empty()) { - std::pair<PressureState, unsigned> psRLEAccumulator(Zero, 0); - for (MFRenderingOptions::RegClassSet::const_iterator - rcItr = ro.regClasses().begin(), - rcEnd = ro.regClasses().end(); - rcItr != rcEnd; ++rcItr) { - const TargetRegisterClass *trc = *rcItr; - PressureState newPressure = getPressureStateAt(trc, i); - - if (newPressure == psRLEAccumulator.first) { - ++psRLEAccumulator.second; - } else { - renderCellsWithRLE(indent + s(4), os, psRLEAccumulator, psStrs); - psRLEAccumulator.first = newPressure; - psRLEAccumulator.second = 1; - } - } - renderCellsWithRLE(indent + s(4), os, psRLEAccumulator, psStrs); - } - - // FIXME: Is there a nicer way to insert space between columns in HTML? - if (!ro.regClasses().empty() && !ro.intervals().empty()) - os << indent + s(4) << "<td width=2em></td>\n"; - - if (!ro.intervals().empty()) { - std::pair<LiveState, unsigned> lsRLEAccumulator(Dead, 0); - for (MFRenderingOptions::IntervalSet::const_iterator - liItr = ro.intervals().begin(), - liEnd = ro.intervals().end(); - liItr != liEnd; ++liItr) { - const LiveInterval *li = *liItr; - LiveState newLiveness = getLiveStateAt(li, i); - - if (newLiveness == lsRLEAccumulator.first) { - ++lsRLEAccumulator.second; - } else { - renderCellsWithRLE(indent + s(4), os, lsRLEAccumulator, lsStrs); - lsRLEAccumulator.first = newLiveness; - lsRLEAccumulator.second = 1; - } - } - renderCellsWithRLE(indent + s(4), os, lsRLEAccumulator, lsStrs); - } - os << indent + s(2) << "</tr>\n"; - } - - os << indent << "</table>\n"; - - if (!ro.regClasses().empty()) - renderPressureTableLegend(indent, os); - } - - void RenderMachineFunction::renderFunctionPage( - raw_ostream &os, - const char * const renderContextStr) const { - os << "<html>\n" - << s(2) << "<head>\n" - << s(4) << "<title>" << fqn << "</title>\n"; - - insertCSS(s(4), os); - - os << s(2) << "<head>\n" - << s(2) << "<body >\n"; - - renderFunctionSummary(s(4), os, renderContextStr); - - os << s(4) << "<br/><br/><br/>\n"; - - //renderLiveIntervalInfoTable(" ", os); - - os << s(4) << "<br/><br/><br/>\n"; - - renderCodeTablePlusPI(s(4), os); - - os << s(2) << "</body>\n" - << "</html>\n"; - } - - void RenderMachineFunction::getAnalysisUsage(AnalysisUsage &au) const { - au.addRequired<SlotIndexes>(); - au.addRequired<LiveIntervals>(); - au.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(au); - } - - bool RenderMachineFunction::runOnMachineFunction(MachineFunction &fn) { - - mf = &fn; - mri = &mf->getRegInfo(); - tri = mf->getTarget().getRegisterInfo(); - lis = &getAnalysis<LiveIntervals>(); - sis = &getAnalysis<SlotIndexes>(); - - trei.setup(mf, mri, tri, lis); - ro.setup(mf, tri, lis, this); - spillIntervals.clear(); - spillFor.clear(); - useDefs.clear(); - - fqn = mf->getFunction()->getParent()->getModuleIdentifier() + "." + - mf->getFunction()->getName().str(); - - return false; - } - - void RenderMachineFunction::releaseMemory() { - trei.clear(); - ro.clear(); - spillIntervals.clear(); - spillFor.clear(); - useDefs.clear(); - } - - void RenderMachineFunction::rememberUseDefs(const LiveInterval *li) { - - if (!ro.shouldRenderCurrentMachineFunction()) - return; - - for (MachineRegisterInfo::reg_iterator rItr = mri->reg_begin(li->reg), - rEnd = mri->reg_end(); - rItr != rEnd; ++rItr) { - const MachineInstr *mi = &*rItr; - if (mi->readsRegister(li->reg)) { - useDefs[li].insert(lis->getInstructionIndex(mi).getRegSlot(true)); - } - if (mi->definesRegister(li->reg)) { - useDefs[li].insert(lis->getInstructionIndex(mi).getRegSlot()); - } - } - } - - void RenderMachineFunction::rememberSpills( - const LiveInterval *li, - const std::vector<LiveInterval*> &spills) { - - if (!ro.shouldRenderCurrentMachineFunction()) - return; - - for (std::vector<LiveInterval*>::const_iterator siItr = spills.begin(), - siEnd = spills.end(); - siItr != siEnd; ++siItr) { - const LiveInterval *spill = *siItr; - spillIntervals[li].insert(spill); - spillFor[spill] = li; - } - } - - bool RenderMachineFunction::isSpill(const LiveInterval *li) const { - SpillForMap::const_iterator sfItr = spillFor.find(li); - if (sfItr == spillFor.end()) - return false; - return true; - } - - void RenderMachineFunction::renderMachineFunction( - const char *renderContextStr, - const VirtRegMap *vrm, - const char *renderSuffix) { - if (!ro.shouldRenderCurrentMachineFunction()) - return; - - this->vrm = vrm; - trei.reset(); - - std::string rpFileName(mf->getFunction()->getName().str() + - (renderSuffix ? renderSuffix : "") + - outputFileSuffix); - - std::string errMsg; - raw_fd_ostream outFile(rpFileName.c_str(), errMsg, raw_fd_ostream::F_Binary); - - renderFunctionPage(outFile, renderContextStr); - - ro.resetRenderSpecificOptions(); - } - - std::string RenderMachineFunction::escapeChars(const std::string &s) const { - return escapeChars(s.begin(), s.end()); - } - -} diff --git a/lib/CodeGen/RenderMachineFunction.h b/lib/CodeGen/RenderMachineFunction.h deleted file mode 100644 index 8571992..0000000 --- a/lib/CodeGen/RenderMachineFunction.h +++ /dev/null @@ -1,338 +0,0 @@ -//===-- llvm/CodeGen/RenderMachineFunction.h - MF->HTML -*- C++ -*---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CODEGEN_RENDERMACHINEFUNCTION_H -#define LLVM_CODEGEN_RENDERMACHINEFUNCTION_H - -#include "llvm/CodeGen/LiveInterval.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/SlotIndexes.h" -#include "llvm/Target/TargetRegisterInfo.h" - -#include <algorithm> -#include <map> -#include <set> -#include <string> - -namespace llvm { - - class LiveInterval; - class LiveIntervals; - class MachineInstr; - class MachineRegisterInfo; - class RenderMachineFunction; - class TargetRegisterClass; - class TargetRegisterInfo; - class VirtRegMap; - class raw_ostream; - - /// \brief Helper class to process rendering options. Tries to be as lazy as - /// possible. - class MFRenderingOptions { - public: - - struct RegClassComp { - bool operator()(const TargetRegisterClass *trc1, - const TargetRegisterClass *trc2) const { - std::string trc1Name(trc1->getName()), trc2Name(trc2->getName()); - return std::lexicographical_compare(trc1Name.begin(), trc1Name.end(), - trc2Name.begin(), trc2Name.end()); - } - }; - - typedef std::set<const TargetRegisterClass*, RegClassComp> RegClassSet; - - struct IntervalComp { - bool operator()(const LiveInterval *li1, const LiveInterval *li2) const { - return li1->reg < li2->reg; - } - }; - - typedef std::set<const LiveInterval*, IntervalComp> IntervalSet; - - /// Initialise the rendering options. - void setup(MachineFunction *mf, const TargetRegisterInfo *tri, - LiveIntervals *lis, const RenderMachineFunction *rmf); - - /// Clear translations of options to the current function. - void clear(); - - /// Reset any options computed for this specific rendering. - void resetRenderSpecificOptions(); - - /// Should we render the current function. - bool shouldRenderCurrentMachineFunction() const; - - /// Return the set of register classes to render pressure for. - const RegClassSet& regClasses() const; - - /// Return the set of live intervals to render liveness for. - const IntervalSet& intervals() const; - - /// Render indexes which are not associated with instructions / MBB starts. - bool renderEmptyIndexes() const; - - /// Return whether or not to render using SVG for fancy vertical text. - bool fancyVerticals() const; - - private: - - static bool renderingOptionsProcessed; - static std::set<std::string> mfNamesToRender; - static bool renderAllMFs; - - static std::set<std::string> classNamesToRender; - static bool renderAllClasses; - - - static std::set<std::pair<unsigned, unsigned> > intervalNumsToRender; - typedef enum { ExplicitOnly = 0, - AllPhys = 1, - VirtNoSpills = 2, - VirtSpills = 4, - AllVirt = 6, - All = 7 } - IntervalTypesToRender; - static unsigned intervalTypesToRender; - - template <typename OutputItr> - static void splitComaSeperatedList(const std::string &s, OutputItr outItr); - - static void processOptions(); - - static void processFuncNames(); - static void processRegClassNames(); - static void processIntervalNumbers(); - - static void processIntervalRange(const std::string &intervalRangeStr); - - MachineFunction *mf; - const TargetRegisterInfo *tri; - LiveIntervals *lis; - const RenderMachineFunction *rmf; - - mutable bool regClassesTranslatedToCurrentFunction; - mutable RegClassSet regClassSet; - - mutable bool intervalsTranslatedToCurrentFunction; - mutable IntervalSet intervalSet; - - void translateRegClassNamesToCurrentFunction() const; - - void translateIntervalNumbersToCurrentFunction() const; - }; - - /// \brief Provide extra information about the physical and virtual registers - /// in the function being compiled. - class TargetRegisterExtraInfo { - public: - TargetRegisterExtraInfo(); - - /// \brief Set up TargetRegisterExtraInfo with pointers to necessary - /// sources of information. - void setup(MachineFunction *mf, MachineRegisterInfo *mri, - const TargetRegisterInfo *tri, LiveIntervals *lis); - - /// \brief Recompute tables for changed function. - void reset(); - - /// \brief Free all tables in TargetRegisterExtraInfo. - void clear(); - - /// \brief Maximum number of registers from trc which alias reg. - unsigned getWorst(unsigned reg, const TargetRegisterClass *trc) const; - - /// \brief Returns the number of allocable registers in trc. - unsigned getCapacity(const TargetRegisterClass *trc) const; - - /// \brief Return the number of registers of class trc that may be - /// needed at slot i. - unsigned getPressureAtSlot(const TargetRegisterClass *trc, - SlotIndex i) const; - - /// \brief Return true if the number of registers of type trc that may be - /// needed at slot i is greater than the capacity of trc. - bool classOverCapacityAtSlot(const TargetRegisterClass *trc, - SlotIndex i) const; - - private: - - MachineFunction *mf; - MachineRegisterInfo *mri; - const TargetRegisterInfo *tri; - LiveIntervals *lis; - - typedef std::map<const TargetRegisterClass*, unsigned> WorstMapLine; - typedef std::map<const TargetRegisterClass*, WorstMapLine> VRWorstMap; - VRWorstMap vrWorst; - - typedef std::map<unsigned, WorstMapLine> PRWorstMap; - PRWorstMap prWorst; - - typedef std::map<const TargetRegisterClass*, unsigned> CapacityMap; - CapacityMap capacityMap; - - typedef std::map<const TargetRegisterClass*, unsigned> PressureMapLine; - typedef std::map<SlotIndex, PressureMapLine> PressureMap; - PressureMap pressureMap; - - bool mapsPopulated; - - /// \brief Initialise the 'worst' table. - void initWorst(); - - /// \brief Initialise the 'capacity' table. - void initCapacity(); - - /// \brief Initialise/Reset the 'pressure' and live states tables. - void resetPressureAndLiveStates(); - }; - - /// \brief Render MachineFunction objects and related information to a HTML - /// page. - class RenderMachineFunction : public MachineFunctionPass { - public: - static char ID; - - RenderMachineFunction() : MachineFunctionPass(ID) { - initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry()); - } - - virtual void getAnalysisUsage(AnalysisUsage &au) const; - - virtual bool runOnMachineFunction(MachineFunction &fn); - - virtual void releaseMemory(); - - void rememberUseDefs(const LiveInterval *li); - - void rememberSpills(const LiveInterval *li, - const std::vector<LiveInterval*> &spills); - - bool isSpill(const LiveInterval *li) const; - - /// \brief Render this machine function to HTML. - /// - /// @param renderContextStr This parameter will be included in the top of - /// the html file to explain where (in the - /// codegen pipeline) this function was rendered - /// from. Set it to something like - /// "Pre-register-allocation". - /// @param vrm If non-null the VRM will be queried to determine - /// whether a virtual register was allocated to a - /// physical register or spilled. - /// @param renderFilePrefix This string will be appended to the function - /// name (before the output file suffix) to enable - /// multiple renderings from the same function. - void renderMachineFunction(const char *renderContextStr, - const VirtRegMap *vrm = 0, - const char *renderSuffix = 0); - - private: - class Spacer; - friend raw_ostream& operator<<(raw_ostream &os, const Spacer &s); - - std::string fqn; - - MachineFunction *mf; - MachineRegisterInfo *mri; - const TargetRegisterInfo *tri; - LiveIntervals *lis; - SlotIndexes *sis; - const VirtRegMap *vrm; - - TargetRegisterExtraInfo trei; - MFRenderingOptions ro; - - - - // Utilities. - typedef enum { Dead, Defined, Used, AliveReg, AliveStack } LiveState; - LiveState getLiveStateAt(const LiveInterval *li, SlotIndex i) const; - - typedef enum { Zero, Low, High } PressureState; - PressureState getPressureStateAt(const TargetRegisterClass *trc, - SlotIndex i) const; - - typedef std::map<const LiveInterval*, std::set<const LiveInterval*> > - SpillIntervals; - SpillIntervals spillIntervals; - - typedef std::map<const LiveInterval*, const LiveInterval*> SpillForMap; - SpillForMap spillFor; - - typedef std::set<SlotIndex> SlotSet; - typedef std::map<const LiveInterval*, SlotSet> UseDefs; - UseDefs useDefs; - - // ---------- Rendering methods ---------- - - /// For inserting spaces when pretty printing. - class Spacer { - public: - explicit Spacer(unsigned numSpaces) : ns(numSpaces) {} - Spacer operator+(const Spacer &o) const { return Spacer(ns + o.ns); } - void print(raw_ostream &os) const; - private: - unsigned ns; - }; - - Spacer s(unsigned ns) const; - - template <typename Iterator> - std::string escapeChars(Iterator sBegin, Iterator sEnd) const; - - /// \brief Render a machine instruction. - void renderMachineInstr(raw_ostream &os, - const MachineInstr *mi) const; - - /// \brief Render vertical text. - template <typename T> - void renderVertical(const Spacer &indent, - raw_ostream &os, - const T &t) const; - - /// \brief Insert CSS layout info. - void insertCSS(const Spacer &indent, - raw_ostream &os) const; - - /// \brief Render a brief summary of the function (including rendering - /// context). - void renderFunctionSummary(const Spacer &indent, - raw_ostream &os, - const char * const renderContextStr) const; - - /// \brief Render a legend for the pressure table. - void renderPressureTableLegend(const Spacer &indent, - raw_ostream &os) const; - - /// \brief Render a consecutive set of HTML cells of the same class using - /// the colspan attribute for run-length encoding. - template <typename CellType> - void renderCellsWithRLE( - const Spacer &indent, raw_ostream &os, - const std::pair<CellType, unsigned> &rleAccumulator, - const std::map<CellType, std::string> &cellTypeStrs) const; - - /// \brief Render code listing, potentially with register pressure - /// and live intervals shown alongside. - void renderCodeTablePlusPI(const Spacer &indent, - raw_ostream &os) const; - - /// \brief Render the HTML page representing the MachineFunction. - void renderFunctionPage(raw_ostream &os, - const char * const renderContextStr) const; - - std::string escapeChars(const std::string &s) const; - }; -} - -#endif /* LLVM_CODEGEN_RENDERMACHINEFUNCTION_H */ diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp index 8fd6426..752f8e4 100644 --- a/lib/CodeGen/ScheduleDAG.cpp +++ b/lib/CodeGen/ScheduleDAG.cpp @@ -64,10 +64,27 @@ const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const { /// specified node. bool SUnit::addPred(const SDep &D) { // If this node already has this depenence, don't add a redundant one. - for (SmallVector<SDep, 4>::const_iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) - if (*I == D) + for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end(); + I != E; ++I) { + if (I->overlaps(D)) { + // Extend the latency if needed. Equivalent to removePred(I) + addPred(D). + if (I->getLatency() < D.getLatency()) { + SUnit *PredSU = I->getSUnit(); + // Find the corresponding successor in N. + SDep ForwardD = *I; + ForwardD.setSUnit(this); + for (SmallVector<SDep, 4>::iterator II = PredSU->Succs.begin(), + EE = PredSU->Succs.end(); II != EE; ++II) { + if (*II == ForwardD) { + II->setLatency(D.getLatency()); + break; + } + } + I->setLatency(D.getLatency()); + } return false; + } + } // Now add a corresponding succ to N. SDep P = D; P.setSUnit(this); diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index d46eb89..110f478 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -21,17 +21,24 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallPtrSet.h" using namespace llvm; +static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden, + cl::ZeroOrMore, cl::init(false), + cl::desc("Enable use of AA during MI GAD construction")); + ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, const MachineLoopInfo &mli, const MachineDominatorTree &mdt, @@ -40,7 +47,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()), InstrItins(mf.getTarget().getInstrItineraryData()), LIS(lis), IsPostRA(IsPostRAFlag), UnitLatencies(false), CanHandleTerminators(false), - LoopRegs(MLI, MDT), FirstDbgValue(0) { + LoopRegs(MDT), FirstDbgValue(0) { assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals"); DbgValues.clear(); assert(!(IsPostRA && MRI.getNumVirtRegs()) && @@ -126,7 +133,8 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI, return 0; } -void ScheduleDAGInstrs::startBlock(MachineBasicBlock *BB) { +void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) { + BB = bb; LoopRegs.Deps.clear(); if (MachineLoop *ML = MLI.getLoopFor(BB)) if (BB == ML->getLoopLatch()) @@ -134,7 +142,8 @@ void ScheduleDAGInstrs::startBlock(MachineBasicBlock *BB) { } void ScheduleDAGInstrs::finishBlock() { - // Nothing to do. + // Subclasses should no longer refer to the old block. + BB = 0; } /// Initialize the map with the number of registers. @@ -159,7 +168,7 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, MachineBasicBlock::iterator begin, MachineBasicBlock::iterator end, unsigned endcount) { - BB = bb; + assert(bb == BB && "startBlock should set BB"); RegionBegin = begin; RegionEnd = end; EndIndex = endcount; @@ -232,7 +241,8 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned SpecialAddressLatency = ST.getSpecialAddressLatency(); unsigned DataLatency = SU->Latency; - for (const uint16_t *Alias = TRI->getOverlaps(MO.getReg()); *Alias; ++Alias) { + for (MCRegAliasIterator Alias(MO.getReg(), TRI, true); + Alias.isValid(); ++Alias) { if (!Uses.contains(*Alias)) continue; std::vector<SUnit*> &UseList = Uses[*Alias]; @@ -261,10 +271,12 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, // Adjust the dependence latency using operand def/use // information (if any), and then allow the target to // perform its own adjustments. - const SDep& dep = SDep(SU, SDep::Data, LDataLatency, *Alias); + SDep dep(SU, SDep::Data, LDataLatency, *Alias); if (!UnitLatencies) { - computeOperandLatency(SU, UseSU, const_cast<SDep &>(dep)); - ST.adjustSchedDependency(SU, UseSU, const_cast<SDep &>(dep)); + unsigned Latency = computeOperandLatency(SU, UseSU, dep); + dep.setLatency(Latency); + + ST.adjustSchedDependency(SU, UseSU, dep); } UseSU->addPred(dep); } @@ -285,7 +297,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { // TODO: Using a latency of 1 here for output dependencies assumes // there's no cost for reusing registers. SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output; - for (const uint16_t *Alias = TRI->getOverlaps(MO.getReg()); *Alias; ++Alias) { + for (MCRegAliasIterator Alias(MO.getReg(), TRI, true); + Alias.isValid(); ++Alias) { if (!Defs.contains(*Alias)) continue; std::vector<SUnit *> &DefList = Defs[*Alias]; @@ -400,8 +413,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { // SSA defs do not have output/anti dependencies. // The current operand is a def, so we have at least one. - if (llvm::next(MRI.def_begin(Reg)) == MRI.def_end()) - return; + // + // FIXME: This optimization is disabled pending PR13112. + //if (llvm::next(MRI.def_begin(Reg)) == MRI.def_end()) + // return; // Add output dependence to the next nearest def of this vreg. // @@ -410,7 +425,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { // uses. We're conservative for now until we have a way to guarantee the uses // are not eliminated sometime during scheduling. The output dependence edge // is also useful if output latency exceeds def-use latency. - VReg2SUnitMap::iterator DefI = findVRegDef(Reg); + VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg); if (DefI == VRegDefs.end()) VRegDefs.insert(VReg2SUnit(Reg, SU)); else { @@ -436,10 +451,11 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { // Lookup this operand's reaching definition. assert(LIS && "vreg dependencies requires LiveIntervals"); - SlotIndex UseIdx = LIS->getInstructionIndex(MI).getRegSlot(); - LiveInterval *LI = &LIS->getInterval(Reg); - VNInfo *VNI = LI->getVNInfoBefore(UseIdx); + LiveRangeQuery LRQ(LIS->getInterval(Reg), LIS->getInstructionIndex(MI)); + VNInfo *VNI = LRQ.valueIn(); + // VNI will be valid because MachineOperand::readsReg() is checked by caller. + assert(VNI && "No value to read by operand"); MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def); // Phis and other noninstructions (after coalescing) have a NULL Def. if (Def) { @@ -449,11 +465,13 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { // Create a data dependence. // // TODO: Handle "special" address latencies cleanly. - const SDep &dep = SDep(DefSU, SDep::Data, DefSU->Latency, Reg); + SDep dep(DefSU, SDep::Data, DefSU->Latency, Reg); if (!UnitLatencies) { // Adjust the dependence latency using operand def/use information, then // allow the target to perform its own adjustments. - computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep)); + unsigned Latency = computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep)); + dep.setLatency(Latency); + const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>(); ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep)); } @@ -462,11 +480,217 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { } // Add antidependence to the following def of the vreg it uses. - VReg2SUnitMap::iterator DefI = findVRegDef(Reg); + VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg); if (DefI != VRegDefs.end() && DefI->SU != SU) DefI->SU->addPred(SDep(SU, SDep::Anti, 0, Reg)); } +/// Return true if MI is an instruction we are unable to reason about +/// (like a call or something with unmodeled side effects). +static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) { + if (MI->isCall() || MI->hasUnmodeledSideEffects() || + (MI->hasVolatileMemoryRef() && + (!MI->mayLoad() || !MI->isInvariantLoad(AA)))) + return true; + return false; +} + +// This MI might have either incomplete info, or known to be unsafe +// to deal with (i.e. volatile object). +static inline bool isUnsafeMemoryObject(MachineInstr *MI, + const MachineFrameInfo *MFI) { + if (!MI || MI->memoperands_empty()) + return true; + // We purposefully do no check for hasOneMemOperand() here + // in hope to trigger an assert downstream in order to + // finish implementation. + if ((*MI->memoperands_begin())->isVolatile() || + MI->hasUnmodeledSideEffects()) + return true; + + const Value *V = (*MI->memoperands_begin())->getValue(); + if (!V) + return true; + + V = getUnderlyingObject(V); + if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) { + // Similarly to getUnderlyingObjectForInstr: + // For now, ignore PseudoSourceValues which may alias LLVM IR values + // because the code that uses this function has no way to cope with + // such aliases. + if (PSV->isAliased(MFI)) + return true; + } + // Does this pointer refer to a distinct and identifiable object? + if (!isIdentifiedObject(V)) + return true; + + return false; +} + +/// This returns true if the two MIs need a chain edge betwee them. +/// If these are not even memory operations, we still may need +/// chain deps between them. The question really is - could +/// these two MIs be reordered during scheduling from memory dependency +/// point of view. +static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI, + MachineInstr *MIa, + MachineInstr *MIb) { + // Cover a trivial case - no edge is need to itself. + if (MIa == MIb) + return false; + + if (isUnsafeMemoryObject(MIa, MFI) || isUnsafeMemoryObject(MIb, MFI)) + return true; + + // If we are dealing with two "normal" loads, we do not need an edge + // between them - they could be reordered. + if (!MIa->mayStore() && !MIb->mayStore()) + return false; + + // To this point analysis is generic. From here on we do need AA. + if (!AA) + return true; + + MachineMemOperand *MMOa = *MIa->memoperands_begin(); + MachineMemOperand *MMOb = *MIb->memoperands_begin(); + + // FIXME: Need to handle multiple memory operands to support all targets. + if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) + llvm_unreachable("Multiple memory operands."); + + // The following interface to AA is fashioned after DAGCombiner::isAlias + // and operates with MachineMemOperand offset with some important + // assumptions: + // - LLVM fundamentally assumes flat address spaces. + // - MachineOperand offset can *only* result from legalization and + // cannot affect queries other than the trivial case of overlap + // checking. + // - These offsets never wrap and never step outside + // of allocated objects. + // - There should never be any negative offsets here. + // + // FIXME: Modify API to hide this math from "user" + // FIXME: Even before we go to AA we can reason locally about some + // memory objects. It can save compile time, and possibly catch some + // corner cases not currently covered. + + assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); + assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); + + int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); + int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; + int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; + + AliasAnalysis::AliasResult AAResult = AA->alias( + AliasAnalysis::Location(MMOa->getValue(), Overlapa, + MMOa->getTBAAInfo()), + AliasAnalysis::Location(MMOb->getValue(), Overlapb, + MMOb->getTBAAInfo())); + + return (AAResult != AliasAnalysis::NoAlias); +} + +/// This recursive function iterates over chain deps of SUb looking for +/// "latest" node that needs a chain edge to SUa. +static unsigned +iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI, + SUnit *SUa, SUnit *SUb, SUnit *ExitSU, unsigned *Depth, + SmallPtrSet<const SUnit*, 16> &Visited) { + if (!SUa || !SUb || SUb == ExitSU) + return *Depth; + + // Remember visited nodes. + if (!Visited.insert(SUb)) + return *Depth; + // If there is _some_ dependency already in place, do not + // descend any further. + // TODO: Need to make sure that if that dependency got eliminated or ignored + // for any reason in the future, we would not violate DAG topology. + // Currently it does not happen, but makes an implicit assumption about + // future implementation. + // + // Independently, if we encounter node that is some sort of global + // object (like a call) we already have full set of dependencies to it + // and we can stop descending. + if (SUa->isSucc(SUb) || + isGlobalMemoryObject(AA, SUb->getInstr())) + return *Depth; + + // If we do need an edge, or we have exceeded depth budget, + // add that edge to the predecessors chain of SUb, + // and stop descending. + if (*Depth > 200 || + MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) { + SUb->addPred(SDep(SUa, SDep::Order, /*Latency=*/0, /*Reg=*/0, + /*isNormalMemory=*/true)); + return *Depth; + } + // Track current depth. + (*Depth)++; + // Iterate over chain dependencies only. + for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end(); + I != E; ++I) + if (I->isCtrl()) + iterateChainSucc (AA, MFI, SUa, I->getSUnit(), ExitSU, Depth, Visited); + return *Depth; +} + +/// This function assumes that "downward" from SU there exist +/// tail/leaf of already constructed DAG. It iterates downward and +/// checks whether SU can be aliasing any node dominated +/// by it. +static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI, + SUnit *SU, SUnit *ExitSU, std::set<SUnit *> &CheckList, + unsigned LatencyToLoad) { + if (!SU) + return; + + SmallPtrSet<const SUnit*, 16> Visited; + unsigned Depth = 0; + + for (std::set<SUnit *>::iterator I = CheckList.begin(), IE = CheckList.end(); + I != IE; ++I) { + if (SU == *I) + continue; + if (MIsNeedChainEdge(AA, MFI, SU->getInstr(), (*I)->getInstr())) { + unsigned Latency = ((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0; + (*I)->addPred(SDep(SU, SDep::Order, Latency, /*Reg=*/0, + /*isNormalMemory=*/true)); + } + // Now go through all the chain successors and iterate from them. + // Keep track of visited nodes. + for (SUnit::const_succ_iterator J = (*I)->Succs.begin(), + JE = (*I)->Succs.end(); J != JE; ++J) + if (J->isCtrl()) + iterateChainSucc (AA, MFI, SU, J->getSUnit(), + ExitSU, &Depth, Visited); + } +} + +/// Check whether two objects need a chain edge, if so, add it +/// otherwise remember the rejected SU. +static inline +void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI, + SUnit *SUa, SUnit *SUb, + std::set<SUnit *> &RejectList, + unsigned TrueMemOrderLatency = 0, + bool isNormalMemory = false) { + // If this is a false dependency, + // do not add the edge, but rememeber the rejected node. + if (!EnableAASchedMI || + MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) + SUb->addPred(SDep(SUa, SDep::Order, TrueMemOrderLatency, /*Reg=*/0, + isNormalMemory)); + else { + // Duplicate entries should be ignored. + RejectList.insert(SUb); + DEBUG(dbgs() << "\tReject chain dep between SU(" + << SUa->NodeNum << ") and SU(" + << SUb->NodeNum << ")\n"); + } +} + /// Create an SUnit for each real instruction, numbered in top-down toplological /// order. The instruction order A < B, implies that no edge exists from B to A. /// @@ -502,7 +726,11 @@ void ScheduleDAGInstrs::initSUnits() { } } -void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { +/// If RegPressure is non null, compute register pressure as a side effect. The +/// DAG builder is an efficient place to do it because it already visits +/// operands. +void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, + RegPressureTracker *RPTracker) { // Create an SUnit for each real instruction. initSUnits(); @@ -518,6 +746,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { // that are known not to alias std::map<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs; std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses; + std::set<SUnit*> RejectMemNodes; // Remove any stale debug info; sometimes BuildSchedGraph is called again // without emitting the info from the previous call. @@ -553,6 +782,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { PrevMI = MI; continue; } + if (RPTracker) { + RPTracker->recede(); + assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI"); + } assert((!MI->isTerminator() || CanHandleTerminators) && !MI->isLabel() && "Cannot schedule terminators or labels!"); @@ -587,11 +820,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { // after stack slots are lowered to actual addresses. // TODO: Use an AliasAnalysis and do real alias-analysis queries, and // produce more precise dependence information. -#define STORE_LOAD_LATENCY 1 - unsigned TrueMemOrderLatency = 0; - if (MI->isCall() || MI->hasUnmodeledSideEffects() || - (MI->hasVolatileMemoryRef() && - (!MI->mayLoad() || !MI->isInvariantLoad(AA)))) { + unsigned TrueMemOrderLatency = MI->mayStore() ? 1 : 0; + if (isGlobalMemoryObject(AA, MI)) { // Be conservative with these and add dependencies on all memory // references, even those that are known to not alias. for (std::map<const Value *, SUnit *>::iterator I = @@ -603,36 +833,48 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { for (unsigned i = 0, e = I->second.size(); i != e; ++i) I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency)); } - NonAliasMemDefs.clear(); - NonAliasMemUses.clear(); // Add SU to the barrier chain. if (BarrierChain) BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); BarrierChain = SU; + // This is a barrier event that acts as a pivotal node in the DAG, + // so it is safe to clear list of exposed nodes. + adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, + TrueMemOrderLatency); + RejectMemNodes.clear(); + NonAliasMemDefs.clear(); + NonAliasMemUses.clear(); // fall-through new_alias_chain: // Chain all possibly aliasing memory references though SU. - if (AliasChain) - AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); + if (AliasChain) { + unsigned ChainLatency = 0; + if (AliasChain->getInstr()->mayLoad()) + ChainLatency = TrueMemOrderLatency; + addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes, + ChainLatency); + } AliasChain = SU; for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) - PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency)); + addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes, + TrueMemOrderLatency); for (std::map<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(), - E = AliasMemDefs.end(); I != E; ++I) { - I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); - } + E = AliasMemDefs.end(); I != E; ++I) + addChainDependency(AA, MFI, SU, I->second, RejectMemNodes); for (std::map<const Value *, std::vector<SUnit *> >::iterator I = AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) { for (unsigned i = 0, e = I->second.size(); i != e; ++i) - I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency)); + addChainDependency(AA, MFI, SU, I->second[i], RejectMemNodes, + TrueMemOrderLatency); } + adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, + TrueMemOrderLatency); PendingLoads.clear(); AliasMemDefs.clear(); AliasMemUses.clear(); } else if (MI->mayStore()) { bool MayAlias = true; - TrueMemOrderLatency = STORE_LOAD_LATENCY; if (const Value *V = getUnderlyingObjectForInstr(MI, MFI, MayAlias)) { // A store to a specific PseudoSourceValue. Add precise dependencies. // Record the def in MemDefs, first adding a dep if there is @@ -642,8 +884,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { std::map<const Value *, SUnit *>::iterator IE = ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); if (I != IE) { - I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0, - /*isNormalMemory=*/true)); + addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, + 0, true); I->second = SU; } else { if (MayAlias) @@ -658,20 +900,28 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { ((MayAlias) ? AliasMemUses.end() : NonAliasMemUses.end()); if (J != JE) { for (unsigned i = 0, e = J->second.size(); i != e; ++i) - J->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency, - /*Reg=*/0, /*isNormalMemory=*/true)); + addChainDependency(AA, MFI, SU, J->second[i], RejectMemNodes, + TrueMemOrderLatency, true); J->second.clear(); } if (MayAlias) { // Add dependencies from all the PendingLoads, i.e. loads // with no underlying object. for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) - PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency)); + addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes, + TrueMemOrderLatency); // Add dependence on alias chain, if needed. if (AliasChain) - AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); + addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes); + // But we also should check dependent instructions for the + // SU in question. + adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, + TrueMemOrderLatency); } // Add dependence on barrier chain, if needed. + // There is no point to check aliasing on barrier event. Even if + // SU and barrier _could_ be reordered, they should not. In addition, + // we have lost all RejectMemNodes below barrier. if (BarrierChain) BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); } else { @@ -688,7 +938,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { /*isArtificial=*/true)); } else if (MI->mayLoad()) { bool MayAlias = true; - TrueMemOrderLatency = 0; if (MI->isInvariantLoad(AA)) { // Invariant load, no chain dependencies needed! } else { @@ -700,8 +949,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { std::map<const Value *, SUnit *>::iterator IE = ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); if (I != IE) - I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0, - /*isNormalMemory=*/true)); + addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true); if (MayAlias) AliasMemUses[V].push_back(SU); else @@ -711,15 +959,16 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { // potentially aliasing stores. for (std::map<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) - I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); + addChainDependency(AA, MFI, SU, I->second, RejectMemNodes); PendingLoads.push_back(SU); MayAlias = true; } - + if (MayAlias) + adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, /*Latency=*/0); // Add dependencies on alias and barrier chains, if needed. if (MayAlias && AliasChain) - AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); + addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes); if (BarrierChain) BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); } @@ -735,8 +984,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) { } void ScheduleDAGInstrs::computeLatency(SUnit *SU) { - // Compute the latency for the node. - if (!InstrItins || InstrItins->isEmpty()) { + // Compute the latency for the node. We only provide a default for missing + // itineraries. Empty itineraries still have latency properties. + if (!InstrItins) { SU->Latency = 1; // Simplistic target-independent heuristic: assume that loads take @@ -748,63 +998,15 @@ void ScheduleDAGInstrs::computeLatency(SUnit *SU) { } } -void ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use, - SDep& dep) const { - if (!InstrItins || InstrItins->isEmpty()) - return; - +unsigned ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use, + const SDep& dep, + bool FindMin) const { // For a data dependency with a known register... if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0)) - return; - - const unsigned Reg = dep.getReg(); - - // ... find the definition of the register in the defining - // instruction - MachineInstr *DefMI = Def->getInstr(); - int DefIdx = DefMI->findRegisterDefOperandIdx(Reg); - if (DefIdx != -1) { - const MachineOperand &MO = DefMI->getOperand(DefIdx); - if (MO.isReg() && MO.isImplicit() && - DefIdx >= (int)DefMI->getDesc().getNumOperands()) { - // This is an implicit def, getOperandLatency() won't return the correct - // latency. e.g. - // %D6<def>, %D7<def> = VLD1q16 %R2<kill>, 0, ..., %Q3<imp-def> - // %Q1<def> = VMULv8i16 %Q1<kill>, %Q3<kill>, ... - // What we want is to compute latency between def of %D6/%D7 and use of - // %Q3 instead. - unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI); - if (DefMI->getOperand(Op2).isReg()) - DefIdx = Op2; - } - MachineInstr *UseMI = Use->getInstr(); - // For all uses of the register, calculate the maxmimum latency - int Latency = -1; - if (UseMI) { - for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = UseMI->getOperand(i); - if (!MO.isReg() || !MO.isUse()) - continue; - unsigned MOReg = MO.getReg(); - if (MOReg != Reg) - continue; - - int UseCycle = TII->getOperandLatency(InstrItins, DefMI, DefIdx, - UseMI, i); - Latency = std::max(Latency, UseCycle); - } - } else { - // UseMI is null, then it must be a scheduling barrier. - if (!InstrItins || InstrItins->isEmpty()) - return; - unsigned DefClass = DefMI->getDesc().getSchedClass(); - Latency = InstrItins->getOperandCycle(DefClass, DefIdx); - } + return 1; - // If we found a latency, then replace the existing dependence latency. - if (Latency >= 0) - dep.setLatency(Latency); - } + return TII->computeOperandLatency(InstrItins, TRI, Def->getInstr(), + Use->getInstr(), dep.getReg(), FindMin); } void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp index 3d22035..e675366 100644 --- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp +++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp @@ -39,13 +39,11 @@ ScoreboardHazardRecognizer(const InstrItineraryData *II, DebugType = ParentDebugType; #endif - // Determine the maximum depth of any itinerary. This determines the - // depth of the scoreboard. We always make the scoreboard at least 1 - // cycle deep to avoid dealing with the boundary condition. + // Determine the maximum depth of any itinerary. This determines the depth of + // the scoreboard. We always make the scoreboard at least 1 cycle deep to + // avoid dealing with the boundary condition. unsigned ScoreboardDepth = 1; if (ItinData && !ItinData->isEmpty()) { - IssueWidth = ItinData->IssueWidth; - for (unsigned idx = 0; ; ++idx) { if (ItinData->isEndMarker(idx)) break; @@ -63,16 +61,26 @@ ScoreboardHazardRecognizer(const InstrItineraryData *II, // Find the next power-of-2 >= ItinDepth while (ItinDepth > ScoreboardDepth) { ScoreboardDepth *= 2; + // Don't set MaxLookAhead until we find at least one nonzero stage. + // This way, an itinerary with no stages has MaxLookAhead==0, which + // completely bypasses the scoreboard hazard logic. + MaxLookAhead = ScoreboardDepth; } } - MaxLookAhead = ScoreboardDepth; } ReservedScoreboard.reset(ScoreboardDepth); RequiredScoreboard.reset(ScoreboardDepth); - DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = " - << ScoreboardDepth << '\n'); + // If MaxLookAhead is not set above, then we are not enabled. + if (!isEnabled()) + DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n"); + else { + // A nonempty itinerary must have a SchedModel. + IssueWidth = ItinData->SchedModel->IssueWidth; + DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = " + << ScoreboardDepth << '\n'); + } } void ScoreboardHazardRecognizer::Reset() { @@ -151,7 +159,7 @@ ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { } if (!freeUnits) { - DEBUG(dbgs() << "*** Hazard in cycle " << (cycle + i) << ", "); + DEBUG(dbgs() << "*** Hazard in cycle +" << StageCycle << ", "); DEBUG(dbgs() << "SU(" << SU->NodeNum << "): "); DEBUG(DAG->dumpNode(SU)); return Hazard; diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt index a6bdc3b..75e8167 100644 --- a/lib/CodeGen/SelectionDAG/CMakeLists.txt +++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt @@ -23,3 +23,5 @@ add_llvm_library(LLVMSelectionDAG TargetLowering.cpp TargetSelectionDAGInfo.cpp ) + +add_dependencies(LLVMSelectionDAG intrinsics_gen) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0914c66..747bc44 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -215,6 +215,7 @@ namespace { SDValue visitFADD(SDNode *N); SDValue visitFSUB(SDNode *N); SDValue visitFMUL(SDNode *N); + SDValue visitFMA(SDNode *N); SDValue visitFDIV(SDNode *N); SDValue visitFREM(SDNode *N); SDValue visitFCOPYSIGN(SDNode *N); @@ -328,15 +329,12 @@ namespace { class WorkListRemover : public SelectionDAG::DAGUpdateListener { DAGCombiner &DC; public: - explicit WorkListRemover(DAGCombiner &dc) : DC(dc) {} + explicit WorkListRemover(DAGCombiner &dc) + : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} virtual void NodeDeleted(SDNode *N, SDNode *E) { DC.removeFromWorkList(N); } - - virtual void NodeUpdated(SDNode *N) { - // Ignore updates. - } }; } @@ -619,8 +617,7 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, N->getValueType(i) == To[i].getValueType()) && "Cannot combine value to value of different type!")); WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesWith(N, To, &DeadNodes); - + DAG.ReplaceAllUsesWith(N, To); if (AddTo) { // Push the new nodes and any users onto the worklist for (unsigned i = 0, e = NumTo; i != e; ++i) { @@ -650,7 +647,7 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { // Replace all uses. If any nodes become isomorphic to other nodes and // are deleted, make sure to remove them from our worklist. WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); // Push the new node and any (possibly new) users onto the worklist. AddToWorkList(TLO.New.getNode()); @@ -707,9 +704,8 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { Trunc.getNode()->dump(&DAG); dbgs() << '\n'); WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc, &DeadNodes); - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc); + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1)); removeFromWorkList(Load); DAG.DeleteNode(Load); AddToWorkList(Trunc.getNode()); @@ -961,8 +957,8 @@ bool DAGCombiner::PromoteLoad(SDValue Op) { Result.getNode()->dump(&DAG); dbgs() << '\n'); WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result, &DeadNodes); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1), &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); removeFromWorkList(N); DAG.DeleteNode(N); AddToWorkList(Result.getNode()); @@ -1047,12 +1043,12 @@ void DAGCombiner::Run(CombineLevel AtLevel) { DAG.TransferDbgValues(SDValue(N, 0), RV); WorkListRemover DeadNodes(*this); if (N->getNumValues() == RV.getNode()->getNumValues()) - DAG.ReplaceAllUsesWith(N, RV.getNode(), &DeadNodes); + DAG.ReplaceAllUsesWith(N, RV.getNode()); else { assert(N->getValueType(0) == RV.getValueType() && N->getNumValues() == 1 && "Type mismatch"); SDValue OpV = RV; - DAG.ReplaceAllUsesWith(N, &OpV, &DeadNodes); + DAG.ReplaceAllUsesWith(N, &OpV); } // Push the new node and any users onto the worklist @@ -1131,6 +1127,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::FADD: return visitFADD(N); case ISD::FSUB: return visitFSUB(N); case ISD::FMUL: return visitFMUL(N); + case ISD::FMA: return visitFMA(N); case ISD::FDIV: return visitFDIV(N); case ISD::FREM: return visitFREM(N); case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); @@ -1325,10 +1322,12 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { // Replacing results may cause a different MERGE_VALUES to suddenly // be CSE'd with N, and carry its uses with it. Iterate until no // uses remain, to ensure that the node can be safely deleted. + // First add the users of this node to the work list so that they + // can be tried again once they have new operands. + AddUsersToWorkList(N); do { for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i)); } while (!N->use_empty()); removeFromWorkList(N); DAG.DeleteNode(N); @@ -1640,7 +1639,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { if (N1.getOpcode() == ISD::ADD && N0C && N1C1) { SDValue NewC = DAG.getConstant((N0C->getAPIntValue() - N1C1->getAPIntValue()), VT); return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, NewC, - N1.getOperand(0)); + N1.getOperand(0)); } // fold ((A+(B+or-C))-B) -> A+or-C if (N0.getOpcode() == ISD::ADD && @@ -2341,7 +2340,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper // on scalars. if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR) - && Level == AfterLegalizeVectorOps) { + && Level == AfterLegalizeTypes) { SDValue In0 = N0.getOperand(0); SDValue In1 = N1.getOperand(0); EVT In0Ty = In0.getValueType(); @@ -2528,7 +2527,14 @@ SDValue DAGCombiner::visitAND(SDNode *N) { Load->getOffset(), Load->getMemoryVT(), Load->getMemOperand()); // Replace uses of the EXTLOAD with the new ZEXTLOAD. - CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); + if (Load->getNumValues() == 3) { + // PRE/POST_INC loads have 3 values. + SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1), + NewLoad.getValue(2) }; + CombineTo(Load, To, 3, true); + } else { + CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); + } } // Fold the AND away, taking care not to fold to the old load node if we @@ -2710,6 +2716,34 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } + if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && + VT.getSizeInBits() <= 64) { + if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { + APInt ADDC = ADDI->getAPIntValue(); + if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) { + // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal + // immediate for an add, but it is legal if its top c2 bits are set, + // transform the ADD so the immediate doesn't need to be materialized + // in a register. + if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) { + APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), + SRLI->getZExtValue()); + if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { + ADDC |= Mask; + if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { + SDValue NewAdd = + DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT, + N0.getOperand(0), DAG.getConstant(ADDC, VT)); + CombineTo(N0.getNode(), NewAdd); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + } + } + } + } + + return SDValue(); } @@ -4526,8 +4560,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Op = N0.getOperand(0); if (Op.getValueType().bitsLT(VT)) { Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op); + AddToWorkList(Op.getNode()); } else if (Op.getValueType().bitsGT(VT)) { Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op); + AddToWorkList(Op.getNode()); } return DAG.getZeroExtendInReg(Op, N->getDebugLoc(), N0.getValueType().getScalarType()); @@ -5012,6 +5048,10 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); EVT PtrType = N0.getOperand(1).getValueType(); + if (PtrType == MVT::Untyped || PtrType.isExtended()) + // It's not possible to generate a constant of extended or untyped type. + return SDValue(); + // For big endian targets, we need to adjust the offset to the pointer to // load the correct bytes. if (TLI.isBigEndian()) { @@ -5041,8 +5081,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { // Replace the old load's chain with the new load's chain. WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); // Shift the result left, if we've swallowed a left shift. SDValue Result = Load; @@ -5225,7 +5264,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue EltNo = N0->getOperand(1); if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) { int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - + EVT IndexTy = N0->getOperand(1).getValueType(); int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1)); SDValue V = DAG.getNode(ISD::BITCAST, N->getDebugLoc(), @@ -5233,7 +5272,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(), TrTy, V, - DAG.getConstant(Index, MVT::i32)); + DAG.getConstant(Index, IndexTy)); } } @@ -5607,7 +5646,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { if (FoldedVOp.getNode()) return FoldedVOp; } - // fold (fadd c1, c2) -> (fadd c1, c2) + // fold (fadd c1, c2) -> c1 + c2 if (N0CFP && N1CFP && VT != MVT::ppcf128) return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N1); // canonicalize constant to RHS @@ -5636,6 +5675,26 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0.getOperand(1), N1)); + // FADD -> FMA combines: + if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || + DAG.getTarget().Options.UnsafeFPMath) && + DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) && + TLI.isOperationLegal(ISD::FMA, VT)) { + + // fold (fadd (fmul x, y), z) -> (fma x, y, z) + if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) { + return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + + // fold (fadd x, (fmul y, z)) -> (fma x, y, z) + // Note: Commutes FADD operands. + if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) { + return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, + N1.getOperand(0), N1.getOperand(1), N0); + } + } + return SDValue(); } @@ -5673,9 +5732,13 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { GetNegatedExpression(N1, DAG, LegalOperations)); // If 'unsafe math' is enabled, fold + // (fsub x, x) -> 0.0 & // (fsub x, (fadd x, y)) -> (fneg y) & // (fsub x, (fadd y, x)) -> (fneg y) if (DAG.getTarget().Options.UnsafeFPMath) { + if (N0 == N1) + return DAG.getConstantFP(0.0f, VT); + if (N1.getOpcode() == ISD::FADD) { SDValue N10 = N1->getOperand(0); SDValue N11 = N1->getOperand(1); @@ -5689,6 +5752,29 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } } + // FSUB -> FMA combines: + if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || + DAG.getTarget().Options.UnsafeFPMath) && + DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) && + TLI.isOperationLegal(ISD::FMA, VT)) { + + // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) + if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) { + return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, + N0.getOperand(0), N0.getOperand(1), + DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT, N1)); + } + + // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) + // Note: Commutes FSUB operands. + if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) { + return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, + DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT, + N1.getOperand(0)), + N1.getOperand(1), N0); + } + } + return SDValue(); } @@ -5720,6 +5806,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (DAG.getTarget().Options.UnsafeFPMath && ISD::isBuildVectorAllZeros(N1.getNode())) return N1; + // fold (fmul A, 1.0) -> A + if (N1CFP && N1CFP->isExactlyValue(1.0)) + return N0; // fold (fmul X, 2.0) -> (fadd X, X) if (N1CFP && N1CFP->isExactlyValue(+2.0)) return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N0); @@ -5753,6 +5842,26 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitFMA(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); + ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); + EVT VT = N->getValueType(0); + + if (N0CFP && N0CFP->isExactlyValue(1.0)) + return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N2); + if (N1CFP && N1CFP->isExactlyValue(1.0)) + return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N2); + + // Canonicalize (fma c, x, y) -> (fma x, c, y) + if (N0CFP && !N1CFP) + return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, N1, N0, N2); + + return SDValue(); +} + SDValue DAGCombiner::visitFDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -5893,6 +6002,38 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0); } + // The next optimizations are desireable only if SELECT_CC can be lowered. + // Check against MVT::Other for SELECT_CC, which is a workaround for targets + // having to say they don't support SELECT_CC on every type the DAG knows + // about, since there is no way to mark an opcode illegal at all value types + // (See also visitSELECT) + if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) { + // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) + if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && + !VT.isVector() && + (!LegalOperations || + TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { + SDValue Ops[] = + { N0.getOperand(0), N0.getOperand(1), + DAG.getConstantFP(-1.0, VT) , DAG.getConstantFP(0.0, VT), + N0.getOperand(2) }; + return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5); + } + + // fold (sint_to_fp (zext (setcc x, y, cc))) -> + // (select_cc x, y, 1.0, 0.0,, cc) + if (N0.getOpcode() == ISD::ZERO_EXTEND && + N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() && + (!LegalOperations || + TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { + SDValue Ops[] = + { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1), + DAG.getConstantFP(1.0, VT) , DAG.getConstantFP(0.0, VT), + N0.getOperand(0).getOperand(2) }; + return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5); + } + } + return SDValue(); } @@ -5918,6 +6059,25 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0); } + // The next optimizations are desireable only if SELECT_CC can be lowered. + // Check against MVT::Other for SELECT_CC, which is a workaround for targets + // having to say they don't support SELECT_CC on every type the DAG knows + // about, since there is no way to mark an opcode illegal at all value types + // (See also visitSELECT) + if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) { + // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) + + if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && + (!LegalOperations || + TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { + SDValue Ops[] = + { N0.getOperand(0), N0.getOperand(1), + DAG.getConstantFP(1.0, VT), DAG.getConstantFP(0.0, VT), + N0.getOperand(2) }; + return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5); + } + } + return SDValue(); } @@ -6185,7 +6345,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { } // Replace the uses of SRL with SETCC WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(N1, SetCC, &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(N1, SetCC); removeFromWorkList(N1.getNode()); DAG.DeleteNode(N1.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! @@ -6214,7 +6374,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { Tmp.getNode()->dump(&DAG); dbgs() << '\n'); WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(N1, Tmp, &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(N1, Tmp); removeFromWorkList(TheXor); DAG.DeleteNode(TheXor); return DAG.getNode(ISD::BRCOND, N->getDebugLoc(), @@ -6240,7 +6400,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { Equal ? ISD::SETEQ : ISD::SETNE); // Replace the uses of XOR with SETCC WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(N1, SetCC, &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(N1, SetCC); removeFromWorkList(N1.getNode()); DAG.DeleteNode(N1.getNode()); return DAG.getNode(ISD::BRCOND, N->getDebugLoc(), @@ -6431,21 +6591,17 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { dbgs() << '\n'); WorkListRemover DeadNodes(*this); if (isLoad) { - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0), - &DeadNodes); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } // Finally, since the node is now dead, remove it from the graph. DAG.DeleteNode(N); // Replace the uses of Ptr with uses of the updated base value. - DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); removeFromWorkList(Ptr.getNode()); DAG.DeleteNode(Ptr.getNode()); @@ -6559,13 +6715,10 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { dbgs() << '\n'); WorkListRemover DeadNodes(*this); if (isLoad) { - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0), - &DeadNodes); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } // Finally, since the node is now dead, remove it from the graph. @@ -6573,8 +6726,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { // Replace the uses of Use with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), - Result.getValue(isLoad ? 1 : 0), - &DeadNodes); + Result.getValue(isLoad ? 1 : 0)); removeFromWorkList(Op); DAG.DeleteNode(Op); return true; @@ -6609,7 +6761,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { Chain.getNode()->dump(&DAG); dbgs() << "\n"); WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain, &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); if (N->use_empty()) { removeFromWorkList(N); @@ -6629,11 +6781,10 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { Undef.getNode()->dump(&DAG); dbgs() << " and 2 other values\n"); WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef, &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), - DAG.getUNDEF(N->getValueType(1)), - &DeadNodes); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain, &DeadNodes); + DAG.getUNDEF(N->getValueType(1))); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain); removeFromWorkList(N); DAG.DeleteNode(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! @@ -6955,8 +7106,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { AddToWorkList(NewLD.getNode()); AddToWorkList(NewVal.getNode()); WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); ++OpsNarrowed; return NewST; } @@ -7013,8 +7163,7 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { AddToWorkList(NewLD.getNode()); AddToWorkList(NewST.getNode()); WorkListRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1), - &DeadNodes); + DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1)); ++LdStFP2Int; return NewST; } @@ -7058,7 +7207,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { SDValue Tmp; switch (CFP->getValueType(0).getSimpleVT().SimpleTy) { default: llvm_unreachable("Unknown FP type"); - case MVT::f80: // We don't do this for these yet. + case MVT::f16: // We don't do this for these yet. + case MVT::f80: case MVT::f128: case MVT::ppcf128: break; @@ -7323,8 +7473,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { OrigElt -= NumElem; } + EVT IndexTy = N->getOperand(1).getValueType(); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(), NVT, - InVec, DAG.getConstant(OrigElt, MVT::i32)); + InVec, DAG.getConstant(OrigElt, IndexTy)); } // Perform only after legalization to ensure build_vector / vector_shuffle @@ -7472,7 +7623,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { WorkListRemover DeadNodes(*this); SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) }; SDValue To[] = { Load, Chain }; - DAG.ReplaceAllUsesOfValuesWith(From, To, 2, &DeadNodes); + DAG.ReplaceAllUsesOfValuesWith(From, To, 2); // Since we're explcitly calling ReplaceAllUses, add the new node to the // worklist explicitly as well. AddToWorkList(Load.getNode()); @@ -7489,6 +7640,11 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { unsigned NumInScalars = N->getNumOperands(); DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); + + // A vector built entirely of undefs is undef. + if (ISD::allOperandsUndef(N)) + return DAG.getUNDEF(VT); + // Check to see if this is a BUILD_VECTOR of a bunch of values // which come from any_extend or zero_extend nodes. If so, we can create // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR @@ -7496,12 +7652,11 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { // using shuffles. EVT SourceType = MVT::Other; bool AllAnyExt = true; - bool AllUndef = true; + for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); // Ignore undef inputs. if (In.getOpcode() == ISD::UNDEF) continue; - AllUndef = false; bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND; bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND; @@ -7529,9 +7684,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { AllAnyExt &= AnyExt; } - if (AllUndef) - return DAG.getUNDEF(VT); - // In order to have valid types, all of the inputs must be extended from the // same source type and all of the inputs must be any or zero extend. // Scalar sizes must be a power of two. @@ -7707,6 +7859,10 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { if (N->getNumOperands() == 1) return N->getOperand(0); + // Check if all of the operands are undefs. + if (ISD::allOperandsUndef(N)) + return DAG.getUNDEF(N->getValueType(0)); + return SDValue(); } diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index 0c1ac69..e5ea6e6 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -40,6 +40,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "isel" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" @@ -51,7 +52,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Analysis/Loads.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetInstrInfo.h" @@ -484,7 +484,7 @@ bool FastISel::SelectGetElementPtr(const User *I) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) { if (CI->isZero()) continue; // N = N + Offset - TotalOffs += + TotalOffs += TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue(); if (TotalOffs >= MaxOffs) { N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT); @@ -573,7 +573,10 @@ bool FastISel::SelectCall(const User *I) { // At -O0 we don't care about the lifetime intrinsics. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + // The donothing intrinsic does, well, nothing. + case Intrinsic::donothing: return true; + case Intrinsic::dbg_declare: { const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call); if (!DIVariable(DI->getVariable()).Verify() || @@ -642,7 +645,7 @@ bool FastISel::SelectCall(const User *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) .addCImm(CI).addImm(DI->getOffset()) .addMetadata(DI->getVariable()); - else + else BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) .addImm(CI->getZExtValue()).addImm(DI->getOffset()) .addMetadata(DI->getVariable()); @@ -792,7 +795,7 @@ FastISel::SelectInstruction(const Instruction *I) { DL = DebugLoc(); return true; } - // Remove dead code. However, ignore call instructions since we've flushed + // Remove dead code. However, ignore call instructions since we've flushed // the local value map and recomputed the insert point. if (!isa<CallInst>(I)) { recomputeInsertPt(); @@ -1306,6 +1309,30 @@ unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode, return ResultReg; } +unsigned FastISel::FastEmitInst_rrii(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + uint64_t Imm1, uint64_t Imm2) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addImm(Imm1).addImm(Imm2); + else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addImm(Imm1).addImm(Imm2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ResultReg).addReg(II.ImplicitDefs[0]); + } + return ResultReg; +} + unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode, const TargetRegisterClass *RC, uint64_t Imm) { @@ -1345,6 +1372,8 @@ unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT, unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT)); assert(TargetRegisterInfo::isVirtualRegister(Op0) && "Cannot yet extract from physregs"); + const TargetRegisterClass *RC = MRI.getRegClass(Op0); + MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), ResultReg) .addReg(Op0, getKillRegState(Op0IsKill), Idx); diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 8dde919..3e18ea7 100644 --- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -15,13 +15,13 @@ #define DEBUG_TYPE "function-lowering-info" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 1467d88..936c126 100644 --- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -48,16 +48,31 @@ unsigned InstrEmitter::CountResults(SDNode *Node) { return N; } -/// CountOperands - The inputs to target nodes have any actual inputs first, +/// countOperands - The inputs to target nodes have any actual inputs first, /// followed by an optional chain operand, then an optional glue operand. /// Compute the number of actual operands that will go into the resulting /// MachineInstr. -unsigned InstrEmitter::CountOperands(SDNode *Node) { +/// +/// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding +/// the chain and glue. These operands may be implicit on the machine instr. +static unsigned countOperands(SDNode *Node, unsigned &NumImpUses) { unsigned N = Node->getNumOperands(); while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) --N; if (N && Node->getOperand(N - 1).getValueType() == MVT::Other) --N; // Ignore chain if it exists. + + // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses. + for (unsigned I = N; I; --I) { + if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1))) + continue; + if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1))) + if (TargetRegisterInfo::isPhysicalRegister(RN->getReg())) + continue; + NumImpUses = N - I; + break; + } + return N; } @@ -114,8 +129,10 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, if (User->isMachineOpcode()) { const MCInstrDesc &II = TII->get(User->getMachineOpcode()); const TargetRegisterClass *RC = 0; - if (i+II.getNumDefs() < II.getNumOperands()) - RC = TII->getRegClass(II, i+II.getNumDefs(), TRI); + if (i+II.getNumDefs() < II.getNumOperands()) { + RC = TRI->getAllocatableClass( + TII->getRegClass(II, i+II.getNumDefs(), TRI, *MF)); + } if (!UseRC) UseRC = RC; else if (RC) { @@ -196,7 +213,8 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, MachineInstr *MI, // is a vreg in the same register class, use the CopyToReg'd destination // register instead of creating a new vreg. unsigned VRBase = 0; - const TargetRegisterClass *RC = TII->getRegClass(II, i, TRI); + const TargetRegisterClass *RC = + TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF)); if (II.OpInfo[i].isOptionalDef()) { // Optional def must be a physical register. unsigned NumResults = CountResults(Node); @@ -293,7 +311,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op, if (II) { const TargetRegisterClass *DstRC = 0; if (IIOpNum < II->getNumOperands()) - DstRC = TII->getRegClass(*II, IIOpNum, TRI); + DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF)); assert((DstRC || (MI->isVariadic() && IIOpNum >= MCID.getNumOperands())) && "Don't have operand info for this instruction!"); if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) { @@ -334,8 +352,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op, /// AddOperand - Add the specified operand to the specified machine instr. II /// specifies the instruction information for the node, and IIOpNum is the -/// operand number (in the II) that we are adding. IIOpNum and II are used for -/// assertions only. +/// operand number (in the II) that we are adding. void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op, unsigned IIOpNum, const MCInstrDesc *II, @@ -350,7 +367,11 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op, const ConstantFP *CFP = F->getConstantFPValue(); MI->addOperand(MachineOperand::CreateFPImm(CFP)); } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) { - MI->addOperand(MachineOperand::CreateReg(R->getReg(), false)); + // Turn additional physreg operands into implicit uses on non-variadic + // instructions. This is used by call and return instructions passing + // arguments in registers. + bool Imp = II && (IIOpNum >= II->getNumOperands() && !II->isVariadic()); + MI->addOperand(MachineOperand::CreateReg(R->getReg(), false, Imp)); } else if (RegisterMaskSDNode *RM = dyn_cast<RegisterMaskSDNode>(Op)) { MI->addOperand(MachineOperand::CreateRegMask(RM->getRegMask())); } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) { @@ -458,7 +479,8 @@ void InstrEmitter::EmitSubregNode(SDNode *Node, unsigned SrcReg, DstReg, DefSubIdx; if (DefMI && TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) && - SubIdx == DefSubIdx) { + SubIdx == DefSubIdx && + TRC == MRI->getRegClass(SrcReg)) { // Optimize these: // r1025 = s/zext r1024, 4 // r1026 = extract_subreg r1025, 4 @@ -467,6 +489,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node, VRBase = MRI->createVirtualRegister(TRC); BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg); + MRI->clearKillFlags(SrcReg); } else { // VReg may not support a SubIdx sub-register, and we may need to // constrain its register class or issue a COPY to a compatible register @@ -548,7 +571,8 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node, // Create the new VReg in the destination class and emit a copy. unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); - const TargetRegisterClass *DstRC = TRI->getRegClass(DstRCIdx); + const TargetRegisterClass *DstRC = + TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx)); unsigned NewVReg = MRI->createVirtualRegister(DstRC); BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg); @@ -566,7 +590,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node, bool IsClone, bool IsCloned) { unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue(); const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx); - unsigned NewVReg = MRI->createVirtualRegister(RC); + unsigned NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC)); MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), TII->get(TargetOpcode::REG_SEQUENCE), NewVReg); unsigned NumOps = Node->getNumOperands(); @@ -691,7 +715,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, const MCInstrDesc &II = TII->get(Opc); unsigned NumResults = CountResults(Node); - unsigned NodeOperands = CountOperands(Node); + unsigned NumImpUses = 0; + unsigned NodeOperands = countOperands(Node, NumImpUses); bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0; #ifndef NDEBUG unsigned NumMIOperands = NodeOperands + NumResults; @@ -700,7 +725,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, "Too few operands for a variadic node!"); else assert(NumMIOperands >= II.getNumOperands() && - NumMIOperands <= II.getNumOperands()+II.getNumImplicitDefs() && + NumMIOperands <= II.getNumOperands() + II.getNumImplicitDefs() + + NumImpUses && "#operands for dag node doesn't match .td file!"); #endif diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h index c081f38..9eddee9 100644 --- a/lib/CodeGen/SelectionDAG/InstrEmitter.h +++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h @@ -105,12 +105,6 @@ public: /// (which do not go into the machine instrs.) static unsigned CountResults(SDNode *Node); - /// CountOperands - The inputs to target nodes have any actual inputs first, - /// followed by an optional chain operand, then flag operands. Compute - /// the number of actual operands that will go into the resulting - /// MachineInstr. - static unsigned CountOperands(SDNode *Node); - /// EmitDbgValue - Generate machine instruction for a dbg_value node. /// MachineInstr *EmitDbgValue(SDDbgValue *SD, diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index a96a997..b0776af 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -11,7 +11,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/DebugInfo.h" +#include "llvm/DerivedTypes.h" +#include "llvm/LLVMContext.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" @@ -20,10 +24,6 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/CallingConv.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -70,6 +70,9 @@ private: SDValue OptimizeFloatStore(StoreSDNode *ST); + void LegalizeLoadOps(SDNode *Node); + void LegalizeStoreOps(SDNode *Node); + /// PerformInsertVectorEltInMemory - Some target cannot handle a variable /// insertion index for the INSERT_VECTOR_ELT instruction. In this case, it /// is necessary to spill the vector being inserted into to memory, perform @@ -150,21 +153,21 @@ public: // Node replacement helpers void ReplacedNode(SDNode *N) { if (N->use_empty()) { - DAG.RemoveDeadNode(N, this); + DAG.RemoveDeadNode(N); } else { ForgetNode(N); } } void ReplaceNode(SDNode *Old, SDNode *New) { - DAG.ReplaceAllUsesWith(Old, New, this); + DAG.ReplaceAllUsesWith(Old, New); ReplacedNode(Old); } void ReplaceNode(SDValue Old, SDValue New) { - DAG.ReplaceAllUsesWith(Old, New, this); + DAG.ReplaceAllUsesWith(Old, New); ReplacedNode(Old.getNode()); } void ReplaceNode(SDNode *Old, const SDValue *New) { - DAG.ReplaceAllUsesWith(Old, New, this); + DAG.ReplaceAllUsesWith(Old, New); ReplacedNode(Old); } }; @@ -203,7 +206,8 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT, DebugLoc dl, } SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag) - : TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()), + : SelectionDAG::DAGUpdateListener(dag), + TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()), DAG(dag) { } @@ -638,9 +642,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { // probably means that we need to integrate dag combiner and legalizer // together. // We generally can't do this one for long doubles. - SDValue Tmp1 = ST->getChain(); - SDValue Tmp2 = ST->getBasePtr(); - SDValue Tmp3; + SDValue Chain = ST->getChain(); + SDValue Ptr = ST->getBasePtr(); unsigned Alignment = ST->getAlignment(); bool isVolatile = ST->isVolatile(); bool isNonTemporal = ST->isNonTemporal(); @@ -648,19 +651,19 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) { if (CFP->getValueType(0) == MVT::f32 && TLI.isTypeLegal(MVT::i32)) { - Tmp3 = DAG.getConstant(CFP->getValueAPF(). + SDValue Con = DAG.getConstant(CFP->getValueAPF(). bitcastToAPInt().zextOrTrunc(32), MVT::i32); - return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), + return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), isVolatile, isNonTemporal, Alignment); } if (CFP->getValueType(0) == MVT::f64) { // If this target supports 64-bit registers, do a single 64-bit store. if (TLI.isTypeLegal(MVT::i64)) { - Tmp3 = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). + SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). zextOrTrunc(64), MVT::i64); - return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), + return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), isVolatile, isNonTemporal, Alignment); } @@ -673,11 +676,11 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32); if (TLI.isBigEndian()) std::swap(Lo, Hi); - Lo = DAG.getStore(Tmp1, dl, Lo, Tmp2, ST->getPointerInfo(), isVolatile, + Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile, isNonTemporal, Alignment); - Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getIntPtrConstant(4)); - Hi = DAG.getStore(Tmp1, dl, Hi, Tmp2, + Hi = DAG.getStore(Chain, dl, Hi, Ptr, ST->getPointerInfo().getWithOffset(4), isVolatile, isNonTemporal, MinAlign(Alignment, 4U)); @@ -688,14 +691,448 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { return SDValue(0, 0); } +void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { + StoreSDNode *ST = cast<StoreSDNode>(Node); + SDValue Chain = ST->getChain(); + SDValue Ptr = ST->getBasePtr(); + DebugLoc dl = Node->getDebugLoc(); + + unsigned Alignment = ST->getAlignment(); + bool isVolatile = ST->isVolatile(); + bool isNonTemporal = ST->isNonTemporal(); + + if (!ST->isTruncatingStore()) { + if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) { + ReplaceNode(ST, OptStore); + return; + } + + { + SDValue Value = ST->getValue(); + EVT VT = Value.getValueType(); + switch (TLI.getOperationAction(ISD::STORE, VT)) { + default: llvm_unreachable("This action is not supported yet!"); + case TargetLowering::Legal: + // If this is an unaligned store and the target doesn't support it, + // expand it. + if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) { + Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty); + if (ST->getAlignment() < ABIAlignment) + ExpandUnalignedStore(cast<StoreSDNode>(Node), + DAG, TLI, this); + } + break; + case TargetLowering::Custom: { + SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG); + if (Res.getNode()) + ReplaceNode(SDValue(Node, 0), Res); + return; + } + case TargetLowering::Promote: { + assert(VT.isVector() && "Unknown legal promote case!"); + Value = DAG.getNode(ISD::BITCAST, dl, + TLI.getTypeToPromoteTo(ISD::STORE, VT), Value); + SDValue Result = + DAG.getStore(Chain, dl, Value, Ptr, + ST->getPointerInfo(), isVolatile, + isNonTemporal, Alignment); + ReplaceNode(SDValue(Node, 0), Result); + break; + } + } + return; + } + } else { + SDValue Value = ST->getValue(); + + EVT StVT = ST->getMemoryVT(); + unsigned StWidth = StVT.getSizeInBits(); + + if (StWidth != StVT.getStoreSizeInBits()) { + // Promote to a byte-sized store with upper bits zero if not + // storing an integral number of bytes. For example, promote + // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1) + EVT NVT = EVT::getIntegerVT(*DAG.getContext(), + StVT.getStoreSizeInBits()); + Value = DAG.getZeroExtendInReg(Value, dl, StVT); + SDValue Result = + DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), + NVT, isVolatile, isNonTemporal, Alignment); + ReplaceNode(SDValue(Node, 0), Result); + } else if (StWidth & (StWidth - 1)) { + // If not storing a power-of-2 number of bits, expand as two stores. + assert(!StVT.isVector() && "Unsupported truncstore!"); + unsigned RoundWidth = 1 << Log2_32(StWidth); + assert(RoundWidth < StWidth); + unsigned ExtraWidth = StWidth - RoundWidth; + assert(ExtraWidth < RoundWidth); + assert(!(RoundWidth % 8) && !(ExtraWidth % 8) && + "Store size not an integral number of bytes!"); + EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth); + EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth); + SDValue Lo, Hi; + unsigned IncrementSize; + + if (TLI.isLittleEndian()) { + // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16) + // Store the bottom RoundWidth bits. + Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), + RoundVT, + isVolatile, isNonTemporal, Alignment); + + // Store the remaining ExtraWidth bits. + IncrementSize = RoundWidth / 8; + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, + DAG.getIntPtrConstant(IncrementSize)); + Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value, + DAG.getConstant(RoundWidth, + TLI.getShiftAmountTy(Value.getValueType()))); + Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, + ST->getPointerInfo().getWithOffset(IncrementSize), + ExtraVT, isVolatile, isNonTemporal, + MinAlign(Alignment, IncrementSize)); + } else { + // Big endian - avoid unaligned stores. + // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X + // Store the top RoundWidth bits. + Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value, + DAG.getConstant(ExtraWidth, + TLI.getShiftAmountTy(Value.getValueType()))); + Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(), + RoundVT, isVolatile, isNonTemporal, Alignment); + + // Store the remaining ExtraWidth bits. + IncrementSize = RoundWidth / 8; + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, + DAG.getIntPtrConstant(IncrementSize)); + Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, + ST->getPointerInfo().getWithOffset(IncrementSize), + ExtraVT, isVolatile, isNonTemporal, + MinAlign(Alignment, IncrementSize)); + } + + // The order of the stores doesn't matter. + SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); + ReplaceNode(SDValue(Node, 0), Result); + } else { + switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) { + default: llvm_unreachable("This action is not supported yet!"); + case TargetLowering::Legal: + // If this is an unaligned store and the target doesn't support it, + // expand it. + if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) { + Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty); + if (ST->getAlignment() < ABIAlignment) + ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this); + } + break; + case TargetLowering::Custom: { + SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG); + if (Res.getNode()) + ReplaceNode(SDValue(Node, 0), Res); + return; + } + case TargetLowering::Expand: + assert(!StVT.isVector() && + "Vector Stores are handled in LegalizeVectorOps"); + + // TRUNCSTORE:i16 i32 -> STORE i16 + assert(TLI.isTypeLegal(StVT) && + "Do not know how to expand this store!"); + Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value); + SDValue Result = + DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), + isVolatile, isNonTemporal, Alignment); + ReplaceNode(SDValue(Node, 0), Result); + break; + } + } + } +} + +void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { + LoadSDNode *LD = cast<LoadSDNode>(Node); + SDValue Chain = LD->getChain(); // The chain. + SDValue Ptr = LD->getBasePtr(); // The base pointer. + SDValue Value; // The value returned by the load op. + DebugLoc dl = Node->getDebugLoc(); + + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) { + EVT VT = Node->getValueType(0); + SDValue RVal = SDValue(Node, 0); + SDValue RChain = SDValue(Node, 1); + + switch (TLI.getOperationAction(Node->getOpcode(), VT)) { + default: llvm_unreachable("This action is not supported yet!"); + case TargetLowering::Legal: + // If this is an unaligned load and the target doesn't support it, + // expand it. + if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) { + Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = + TLI.getTargetData()->getABITypeAlignment(Ty); + if (LD->getAlignment() < ABIAlignment){ + ExpandUnalignedLoad(cast<LoadSDNode>(Node), + DAG, TLI, RVal, RChain); + } + } + break; + case TargetLowering::Custom: { + SDValue Res = TLI.LowerOperation(RVal, DAG); + if (Res.getNode()) { + RVal = Res; + RChain = Res.getValue(1); + } + break; + } + case TargetLowering::Promote: { + // Only promote a load of vector type to another. + assert(VT.isVector() && "Cannot promote this load!"); + // Change base type to a different vector type. + EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT); + + SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), + LD->isVolatile(), LD->isNonTemporal(), + LD->isInvariant(), LD->getAlignment()); + RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res); + RChain = Res.getValue(1); + break; + } + } + if (RChain.getNode() != Node) { + assert(RVal.getNode() != Node && "Load must be completely replaced"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal); + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain); + ReplacedNode(Node); + } + return; + } + + EVT SrcVT = LD->getMemoryVT(); + unsigned SrcWidth = SrcVT.getSizeInBits(); + unsigned Alignment = LD->getAlignment(); + bool isVolatile = LD->isVolatile(); + bool isNonTemporal = LD->isNonTemporal(); + + if (SrcWidth != SrcVT.getStoreSizeInBits() && + // Some targets pretend to have an i1 loading operation, and actually + // load an i8. This trick is correct for ZEXTLOAD because the top 7 + // bits are guaranteed to be zero; it helps the optimizers understand + // that these bits are zero. It is also useful for EXTLOAD, since it + // tells the optimizers that those bits are undefined. It would be + // nice to have an effective generic way of getting these benefits... + // Until such a way is found, don't insist on promoting i1 here. + (SrcVT != MVT::i1 || + TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) { + // Promote to a byte-sized load if not loading an integral number of + // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24. + unsigned NewWidth = SrcVT.getStoreSizeInBits(); + EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth); + SDValue Ch; + + // The extra bits are guaranteed to be zero, since we stored them that + // way. A zext load from NVT thus automatically gives zext from SrcVT. + + ISD::LoadExtType NewExtType = + ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD; + + SDValue Result = + DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), + Chain, Ptr, LD->getPointerInfo(), + NVT, isVolatile, isNonTemporal, Alignment); + + Ch = Result.getValue(1); // The chain. + + if (ExtType == ISD::SEXTLOAD) + // Having the top bits zero doesn't help when sign extending. + Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, + Result.getValueType(), + Result, DAG.getValueType(SrcVT)); + else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType()) + // All the top bits are guaranteed to be zero - inform the optimizers. + Result = DAG.getNode(ISD::AssertZext, dl, + Result.getValueType(), Result, + DAG.getValueType(SrcVT)); + + Value = Result; + Chain = Ch; + } else if (SrcWidth & (SrcWidth - 1)) { + // If not loading a power-of-2 number of bits, expand as two loads. + assert(!SrcVT.isVector() && "Unsupported extload!"); + unsigned RoundWidth = 1 << Log2_32(SrcWidth); + assert(RoundWidth < SrcWidth); + unsigned ExtraWidth = SrcWidth - RoundWidth; + assert(ExtraWidth < RoundWidth); + assert(!(RoundWidth % 8) && !(ExtraWidth % 8) && + "Load size not an integral number of bytes!"); + EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth); + EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth); + SDValue Lo, Hi, Ch; + unsigned IncrementSize; + + if (TLI.isLittleEndian()) { + // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16) + // Load the bottom RoundWidth bits. + Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), + Chain, Ptr, + LD->getPointerInfo(), RoundVT, isVolatile, + isNonTemporal, Alignment); + + // Load the remaining ExtraWidth bits. + IncrementSize = RoundWidth / 8; + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, + DAG.getIntPtrConstant(IncrementSize)); + Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr, + LD->getPointerInfo().getWithOffset(IncrementSize), + ExtraVT, isVolatile, isNonTemporal, + MinAlign(Alignment, IncrementSize)); + + // Build a factor node to remember that this load is independent of + // the other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Move the top bits to the right place. + Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi, + DAG.getConstant(RoundWidth, + TLI.getShiftAmountTy(Hi.getValueType()))); + + // Join the hi and lo parts. + Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi); + } else { + // Big endian - avoid unaligned loads. + // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8 + // Load the top RoundWidth bits. + Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr, + LD->getPointerInfo(), RoundVT, isVolatile, + isNonTemporal, Alignment); + + // Load the remaining ExtraWidth bits. + IncrementSize = RoundWidth / 8; + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, + DAG.getIntPtrConstant(IncrementSize)); + Lo = DAG.getExtLoad(ISD::ZEXTLOAD, + dl, Node->getValueType(0), Chain, Ptr, + LD->getPointerInfo().getWithOffset(IncrementSize), + ExtraVT, isVolatile, isNonTemporal, + MinAlign(Alignment, IncrementSize)); + + // Build a factor node to remember that this load is independent of + // the other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Move the top bits to the right place. + Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi, + DAG.getConstant(ExtraWidth, + TLI.getShiftAmountTy(Hi.getValueType()))); + + // Join the hi and lo parts. + Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi); + } + + Chain = Ch; + } else { + bool isCustom = false; + switch (TLI.getLoadExtAction(ExtType, SrcVT)) { + default: llvm_unreachable("This action is not supported yet!"); + case TargetLowering::Custom: + isCustom = true; + // FALLTHROUGH + case TargetLowering::Legal: { + Value = SDValue(Node, 0); + Chain = SDValue(Node, 1); + + if (isCustom) { + SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG); + if (Res.getNode()) { + Value = Res; + Chain = Res.getValue(1); + } + } else { + // If this is an unaligned load and the target doesn't support it, + // expand it. + if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) { + Type *Ty = + LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = + TLI.getTargetData()->getABITypeAlignment(Ty); + if (LD->getAlignment() < ABIAlignment){ + ExpandUnalignedLoad(cast<LoadSDNode>(Node), + DAG, TLI, Value, Chain); + } + } + } + break; + } + case TargetLowering::Expand: + if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) { + SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr, + LD->getPointerInfo(), + LD->isVolatile(), LD->isNonTemporal(), + LD->isInvariant(), LD->getAlignment()); + unsigned ExtendOp; + switch (ExtType) { + case ISD::EXTLOAD: + ExtendOp = (SrcVT.isFloatingPoint() ? + ISD::FP_EXTEND : ISD::ANY_EXTEND); + break; + case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break; + case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break; + default: llvm_unreachable("Unexpected extend load type!"); + } + Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load); + Chain = Load.getValue(1); + break; + } + + assert(!SrcVT.isVector() && + "Vector Loads are handled in LegalizeVectorOps"); + + // FIXME: This does not work for vectors on most targets. Sign- and + // zero-extend operations are currently folded into extending loads, + // whether they are legal or not, and then we end up here without any + // support for legalizing them. + assert(ExtType != ISD::EXTLOAD && + "EXTLOAD should always be supported!"); + // Turn the unsupported load into an EXTLOAD followed by an explicit + // zero/sign extend inreg. + SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0), + Chain, Ptr, LD->getPointerInfo(), SrcVT, + LD->isVolatile(), LD->isNonTemporal(), + LD->getAlignment()); + SDValue ValRes; + if (ExtType == ISD::SEXTLOAD) + ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, + Result.getValueType(), + Result, DAG.getValueType(SrcVT)); + else + ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType()); + Value = ValRes; + Chain = Result.getValue(1); + break; + } + } + + // Since loads produce two values, make sure to remember that we legalized + // both of them. + if (Chain.getNode() != Node) { + assert(Value.getNode() != Node && "Load must be completely replaced"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value); + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain); + ReplacedNode(Node); + } +} + /// LegalizeOp - Return a legal replacement for the given operation, with /// all legal operands. void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { if (Node->getOpcode() == ISD::TargetConstant) // Allow illegal target nodes. return; - DebugLoc dl = Node->getDebugLoc(); - for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) == TargetLowering::TypeLegal && @@ -708,9 +1145,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { Node->getOperand(i).getOpcode() == ISD::TargetConstant) && "Unexpected illegal type!"); - SDValue Tmp1, Tmp2, Tmp3, Tmp4; - bool isCustom = false; - // Figure out the correct action; the way to query this varies by opcode TargetLowering::LegalizeAction Action = TargetLowering::Legal; bool SimpleFinishLegalizing = true; @@ -816,9 +1250,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { } if (SimpleFinishLegalizing) { - SmallVector<SDValue, 8> Ops; - for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) - Ops.push_back(Node->getOperand(i)); + SDNode *NewNode = Node; switch (Node->getOpcode()) { default: break; case ISD::SHL: @@ -828,11 +1260,14 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::ROTR: // Legalizing shifts/rotates requires adjusting the shift amount // to the appropriate width. - if (!Ops[1].getValueType().isVector()) { - SDValue SAO = DAG.getShiftAmountOperand(Ops[0].getValueType(), Ops[1]); + if (!Node->getOperand(1).getValueType().isVector()) { + SDValue SAO = + DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(), + Node->getOperand(1)); HandleSDNode Handle(SAO); LegalizeOp(SAO.getNode()); - Ops[1] = Handle.getValue(); + NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0), + Handle.getValue()); } break; case ISD::SRL_PARTS: @@ -840,18 +1275,21 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::SHL_PARTS: // Legalizing shifts/rotates requires adjusting the shift amount // to the appropriate width. - if (!Ops[2].getValueType().isVector()) { - SDValue SAO = DAG.getShiftAmountOperand(Ops[0].getValueType(), Ops[2]); + if (!Node->getOperand(2).getValueType().isVector()) { + SDValue SAO = + DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(), + Node->getOperand(2)); HandleSDNode Handle(SAO); LegalizeOp(SAO.getNode()); - Ops[2] = Handle.getValue(); + NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0), + Node->getOperand(1), + Handle.getValue()); } break; } - SDNode *NewNode = DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); if (NewNode != Node) { - DAG.ReplaceAllUsesWith(Node, NewNode, this); + DAG.ReplaceAllUsesWith(Node, NewNode); for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) DAG.TransferDbgValues(SDValue(Node, i), SDValue(NewNode, i)); ReplacedNode(Node); @@ -860,27 +1298,27 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { switch (Action) { case TargetLowering::Legal: return; - case TargetLowering::Custom: + case TargetLowering::Custom: { // FIXME: The handling for custom lowering with multiple results is // a complete mess. - Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG); - if (Tmp1.getNode()) { + SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG); + if (Res.getNode()) { SmallVector<SDValue, 8> ResultVals; for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) { if (e == 1) - ResultVals.push_back(Tmp1); + ResultVals.push_back(Res); else - ResultVals.push_back(Tmp1.getValue(i)); + ResultVals.push_back(Res.getValue(i)); } - if (Tmp1.getNode() != Node || Tmp1.getResNo() != 0) { - DAG.ReplaceAllUsesWith(Node, ResultVals.data(), this); + if (Res.getNode() != Node || Res.getResNo() != 0) { + DAG.ReplaceAllUsesWith(Node, ResultVals.data()); for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) DAG.TransferDbgValues(SDValue(Node, i), ResultVals[i]); ReplacedNode(Node); } return; } - + } // FALL THROUGH case TargetLowering::Expand: ExpandNode(Node); @@ -904,428 +1342,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::CALLSEQ_END: break; case ISD::LOAD: { - LoadSDNode *LD = cast<LoadSDNode>(Node); - Tmp1 = LD->getChain(); // Legalize the chain. - Tmp2 = LD->getBasePtr(); // Legalize the base pointer. - - ISD::LoadExtType ExtType = LD->getExtensionType(); - if (ExtType == ISD::NON_EXTLOAD) { - EVT VT = Node->getValueType(0); - Tmp3 = SDValue(Node, 0); - Tmp4 = SDValue(Node, 1); - - switch (TLI.getOperationAction(Node->getOpcode(), VT)) { - default: llvm_unreachable("This action is not supported yet!"); - case TargetLowering::Legal: - // If this is an unaligned load and the target doesn't support it, - // expand it. - if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) { - Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty); - if (LD->getAlignment() < ABIAlignment){ - ExpandUnalignedLoad(cast<LoadSDNode>(Node), - DAG, TLI, Tmp3, Tmp4); - } - } - break; - case TargetLowering::Custom: - Tmp1 = TLI.LowerOperation(Tmp3, DAG); - if (Tmp1.getNode()) { - Tmp3 = Tmp1; - Tmp4 = Tmp1.getValue(1); - } - break; - case TargetLowering::Promote: { - // Only promote a load of vector type to another. - assert(VT.isVector() && "Cannot promote this load!"); - // Change base type to a different vector type. - EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT); - - Tmp1 = DAG.getLoad(NVT, dl, Tmp1, Tmp2, LD->getPointerInfo(), - LD->isVolatile(), LD->isNonTemporal(), - LD->isInvariant(), LD->getAlignment()); - Tmp3 = DAG.getNode(ISD::BITCAST, dl, VT, Tmp1); - Tmp4 = Tmp1.getValue(1); - break; - } - } - if (Tmp4.getNode() != Node) { - assert(Tmp3.getNode() != Node && "Load must be completely replaced"); - DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp3); - DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Tmp4); - ReplacedNode(Node); - } - return; - } - - EVT SrcVT = LD->getMemoryVT(); - unsigned SrcWidth = SrcVT.getSizeInBits(); - unsigned Alignment = LD->getAlignment(); - bool isVolatile = LD->isVolatile(); - bool isNonTemporal = LD->isNonTemporal(); - - if (SrcWidth != SrcVT.getStoreSizeInBits() && - // Some targets pretend to have an i1 loading operation, and actually - // load an i8. This trick is correct for ZEXTLOAD because the top 7 - // bits are guaranteed to be zero; it helps the optimizers understand - // that these bits are zero. It is also useful for EXTLOAD, since it - // tells the optimizers that those bits are undefined. It would be - // nice to have an effective generic way of getting these benefits... - // Until such a way is found, don't insist on promoting i1 here. - (SrcVT != MVT::i1 || - TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) { - // Promote to a byte-sized load if not loading an integral number of - // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24. - unsigned NewWidth = SrcVT.getStoreSizeInBits(); - EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth); - SDValue Ch; - - // The extra bits are guaranteed to be zero, since we stored them that - // way. A zext load from NVT thus automatically gives zext from SrcVT. - - ISD::LoadExtType NewExtType = - ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD; - - SDValue Result = - DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), - Tmp1, Tmp2, LD->getPointerInfo(), - NVT, isVolatile, isNonTemporal, Alignment); - - Ch = Result.getValue(1); // The chain. - - if (ExtType == ISD::SEXTLOAD) - // Having the top bits zero doesn't help when sign extending. - Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, - Result.getValueType(), - Result, DAG.getValueType(SrcVT)); - else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType()) - // All the top bits are guaranteed to be zero - inform the optimizers. - Result = DAG.getNode(ISD::AssertZext, dl, - Result.getValueType(), Result, - DAG.getValueType(SrcVT)); - - Tmp1 = Result; - Tmp2 = Ch; - } else if (SrcWidth & (SrcWidth - 1)) { - // If not loading a power-of-2 number of bits, expand as two loads. - assert(!SrcVT.isVector() && "Unsupported extload!"); - unsigned RoundWidth = 1 << Log2_32(SrcWidth); - assert(RoundWidth < SrcWidth); - unsigned ExtraWidth = SrcWidth - RoundWidth; - assert(ExtraWidth < RoundWidth); - assert(!(RoundWidth % 8) && !(ExtraWidth % 8) && - "Load size not an integral number of bytes!"); - EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth); - EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth); - SDValue Lo, Hi, Ch; - unsigned IncrementSize; - - if (TLI.isLittleEndian()) { - // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16) - // Load the bottom RoundWidth bits. - Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), - Tmp1, Tmp2, - LD->getPointerInfo(), RoundVT, isVolatile, - isNonTemporal, Alignment); - - // Load the remaining ExtraWidth bits. - IncrementSize = RoundWidth / 8; - Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, - DAG.getIntPtrConstant(IncrementSize)); - Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2, - LD->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, isVolatile, isNonTemporal, - MinAlign(Alignment, IncrementSize)); - - // Build a factor node to remember that this load is independent of - // the other one. - Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), - Hi.getValue(1)); - - // Move the top bits to the right place. - Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi, - DAG.getConstant(RoundWidth, - TLI.getShiftAmountTy(Hi.getValueType()))); - - // Join the hi and lo parts. - Tmp1 = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi); - } else { - // Big endian - avoid unaligned loads. - // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8 - // Load the top RoundWidth bits. - Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2, - LD->getPointerInfo(), RoundVT, isVolatile, - isNonTemporal, Alignment); - - // Load the remaining ExtraWidth bits. - IncrementSize = RoundWidth / 8; - Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, - DAG.getIntPtrConstant(IncrementSize)); - Lo = DAG.getExtLoad(ISD::ZEXTLOAD, - dl, Node->getValueType(0), Tmp1, Tmp2, - LD->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, isVolatile, isNonTemporal, - MinAlign(Alignment, IncrementSize)); - - // Build a factor node to remember that this load is independent of - // the other one. - Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), - Hi.getValue(1)); - - // Move the top bits to the right place. - Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi, - DAG.getConstant(ExtraWidth, - TLI.getShiftAmountTy(Hi.getValueType()))); - - // Join the hi and lo parts. - Tmp1 = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi); - } - - Tmp2 = Ch; - } else { - switch (TLI.getLoadExtAction(ExtType, SrcVT)) { - default: llvm_unreachable("This action is not supported yet!"); - case TargetLowering::Custom: - isCustom = true; - // FALLTHROUGH - case TargetLowering::Legal: - Tmp1 = SDValue(Node, 0); - Tmp2 = SDValue(Node, 1); - - if (isCustom) { - Tmp3 = TLI.LowerOperation(SDValue(Node, 0), DAG); - if (Tmp3.getNode()) { - Tmp1 = Tmp3; - Tmp2 = Tmp3.getValue(1); - } - } else { - // If this is an unaligned load and the target doesn't support it, - // expand it. - if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) { - Type *Ty = - LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = - TLI.getTargetData()->getABITypeAlignment(Ty); - if (LD->getAlignment() < ABIAlignment){ - ExpandUnalignedLoad(cast<LoadSDNode>(Node), - DAG, TLI, Tmp1, Tmp2); - } - } - } - break; - case TargetLowering::Expand: - if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) { - SDValue Load = DAG.getLoad(SrcVT, dl, Tmp1, Tmp2, - LD->getPointerInfo(), - LD->isVolatile(), LD->isNonTemporal(), - LD->isInvariant(), LD->getAlignment()); - unsigned ExtendOp; - switch (ExtType) { - case ISD::EXTLOAD: - ExtendOp = (SrcVT.isFloatingPoint() ? - ISD::FP_EXTEND : ISD::ANY_EXTEND); - break; - case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break; - case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break; - default: llvm_unreachable("Unexpected extend load type!"); - } - Tmp1 = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load); - Tmp2 = Load.getValue(1); - break; - } - - assert(!SrcVT.isVector() && - "Vector Loads are handled in LegalizeVectorOps"); - - // FIXME: This does not work for vectors on most targets. Sign- and - // zero-extend operations are currently folded into extending loads, - // whether they are legal or not, and then we end up here without any - // support for legalizing them. - assert(ExtType != ISD::EXTLOAD && - "EXTLOAD should always be supported!"); - // Turn the unsupported load into an EXTLOAD followed by an explicit - // zero/sign extend inreg. - SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0), - Tmp1, Tmp2, LD->getPointerInfo(), SrcVT, - LD->isVolatile(), LD->isNonTemporal(), - LD->getAlignment()); - SDValue ValRes; - if (ExtType == ISD::SEXTLOAD) - ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, - Result.getValueType(), - Result, DAG.getValueType(SrcVT)); - else - ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType()); - Tmp1 = ValRes; - Tmp2 = Result.getValue(1); - break; - } - } - - // Since loads produce two values, make sure to remember that we legalized - // both of them. - if (Tmp2.getNode() != Node) { - assert(Tmp1.getNode() != Node && "Load must be completely replaced"); - DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp1); - DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Tmp2); - ReplacedNode(Node); - } - break; + return LegalizeLoadOps(Node); } case ISD::STORE: { - StoreSDNode *ST = cast<StoreSDNode>(Node); - Tmp1 = ST->getChain(); - Tmp2 = ST->getBasePtr(); - unsigned Alignment = ST->getAlignment(); - bool isVolatile = ST->isVolatile(); - bool isNonTemporal = ST->isNonTemporal(); - - if (!ST->isTruncatingStore()) { - if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) { - ReplaceNode(ST, OptStore); - break; - } - - { - Tmp3 = ST->getValue(); - EVT VT = Tmp3.getValueType(); - switch (TLI.getOperationAction(ISD::STORE, VT)) { - default: llvm_unreachable("This action is not supported yet!"); - case TargetLowering::Legal: - // If this is an unaligned store and the target doesn't support it, - // expand it. - if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) { - Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty); - if (ST->getAlignment() < ABIAlignment) - ExpandUnalignedStore(cast<StoreSDNode>(Node), - DAG, TLI, this); - } - break; - case TargetLowering::Custom: - Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG); - if (Tmp1.getNode()) - ReplaceNode(SDValue(Node, 0), Tmp1); - break; - case TargetLowering::Promote: { - assert(VT.isVector() && "Unknown legal promote case!"); - Tmp3 = DAG.getNode(ISD::BITCAST, dl, - TLI.getTypeToPromoteTo(ISD::STORE, VT), Tmp3); - SDValue Result = - DAG.getStore(Tmp1, dl, Tmp3, Tmp2, - ST->getPointerInfo(), isVolatile, - isNonTemporal, Alignment); - ReplaceNode(SDValue(Node, 0), Result); - break; - } - } - break; - } - } else { - Tmp3 = ST->getValue(); - - EVT StVT = ST->getMemoryVT(); - unsigned StWidth = StVT.getSizeInBits(); - - if (StWidth != StVT.getStoreSizeInBits()) { - // Promote to a byte-sized store with upper bits zero if not - // storing an integral number of bytes. For example, promote - // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1) - EVT NVT = EVT::getIntegerVT(*DAG.getContext(), - StVT.getStoreSizeInBits()); - Tmp3 = DAG.getZeroExtendInReg(Tmp3, dl, StVT); - SDValue Result = - DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), - NVT, isVolatile, isNonTemporal, Alignment); - ReplaceNode(SDValue(Node, 0), Result); - } else if (StWidth & (StWidth - 1)) { - // If not storing a power-of-2 number of bits, expand as two stores. - assert(!StVT.isVector() && "Unsupported truncstore!"); - unsigned RoundWidth = 1 << Log2_32(StWidth); - assert(RoundWidth < StWidth); - unsigned ExtraWidth = StWidth - RoundWidth; - assert(ExtraWidth < RoundWidth); - assert(!(RoundWidth % 8) && !(ExtraWidth % 8) && - "Store size not an integral number of bytes!"); - EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth); - EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth); - SDValue Lo, Hi; - unsigned IncrementSize; - - if (TLI.isLittleEndian()) { - // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16) - // Store the bottom RoundWidth bits. - Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), - RoundVT, - isVolatile, isNonTemporal, Alignment); - - // Store the remaining ExtraWidth bits. - IncrementSize = RoundWidth / 8; - Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, - DAG.getIntPtrConstant(IncrementSize)); - Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3, - DAG.getConstant(RoundWidth, - TLI.getShiftAmountTy(Tmp3.getValueType()))); - Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, - ST->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, isVolatile, isNonTemporal, - MinAlign(Alignment, IncrementSize)); - } else { - // Big endian - avoid unaligned stores. - // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X - // Store the top RoundWidth bits. - Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3, - DAG.getConstant(ExtraWidth, - TLI.getShiftAmountTy(Tmp3.getValueType()))); - Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, ST->getPointerInfo(), - RoundVT, isVolatile, isNonTemporal, Alignment); - - // Store the remaining ExtraWidth bits. - IncrementSize = RoundWidth / 8; - Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2, - DAG.getIntPtrConstant(IncrementSize)); - Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, - ST->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, isVolatile, isNonTemporal, - MinAlign(Alignment, IncrementSize)); - } - - // The order of the stores doesn't matter. - SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); - ReplaceNode(SDValue(Node, 0), Result); - } else { - switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) { - default: llvm_unreachable("This action is not supported yet!"); - case TargetLowering::Legal: - // If this is an unaligned store and the target doesn't support it, - // expand it. - if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) { - Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty); - if (ST->getAlignment() < ABIAlignment) - ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this); - } - break; - case TargetLowering::Custom: - ReplaceNode(SDValue(Node, 0), - TLI.LowerOperation(SDValue(Node, 0), DAG)); - break; - case TargetLowering::Expand: - assert(!StVT.isVector() && - "Vector Stores are handled in LegalizeVectorOps"); - - // TRUNCSTORE:i16 i32 -> STORE i16 - assert(TLI.isTypeLegal(StVT) && "Do not know how to expand this store!"); - Tmp3 = DAG.getNode(ISD::TRUNCATE, dl, StVT, Tmp3); - SDValue Result = - DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), - isVolatile, isNonTemporal, Alignment); - ReplaceNode(SDValue(Node, 0), Result); - break; - } - } - } - break; + return LegalizeStoreOps(Node); } } } @@ -1795,11 +1815,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, if (isTailCall) InChain = TCChain; - std::pair<SDValue, SDValue> CallInfo = - TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false, + TargetLowering:: + CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false, 0, TLI.getLibcallCallingConv(LC), isTailCall, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG, Node->getDebugLoc()); + std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); + if (!CallInfo.second.getNode()) // It's a tailcall, return the chain (which is the DAG root). @@ -1828,11 +1850,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, TLI.getPointerTy()); Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); - std::pair<SDValue,SDValue> CallInfo = - TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false, - false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false, + TargetLowering:: + CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false, + false, 0, TLI.getLibcallCallingConv(LC), + /*isTailCall=*/false, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG, dl); + std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI); return CallInfo.first; } @@ -1860,11 +1884,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, TLI.getPointerTy()); Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); - std::pair<SDValue, SDValue> CallInfo = - TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false, + TargetLowering:: + CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG, Node->getDebugLoc()); + std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); return CallInfo; } @@ -1919,9 +1944,11 @@ static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, return TLI.getLibcallName(LC) != 0; } -/// UseDivRem - Only issue divrem libcall if both quotient and remainder are +/// useDivRem - Only issue divrem libcall if both quotient and remainder are /// needed. -static bool UseDivRem(SDNode *Node, bool isSigned, bool isDIV) { +static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) { + // The other use might have been replaced with a divrem already. + unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; unsigned OtherOpcode = 0; if (isSigned) OtherOpcode = isDIV ? ISD::SREM : ISD::SDIV; @@ -1935,7 +1962,7 @@ static bool UseDivRem(SDNode *Node, bool isSigned, bool isDIV) { SDNode *User = *UI; if (User == Node) continue; - if (User->getOpcode() == OtherOpcode && + if ((User->getOpcode() == OtherOpcode || User->getOpcode() == DivRemOpc) && User->getOperand(0) == Op0 && User->getOperand(1) == Op1) return true; @@ -1992,11 +2019,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, TLI.getPointerTy()); DebugLoc dl = Node->getDebugLoc(); - std::pair<SDValue, SDValue> CallInfo = - TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false, + TargetLowering:: + CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG, dl); + std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); // Remainder is loaded back from the stack frame. SDValue Rem = DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, @@ -2570,14 +2598,17 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { // If the target didn't lower this, lower it to '__sync_synchronize()' call // FIXME: handle "fence singlethread" more efficiently. TargetLowering::ArgListTy Args; - std::pair<SDValue, SDValue> CallResult = - TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()), + TargetLowering:: + CallLoweringInfo CLI(Node->getOperand(0), + Type::getVoidTy(*DAG.getContext()), false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, DAG.getExternalSymbol("__sync_synchronize", TLI.getPointerTy()), Args, DAG, dl); + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + Results.push_back(CallResult.second); break; } @@ -2647,13 +2678,16 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { case ISD::TRAP: { // If this operation is not supported, lower it to 'abort()' call TargetLowering::ArgListTy Args; - std::pair<SDValue, SDValue> CallResult = - TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()), + TargetLowering:: + CallLoweringInfo CLI(Node->getOperand(0), + Type::getVoidTy(*DAG.getContext()), false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, DAG.getExternalSymbol("abort", TLI.getPointerTy()), Args, DAG, dl); + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + Results.push_back(CallResult.second); break; } @@ -3059,7 +3093,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { "Don't know how to expand this subtraction!"); Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1), DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT)); - Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp2, DAG.getConstant(1, VT)); + Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, VT)); Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1)); break; } @@ -3074,7 +3108,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Tmp3 = Node->getOperand(1); if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) || (isDivRemLibcallAvailable(Node, isSigned, TLI) && - UseDivRem(Node, isSigned, false))) { + useDivRem(Node, isSigned, false))) { Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1); } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) { // X % Y -> X-X/Y*Y @@ -3102,7 +3136,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { SDVTList VTs = DAG.getVTList(VT, VT); if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) || (isDivRemLibcallAvailable(Node, isSigned, TLI) && - UseDivRem(Node, isSigned, true))) + useDivRem(Node, isSigned, true))) Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0), Node->getOperand(1)); else if (isSigned) diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 95ddb1e..e8e968a 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -588,18 +588,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { unsigned NumElts = InVT.getVectorNumElements(); assert(NumElts == NVT.getVectorNumElements() && "Dst and Src must have the same number of elements"); - EVT EltVT = InVT.getScalarType(); assert(isPowerOf2_32(NumElts) && "Promoted vector type must be a power of two"); - EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts/2); + SDValue EOp1, EOp2; + GetSplitVector(InOp, EOp1, EOp2); + EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(), NumElts/2); - - SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, InOp, - DAG.getIntPtrConstant(0)); - SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, InOp, - DAG.getIntPtrConstant(NumElts/2)); EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1); EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2); @@ -2273,9 +2269,9 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, // A divide for UMULO will be faster than a function call. Select to // make sure we aren't using 0. SDValue isZero = DAG.getSetCC(dl, TLI.getSetCCResultType(VT), - RHS, DAG.getConstant(0, VT), ISD::SETNE); + RHS, DAG.getConstant(0, VT), ISD::SETNE); SDValue NotZero = DAG.getNode(ISD::SELECT, dl, VT, isZero, - DAG.getConstant(1, VT), RHS); + DAG.getConstant(1, VT), RHS); SDValue DIV = DAG.getNode(ISD::UDIV, DL, LHS.getValueType(), MUL, NotZero); SDValue Overflow; Overflow = DAG.getSetCC(DL, N->getValueType(1), DIV, LHS, ISD::SETNE); @@ -2296,8 +2292,8 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, SDValue Temp = DAG.CreateStackTemporary(PtrVT); // Temporary for the overflow value, default it to zero. SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, - DAG.getConstant(0, PtrVT), Temp, - MachinePointerInfo(), false, false, 0); + DAG.getConstant(0, PtrVT), Temp, + MachinePointerInfo(), false, false, 0); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -2319,16 +2315,17 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, Args.push_back(Entry); SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT); - std::pair<SDValue, SDValue> CallInfo = - TLI.LowerCallTo(Chain, RetTy, true, false, false, false, - 0, TLI.getLibcallCallingConv(LC), - /*isTailCall=*/false, - /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, - Func, Args, DAG, dl); + TargetLowering:: + CallLoweringInfo CLI(Chain, RetTy, true, false, false, false, + 0, TLI.getLibcallCallingConv(LC), + /*isTailCall=*/false, + /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, + Func, Args, DAG, dl); + std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); SplitInteger(CallInfo.first, Lo, Hi); SDValue Temp2 = DAG.getLoad(PtrVT, dl, CallInfo.second, Temp, - MachinePointerInfo(), false, false, false, 0); + MachinePointerInfo(), false, false, false, 0); SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2, DAG.getConstant(0, PtrVT), ISD::SETNE); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index 439aa4d..39337ff 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -628,7 +628,8 @@ namespace { public: explicit NodeUpdateListener(DAGTypeLegalizer &dtl, SmallSetVector<SDNode*, 16> &nta) - : DTL(dtl), NodesToAnalyze(nta) {} + : SelectionDAG::DAGUpdateListener(dtl.getDAG()), + DTL(dtl), NodesToAnalyze(nta) {} virtual void NodeDeleted(SDNode *N, SDNode *E) { assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess && @@ -680,7 +681,7 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) { SmallSetVector<SDNode*, 16> NodesToAnalyze; NodeUpdateListener NUL(*this, NodesToAnalyze); do { - DAG.ReplaceAllUsesOfValueWith(From, To, &NUL); + DAG.ReplaceAllUsesOfValueWith(From, To); // The old node may still be present in a map like ExpandedIntegers or // PromotedIntegers. Inform maps about the replacement. @@ -709,7 +710,7 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) { SDValue NewVal(M, i); if (M->getNodeId() == Processed) RemapValue(NewVal); - DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL); + DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal); // OldVal may be a target of the ReplacedValues map which was marked // NewNode to force reanalysis because it was updated. Ensure that // anything that ReplacedValues mapped to OldVal will now be mapped @@ -950,7 +951,7 @@ SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) { for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) if (i != ResNo) ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i))); - return SDValue(N, ResNo); + return SDValue(N->getOperand(ResNo)); } /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type @@ -1054,12 +1055,14 @@ SDValue DAGTypeLegalizer::MakeLibCall(RTLIB::Libcall LC, EVT RetVT, TLI.getPointerTy()); Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); - std::pair<SDValue,SDValue> CallInfo = - TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false, + TargetLowering:: + CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false, false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG, dl); + std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI); + return CallInfo.first; } @@ -1086,11 +1089,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, TLI.getPointerTy()); Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); - std::pair<SDValue, SDValue> CallInfo = - TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false, + TargetLowering:: + CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG, Node->getDebugLoc()); + std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); return CallInfo; } diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index e866445..94fc976 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -135,6 +135,8 @@ public: ReplacedValues[SDValue(Old, i)] = SDValue(New, i); } + SelectionDAG &getDAG() const { return DAG; } + private: SDNode *AnalyzeNewNode(SDNode *N); void AnalyzeNewValue(SDValue &Val); @@ -151,7 +153,7 @@ private: /// DisintegrateMERGE_VALUES - Replace each result of the given MERGE_VALUES /// node with the corresponding input operand, except for the result 'ResNo', - /// which is returned. + /// for which the corresponding input operand is returned. SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo); SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index); @@ -509,10 +511,12 @@ private: void ScalarizeVectorResult(SDNode *N, unsigned OpNo); SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo); SDValue ScalarizeVecRes_BinOp(SDNode *N); + SDValue ScalarizeVecRes_TernaryOp(SDNode *N); SDValue ScalarizeVecRes_UnaryOp(SDNode *N); SDValue ScalarizeVecRes_InregOp(SDNode *N); SDValue ScalarizeVecRes_BITCAST(SDNode *N); + SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N); SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N); SDValue ScalarizeVecRes_FP_ROUND(SDNode *N); @@ -553,6 +557,7 @@ private: // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>. void SplitVectorResult(SDNode *N, unsigned OpNo); void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index a8ff7c6..06f6bd6 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -168,6 +168,7 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue OldVec = N->getOperand(0); unsigned OldElts = OldVec.getValueType().getVectorNumElements(); + EVT OldEltVT = OldVec.getValueType().getVectorElementType(); DebugLoc dl = N->getDebugLoc(); // Convert to a vector of the expanded element type, for example @@ -175,6 +176,15 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, EVT OldVT = N->getValueType(0); EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT); + if (OldVT != OldEltVT) { + // The result of EXTRACT_VECTOR_ELT may be larger than the element type of + // the input vector. If so, extend the elements of the input vector to the + // same bitwidth as the result before expanding. + assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!"); + EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldElts); + OldVec = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0)); + } + SDValue NewVec = DAG.getNode(ISD::BITCAST, dl, EVT::getVectorVT(*DAG.getContext(), NewVT, 2*OldElts), diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 9fe4480..704f99b 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -71,6 +71,9 @@ class VectorLegalizer { // operands to a different type and bitcasting the result back to the // original type. SDValue PromoteVectorOp(SDValue Op); + // Implements [SU]INT_TO_FP vector promotion; this is a [zs]ext of the input + // operand to the next size up. + SDValue PromoteVectorOpINT_TO_FP(SDValue Op); public: bool Run(); @@ -231,9 +234,19 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) { case TargetLowering::Promote: - // "Promote" the operation by bitcasting - Result = PromoteVectorOp(Op); - Changed = true; + switch (Op.getOpcode()) { + default: + // "Promote" the operation by bitcasting + Result = PromoteVectorOp(Op); + Changed = true; + break; + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + // "Promote" the operation by extending the operand. + Result = PromoteVectorOpINT_TO_FP(Op); + Changed = true; + break; + } break; case TargetLowering::Legal: break; case TargetLowering::Custom: { @@ -293,6 +306,44 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) { return DAG.getNode(ISD::BITCAST, dl, VT, Op); } +SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) { + // INT_TO_FP operations may require the input operand be promoted even + // when the type is otherwise legal. + EVT VT = Op.getOperand(0).getValueType(); + assert(Op.getNode()->getNumValues() == 1 && + "Can't promote a vector with multiple results!"); + + // Normal getTypeToPromoteTo() doesn't work here, as that will promote + // by widening the vector w/ the same element width and twice the number + // of elements. We want the other way around, the same number of elements, + // each twice the width. + // + // Increase the bitwidth of the element to the next pow-of-two + // (which is greater than 8 bits). + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + EltVT = EVT::getIntegerVT(*DAG.getContext(), 2 * EltVT.getSizeInBits()); + assert(EltVT.isSimple() && "Promoting to a non-simple vector type!"); + + // Build a new vector type and check if it is legal. + MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts); + + DebugLoc dl = Op.getDebugLoc(); + SmallVector<SDValue, 4> Operands(Op.getNumOperands()); + + unsigned Opc = Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : + ISD::SIGN_EXTEND; + for (unsigned j = 0; j != Op.getNumOperands(); ++j) { + if (Op.getOperand(j).getValueType().isVector()) + Operands[j] = DAG.getNode(Opc, dl, NVT, Op.getOperand(j)); + else + Operands[j] = Op.getOperand(j); + } + + return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), &Operands[0], + Operands.size()); +} + SDValue VectorLegalizer::ExpandLoad(SDValue Op) { DebugLoc dl = Op.getDebugLoc(); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5f23f01..4709202 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -48,7 +48,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break; case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break; - case ISD::BUILD_VECTOR: R = N->getOperand(0); break; + case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break; case ISD::CONVERT_RNDSAT: R = ScalarizeVecRes_CONVERT_RNDSAT(N); break; case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::FP_ROUND: R = ScalarizeVecRes_FP_ROUND(N); break; @@ -115,6 +115,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SRL: R = ScalarizeVecRes_BinOp(N); break; + case ISD::FMA: + R = ScalarizeVecRes_TernaryOp(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -129,6 +132,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) { LHS.getValueType(), LHS, RHS); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) { + SDValue Op0 = GetScalarizedVector(N->getOperand(0)); + SDValue Op1 = GetScalarizedVector(N->getOperand(1)); + SDValue Op2 = GetScalarizedVector(N->getOperand(2)); + return DAG.getNode(N->getOpcode(), N->getDebugLoc(), + Op0.getValueType(), Op0, Op1, Op2); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); @@ -141,6 +152,16 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) { NewVT, N->getOperand(0)); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) { + EVT EltVT = N->getValueType(0).getVectorElementType(); + SDValue InOp = N->getOperand(0); + // The BUILD_VECTOR operands may be of wider element types and + // we may need to truncate them back to the requested return type. + if (EltVT.isInteger()) + return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp); + return InOp; +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) { EVT NewVT = N->getValueType(0).getVectorElementType(); SDValue Op0 = GetScalarizedVector(N->getOperand(0)); @@ -436,7 +457,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { N->dump(&DAG); dbgs() << "\n"); SDValue Lo, Hi; - + // See if the target wants to custom expand this node. if (CustomLowerNode(N, N->getValueType(ResNo), true)) return; @@ -448,7 +469,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to split the result of this operator!"); + report_fatal_error("Do not know how to split the result of this " + "operator!\n"); case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::VSELECT: @@ -529,6 +551,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::FREM: SplitVecRes_BinOp(N, Lo, Hi); break; + case ISD::FMA: + SplitVecRes_TernaryOp(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -548,6 +573,22 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi); } +void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDValue Op0Lo, Op0Hi; + GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi); + SDValue Op1Lo, Op1Hi; + GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi); + SDValue Op2Lo, Op2Hi; + GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi); + DebugLoc dl = N->getDebugLoc(); + + Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(), + Op0Lo, Op1Lo, Op2Lo); + Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(), + Op0Hi, Op1Hi, Op2Hi); +} + void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { // We know the result is a vector. The input may be either a vector or a @@ -977,7 +1018,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to split this operator's operand!"); + report_fatal_error("Do not know how to split this operator's " + "operand!\n"); + case ISD::SETCC: Res = SplitVecOp_VSETCC(N); break; case ISD::BITCAST: Res = SplitVecOp_BITCAST(N); break; case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break; @@ -1203,15 +1246,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) { DebugLoc DL = N->getDebugLoc(); GetSplitVector(N->getOperand(0), Lo, Hi); EVT InVT = Lo.getValueType(); - + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(), InVT.getVectorNumElements()); - + Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1)); Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1)); - + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); -} +} @@ -1755,8 +1798,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { if (InputWidened) InOp = GetWidenedVector(InOp); for (unsigned j=0; j < NumInElts; ++j) - Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, - DAG.getIntPtrConstant(j)); + Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, + DAG.getIntPtrConstant(j)); } SDValue UndefVal = DAG.getUNDEF(EltVT); for (; Idx < WidenNumElts; ++Idx) @@ -1816,7 +1859,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) { InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp, DAG.getIntPtrConstant(0)); return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp, - SatOp, CvtCode); + SatOp, CvtCode); } } @@ -1832,7 +1875,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) { SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp, DAG.getIntPtrConstant(i)); Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp, - SatOp, CvtCode); + SatOp, CvtCode); } SDValue UndefVal = DAG.getUNDEF(EltVT); @@ -1936,7 +1979,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { Cond1 = GetWidenedVector(Cond1); if (Cond1.getValueType() != CondWidenVT) - Cond1 = ModifyToType(Cond1, CondWidenVT); + Cond1 = ModifyToType(Cond1, CondWidenVT); } SDValue InOp1 = GetWidenedVector(N->getOperand(1)); @@ -2202,7 +2245,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) { SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC, DAG.getIntPtrConstant(0)); - return PromoteTargetBoolean(CC, N->getValueType(0)); + return PromoteTargetBoolean(CC, N->getValueType(0)); } @@ -2371,10 +2414,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain, NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff); NewVTWidth = NewVT.getSizeInBits(); L = DAG.getLoad(NewVT, dl, Chain, BasePtr, - LD->getPointerInfo().getWithOffset(Offset), - isVolatile, - isNonTemporal, isInvariant, - MinAlign(Align, Increment)); + LD->getPointerInfo().getWithOffset(Offset), isVolatile, + isNonTemporal, isInvariant, MinAlign(Align, Increment)); LdChain.push_back(L.getValue(1)); if (L->getValueType(0).isVector()) { SmallVector<SDValue, 16> Loads; @@ -2563,7 +2604,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain, Offset += Increment; BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getIntPtrConstant(Increment)); - } while (StWidth != 0 && StWidth >= NewVTWidth); + } while (StWidth != 0 && StWidth >= NewVTWidth); // Restore index back to be relative to the original widen element type Idx = Idx * NewVTWidth / ValEltWidth; } diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index ff0136e..c3794d5 100644 --- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -50,7 +50,7 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) : const TargetMachine &tm = (*IS->MF).getTarget(); ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,NULL); - // This hard requirment could be relaxed, but for now + // This hard requirement could be relaxed, but for now // do not let it procede. assert (ResourcesModel && "Unimplemented CreateTargetScheduleState."); @@ -318,7 +318,7 @@ void ResourcePriorityQueue::reserveResources(SUnit *SU) { // If packet is now full, reset the state so in the next cycle // we start fresh. - if (Packet.size() >= InstrItins->IssueWidth) { + if (Packet.size() >= InstrItins->SchedModel->IssueWidth) { ResourcesModel->clearResources(); Packet.clear(); } @@ -353,7 +353,7 @@ signed ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) { } /// Estimates change in reg pressure from this SU. -/// It is acheived by trivial tracking of defined +/// It is achieved by trivial tracking of defined /// and used vregs in dependent instructions. /// The RawPressure flag makes this function to ignore /// existing reg file sizes, and report raw def/use diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 24da432..b7ce48a 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -441,19 +441,14 @@ static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg, SmallVector<unsigned, 4> &LRegs, const TargetRegisterInfo *TRI) { bool Added = false; - if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != SU) { - if (RegAdded.insert(Reg)) { - LRegs.push_back(Reg); - Added = true; - } - } - for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) - if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) { - if (RegAdded.insert(*Alias)) { - LRegs.push_back(*Alias); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { + if (LiveRegDefs[*AI] && LiveRegDefs[*AI] != SU) { + if (RegAdded.insert(*AI)) { + LRegs.push_back(*AI); Added = true; } } + } return Added; } diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 2cb5d37..bf0a437 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -266,7 +266,8 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos, const TargetLowering *TLI, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, - unsigned &RegClass, unsigned &Cost) { + unsigned &RegClass, unsigned &Cost, + const MachineFunction &MF) { EVT VT = RegDefPos.GetValue(); // Special handling for untyped values. These values can only come from @@ -285,7 +286,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos, unsigned Idx = RegDefPos.GetIdx(); const MCInstrDesc Desc = TII->get(Opcode); - const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI, MF); RegClass = RC->getID(); // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a // better way to determine it. @@ -852,7 +853,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { } /// After backtracking, the hazard checker needs to be restored to a state -/// corresponding the the current cycle. +/// corresponding the current cycle. void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() { HazardRec->Reset(); @@ -1181,7 +1182,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg, SmallSet<unsigned, 4> &RegAdded, SmallVector<unsigned, 4> &LRegs, const TargetRegisterInfo *TRI) { - for (const uint16_t *AliasI = TRI->getOverlaps(Reg); *AliasI; ++AliasI) { + for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) { // Check if Ref is live. if (!LiveRegDefs[*AliasI]) continue; @@ -1920,7 +1921,7 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const { for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG); RegDefPos.IsValid(); RegDefPos.Advance()) { unsigned RCId, Cost; - GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost); + GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF); if ((RegPressure[RCId] + Cost) >= RegLimit[RCId]) return true; @@ -2034,7 +2035,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) { continue; unsigned RCId, Cost; - GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost); + GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF); RegPressure[RCId] += Cost; break; } @@ -2049,7 +2050,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) { if (SkipRegDefs > 0) continue; unsigned RCId, Cost; - GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost); + GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF); if (RegPressure[RCId] < Cost) { // Register pressure tracking is imprecise. This can happen. But we try // hard not to let it happen because it likely results in poor scheduling. @@ -2330,22 +2331,21 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref, // and latency. if (!checkPref || (left->SchedulingPref == Sched::ILP || right->SchedulingPref == Sched::ILP)) { - if (DisableSchedCycles) { + // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer + // is enabled, grouping instructions by cycle, then its height is already + // covered so only its depth matters. We also reach this point if both stall + // but have the same height. + if (!SPQ->getHazardRec()->isEnabled()) { if (LHeight != RHeight) return LHeight > RHeight ? 1 : -1; } - else { - // If neither instruction stalls (!LStall && !RStall) then - // its height is already covered so only its depth matters. We also reach - // this if both stall but have the same height. - int LDepth = left->getDepth() - LPenalty; - int RDepth = right->getDepth() - RPenalty; - if (LDepth != RDepth) { - DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum - << ") depth " << LDepth << " vs SU (" << right->NodeNum - << ") depth " << RDepth << "\n"); - return LDepth < RDepth ? 1 : -1; - } + int LDepth = left->getDepth() - LPenalty; + int RDepth = right->getDepth() - RPenalty; + if (LDepth != RDepth) { + DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum + << ") depth " << LDepth << " vs SU (" << right->NodeNum + << ") depth " << RDepth << "\n"); + return LDepth < RDepth ? 1 : -1; } if (left->Latency != right->Latency) return left->Latency > right->Latency ? 1 : -1; @@ -2363,7 +2363,7 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { bool RHasPhysReg = right->hasPhysRegDefs; if (LHasPhysReg != RHasPhysReg) { #ifndef NDEBUG - const char *PhysRegMsg[] = {" has no physreg", " defines a physreg"}; + const char *const PhysRegMsg[] = {" has no physreg"," defines a physreg"}; #endif DEBUG(dbgs() << " SU (" << left->NodeNum << ") " << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") " diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 69dd813..748668c 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -131,28 +131,16 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, } } -static void AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) { - SmallVector<EVT, 4> VTs; - SDNode *GlueDestNode = Glue.getNode(); - - // Don't add glue from a node to itself. - if (GlueDestNode == N) return; - - // Don't add glue to something which already has glue. - if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return; - - for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) - VTs.push_back(N->getValueType(I)); - - if (AddGlue) - VTs.push_back(MVT::Glue); - +// Helper for AddGlue to clone node operands. +static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG, + SmallVectorImpl<EVT> &VTs, + SDValue ExtraOper = SDValue()) { SmallVector<SDValue, 4> Ops; for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) Ops.push_back(N->getOperand(I)); - if (GlueDestNode) - Ops.push_back(Glue); + if (ExtraOper.getNode()) + Ops.push_back(ExtraOper); SDVTList VTList = DAG->getVTList(&VTs[0], VTs.size()); MachineSDNode::mmo_iterator Begin = 0, End = 0; @@ -171,6 +159,46 @@ static void AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) { MN->setMemRefs(Begin, End); } +static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) { + SmallVector<EVT, 4> VTs; + SDNode *GlueDestNode = Glue.getNode(); + + // Don't add glue from a node to itself. + if (GlueDestNode == N) return false; + + // Don't add a glue operand to something that already uses glue. + if (GlueDestNode && + N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) { + return false; + } + // Don't add glue to something that already has a glue value. + if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false; + + for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) + VTs.push_back(N->getValueType(I)); + + if (AddGlue) + VTs.push_back(MVT::Glue); + + CloneNodeWithValues(N, DAG, VTs, Glue); + + return true; +} + +// Cleanup after unsuccessful AddGlue. Use the standard method of morphing the +// node even though simply shrinking the value list is sufficient. +static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) { + assert((N->getValueType(N->getNumValues() - 1) == MVT::Glue && + !N->hasAnyUseOfValue(N->getNumValues() - 1)) && + "expected an unused glue value"); + + SmallVector<EVT, 4> VTs; + for (unsigned I = 0, E = N->getNumValues()-1; I != E; ++I) + VTs.push_back(N->getValueType(I)); + + CloneNodeWithValues(N, DAG, VTs); +} + /// ClusterNeighboringLoads - Force nearby loads together by "gluing" them. /// This function finds loads of the same base and different offsets. If the /// offsets are not far apart (target specific), it add MVT::Glue inputs and @@ -238,19 +266,23 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) { // Cluster loads by adding MVT::Glue outputs and inputs. This also // ensure they are scheduled in order of increasing addresses. SDNode *Lead = Loads[0]; - AddGlue(Lead, SDValue(0, 0), true, DAG); - - SDValue InGlue = SDValue(Lead, Lead->getNumValues() - 1); + SDValue InGlue = SDValue(0, 0); + if (AddGlue(Lead, InGlue, true, DAG)) + InGlue = SDValue(Lead, Lead->getNumValues() - 1); for (unsigned I = 1, E = Loads.size(); I != E; ++I) { bool OutGlue = I < E - 1; SDNode *Load = Loads[I]; - AddGlue(Load, InGlue, OutGlue, DAG); + // If AddGlue fails, we could leave an unsused glue value. This should not + // cause any + if (AddGlue(Load, InGlue, OutGlue, DAG)) { + if (OutGlue) + InGlue = SDValue(Load, Load->getNumValues() - 1); - if (OutGlue) - InGlue = SDValue(Load, Load->getNumValues() - 1); - - ++LoadsClustered; + ++LoadsClustered; + } + else if (!OutGlue && InGlue.getNode()) + RemoveUnusedGlue(InGlue.getNode(), DAG); } } diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h index 75940ec..5384576 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h @@ -98,12 +98,6 @@ namespace llvm { /// virtual void computeLatency(SUnit *SU); - /// computeOperandLatency - Override dependence edge latency using - /// operand use/def information - /// - virtual void computeOperandLatency(SUnit *Def, SUnit *Use, - SDep& dep) const { } - virtual void computeOperandLatency(SDNode *Def, SDNode *Use, unsigned OpIdx, SDep& dep) const; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 92671d1..b971b69 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -14,16 +14,16 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "SDNodeOrdering.h" #include "SDNodeDbgValue.h" +#include "llvm/CallingConv.h" #include "llvm/Constants.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/DebugInfo.h" +#include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/GlobalAlias.h" #include "llvm/GlobalVariable.h" #include "llvm/Intrinsics.h" -#include "llvm/DerivedTypes.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Assembly/Writer.h" -#include "llvm/CallingConv.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -71,7 +71,9 @@ static const fltSemantics *EVTToAPFloatSemantics(EVT VT) { } } -SelectionDAG::DAGUpdateListener::~DAGUpdateListener() {} +// Default null implementations of the callbacks. +void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {} +void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {} //===----------------------------------------------------------------------===// // ConstantFPSDNode Class @@ -217,6 +219,22 @@ bool ISD::isScalarToVector(const SDNode *N) { return true; } +/// allOperandsUndef - Return true if the node has at least one operand +/// and all operands of the specified node are ISD::UNDEF. +bool ISD::allOperandsUndef(const SDNode *N) { + // Return false if the node has no operands. + // This is "logically inconsistent" with the definition of "all" but + // is probably the desired behavior. + if (N->getNumOperands() == 0) + return false; + + for (unsigned i = 0, e = N->getNumOperands(); i != e ; ++i) + if (N->getOperand(i).getOpcode() != ISD::UNDEF) + return false; + + return true; +} + /// getSetCCSwappedOperands - Return the operation corresponding to (Y op X) /// when given the operation for (X op Y). ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) { @@ -544,16 +562,15 @@ void SelectionDAG::RemoveDeadNodes() { /// RemoveDeadNodes - This method deletes the unreachable nodes in the /// given list, and any nodes that become unreachable as a result. -void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes, - DAGUpdateListener *UpdateListener) { +void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) { // Process the worklist, deleting the nodes and adding their uses to the // worklist. while (!DeadNodes.empty()) { SDNode *N = DeadNodes.pop_back_val(); - if (UpdateListener) - UpdateListener->NodeDeleted(N, 0); + for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) + DUL->NodeDeleted(N, 0); // Take the node out of the appropriate CSE map. RemoveNodeFromCSEMaps(N); @@ -574,7 +591,7 @@ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes, } } -void SelectionDAG::RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener){ +void SelectionDAG::RemoveDeadNode(SDNode *N){ SmallVector<SDNode*, 16> DeadNodes(1, N); // Create a dummy node that adds a reference to the root node, preventing @@ -582,7 +599,7 @@ void SelectionDAG::RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener){ // dead node.) HandleSDNode Dummy(getRoot()); - RemoveDeadNodes(DeadNodes, UpdateListener); + RemoveDeadNodes(DeadNodes); } void SelectionDAG::DeleteNode(SDNode *N) { @@ -684,8 +701,7 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) { /// node. This transfer can potentially trigger recursive merging. /// void -SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N, - DAGUpdateListener *UpdateListener) { +SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) { // For node types that aren't CSE'd, just act as if no identical node // already exists. if (!doNotCSE(N)) { @@ -694,20 +710,19 @@ SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N, // If there was already an existing matching node, use ReplaceAllUsesWith // to replace the dead one with the existing one. This can cause // recursive merging of other unrelated nodes down the line. - ReplaceAllUsesWith(N, Existing, UpdateListener); + ReplaceAllUsesWith(N, Existing); - // N is now dead. Inform the listener if it exists and delete it. - if (UpdateListener) - UpdateListener->NodeDeleted(N, Existing); + // N is now dead. Inform the listeners and delete it. + for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) + DUL->NodeDeleted(N, Existing); DeleteNodeNotInCSEMaps(N); return; } } - // If the node doesn't already exist, we updated it. Inform a listener if - // it exists. - if (UpdateListener) - UpdateListener->NodeUpdated(N); + // If the node doesn't already exist, we updated it. Inform listeners. + for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) + DUL->NodeUpdated(N); } /// FindModifiedNodeSlot - Find a slot for the specified node if its operands @@ -855,7 +870,7 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const { SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) : TM(tm), TLI(*tm.getTargetLowering()), TSI(*tm.getSelectionDAGInfo()), OptLevel(OL), EntryNode(ISD::EntryToken, DebugLoc(), getVTList(MVT::Other)), - Root(getEntryNode()), Ordering(0) { + Root(getEntryNode()), Ordering(0), UpdateListeners(0) { AllNodes.push_back(&EntryNode); Ordering = new SDNodeOrdering(); DbgInfo = new SDDbgInfo(); @@ -867,6 +882,7 @@ void SelectionDAG::init(MachineFunction &mf) { } SelectionDAG::~SelectionDAG() { + assert(!UpdateListeners && "Dangling registered DAGUpdateListeners"); allnodes_clear(); delete Ordering; delete DbgInfo; @@ -1949,6 +1965,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero, APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); KnownZero |= (~InMask); + KnownOne &= (~KnownZero); return; } case ISD::FGETSIGN: @@ -2246,8 +2263,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{ } // Handle LOADX separately here. EXTLOAD case will fallthrough. - if (Op.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast<LoadSDNode>(Op); + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) { unsigned ExtType = LD->getExtensionType(); switch (ExtType) { default: break; @@ -2675,6 +2691,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, if (N1 == N2) return N1; break; case ISD::CONCAT_VECTORS: + // Concat of UNDEFs is UNDEF. + if (N1.getOpcode() == ISD::UNDEF && + N2.getOpcode() == ISD::UNDEF) + return getUNDEF(VT); + // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to // one big BUILD_VECTOR. if (N1.getOpcode() == ISD::BUILD_VECTOR && @@ -3708,8 +3729,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst, Entry.Node = Src; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); // FIXME: pass in DebugLoc - std::pair<SDValue,SDValue> CallResult = - TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()), + TargetLowering:: + CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()), false, false, false, false, 0, TLI.getLibcallCallingConv(RTLIB::MEMCPY), /*isTailCall=*/false, @@ -3717,6 +3738,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst, getExternalSymbol(TLI.getLibcallName(RTLIB::MEMCPY), TLI.getPointerTy()), Args, *this, dl); + std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); + return CallResult.second; } @@ -3761,8 +3784,8 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst, Entry.Node = Src; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); // FIXME: pass in DebugLoc - std::pair<SDValue,SDValue> CallResult = - TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()), + TargetLowering:: + CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()), false, false, false, false, 0, TLI.getLibcallCallingConv(RTLIB::MEMMOVE), /*isTailCall=*/false, @@ -3770,6 +3793,8 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst, getExternalSymbol(TLI.getLibcallName(RTLIB::MEMMOVE), TLI.getPointerTy()), Args, *this, dl); + std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); + return CallResult.second; } @@ -3822,8 +3847,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst, Entry.isSExt = false; Args.push_back(Entry); // FIXME: pass in DebugLoc - std::pair<SDValue,SDValue> CallResult = - TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()), + TargetLowering:: + CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()), false, false, false, false, 0, TLI.getLibcallCallingConv(RTLIB::MEMSET), /*isTailCall=*/false, @@ -3831,6 +3856,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst, getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET), TLI.getPointerTy()), Args, *this, dl); + std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); + return CallResult.second; } @@ -4654,13 +4681,7 @@ SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) { if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1]) continue; - bool NoMatch = false; - for (unsigned i = 2; i != NumVTs; ++i) - if (VTs[i] != I->VTs[i]) { - NoMatch = true; - break; - } - if (!NoMatch) + if (std::equal(&VTs[2], &VTs[NumVTs], &I->VTs[2])) return *I; } @@ -5237,11 +5258,7 @@ namespace { /// pointed to by a use iterator is deleted, increment the use iterator /// so that it doesn't dangle. /// -/// This class also manages a "downlink" DAGUpdateListener, to forward -/// messages to ReplaceAllUsesWith's callers. -/// class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener { - SelectionDAG::DAGUpdateListener *DownLink; SDNode::use_iterator &UI; SDNode::use_iterator &UE; @@ -5249,21 +5266,13 @@ class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener { // Increment the iterator as needed. while (UI != UE && N == *UI) ++UI; - - // Then forward the message. - if (DownLink) DownLink->NodeDeleted(N, E); - } - - virtual void NodeUpdated(SDNode *N) { - // Just forward the message. - if (DownLink) DownLink->NodeUpdated(N); } public: - RAUWUpdateListener(SelectionDAG::DAGUpdateListener *dl, + RAUWUpdateListener(SelectionDAG &d, SDNode::use_iterator &ui, SDNode::use_iterator &ue) - : DownLink(dl), UI(ui), UE(ue) {} + : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {} }; } @@ -5273,8 +5282,7 @@ public: /// /// This version assumes From has a single result value. /// -void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To, - DAGUpdateListener *UpdateListener) { +void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) { SDNode *From = FromN.getNode(); assert(From->getNumValues() == 1 && FromN.getResNo() == 0 && "Cannot replace with this method!"); @@ -5288,7 +5296,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To, // is replaced by To, we don't want to replace of all its users with To // too. See PR3018 for more info. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); - RAUWUpdateListener Listener(UpdateListener, UI, UE); + RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; @@ -5307,7 +5315,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To, // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. - AddModifiedNodeToCSEMaps(User, &Listener); + AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. @@ -5321,8 +5329,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To, /// This version assumes that for each value of From, there is a /// corresponding value in To in the same position with the same type. /// -void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To, - DAGUpdateListener *UpdateListener) { +void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) { #ifndef NDEBUG for (unsigned i = 0, e = From->getNumValues(); i != e; ++i) assert((!From->hasAnyUseOfValue(i) || @@ -5337,7 +5344,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To, // Iterate over just the existing users of From. See the comments in // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); - RAUWUpdateListener Listener(UpdateListener, UI, UE); + RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; @@ -5356,7 +5363,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To, // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. - AddModifiedNodeToCSEMaps(User, &Listener); + AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. @@ -5369,16 +5376,14 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To, /// /// This version can replace From with any result values. To must match the /// number and types of values returned by From. -void SelectionDAG::ReplaceAllUsesWith(SDNode *From, - const SDValue *To, - DAGUpdateListener *UpdateListener) { +void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) { if (From->getNumValues() == 1) // Handle the simple case efficiently. - return ReplaceAllUsesWith(SDValue(From, 0), To[0], UpdateListener); + return ReplaceAllUsesWith(SDValue(From, 0), To[0]); // Iterate over just the existing users of From. See the comments in // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From->use_begin(), UE = From->use_end(); - RAUWUpdateListener Listener(UpdateListener, UI, UE); + RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; @@ -5398,7 +5403,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. - AddModifiedNodeToCSEMaps(User, &Listener); + AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. @@ -5409,14 +5414,13 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The Deleted /// vector is handled the same way as for ReplaceAllUsesWith. -void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To, - DAGUpdateListener *UpdateListener){ +void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){ // Handle the really simple, really trivial case efficiently. if (From == To) return; // Handle the simple, trivial, case efficiently. if (From.getNode()->getNumValues() == 1) { - ReplaceAllUsesWith(From, To, UpdateListener); + ReplaceAllUsesWith(From, To); return; } @@ -5424,7 +5428,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To, // the ReplaceAllUsesWith above. SDNode::use_iterator UI = From.getNode()->use_begin(), UE = From.getNode()->use_end(); - RAUWUpdateListener Listener(UpdateListener, UI, UE); + RAUWUpdateListener Listener(*this, UI, UE); while (UI != UE) { SDNode *User = *UI; bool UserRemovedFromCSEMaps = false; @@ -5460,7 +5464,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To, // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. - AddModifiedNodeToCSEMaps(User, &Listener); + AddModifiedNodeToCSEMaps(User); } // If we just RAUW'd the root, take note. @@ -5489,11 +5493,10 @@ namespace { /// handled the same way as for ReplaceAllUsesWith. void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From, const SDValue *To, - unsigned Num, - DAGUpdateListener *UpdateListener){ + unsigned Num){ // Handle the simple, trivial case efficiently. if (Num == 1) - return ReplaceAllUsesOfValueWith(*From, *To, UpdateListener); + return ReplaceAllUsesOfValueWith(*From, *To); // Read up all the uses and make records of them. This helps // processing new uses that are introduced during the @@ -5538,7 +5541,7 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From, // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. - AddModifiedNodeToCSEMaps(User, UpdateListener); + AddModifiedNodeToCSEMaps(User); } } @@ -5579,7 +5582,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() { } } - // Visit all the nodes. As we iterate, moves nodes into sorted order, + // Visit all the nodes. As we iterate, move nodes into sorted order, // such that by the time the end is reached all nodes will be sorted. for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) { SDNode *N = I; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 94cb958..8cbe818 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Constants.h" #include "llvm/CallingConv.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/GlobalVariable.h" @@ -42,7 +43,6 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" @@ -51,6 +51,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/IntegersSubsetMapping.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -843,7 +844,7 @@ void SelectionDAGBuilder::clear() { } /// clearDanglingDebugInfo - Clear the dangling debug information -/// map. This function is seperated from the clear so that debug +/// map. This function is separated from the clear so that debug /// information that is dangling in a basic block can be properly /// resolved in a different basic block. This allows the /// SelectionDAG to resolve dangling debug information attached @@ -941,7 +942,7 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) { default: llvm_unreachable("Unknown instruction type encountered!"); // Build the switch statement using the Instruction.def file. #define HANDLE_INST(NUM, OPCODE, CLASS) \ - case Instruction::OPCODE: visit##OPCODE((CLASS&)I); break; + case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break; #include "llvm/Instruction.def" } @@ -1578,17 +1579,18 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB, } else Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC); } else { - assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now"); + assert(CB.CC == ISD::SETCC_INVALID && + "Condition is undefined for to-the-range belonging check."); const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue(); const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue(); SDValue CmpOp = getValue(CB.CmpMHS); EVT VT = CmpOp.getValueType(); - - if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) { + + if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(false)) { Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT), - ISD::SETLE); + ISD::SETULE); } else { SDValue SUB = DAG.getNode(ISD::SUB, dl, VT, CmpOp, DAG.getConstant(Low, VT)); @@ -1826,9 +1828,13 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) { MachineBasicBlock *LandingPad = FuncInfo.MBBMap[I.getSuccessor(1)]; const Value *Callee(I.getCalledValue()); + const Function *Fn = dyn_cast<Function>(Callee); if (isa<InlineAsm>(Callee)) visitInlineAsm(&I); - else + else if (Fn && Fn->isIntrinsic()) { + assert(Fn->getIntrinsicID() == Intrinsic::donothing); + // Ignore invokes to @llvm.donothing: jump directly to the next BB. + } else LowerCallTo(&I, getValue(Callee), false, LandingPad); // If the value of the invoke is used outside of its defining block, make it @@ -1901,8 +1907,6 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR, const Value* SV, MachineBasicBlock *Default, MachineBasicBlock *SwitchBB) { - Case& BackCase = *(CR.Range.second-1); - // Size is the number of Cases represented by this range. size_t Size = CR.Range.second - CR.Range.first; if (Size > 3) @@ -1970,11 +1974,28 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR, } } + // Order cases by weight so the most likely case will be checked first. + BranchProbabilityInfo *BPI = FuncInfo.BPI; + if (BPI) { + for (CaseItr I = CR.Range.first, IE = CR.Range.second; I != IE; ++I) { + uint32_t IWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(), + I->BB->getBasicBlock()); + for (CaseItr J = CR.Range.first; J < I; ++J) { + uint32_t JWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(), + J->BB->getBasicBlock()); + if (IWeight > JWeight) + std::swap(*I, *J); + } + } + } // Rearrange the case blocks so that the last one falls through if possible. - if (NextBlock && Default != NextBlock && BackCase.BB != NextBlock) { + Case &BackCase = *(CR.Range.second-1); + if (Size > 1 && + NextBlock && Default != NextBlock && BackCase.BB != NextBlock) { // The last case block won't fall through into 'NextBlock' if we emit the // branches in this order. See if rearranging a case value would help. - for (CaseItr I = CR.Range.first, E = CR.Range.second-1; I != E; ++I) { + // We start at the bottom as it's the case with the least weight. + for (Case *I = &*(CR.Range.second-2), *E = &*CR.Range.first-1; I != E; --I){ if (I->BB == NextBlock) { std::swap(*I, BackCase); break; @@ -2006,7 +2027,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR, CC = ISD::SETEQ; LHS = SV; RHS = I->High; MHS = NULL; } else { - CC = ISD::SETLE; + CC = ISD::SETCC_INVALID; LHS = I->Low; MHS = SV; RHS = I->High; } @@ -2031,14 +2052,14 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR, } static inline bool areJTsAllowed(const TargetLowering &TLI) { - return !TLI.getTargetMachine().Options.DisableJumpTables && + return TLI.supportJumpTables() && (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other)); } static APInt ComputeRange(const APInt &First, const APInt &Last) { uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1; - APInt LastExt = Last.sext(BitWidth), FirstExt = First.sext(BitWidth); + APInt LastExt = Last.zext(BitWidth), FirstExt = First.zext(BitWidth); return (LastExt - FirstExt + 1ULL); } @@ -2104,7 +2125,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR, const APInt &Low = cast<ConstantInt>(I->Low)->getValue(); const APInt &High = cast<ConstantInt>(I->High)->getValue(); - if (Low.sle(TEI) && TEI.sle(High)) { + if (Low.ule(TEI) && TEI.ule(High)) { DestBBs.push_back(I->BB); if (TEI==High) ++I; @@ -2261,7 +2282,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR, // Create a CaseBlock record representing a conditional branch to // the LHS node if the value being switched on SV is less than C. // Otherwise, branch to LHS. - CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB); + CaseBlock CB(ISD::SETULT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB); if (CR.CaseBB == SwitchBB) visitSwitchCase(CB, SwitchBB); @@ -2333,7 +2354,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR, // Optimize the case where all the case values fit in a // word without having to subtract minValue. In this case, // we can optimize away the subtraction. - if (minValue.isNonNegative() && maxValue.slt(IntPtrBits)) { + if (maxValue.ult(IntPtrBits)) { cmpRange = maxValue; } else { lowBound = minValue; @@ -2407,57 +2428,46 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR, /// Clusterify - Transform simple list of Cases into list of CaseRange's size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases, const SwitchInst& SI) { - size_t numCmps = 0; + + /// Use a shorter form of declaration, and also + /// show the we want to use CRSBuilder as Clusterifier. + typedef IntegersSubsetMapping<MachineBasicBlock> Clusterifier; + + Clusterifier TheClusterifier; - BranchProbabilityInfo *BPI = FuncInfo.BPI; // Start with "simple" cases for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) { const BasicBlock *SuccBB = i.getCaseSuccessor(); MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB]; - uint32_t ExtraWeight = BPI ? BPI->getEdgeWeight(SI.getParent(), SuccBB) : 0; - - Cases.push_back(Case(i.getCaseValue(), i.getCaseValue(), - SMBB, ExtraWeight)); - } - std::sort(Cases.begin(), Cases.end(), CaseCmp()); - - // Merge case into clusters - if (Cases.size() >= 2) - // Must recompute end() each iteration because it may be - // invalidated by erase if we hold on to it - for (CaseItr I = Cases.begin(), J = llvm::next(Cases.begin()); - J != Cases.end(); ) { - const APInt& nextValue = cast<ConstantInt>(J->Low)->getValue(); - const APInt& currentValue = cast<ConstantInt>(I->High)->getValue(); - MachineBasicBlock* nextBB = J->BB; - MachineBasicBlock* currentBB = I->BB; - - // If the two neighboring cases go to the same destination, merge them - // into a single case. - if ((nextValue - currentValue == 1) && (currentBB == nextBB)) { - I->High = J->High; - J = Cases.erase(J); - - if (BranchProbabilityInfo *BPI = FuncInfo.BPI) { - uint32_t CurWeight = currentBB->getBasicBlock() ? - BPI->getEdgeWeight(SI.getParent(), currentBB->getBasicBlock()) : 16; - uint32_t NextWeight = nextBB->getBasicBlock() ? - BPI->getEdgeWeight(SI.getParent(), nextBB->getBasicBlock()) : 16; - - BPI->setEdgeWeight(SI.getParent(), currentBB->getBasicBlock(), - CurWeight + NextWeight); - } - } else { - I = J++; - } + TheClusterifier.add(i.getCaseValueEx(), SMBB); + } + + TheClusterifier.optimize(); + + BranchProbabilityInfo *BPI = FuncInfo.BPI; + size_t numCmps = 0; + for (Clusterifier::RangeIterator i = TheClusterifier.begin(), + e = TheClusterifier.end(); i != e; ++i, ++numCmps) { + Clusterifier::Cluster &C = *i; + unsigned W = 0; + if (BPI) { + W = BPI->getEdgeWeight(SI.getParent(), C.second->getBasicBlock()); + if (!W) + W = 16; + W *= C.first.Weight; + BPI->setEdgeWeight(SI.getParent(), C.second->getBasicBlock(), W); } - for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) { - if (I->Low != I->High) - // A range counts double, since it requires two compares. - ++numCmps; + // FIXME: Currently work with ConstantInt based numbers. + // Changing it to APInt based is a pretty heavy for this commit. + Cases.push_back(Case(C.first.getLow().toConstantInt(), + C.first.getHigh().toConstantInt(), C.second, W)); + + if (C.first.getLow() != C.first.getHigh()) + // A range counts double, since it requires two compares. + ++numCmps; } return numCmps; @@ -2804,7 +2814,7 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) { } // Utility for visitShuffleVector - Return true if every element in Mask, -// begining from position Pos and ending in Pos+Size, falls within the +// beginning from position Pos and ending in Pos+Size, falls within the // specified sequential range [L, L+Pos). or is undef. static bool isSequentialInRange(const SmallVectorImpl<int> &Mask, unsigned Pos, unsigned Size, int Low) { @@ -4914,6 +4924,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::pow: visitPow(I); return 0; + case Intrinsic::fabs: + setValue(&I, DAG.getNode(ISD::FABS, dl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)))); + return 0; case Intrinsic::fma: setValue(&I, DAG.getNode(ISD::FMA, dl, getValue(I.getArgOperand(0)).getValueType(), @@ -4921,6 +4936,29 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); return 0; + case Intrinsic::fmuladd: { + EVT VT = TLI.getValueType(I.getType()); + if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && + TLI.isOperationLegal(ISD::FMA, VT) && + TLI.isFMAFasterThanMulAndAdd(VT)){ + setValue(&I, DAG.getNode(ISD::FMA, dl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), + getValue(I.getArgOperand(2)))); + } else { + SDValue Mul = DAG.getNode(ISD::FMUL, dl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1))); + SDValue Add = DAG.getNode(ISD::FADD, dl, + getValue(I.getArgOperand(0)).getValueType(), + Mul, + getValue(I.getArgOperand(2))); + setValue(&I, Add); + } + return 0; + } case Intrinsic::convert_to_fp16: setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl, MVT::i16, getValue(I.getArgOperand(0)))); @@ -5050,7 +5088,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } case Intrinsic::gcroot: if (GFI) { - const Value *Alloca = I.getArgOperand(0); + const Value *Alloca = I.getArgOperand(0)->stripPointerCasts(); const Constant *TypeMap = cast<Constant>(I.getArgOperand(1)); FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode()); @@ -5077,16 +5115,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return 0; } TargetLowering::ArgListTy Args; - std::pair<SDValue, SDValue> Result = - TLI.LowerCallTo(getRoot(), I.getType(), + TargetLowering:: + CallLoweringInfo CLI(getRoot(), I.getType(), false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed=*/true, DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()), Args, DAG, getCurDebugLoc()); + std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); DAG.setRoot(Result.second); return 0; } + case Intrinsic::debugtrap: { + DAG.setRoot(DAG.getNode(ISD::DEBUGTRAP, dl,MVT::Other, getRoot())); + return 0; + } case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: case Intrinsic::usub_with_overflow: @@ -5139,6 +5182,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::lifetime_end: // Discard region information. return 0; + case Intrinsic::donothing: + // ignore + return 0; } } @@ -5157,14 +5203,13 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, // Check whether the function can return without sret-demotion. SmallVector<ISD::OutputArg, 4> Outs; - SmallVector<uint64_t, 4> Offsets; GetReturnInfo(RetTy, CS.getAttributes().getRetAttributes(), - Outs, TLI, &Offsets); + Outs, TLI); bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), - DAG.getMachineFunction(), - FTy->isVarArg(), Outs, - FTy->getContext()); + DAG.getMachineFunction(), + FTy->isVarArg(), Outs, + FTy->getContext()); SDValue DemoteStackSlot; int DemoteStackIdx = -100; @@ -5247,16 +5292,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, if (isTailCall && TM.Options.EnableFastISel) isTailCall = false; - std::pair<SDValue,SDValue> Result = - TLI.LowerCallTo(getRoot(), RetTy, - CS.paramHasAttr(0, Attribute::SExt), - CS.paramHasAttr(0, Attribute::ZExt), FTy->isVarArg(), - CS.paramHasAttr(0, Attribute::InReg), FTy->getNumParams(), - CS.getCallingConv(), - isTailCall, - CS.doesNotReturn(), - !CS.getInstruction()->use_empty(), - Callee, Args, DAG, getCurDebugLoc()); + TargetLowering:: + CallLoweringInfo CLI(getRoot(), RetTy, FTy, isTailCall, Callee, Args, DAG, + getCurDebugLoc(), CS); + std::pair<SDValue,SDValue> Result = TLI.LowerCallTo(CLI); assert((isTailCall || Result.second.getNode()) && "Non-null chain expected with non-tail call!"); assert((Result.second.getNode() || !Result.first.getNode()) && @@ -5272,7 +5311,13 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, ComputeValueVTs(TLI, PtrRetTy, PVTs); assert(PVTs.size() == 1 && "Pointers should fit in one register"); EVT PtrVT = PVTs[0]; - unsigned NumValues = Outs.size(); + + SmallVector<EVT, 4> RetTys; + SmallVector<uint64_t, 4> Offsets; + RetTy = FTy->getReturnType(); + ComputeValueVTs(TLI, RetTy, RetTys, &Offsets); + + unsigned NumValues = RetTys.size(); SmallVector<SDValue, 4> Values(NumValues); SmallVector<SDValue, 4> Chains(NumValues); @@ -5280,8 +5325,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT, DemoteStackSlot, DAG.getConstant(Offsets[i], PtrVT)); - SDValue L = DAG.getLoad(Outs[i].VT, getCurDebugLoc(), Result.second, - Add, + SDValue L = DAG.getLoad(RetTys[i], getCurDebugLoc(), Result.second, Add, MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]), false, false, false, 1); Values[i] = L; @@ -5292,30 +5336,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, MVT::Other, &Chains[0], NumValues); PendingLoads.push_back(Chain); - // Collect the legal value parts into potentially illegal values - // that correspond to the original function's return values. - SmallVector<EVT, 4> RetTys; - RetTy = FTy->getReturnType(); - ComputeValueVTs(TLI, RetTy, RetTys); - ISD::NodeType AssertOp = ISD::DELETED_NODE; - SmallVector<SDValue, 4> ReturnValues; - unsigned CurReg = 0; - for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { - EVT VT = RetTys[I]; - EVT RegisterVT = TLI.getRegisterType(RetTy->getContext(), VT); - unsigned NumRegs = TLI.getNumRegisters(RetTy->getContext(), VT); - - SDValue ReturnValue = - getCopyFromParts(DAG, getCurDebugLoc(), &Values[CurReg], NumRegs, - RegisterVT, VT, AssertOp); - ReturnValues.push_back(ReturnValue); - CurReg += NumRegs; - } - setValue(CS.getInstruction(), DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(), DAG.getVTList(&RetTys[0], RetTys.size()), - &ReturnValues[0], ReturnValues.size())); + &Values[0], Values.size())); } // Assign order to nodes here. If the call does not produce a result, it won't @@ -5952,11 +5976,11 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput]; if (OpInfo.ConstraintVT != Input.ConstraintVT) { - std::pair<unsigned, const TargetRegisterClass*> MatchRC = - TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode, + std::pair<unsigned, const TargetRegisterClass*> MatchRC = + TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode, OpInfo.ConstraintVT); - std::pair<unsigned, const TargetRegisterClass*> InputRC = - TLI.getRegForInlineAsmConstraint(Input.ConstraintCode, + std::pair<unsigned, const TargetRegisterClass*> InputRC = + TLI.getRegForInlineAsmConstraint(Input.ConstraintCode, Input.ConstraintVT); if ((OpInfo.ConstraintVT.isInteger() != Input.ConstraintVT.isInteger()) || @@ -6225,8 +6249,15 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass || OpInfo.ConstraintType == TargetLowering::C_Register) && "Unknown constraint type!"); - assert(!OpInfo.isIndirect && - "Don't know how to handle indirect register inputs yet!"); + + // TODO: Support this. + if (OpInfo.isIndirect) { + LLVMContext &Ctx = *DAG.getContext(); + Ctx.emitError(CS.getInstruction(), + "Don't know how to handle indirect register inputs yet " + "for constraint '" + Twine(OpInfo.ConstraintCode) + "'"); + break; + } // Copy the input into the appropriate registers. if (OpInfo.AssignedRegs.Regs.empty()) { @@ -6369,24 +6400,18 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) { /// FIXME: When all targets are /// migrated to using LowerCall, this hook should be integrated into SDISel. std::pair<SDValue, SDValue> -TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy, - bool RetSExt, bool RetZExt, bool isVarArg, - bool isInreg, unsigned NumFixedArgs, - CallingConv::ID CallConv, bool isTailCall, - bool doesNotRet, bool isReturnValueUsed, - SDValue Callee, - ArgListTy &Args, SelectionDAG &DAG, - DebugLoc dl) const { +TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // Handle all of the outgoing arguments. - SmallVector<ISD::OutputArg, 32> Outs; - SmallVector<SDValue, 32> OutVals; + CLI.Outs.clear(); + CLI.OutVals.clear(); + ArgListTy &Args = CLI.Args; for (unsigned i = 0, e = Args.size(); i != e; ++i) { SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(*this, Args[i].Ty, ValueVTs); for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; ++Value) { EVT VT = ValueVTs[Value]; - Type *ArgTy = VT.getTypeForEVT(RetTy->getContext()); + Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext()); SDValue Op = SDValue(Args[i].Node.getNode(), Args[i].Node.getResNo() + Value); ISD::ArgFlagsTy Flags; @@ -6419,8 +6444,8 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy, Flags.setNest(); Flags.setOrigAlign(OriginalAlignment); - EVT PartVT = getRegisterType(RetTy->getContext(), VT); - unsigned NumParts = getNumRegisters(RetTy->getContext(), VT); + EVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT); + unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT); SmallVector<SDValue, 4> Parts(NumParts); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; @@ -6429,89 +6454,88 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy, else if (Args[i].isZExt) ExtendKind = ISD::ZERO_EXTEND; - getCopyToParts(DAG, dl, Op, &Parts[0], NumParts, + getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT, ExtendKind); for (unsigned j = 0; j != NumParts; ++j) { // if it isn't first piece, alignment must be 1 ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), - i < NumFixedArgs); + i < CLI.NumFixedArgs); if (NumParts > 1 && j == 0) MyFlags.Flags.setSplit(); else if (j != 0) MyFlags.Flags.setOrigAlign(1); - Outs.push_back(MyFlags); - OutVals.push_back(Parts[j]); + CLI.Outs.push_back(MyFlags); + CLI.OutVals.push_back(Parts[j]); } } } // Handle the incoming return values from the call. - SmallVector<ISD::InputArg, 32> Ins; + CLI.Ins.clear(); SmallVector<EVT, 4> RetTys; - ComputeValueVTs(*this, RetTy, RetTys); + ComputeValueVTs(*this, CLI.RetTy, RetTys); for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { EVT VT = RetTys[I]; - EVT RegisterVT = getRegisterType(RetTy->getContext(), VT); - unsigned NumRegs = getNumRegisters(RetTy->getContext(), VT); + EVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT); + unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT); for (unsigned i = 0; i != NumRegs; ++i) { ISD::InputArg MyFlags; MyFlags.VT = RegisterVT.getSimpleVT(); - MyFlags.Used = isReturnValueUsed; - if (RetSExt) + MyFlags.Used = CLI.IsReturnValueUsed; + if (CLI.RetSExt) MyFlags.Flags.setSExt(); - if (RetZExt) + if (CLI.RetZExt) MyFlags.Flags.setZExt(); - if (isInreg) + if (CLI.IsInReg) MyFlags.Flags.setInReg(); - Ins.push_back(MyFlags); + CLI.Ins.push_back(MyFlags); } } SmallVector<SDValue, 4> InVals; - Chain = LowerCall(Chain, Callee, CallConv, isVarArg, doesNotRet, isTailCall, - Outs, OutVals, Ins, dl, DAG, InVals); + CLI.Chain = LowerCall(CLI, InVals); // Verify that the target's LowerCall behaved as expected. - assert(Chain.getNode() && Chain.getValueType() == MVT::Other && + assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other && "LowerCall didn't return a valid chain!"); - assert((!isTailCall || InVals.empty()) && + assert((!CLI.IsTailCall || InVals.empty()) && "LowerCall emitted a return value for a tail call!"); - assert((isTailCall || InVals.size() == Ins.size()) && + assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) && "LowerCall didn't emit the correct number of values!"); // For a tail call, the return value is merely live-out and there aren't // any nodes in the DAG representing it. Return a special value to // indicate that a tail call has been emitted and no more Instructions // should be processed in the current block. - if (isTailCall) { - DAG.setRoot(Chain); + if (CLI.IsTailCall) { + CLI.DAG.setRoot(CLI.Chain); return std::make_pair(SDValue(), SDValue()); } - DEBUG(for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + DEBUG(for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) { assert(InVals[i].getNode() && "LowerCall emitted a null value!"); - assert(EVT(Ins[i].VT) == InVals[i].getValueType() && + assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() && "LowerCall emitted a value with the wrong type!"); }); // Collect the legal value parts into potentially illegal values // that correspond to the original function's return values. ISD::NodeType AssertOp = ISD::DELETED_NODE; - if (RetSExt) + if (CLI.RetSExt) AssertOp = ISD::AssertSext; - else if (RetZExt) + else if (CLI.RetZExt) AssertOp = ISD::AssertZext; SmallVector<SDValue, 4> ReturnValues; unsigned CurReg = 0; for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { EVT VT = RetTys[I]; - EVT RegisterVT = getRegisterType(RetTy->getContext(), VT); - unsigned NumRegs = getNumRegisters(RetTy->getContext(), VT); + EVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT); + unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT); - ReturnValues.push_back(getCopyFromParts(DAG, dl, &InVals[CurReg], + ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, AssertOp)); CurReg += NumRegs; @@ -6521,12 +6545,12 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy, // such a node, so we just return a null return value in that case. In // that case, nothing will actually look at the value. if (ReturnValues.empty()) - return std::make_pair(SDValue(), Chain); + return std::make_pair(SDValue(), CLI.Chain); - SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl, - DAG.getVTList(&RetTys[0], RetTys.size()), + SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL, + CLI.DAG.getVTList(&RetTys[0], RetTys.size()), &ReturnValues[0], ReturnValues.size()); - return std::make_pair(Res, Chain); + return std::make_pair(Res, CLI.Chain); } void TargetLowering::LowerOperationWrapper(SDNode *N, @@ -6746,7 +6770,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) { // Note down frame index. if (FrameIndexSDNode *FI = - dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode())) + dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode())) FuncInfo->setArgumentFrameIndex(I, FI->getIndex()); SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues, diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 8393b41..d0fde6f 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -180,17 +180,6 @@ private: typedef std::vector<CaseRec> CaseRecVector; - /// The comparison function for sorting the switch case values in the vector. - /// WARNING: Case ranges should be disjoint! - struct CaseCmp { - bool operator()(const Case &C1, const Case &C2) { - assert(isa<ConstantInt>(C1.Low) && isa<ConstantInt>(C2.High)); - const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low); - const ConstantInt* CI2 = cast<const ConstantInt>(C2.High); - return CI1->getValue().slt(CI2->getValue()); - } - }; - struct CaseBitsCmp { bool operator()(const CaseBits &C1, const CaseBits &C2) { return C1.Bits > C2.Bits; @@ -351,7 +340,7 @@ public: void clear(); /// clearDanglingDebugInfo - Clear the dangling debug information - /// map. This function is seperated from the clear so that debug + /// map. This function is separated from the clear so that debug /// information that is dangling in a basic block can be properly /// resolved in a different basic block. This allows the /// SelectionDAG to resolve dangling debug information attached diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index f981afb..9fc225f 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "ScheduleDAGSDNodes.h" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/Intrinsics.h" #include "llvm/Assembly/Writer.h" @@ -19,7 +20,6 @@ #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetMachine.h" @@ -265,6 +265,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::STACKSAVE: return "stacksave"; case ISD::STACKRESTORE: return "stackrestore"; case ISD::TRAP: return "trap"; + case ISD::DEBUGTRAP: return "debugtrap"; // Bit manipulation case ISD::BSWAP: return "bswap"; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 605509b..287c679 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -14,12 +14,8 @@ #define DEBUG_TYPE "isel" #include "ScheduleDAGSDNodes.h" #include "SelectionDAGBuilder.h" -#include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/InlineAsm.h" #include "llvm/Instructions.h" @@ -27,7 +23,10 @@ #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCStrategy.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -38,6 +37,7 @@ #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetInstrInfo.h" @@ -263,8 +263,6 @@ void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // SelectionDAGISel code //===----------------------------------------------------------------------===// -void SelectionDAGISel::ISelUpdater::anchor() { } - SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm, CodeGenOpt::Level OL) : MachineFunctionPass(ID), TM(tm), TLI(*tm.getTargetLowering()), @@ -451,9 +449,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { } } } - done:; } + done: // Determine if there is a call to setjmp in the machine function. MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice()); @@ -468,8 +466,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { // If To is also scheduled to be replaced, find what its ultimate // replacement is. for (;;) { - DenseMap<unsigned, unsigned>::iterator J = - FuncInfo->RegFixups.find(To); + DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To); if (J == E) break; To = J->second; } @@ -703,6 +700,25 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { CurDAG->clear(); } +namespace { +/// ISelUpdater - helper class to handle updates of the instruction selection +/// graph. +class ISelUpdater : public SelectionDAG::DAGUpdateListener { + SelectionDAG::allnodes_iterator &ISelPosition; +public: + ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp) + : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {} + + /// NodeDeleted - Handle nodes deleted from the graph. If the node being + /// deleted is the current ISelPosition node, update ISelPosition. + /// + virtual void NodeDeleted(SDNode *N, SDNode *E) { + if (ISelPosition == SelectionDAG::allnodes_iterator(N)) + ++ISelPosition; + } +}; +} // end anonymous namespace + void SelectionDAGISel::DoInstructionSelection() { DEBUG(errs() << "===== Instruction selection begins: BB#" << FuncInfo->MBB->getNumber() @@ -719,9 +735,13 @@ void SelectionDAGISel::DoInstructionSelection() { // a reference to the root node, preventing it from being deleted, // and tracking any changes of the root. HandleSDNode Dummy(CurDAG->getRoot()); - ISelPosition = SelectionDAG::allnodes_iterator(CurDAG->getRoot().getNode()); + SelectionDAG::allnodes_iterator ISelPosition (CurDAG->getRoot().getNode()); ++ISelPosition; + // Make sure that ISelPosition gets properly updated when nodes are deleted + // in calls made from this function. + ISelUpdater ISU(*CurDAG, ISelPosition); + // The AllNodes list is now topological-sorted. Visit the // nodes by starting at the end of the list (the root of the // graph) and preceding back toward the beginning (the entry @@ -748,10 +768,8 @@ void SelectionDAGISel::DoInstructionSelection() { // If after the replacement this node is not used any more, // remove this dead node. - if (Node->use_empty()) { // Don't delete EntryToken, etc. - ISelUpdater ISU(ISelPosition); - CurDAG->RemoveDeadNode(Node, &ISU); - } + if (Node->use_empty()) // Don't delete EntryToken, etc. + CurDAG->RemoveDeadNode(Node); } CurDAG->setRoot(Dummy.getValue()); @@ -1680,8 +1698,6 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain, bool isMorphNodeTo) { SmallVector<SDNode*, 4> NowDeadNodes; - ISelUpdater ISU(ISelPosition); - // Now that all the normal results are replaced, we replace the chain and // glue results if present. if (!ChainNodesMatched.empty()) { @@ -1705,7 +1721,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain, if (ChainVal.getValueType() == MVT::Glue) ChainVal = ChainVal.getValue(ChainVal->getNumValues()-2); assert(ChainVal.getValueType() == MVT::Other && "Not a chain?"); - CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain, &ISU); + CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain); // If the node became dead and we haven't already seen it, delete it. if (ChainNode->use_empty() && @@ -1728,7 +1744,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain, assert(FRN->getValueType(FRN->getNumValues()-1) == MVT::Glue && "Doesn't have a glue result"); CurDAG->ReplaceAllUsesOfValueWith(SDValue(FRN, FRN->getNumValues()-1), - InputGlue, &ISU); + InputGlue); // If the node became dead and we haven't already seen it, delete it. if (FRN->use_empty() && @@ -1738,7 +1754,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain, } if (!NowDeadNodes.empty()) - CurDAG->RemoveDeadNodes(NowDeadNodes, &ISU); + CurDAG->RemoveDeadNodes(NowDeadNodes); DEBUG(errs() << "ISEL: Match complete!\n"); } @@ -1759,7 +1775,7 @@ enum ChainResult { /// The walk we do here is guaranteed to be small because we quickly get down to /// already selected nodes "below" us. static ChainResult -WalkChainUsers(SDNode *ChainedNode, +WalkChainUsers(const SDNode *ChainedNode, SmallVectorImpl<SDNode*> &ChainedNodesInPattern, SmallVectorImpl<SDNode*> &InteriorChainedNodes) { ChainResult Result = CR_Simple; @@ -1992,14 +2008,14 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, /// CheckPatternPredicate - Implements OP_CheckPatternPredicate. LLVM_ATTRIBUTE_ALWAYS_INLINE static bool CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex, - SelectionDAGISel &SDISel) { + const SelectionDAGISel &SDISel) { return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]); } /// CheckNodePredicate - Implements OP_CheckNodePredicate. LLVM_ATTRIBUTE_ALWAYS_INLINE static bool CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex, - SelectionDAGISel &SDISel, SDNode *N) { + const SelectionDAGISel &SDISel, SDNode *N) { return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]); } @@ -2062,7 +2078,7 @@ CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex, LLVM_ATTRIBUTE_ALWAYS_INLINE static bool CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, - SDValue N, SelectionDAGISel &SDISel) { + SDValue N, const SelectionDAGISel &SDISel) { int64_t Val = MatcherTable[MatcherIndex++]; if (Val & 128) Val = GetVBR(Val, MatcherTable, MatcherIndex); @@ -2075,7 +2091,7 @@ CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, LLVM_ATTRIBUTE_ALWAYS_INLINE static bool CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, - SDValue N, SelectionDAGISel &SDISel) { + SDValue N, const SelectionDAGISel &SDISel) { int64_t Val = MatcherTable[MatcherIndex++]; if (Val & 128) Val = GetVBR(Val, MatcherTable, MatcherIndex); @@ -2094,7 +2110,8 @@ CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, /// MatcherIndex to continue with. static unsigned IsPredicateKnownToFail(const unsigned char *Table, unsigned Index, SDValue N, - bool &Result, SelectionDAGISel &SDISel, + bool &Result, + const SelectionDAGISel &SDISel, SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { switch (Table[Index++]) { default: @@ -2759,9 +2776,14 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable, (SDNode*) 0)); } - } else { + } else if (NodeToMatch->getOpcode() != ISD::DELETED_NODE) { Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops.data(), Ops.size(), EmitNodeInfo); + } else { + // NodeToMatch was eliminated by CSE when the target changed the DAG. + // We will visit the equivalent node later. + DEBUG(dbgs() << "Node was eliminated by CSE\n"); + return 0; } // If the node had chain/glue results, update our notion of the current @@ -2959,6 +2981,7 @@ void SelectionDAGISel::CannotYetSelect(SDNode *N) { N->getOpcode() != ISD::INTRINSIC_WO_CHAIN && N->getOpcode() != ISD::INTRINSIC_VOID) { N->printrFull(Msg, CurDAG); + Msg << "\nIn function: " << MF->getFunction()->getName(); } else { bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other; unsigned iid = diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp index 6cde05a..173ffac 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp @@ -13,13 +13,13 @@ #include "ScheduleDAGSDNodes.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/Assembly/Writer.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Support/Debug.h" diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e341e15..dff9b2c 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -25,6 +25,7 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -32,13 +33,6 @@ #include <cctype> using namespace llvm; -/// We are in the process of implementing a new TypeLegalization action -/// - the promotion of vector elements. This feature is disabled by default -/// and only enabled using this flag. -static cl::opt<bool> -AllowPromoteIntElem("promote-elements", cl::Hidden, cl::init(true), - cl::desc("Allow promotion of integer vector element types")); - /// InitLibcallNames - Set default libcall names. /// static void InitLibcallNames(const char **Names) { @@ -521,8 +515,7 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) { /// NOTE: The constructor takes ownership of TLOF. TargetLowering::TargetLowering(const TargetMachine &tm, const TargetLoweringObjectFile *tlof) - : TM(tm), TD(TM.getTargetData()), TLOF(*tlof), - mayPromoteElements(AllowPromoteIntElem) { + : TM(tm), TD(TM.getTargetData()), TLOF(*tlof) { // All operations default to being supported. memset(OpActions, 0, sizeof(OpActions)); memset(LoadExtActions, 0, sizeof(LoadExtActions)); @@ -604,6 +597,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm, IntDivIsCheap = false; Pow2DivIsCheap = false; JumpIsExpensive = false; + predictableSelectIsExpensive = false; StackPointerRegisterToSaveRestore = 0; ExceptionPointerRegister = 0; ExceptionSelectorRegister = 0; @@ -618,6 +612,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm, MinStackArgumentAlignment = 1; ShouldFoldAtomicFences = false; InsertFencesForAtomic = false; + SupportJumpTables = true; InitLibcallNames(LibcallRoutineNames); InitCmpLibcallCCs(CmpLibcallCCs); @@ -708,42 +703,34 @@ bool TargetLowering::isLegalRC(const TargetRegisterClass *RC) const { return false; } -/// hasLegalSuperRegRegClasses - Return true if the specified register class -/// has one or more super-reg register classes that are legal. -bool -TargetLowering::hasLegalSuperRegRegClasses(const TargetRegisterClass *RC) const{ - if (*RC->superregclasses_begin() == 0) - return false; - for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(), - E = RC->superregclasses_end(); I != E; ++I) { - const TargetRegisterClass *RRC = *I; - if (isLegalRC(RRC)) - return true; - } - return false; -} - /// findRepresentativeClass - Return the largest legal super-reg register class /// of the register class for the specified type and its associated "cost". std::pair<const TargetRegisterClass*, uint8_t> TargetLowering::findRepresentativeClass(EVT VT) const { + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); const TargetRegisterClass *RC = RegClassForVT[VT.getSimpleVT().SimpleTy]; if (!RC) return std::make_pair(RC, 0); + + // Compute the set of all super-register classes. + BitVector SuperRegRC(TRI->getNumRegClasses()); + for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) + SuperRegRC.setBitsInMask(RCI.getMask()); + + // Find the first legal register class with the largest spill size. const TargetRegisterClass *BestRC = RC; - for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(), - E = RC->superregclasses_end(); I != E; ++I) { - const TargetRegisterClass *RRC = *I; - if (RRC->isASubClass() || !isLegalRC(RRC)) + for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) { + const TargetRegisterClass *SuperRC = TRI->getRegClass(i); + // We want the largest possible spill size. + if (SuperRC->getSize() <= BestRC->getSize()) + continue; + if (!isLegalRC(SuperRC)) continue; - if (!hasLegalSuperRegRegClasses(RRC)) - return std::make_pair(RRC, 1); - BestRC = RRC; + BestRC = SuperRC; } return std::make_pair(BestRC, 1); } - /// computeRegisterProperties - Once all of the register classes are added, /// this allows us to compute derived properties we expose. void TargetLowering::computeRegisterProperties() { @@ -835,11 +822,8 @@ void TargetLowering::computeRegisterProperties() { unsigned NElts = VT.getVectorNumElements(); if (NElts != 1) { bool IsLegalWiderType = false; - // If we allow the promotion of vector elements using a flag, - // then return TypePromoteInteger on vector elements. // First try to promote the elements of integer vectors. If no legal // promotion was found, fallback to the widen-vector method. - if (mayPromoteElements) for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { EVT SVT = (MVT::SimpleValueType)nVT; // Promote vectors of integers to vectors with the same number @@ -940,9 +924,12 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT, unsigned NumElts = VT.getVectorNumElements(); // If there is a wider vector type with the same element type as this one, - // we should widen to that legal vector type. This handles things like - // <2 x float> -> <4 x float>. - if (NumElts != 1 && getTypeAction(Context, VT) == TypeWidenVector) { + // or a promoted vector type that has the same number of elements which + // are wider, then we should convert to that legal vector type. + // This handles things like <2 x float> -> <4 x float> and + // <4 x i1> -> <4 x i32>. + LegalizeTypeAction TA = getTypeAction(Context, VT); + if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) { RegisterVT = getTypeToTransformTo(Context, VT); if (isTypeLegal(RegisterVT)) { IntermediateVT = RegisterVT; @@ -1000,13 +987,11 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT, /// TODO: Move this out of TargetLowering.cpp. void llvm::GetReturnInfo(Type* ReturnType, Attributes attr, SmallVectorImpl<ISD::OutputArg> &Outs, - const TargetLowering &TLI, - SmallVectorImpl<uint64_t> *Offsets) { + const TargetLowering &TLI) { SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(TLI, ReturnType, ValueVTs); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; - unsigned Offset = 0; for (unsigned j = 0, f = NumValues; j != f; ++j) { EVT VT = ValueVTs[j]; @@ -1029,8 +1014,6 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr, unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT); EVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT); - unsigned PartSize = TLI.getTargetData()->getTypeAllocSize( - PartVT.getTypeForEVT(ReturnType->getContext())); // 'inreg' on function refers to return value ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); @@ -1045,10 +1028,6 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr, for (unsigned i = 0; i < NumParts; ++i) { Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true)); - if (Offsets) { - Offsets->push_back(Offset); - Offset += PartSize; - } } } } @@ -2019,7 +1998,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } } - // Make sure we're not loosing bits from the constant. + // Make sure we're not losing bits from the constant. if (MinBits < C1.getBitWidth() && MinBits > C1.getActiveBits()) { EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits); if (isTypeDesirableForOp(ISD::SETCC, MinVT)) { @@ -2343,6 +2322,55 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } } } + + if (C1.getMinSignedBits() <= 64 && + !isLegalICmpImmediate(C1.getSExtValue())) { + // (X & -256) == 256 -> (X >> 8) == 1 + if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && + N0.getOpcode() == ISD::AND && N0.hasOneUse()) { + if (ConstantSDNode *AndRHS = + dyn_cast<ConstantSDNode>(N0.getOperand(1))) { + const APInt &AndRHSC = AndRHS->getAPIntValue(); + if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) { + unsigned ShiftBits = AndRHSC.countTrailingZeros(); + EVT ShiftTy = DCI.isBeforeLegalize() ? + getPointerTy() : getShiftAmountTy(N0.getValueType()); + EVT CmpTy = N0.getValueType(); + SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0), + DAG.getConstant(ShiftBits, ShiftTy)); + SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), CmpTy); + return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond); + } + } + } else if (Cond == ISD::SETULT || Cond == ISD::SETUGE || + Cond == ISD::SETULE || Cond == ISD::SETUGT) { + bool AdjOne = (Cond == ISD::SETULE || Cond == ISD::SETUGT); + // X < 0x100000000 -> (X >> 32) < 1 + // X >= 0x100000000 -> (X >> 32) >= 1 + // X <= 0x0ffffffff -> (X >> 32) < 1 + // X > 0x0ffffffff -> (X >> 32) >= 1 + unsigned ShiftBits; + APInt NewC = C1; + ISD::CondCode NewCond = Cond; + if (AdjOne) { + ShiftBits = C1.countTrailingOnes(); + NewC = NewC + 1; + NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; + } else { + ShiftBits = C1.countTrailingZeros(); + } + NewC = NewC.lshr(ShiftBits); + if (ShiftBits && isLegalICmpImmediate(NewC.getSExtValue())) { + EVT ShiftTy = DCI.isBeforeLegalize() ? + getPointerTy() : getShiftAmountTy(N0.getValueType()); + EVT CmpTy = N0.getValueType(); + SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0, + DAG.getConstant(ShiftBits, ShiftTy)); + SDValue CmpRHS = DAG.getConstant(NewC, CmpTy); + return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond); + } + } + } } if (isa<ConstantFPSDNode>(N0.getNode())) { @@ -2411,21 +2439,28 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } if (N0 == N1) { + // The sext(setcc()) => setcc() optimization relies on the appropriate + // constant being emitted. + uint64_t EqVal; + switch (getBooleanContents(N0.getValueType().isVector())) { + case UndefinedBooleanContent: + case ZeroOrOneBooleanContent: + EqVal = ISD::isTrueWhenEqual(Cond); + break; + case ZeroOrNegativeOneBooleanContent: + EqVal = ISD::isTrueWhenEqual(Cond) ? -1 : 0; + break; + } + // We can always fold X == X for integer setcc's. if (N0.getValueType().isInteger()) { - switch (getBooleanContents(N0.getValueType().isVector())) { - case UndefinedBooleanContent: - case ZeroOrOneBooleanContent: - return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT); - case ZeroOrNegativeOneBooleanContent: - return DAG.getConstant(ISD::isTrueWhenEqual(Cond) ? -1 : 0, VT); - } + return DAG.getConstant(EqVal, VT); } unsigned UOF = ISD::getUnorderedFlavor(Cond); if (UOF == 2) // FP operators that are undefined on NaNs. - return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT); + return DAG.getConstant(EqVal, VT); if (UOF == unsigned(ISD::isTrueWhenEqual(Cond))) - return DAG.getConstant(UOF, VT); + return DAG.getConstant(EqVal, VT); // Otherwise, we can't fold it. However, we can simplify it to SETUO/SETO // if it is not already. ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO; @@ -2998,10 +3033,12 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints( AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput]; if (OpInfo.ConstraintVT != Input.ConstraintVT) { - std::pair<unsigned, const TargetRegisterClass*> MatchRC = - getRegForInlineAsmConstraint(OpInfo.ConstraintCode, OpInfo.ConstraintVT); - std::pair<unsigned, const TargetRegisterClass*> InputRC = - getRegForInlineAsmConstraint(Input.ConstraintCode, Input.ConstraintVT); + std::pair<unsigned, const TargetRegisterClass*> MatchRC = + getRegForInlineAsmConstraint(OpInfo.ConstraintCode, + OpInfo.ConstraintVT); + std::pair<unsigned, const TargetRegisterClass*> InputRC = + getRegForInlineAsmConstraint(Input.ConstraintCode, + Input.ConstraintVT); if ((OpInfo.ConstraintVT.isInteger() != Input.ConstraintVT.isInteger()) || (MatchRC.second != InputRC.second)) { diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp index 0016047..8a6b120 100644 --- a/lib/CodeGen/ShadowStackGC.cpp +++ b/lib/CodeGen/ShadowStackGC.cpp @@ -26,13 +26,13 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "shadowstackgc" -#include "llvm/CodeGen/GCs.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/Module.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/GCs.h" #include "llvm/Support/CallSite.h" -#include "llvm/Support/IRBuilder.h" using namespace llvm; diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp index 9a86f32..980bd74 100644 --- a/lib/CodeGen/SjLjEHPrepare.cpp +++ b/lib/CodeGen/SjLjEHPrepare.cpp @@ -13,28 +13,28 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "sjljehprepare" -#include "llvm/Transforms/Scalar.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Pass.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <set> using namespace llvm; diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp index 26cf259..c8c3fb3 100644 --- a/lib/CodeGen/SlotIndexes.cpp +++ b/lib/CodeGen/SlotIndexes.cpp @@ -62,7 +62,6 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) { assert(mi2iMap.empty() && "MachineInstr -> Index mapping non-empty at initial numbering?"); - functionSize = 0; unsigned index = 0; MBBRanges.resize(mf->getNumBlockIDs()); idx2MBBMap.reserve(mf->size()); @@ -89,8 +88,6 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) { // Save this base index in the maps. mi2iMap.insert(std::make_pair(mi, SlotIndex(&indexList.back(), SlotIndex::Slot_Block))); - - ++functionSize; } // We insert one blank instructions between basic blocks. diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp index 6f33f54..320128a 100644 --- a/lib/CodeGen/SpillPlacement.cpp +++ b/lib/CodeGen/SpillPlacement.cpp @@ -207,6 +207,17 @@ void SpillPlacement::activate(unsigned n) { return; ActiveNodes->set(n); nodes[n].clear(); + + // Very large bundles usually come from big switches, indirect branches, + // landing pads, or loops with many 'continue' statements. It is difficult to + // allocate registers when so many different blocks are involved. + // + // Give a small negative bias to large bundles such that 1/32 of the + // connected blocks need to be interested before we consider expanding the + // region through the bundle. This helps compile time by limiting the number + // of blocks visited and the number of links in the Hopfield network. + if (bundles->getBlocks(n).size() > 100) + nodes[n].Bias = -0.0625f; } diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp index 9959f74..9a751c1 100644 --- a/lib/CodeGen/SplitKit.cpp +++ b/lib/CodeGen/SplitKit.cpp @@ -345,9 +345,11 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) { Values.clear(); // Reset the LiveRangeCalc instances needed for this spill mode. - LRCalc[0].reset(&VRM.getMachineFunction()); + LRCalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT, + &LIS.getVNInfoAllocator()); if (SpillMode) - LRCalc[1].reset(&VRM.getMachineFunction()); + LRCalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT, + &LIS.getVNInfoAllocator()); // We don't need an AliasAnalysis since we will only be performing // cheap-as-a-copy remats anyway. @@ -924,11 +926,9 @@ bool SplitEditor::transferValues() { DEBUG(dbgs() << '\n'); } - LRCalc[0].calculateValues(LIS.getSlotIndexes(), &MDT, - &LIS.getVNInfoAllocator()); + LRCalc[0].calculateValues(); if (SpillMode) - LRCalc[1].calculateValues(LIS.getSlotIndexes(), &MDT, - &LIS.getVNInfoAllocator()); + LRCalc[1].calculateValues(); return Skipped; } @@ -953,8 +953,7 @@ void SplitEditor::extendPHIKillRanges() { if (Edit->getParent().liveAt(LastUse)) { assert(RegAssign.lookup(LastUse) == RegIdx && "Different register assignment in phi predecessor"); - LRC.extend(LI, End, - LIS.getSlotIndexes(), &MDT, &LIS.getVNInfoAllocator()); + LRC.extend(LI, End); } } } @@ -1004,8 +1003,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) { } else Idx = Idx.getRegSlot(true); - getLRCalc(RegIdx).extend(LI, Idx.getNextSlot(), LIS.getSlotIndexes(), - &MDT, &LIS.getVNInfoAllocator()); + getLRCalc(RegIdx).extend(LI, Idx.getNextSlot()); } } diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp index 1e940b1..20da36e 100644 --- a/lib/CodeGen/StackSlotColoring.cpp +++ b/lib/CodeGen/StackSlotColoring.cpp @@ -46,7 +46,6 @@ STATISTIC(NumDead, "Number of trivially dead stack accesses eliminated"); namespace { class StackSlotColoring : public MachineFunctionPass { - bool ColorWithRegs; LiveStacks* LS; MachineFrameInfo *MFI; const TargetInstrInfo *TII; @@ -82,7 +81,7 @@ namespace { public: static char ID; // Pass identification StackSlotColoring() : - MachineFunctionPass(ID), ColorWithRegs(false), NextColor(-1) { + MachineFunctionPass(ID), NextColor(-1) { initializeStackSlotColoringPass(*PassRegistry::getPassRegistry()); } diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp index 8ebfbca..a813fa6 100644 --- a/lib/CodeGen/TailDuplication.cpp +++ b/lib/CodeGen/TailDuplication.cpp @@ -20,12 +20,15 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineSSAUpdater.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" @@ -57,8 +60,10 @@ namespace { /// TailDuplicatePass - Perform tail duplication. class TailDuplicatePass : public MachineFunctionPass { const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; MachineModuleInfo *MMI; MachineRegisterInfo *MRI; + OwningPtr<RegScavenger> RS; bool PreRegAlloc; // SSAUpdateVRs - A list of virtual registers for which to update SSA form. @@ -124,9 +129,13 @@ INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication", bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) { TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); MRI = &MF.getRegInfo(); MMI = getAnalysisIfAvailable<MachineModuleInfo>(); PreRegAlloc = MRI->isSSA(); + RS.reset(); + if (MRI->tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF)) + RS.reset(new RegScavenger()); bool MadeChange = false; while (TailDuplicateBlocks(MF)) @@ -272,8 +281,8 @@ TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB, continue; unsigned Dst = Copy->getOperand(0).getReg(); unsigned Src = Copy->getOperand(1).getReg(); - MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src); - if (++UI == MRI->use_end()) { + if (MRI->hasOneNonDBGUse(Src) && + MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) { // Copy is the only use. Do trivial copy propagation here. MRI->replaceRegWith(Dst, Src); Copy->eraseFromParent(); @@ -429,8 +438,10 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI, AddSSAUpdateEntry(Reg, NewReg, PredBB); } else { DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg); - if (VI != LocalVRMap.end()) + if (VI != LocalVRMap.end()) { MO.setReg(VI->second); + MRI->constrainRegClass(VI->second, MRI->getRegClass(Reg)); + } } } PredBB->insert(PredBB->instr_end(), NewMI); @@ -775,6 +786,23 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, // Remove PredBB's unconditional branch. TII->RemoveBranch(*PredBB); + if (RS && !TailBB->livein_empty()) { + // Update PredBB livein. + RS->enterBasicBlock(PredBB); + if (!PredBB->empty()) + RS->forward(prior(PredBB->end())); + BitVector RegsLiveAtExit(TRI->getNumRegs()); + RS->getRegsUsed(RegsLiveAtExit, false); + for (MachineBasicBlock::livein_iterator I = TailBB->livein_begin(), + E = TailBB->livein_end(); I != E; ++I) { + if (!RegsLiveAtExit[*I]) + // If a register is previously livein to the tail but it's not live + // at the end of predecessor BB, then it should be added to its + // livein list. + PredBB->addLiveIn(*I); + } + } + // Clone the contents of TailBB into PredBB. DenseMap<unsigned, unsigned> LocalVRMap; SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos; diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp index 2beb928..a3d6771 100644 --- a/lib/CodeGen/TargetInstrInfoImpl.cpp +++ b/lib/CodeGen/TargetInstrInfoImpl.cpp @@ -501,6 +501,14 @@ CreateTargetHazardRecognizer(const TargetMachine *TM, return new ScheduleHazardRecognizer(); } +// Default implementation of CreateTargetMIHazardRecognizer. +ScheduleHazardRecognizer *TargetInstrInfoImpl:: +CreateTargetMIHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + return (ScheduleHazardRecognizer *) + new ScoreboardHazardRecognizer(II, DAG, "misched"); +} + // Default implementation of CreateTargetPostRAHazardRecognizer. ScheduleHazardRecognizer *TargetInstrInfoImpl:: CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, @@ -509,6 +517,10 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, new ScoreboardHazardRecognizer(II, DAG, "post-RA-sched"); } +//===----------------------------------------------------------------------===// +// SelectionDAG latency interface. +//===----------------------------------------------------------------------===// + int TargetInstrInfoImpl::getOperandLatency(const InstrItineraryData *ItinData, SDNode *DefNode, unsigned DefIdx, @@ -537,3 +549,199 @@ int TargetInstrInfoImpl::getInstrLatency(const InstrItineraryData *ItinData, return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass()); } +//===----------------------------------------------------------------------===// +// MachineInstr latency interface. +//===----------------------------------------------------------------------===// + +unsigned +TargetInstrInfoImpl::getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const { + if (!ItinData || ItinData->isEmpty()) + return 1; + + unsigned Class = MI->getDesc().getSchedClass(); + int UOps = ItinData->Itineraries[Class].NumMicroOps; + if (UOps >= 0) + return UOps; + + // The # of u-ops is dynamically determined. The specific target should + // override this function to return the right number. + return 1; +} + +/// Return the default expected latency for a def based on it's opcode. +unsigned TargetInstrInfo::defaultDefLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI) const { + if (DefMI->mayLoad()) + return ItinData->SchedModel->LoadLatency; + if (isHighLatencyDef(DefMI->getOpcode())) + return ItinData->SchedModel->HighLatency; + return 1; +} + +unsigned TargetInstrInfoImpl:: +getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + // Default to one cycle for no itinerary. However, an "empty" itinerary may + // still have a MinLatency property, which getStageLatency checks. + if (!ItinData) + return MI->mayLoad() ? 2 : 1; + + return ItinData->getStageLatency(MI->getDesc().getSchedClass()); +} + +bool TargetInstrInfoImpl::hasLowDefLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, + unsigned DefIdx) const { + if (!ItinData || ItinData->isEmpty()) + return false; + + unsigned DefClass = DefMI->getDesc().getSchedClass(); + int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + return (DefCycle != -1 && DefCycle <= 1); +} + +/// Both DefMI and UseMI must be valid. By default, call directly to the +/// itinerary. This may be overriden by the target. +int TargetInstrInfoImpl:: +getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + unsigned DefClass = DefMI->getDesc().getSchedClass(); + unsigned UseClass = UseMI->getDesc().getSchedClass(); + return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); +} + +/// If we can determine the operand latency from the def only, without itinerary +/// lookup, do so. Otherwise return -1. +static int computeDefOperandLatency( + const TargetInstrInfo *TII, const InstrItineraryData *ItinData, + const MachineInstr *DefMI, bool FindMin) { + + // Let the target hook getInstrLatency handle missing itineraries. + if (!ItinData) + return TII->getInstrLatency(ItinData, DefMI); + + // Return a latency based on the itinerary properties and defining instruction + // if possible. Some common subtargets don't require per-operand latency, + // especially for minimum latencies. + if (FindMin) { + // If MinLatency is valid, call getInstrLatency. This uses Stage latency if + // it exists before defaulting to MinLatency. + if (ItinData->SchedModel->MinLatency >= 0) + return TII->getInstrLatency(ItinData, DefMI); + + // If MinLatency is invalid, OperandLatency is interpreted as MinLatency. + // For empty itineraries, short-cirtuit the check and default to one cycle. + if (ItinData->isEmpty()) + return 1; + } + else if(ItinData->isEmpty()) + return TII->defaultDefLatency(ItinData, DefMI); + + // ...operand lookup required + return -1; +} + +/// computeOperandLatency - Compute and return the latency of the given data +/// dependent def and use when the operand indices are already known. +/// +/// FindMin may be set to get the minimum vs. expected latency. +unsigned TargetInstrInfo:: +computeOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx, + bool FindMin) const { + + int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin); + if (DefLatency >= 0) + return DefLatency; + + assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail"); + + int OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); + if (OperLatency >= 0) + return OperLatency; + + // No operand latency was found. + unsigned InstrLatency = getInstrLatency(ItinData, DefMI); + + // Expected latency is the max of the stage latency and itinerary props. + if (!FindMin) + InstrLatency = std::max(InstrLatency, defaultDefLatency(ItinData, DefMI)); + return InstrLatency; +} + +/// computeOperandLatency - Compute and return the latency of the given data +/// dependent def and use. DefMI must be a valid def. UseMI may be NULL for an +/// unknown use. Depending on the subtarget's itinerary properties, this may or +/// may not need to call getOperandLatency(). +/// +/// FindMin may be set to get the minimum vs. expected latency. Minimum +/// latency is used for scheduling groups, while expected latency is for +/// instruction cost and critical path. +/// +/// For most subtargets, we don't need DefIdx or UseIdx to compute min latency. +/// DefMI must be a valid definition, but UseMI may be NULL for an unknown use. +unsigned TargetInstrInfo:: +computeOperandLatency(const InstrItineraryData *ItinData, + const TargetRegisterInfo *TRI, + const MachineInstr *DefMI, const MachineInstr *UseMI, + unsigned Reg, bool FindMin) const { + + int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin); + if (DefLatency >= 0) + return DefLatency; + + assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail"); + + // Find the definition of the register in the defining instruction. + int DefIdx = DefMI->findRegisterDefOperandIdx(Reg); + if (DefIdx != -1) { + const MachineOperand &MO = DefMI->getOperand(DefIdx); + if (MO.isReg() && MO.isImplicit() && + DefIdx >= (int)DefMI->getDesc().getNumOperands()) { + // This is an implicit def, getOperandLatency() won't return the correct + // latency. e.g. + // %D6<def>, %D7<def> = VLD1q16 %R2<kill>, 0, ..., %Q3<imp-def> + // %Q1<def> = VMULv8i16 %Q1<kill>, %Q3<kill>, ... + // What we want is to compute latency between def of %D6/%D7 and use of + // %Q3 instead. + unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI); + if (DefMI->getOperand(Op2).isReg()) + DefIdx = Op2; + } + // For all uses of the register, calculate the maxmimum latency + int OperLatency = -1; + + // UseMI is null, then it must be a scheduling barrier. + if (!UseMI) { + unsigned DefClass = DefMI->getDesc().getSchedClass(); + OperLatency = ItinData->getOperandCycle(DefClass, DefIdx); + } + else { + for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = UseMI->getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned MOReg = MO.getReg(); + if (MOReg != Reg) + continue; + + int UseCycle = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, i); + OperLatency = std::max(OperLatency, UseCycle); + } + } + // If we found an operand latency, we're done. + if (OperLatency >= 0) + return OperLatency; + } + // No operand latency was found. + unsigned InstrLatency = getInstrLatency(ItinData, DefMI); + + // Expected latency is the max of the stage latency and itinerary props. + if (!FindMin) + InstrLatency = std::max(InstrLatency, defaultDefLatency(ItinData, DefMI)); + return InstrLatency; +} diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 9925185..2a2fa9e 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -93,8 +93,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) { // N.B.: The defaults used in here are no the same ones used in MC. // We follow gcc, MC follows gas. For example, given ".section .eh_frame", // both gas and MC will produce a section with no flags. Given - // section(".eh_frame") gcc will produce - // .section .eh_frame,"a",@progbits + // section(".eh_frame") gcc will produce: + // + // .section .eh_frame,"a",@progbits if (Name.empty() || Name[0] != '.') return K; // Some lame default implementation based on some magic section names. @@ -349,10 +350,17 @@ TargetLoweringObjectFileELF::getStaticCtorSection(unsigned Priority) const { if (Priority == 65535) return StaticCtorSection; - std::string Name = std::string(".ctors.") + utostr(65535 - Priority); - return getContext().getELFSection(Name, ELF::SHT_PROGBITS, - ELF::SHF_ALLOC |ELF::SHF_WRITE, - SectionKind::getDataRel()); + if (UseInitArray) { + std::string Name = std::string(".init_array.") + utostr(Priority); + return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY, + ELF::SHF_ALLOC | ELF::SHF_WRITE, + SectionKind::getDataRel()); + } else { + std::string Name = std::string(".ctors.") + utostr(65535 - Priority); + return getContext().getELFSection(Name, ELF::SHT_PROGBITS, + ELF::SHF_ALLOC |ELF::SHF_WRITE, + SectionKind::getDataRel()); + } } const MCSection * @@ -362,10 +370,35 @@ TargetLoweringObjectFileELF::getStaticDtorSection(unsigned Priority) const { if (Priority == 65535) return StaticDtorSection; - std::string Name = std::string(".dtors.") + utostr(65535 - Priority); - return getContext().getELFSection(Name, ELF::SHT_PROGBITS, - ELF::SHF_ALLOC |ELF::SHF_WRITE, - SectionKind::getDataRel()); + if (UseInitArray) { + std::string Name = std::string(".fini_array.") + utostr(Priority); + return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY, + ELF::SHF_ALLOC | ELF::SHF_WRITE, + SectionKind::getDataRel()); + } else { + std::string Name = std::string(".dtors.") + utostr(65535 - Priority); + return getContext().getELFSection(Name, ELF::SHT_PROGBITS, + ELF::SHF_ALLOC |ELF::SHF_WRITE, + SectionKind::getDataRel()); + } +} + +void +TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) { + UseInitArray = UseInitArray_; + if (!UseInitArray) + return; + + StaticCtorSection = + getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY, + ELF::SHF_WRITE | + ELF::SHF_ALLOC, + SectionKind::getDataRel()); + StaticDtorSection = + getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY, + ELF::SHF_WRITE | + ELF::SHF_ALLOC, + SectionKind::getDataRel()); } //===----------------------------------------------------------------------===// @@ -379,7 +412,7 @@ emitModuleFlags(MCStreamer &Streamer, ArrayRef<Module::ModuleFlagEntry> ModuleFlags, Mangler *Mang, const TargetMachine &TM) const { unsigned VersionVal = 0; - unsigned GCFlags = 0; + unsigned ImageInfoFlags = 0; StringRef SectionVal; for (ArrayRef<Module::ModuleFlagEntry>::iterator @@ -396,8 +429,9 @@ emitModuleFlags(MCStreamer &Streamer, if (Key == "Objective-C Image Info Version") VersionVal = cast<ConstantInt>(Val)->getZExtValue(); else if (Key == "Objective-C Garbage Collection" || - Key == "Objective-C GC Only") - GCFlags |= cast<ConstantInt>(Val)->getZExtValue(); + Key == "Objective-C GC Only" || + Key == "Objective-C Is Simulated") + ImageInfoFlags |= cast<ConstantInt>(Val)->getZExtValue(); else if (Key == "Objective-C Image Info Section") SectionVal = cast<MDString>(Val)->getString(); } @@ -424,7 +458,7 @@ emitModuleFlags(MCStreamer &Streamer, Streamer.EmitLabel(getContext(). GetOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO"))); Streamer.EmitIntValue(VersionVal, 4); - Streamer.EmitIntValue(GCFlags, 4); + Streamer.EmitIntValue(ImageInfoFlags, 4); Streamer.AddBlankLine(); } diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index c30b133..e4c0119 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -102,7 +102,7 @@ namespace { MachineInstr *FindLastUseInMBB(unsigned Reg, MachineBasicBlock *MBB, unsigned Dist); - bool isProfitableToCommute(unsigned regB, unsigned regC, + bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, MachineInstr *MI, MachineBasicBlock *MBB, unsigned Dist); @@ -483,32 +483,6 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) { return false; } -/// findLocalKill - Look for an instruction below MI in the MBB that kills the -/// specified register. Returns null if there are any other Reg use between the -/// instructions. -static -MachineInstr *findLocalKill(unsigned Reg, MachineBasicBlock *MBB, - MachineInstr *MI, MachineRegisterInfo *MRI, - DenseMap<MachineInstr*, unsigned> &DistanceMap) { - MachineInstr *KillMI = 0; - for (MachineRegisterInfo::use_nodbg_iterator - UI = MRI->use_nodbg_begin(Reg), - UE = MRI->use_nodbg_end(); UI != UE; ++UI) { - MachineInstr *UseMI = &*UI; - if (UseMI == MI || UseMI->getParent() != MBB) - continue; - if (DistanceMap.count(UseMI)) - continue; - if (!UI.getOperand().isKill()) - return 0; - if (KillMI) - return 0; // -O0 kill markers cannot be trusted? - KillMI = UseMI; - } - - return KillMI; -} - /// findOnlyInterestingUse - Given a register, if has a single in-basic block /// use, return the use instruction if it's a copy or a two-address use. static @@ -567,7 +541,8 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) { /// isProfitableToReMat - Return true if it's potentially profitable to commute /// the two-address instruction that's being processed. bool -TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC, +TwoAddressInstructionPass::isProfitableToCommute(unsigned regA, unsigned regB, + unsigned regC, MachineInstr *MI, MachineBasicBlock *MBB, unsigned Dist) { if (OptLevel == CodeGenOpt::None) @@ -604,15 +579,15 @@ TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC, // %reg1026<def> = ADD %reg1024, %reg1025 // r0 = MOV %reg1026 // Commute the ADD to hopefully eliminate an otherwise unavoidable copy. - unsigned FromRegB = getMappedReg(regB, SrcRegMap); - unsigned FromRegC = getMappedReg(regC, SrcRegMap); - unsigned ToRegB = getMappedReg(regB, DstRegMap); - unsigned ToRegC = getMappedReg(regC, DstRegMap); - if ((FromRegB && ToRegB && !regsAreCompatible(FromRegB, ToRegB, TRI)) && - ((!FromRegC && !ToRegC) || - regsAreCompatible(FromRegB, ToRegC, TRI) || - regsAreCompatible(FromRegC, ToRegB, TRI))) - return true; + unsigned ToRegA = getMappedReg(regA, DstRegMap); + if (ToRegA) { + unsigned FromRegB = getMappedReg(regB, SrcRegMap); + unsigned FromRegC = getMappedReg(regC, SrcRegMap); + bool BComp = !FromRegB || regsAreCompatible(FromRegB, ToRegA, TRI); + bool CComp = !FromRegC || regsAreCompatible(FromRegC, ToRegA, TRI); + if (BComp != CComp) + return !BComp && CComp; + } // If there is a use of regC between its last def (could be livein) and this // instruction, then bail. @@ -904,14 +879,19 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB, MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, unsigned Reg) { + // Bail immediately if we don't have LV available. We use it to find kills + // efficiently. + if (!LV) + return false; + MachineInstr *MI = &*mi; DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI); if (DI == DistanceMap.end()) // Must be created from unfolded load. Don't waste time trying this. return false; - MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap); - if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike()) + MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB); + if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike()) // Don't mess with copies, they may be coalesced later. return false; @@ -998,6 +978,12 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB, ((MO.isKill() && Uses.count(MOReg)) || Kills.count(MOReg))) // Don't want to extend other live ranges and update kills. return false; + if (MOReg == Reg && !MO.isKill()) + // We can't schedule across a use of the register in question. + return false; + // Ensure that if this is register in question, its the kill we expect. + assert((MOReg != Reg || OtherMI == KillMI) && + "Found multiple kills of a register in a basic block"); } } } @@ -1011,20 +997,11 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB, MBB->splice(KillPos, MBB, From, To); DistanceMap.erase(DI); - if (LV) { - // Update live variables - LV->removeVirtualRegisterKilled(Reg, KillMI); - LV->addVirtualRegisterKilled(Reg, MI); - } else { - for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = KillMI->getOperand(i); - if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg) - continue; - MO.setIsKill(false); - } - MI->addRegisterKilled(Reg, 0); - } + // Update live variables + LV->removeVirtualRegisterKilled(Reg, KillMI); + LV->addVirtualRegisterKilled(Reg, MI); + DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI); return true; } @@ -1045,7 +1022,7 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist, return true; // Below MI unsigned DefDist = DDI->second; assert(Dist > DefDist && "Visited def already?"); - if (TII->getInstrLatency(InstrItins, DefMI) > (int)(Dist - DefDist)) + if (TII->getInstrLatency(InstrItins, DefMI) > (Dist - DefDist)) return true; } return false; @@ -1060,14 +1037,19 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB, MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, unsigned Reg) { + // Bail immediately if we don't have LV available. We use it to find kills + // efficiently. + if (!LV) + return false; + MachineInstr *MI = &*mi; DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI); if (DI == DistanceMap.end()) // Must be created from unfolded load. Don't waste time trying this. return false; - MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap); - if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike()) + MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB); + if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike()) // Don't mess with copies, they may be coalesced later. return false; @@ -1093,6 +1075,8 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB, continue; if (isDefTooClose(MOReg, DI->second, MI, MBB)) return false; + if (MOReg == Reg && !MO.isKill()) + return false; Uses.insert(MOReg); if (MO.isKill() && MOReg != Reg) Kills.insert(MOReg); @@ -1134,6 +1118,9 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB, if (Kills.count(MOReg)) // Don't want to extend other live ranges and update kills. return false; + if (OtherMI != MI && MOReg == Reg && !MO.isKill()) + // We can't schedule across a use of the register in question. + return false; } else { OtherDefs.push_back(MOReg); } @@ -1164,19 +1151,11 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB, nmi = llvm::prior(InsertPos); // Backtrack so we process the moved instr. DistanceMap.erase(DI); - if (LV) { - // Update live variables - LV->removeVirtualRegisterKilled(Reg, KillMI); - LV->addVirtualRegisterKilled(Reg, MI); - } else { - for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = KillMI->getOperand(i); - if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg) - continue; - MO.setIsKill(false); - } - MI->addRegisterKilled(Reg, 0); - } + // Update live variables + LV->removeVirtualRegisterKilled(Reg, KillMI); + LV->addVirtualRegisterKilled(Reg, MI); + + DEBUG(dbgs() << "\trescheduled kill: " << *KillMI); return true; } @@ -1208,9 +1187,13 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, if (!regBKilled && MI.getOperand(DstIdx).isDead() && DeleteUnusedInstr(mi, nmi, mbbi, Dist)) { ++NumDeletes; - return true; // Done with this instruction. + DEBUG(dbgs() << "\tdeleted unused instruction.\n"); + return true; // Done with this instruction." } + if (TargetRegisterInfo::isVirtualRegister(regA)) + ScanUses(regA, &*mbbi, Processed); + // Check if it is profitable to commute the operands. unsigned SrcOp1, SrcOp2; unsigned regC = 0; @@ -1230,7 +1213,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, // If C dies but B does not, swap the B and C operands. // This makes the live ranges of A and C joinable. TryCommute = true; - else if (isProfitableToCommute(regB, regC, &MI, mbbi, Dist)) { + else if (isProfitableToCommute(regA, regB, regC, &MI, mbbi, Dist)) { TryCommute = true; AggressiveCommute = true; } @@ -1252,9 +1235,6 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, return true; } - if (TargetRegisterInfo::isVirtualRegister(regA)) - ScanUses(regA, &*mbbi, Processed); - if (MI.isConvertibleTo3Addr()) { // This instruction is potentially convertible to a true // three-address instruction. Check if it is profitable. @@ -1298,7 +1278,8 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, // Unfold the load. DEBUG(dbgs() << "2addr: UNFOLDING: " << MI); const TargetRegisterClass *RC = - TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI); + TRI->getAllocatableClass( + TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, MF)); unsigned Reg = MRI->createVirtualRegister(RC); SmallVector<MachineInstr *, 2> NewMIs; if (!TII->unfoldMemoryOperand(MF, &MI, Reg, @@ -1454,31 +1435,50 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { "two address instruction invalid"); unsigned regB = mi->getOperand(SrcIdx).getReg(); + + // Deal with <undef> uses immediately - simply rewrite the src operand. + if (mi->getOperand(SrcIdx).isUndef()) { + unsigned DstReg = mi->getOperand(DstIdx).getReg(); + // Constrain the DstReg register class if required. + if (TargetRegisterInfo::isVirtualRegister(DstReg)) + if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx, + TRI, MF)) + MRI->constrainRegClass(DstReg, RC); + mi->getOperand(SrcIdx).setReg(DstReg); + DEBUG(dbgs() << "\t\trewrite undef:\t" << *mi); + continue; + } TiedOperands[regB].push_back(std::make_pair(SrcIdx, DstIdx)); } + // If the instruction has a single pair of tied operands, try some + // transformations that may either eliminate the tied operands or + // improve the opportunities for coalescing away the register copy. + if (TiedOperands.size() == 1) { + SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs + = TiedOperands.begin()->second; + if (TiedPairs.size() == 1) { + unsigned SrcIdx = TiedPairs[0].first; + unsigned DstIdx = TiedPairs[0].second; + unsigned SrcReg = mi->getOperand(SrcIdx).getReg(); + unsigned DstReg = mi->getOperand(DstIdx).getReg(); + if (SrcReg != DstReg && + TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist, + Processed)) { + // The tied operands have been eliminated or shifted further down the + // block to ease elimination. Continue processing with 'nmi'. + TiedOperands.clear(); + mi = nmi; + continue; + } + } + } + // Now iterate over the information collected above. for (TiedOperandMap::iterator OI = TiedOperands.begin(), OE = TiedOperands.end(); OI != OE; ++OI) { SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs = OI->second; - // If the instruction has a single pair of tied operands, try some - // transformations that may either eliminate the tied operands or - // improve the opportunities for coalescing away the register copy. - if (TiedOperands.size() == 1 && TiedPairs.size() == 1) { - unsigned SrcIdx = TiedPairs[0].first; - unsigned DstIdx = TiedPairs[0].second; - - // If the registers are already equal, nothing needs to be done. - if (mi->getOperand(SrcIdx).getReg() == - mi->getOperand(DstIdx).getReg()) - break; // Done with this instruction. - - if (TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist, - Processed)) - break; // The tied operands have been eliminated. - } - bool IsEarlyClobber = false; bool RemovedKillFlag = false; bool AllUsesCopied = true; @@ -1519,8 +1519,9 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { #endif // Emit a copy or rematerialize the definition. + bool isCopy = false; const TargetRegisterClass *rc = MRI->getRegClass(regB); - MachineInstr *DefMI = MRI->getVRegDef(regB); + MachineInstr *DefMI = MRI->getUniqueVRegDef(regB); // If it's safe and profitable, remat the definition instead of // copying it. if (DefMI && @@ -1535,10 +1536,11 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { } else { BuildMI(*mbbi, mi, mi->getDebugLoc(), TII->get(TargetOpcode::COPY), regA).addReg(regB); + isCopy = true; } - MachineBasicBlock::iterator prevMI = prior(mi); // Update DistanceMap. + MachineBasicBlock::iterator prevMI = prior(mi); DistanceMap.insert(std::make_pair(prevMI, Dist)); DistanceMap[mi] = ++Dist; @@ -1551,7 +1553,17 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { MO.setIsKill(false); RemovedKillFlag = true; } + + // Make sure regA is a legal regclass for the SrcIdx operand. + if (TargetRegisterInfo::isVirtualRegister(regA) && + TargetRegisterInfo::isVirtualRegister(regB)) + MRI->constrainRegClass(regA, MRI->getRegClass(regB)); + MO.setReg(regA); + + if (isCopy) + // Propagate SrcRegMap. + SrcRegMap[regA] = regB; } if (AllUsesCopied) { @@ -1587,27 +1599,32 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { } } - // Schedule the source copy / remat inserted to form two-address - // instruction. FIXME: Does it matter the distance map may not be - // accurate after it's scheduled? - TII->scheduleTwoAddrSource(prior(mi), mi, *TRI); + // We didn't change anything if there was a single tied pair, and that + // pair didn't require copies. + if (AllUsesCopied || TiedPairs.size() > 1) { + MadeChange = true; - MadeChange = true; + // Schedule the source copy / remat inserted to form two-address + // instruction. FIXME: Does it matter the distance map may not be + // accurate after it's scheduled? + TII->scheduleTwoAddrSource(prior(mi), mi, *TRI); + } DEBUG(dbgs() << "\t\trewrite to:\t" << *mi); + } - // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form. - if (mi->isInsertSubreg()) { - // From %reg = INSERT_SUBREG %reg, %subreg, subidx - // To %reg:subidx = COPY %subreg - unsigned SubIdx = mi->getOperand(3).getImm(); - mi->RemoveOperand(3); - assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx"); - mi->getOperand(0).setSubReg(SubIdx); - mi->RemoveOperand(1); - mi->setDesc(TII->get(TargetOpcode::COPY)); - DEBUG(dbgs() << "\t\tconvert to:\t" << *mi); - } + // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form. + if (mi->isInsertSubreg()) { + // From %reg = INSERT_SUBREG %reg, %subreg, subidx + // To %reg:subidx = COPY %subreg + unsigned SubIdx = mi->getOperand(3).getImm(); + mi->RemoveOperand(3); + assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx"); + mi->getOperand(0).setSubReg(SubIdx); + mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef()); + mi->RemoveOperand(1); + mi->setDesc(TII->get(TargetOpcode::COPY)); + DEBUG(dbgs() << "\t\tconvert to:\t" << *mi); } // Clear TiedOperands here instead of at the top of the loop @@ -1694,9 +1711,10 @@ TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs, continue; // Check that the instructions are all in the same basic block. - MachineInstr *SrcDefMI = MRI->getVRegDef(SrcReg); - MachineInstr *DstDefMI = MRI->getVRegDef(DstReg); - if (SrcDefMI->getParent() != DstDefMI->getParent()) + MachineInstr *SrcDefMI = MRI->getUniqueVRegDef(SrcReg); + MachineInstr *DstDefMI = MRI->getUniqueVRegDef(DstReg); + if (!SrcDefMI || !DstDefMI || + SrcDefMI->getParent() != DstDefMI->getParent()) continue; // If there are no other uses than copies which feed into @@ -1832,6 +1850,11 @@ bool TwoAddressInstructionPass::EliminateRegSequences() { SmallVector<unsigned, 4> RealSrcs; SmallSet<unsigned, 4> Seen; for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) { + // Nothing needs to be inserted for <undef> operands. + if (MI->getOperand(i).isUndef()) { + MI->getOperand(i).setReg(0); + continue; + } unsigned SrcReg = MI->getOperand(i).getReg(); unsigned SrcSubIdx = MI->getOperand(i).getSubReg(); unsigned SubIdx = MI->getOperand(i+1).getImm(); @@ -1841,7 +1864,7 @@ bool TwoAddressInstructionPass::EliminateRegSequences() { MachineInstr *DefMI = NULL; if (!MI->getOperand(i).getSubReg() && !TargetRegisterInfo::isPhysicalRegister(SrcReg)) { - DefMI = MRI->getVRegDef(SrcReg); + DefMI = MRI->getUniqueVRegDef(SrcReg); } if (DefMI && DefMI->isImplicitDef()) { diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp index 3bab93b..93840f0 100644 --- a/lib/CodeGen/VirtRegMap.cpp +++ b/lib/CodeGen/VirtRegMap.cpp @@ -18,12 +18,14 @@ #define DEBUG_TYPE "regalloc" #include "VirtRegMap.h" +#include "LiveDebugVariables.h" #include "llvm/Function.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -104,11 +106,149 @@ void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) { Virt2StackSlotMap[virtReg] = SS; } -void VirtRegMap::rewrite(SlotIndexes *Indexes) { +void VirtRegMap::print(raw_ostream &OS, const Module*) const { + OS << "********** REGISTER MAP **********\n"; + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) { + OS << '[' << PrintReg(Reg, TRI) << " -> " + << PrintReg(Virt2PhysMap[Reg], TRI) << "] " + << MRI->getRegClass(Reg)->getName() << "\n"; + } + } + + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) { + OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg] + << "] " << MRI->getRegClass(Reg)->getName() << "\n"; + } + } + OS << '\n'; +} + +void VirtRegMap::dump() const { + print(dbgs()); +} + +//===----------------------------------------------------------------------===// +// VirtRegRewriter +//===----------------------------------------------------------------------===// +// +// The VirtRegRewriter is the last of the register allocator passes. +// It rewrites virtual registers to physical registers as specified in the +// VirtRegMap analysis. It also updates live-in information on basic blocks +// according to LiveIntervals. +// +namespace { +class VirtRegRewriter : public MachineFunctionPass { + MachineFunction *MF; + const TargetMachine *TM; + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + SlotIndexes *Indexes; + LiveIntervals *LIS; + VirtRegMap *VRM; + + void rewrite(); + void addMBBLiveIns(); +public: + static char ID; + VirtRegRewriter() : MachineFunctionPass(ID) {} + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + + virtual bool runOnMachineFunction(MachineFunction&); +}; +} // end anonymous namespace + +char &llvm::VirtRegRewriterID = VirtRegRewriter::ID; + +INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter", + "Virtual Register Rewriter", false, false) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter", + "Virtual Register Rewriter", false, false) + +char VirtRegRewriter::ID = 0; + +void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<LiveIntervals>(); + AU.addRequired<SlotIndexes>(); + AU.addPreserved<SlotIndexes>(); + AU.addRequired<LiveDebugVariables>(); + AU.addRequired<VirtRegMap>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) { + MF = &fn; + TM = &MF->getTarget(); + TRI = TM->getRegisterInfo(); + TII = TM->getInstrInfo(); + MRI = &MF->getRegInfo(); + Indexes = &getAnalysis<SlotIndexes>(); + LIS = &getAnalysis<LiveIntervals>(); + VRM = &getAnalysis<VirtRegMap>(); DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n" << "********** Function: " << MF->getFunction()->getName() << '\n'); - DEBUG(dump()); + DEBUG(VRM->dump()); + + // Add kill flags while we still have virtual registers. + LIS->addKillFlags(); + + // Live-in lists on basic blocks are required for physregs. + addMBBLiveIns(); + + // Rewrite virtual registers. + rewrite(); + + // Write out new DBG_VALUE instructions. + getAnalysis<LiveDebugVariables>().emitDebugValues(VRM); + + // All machine operands and other references to virtual registers have been + // replaced. Remove the virtual registers and release all the transient data. + VRM->clearAllVirt(); + MRI->clearVirtRegs(); + return true; +} + +// Compute MBB live-in lists from virtual register live ranges and their +// assignments. +void VirtRegRewriter::addMBBLiveIns() { + SmallVector<MachineBasicBlock*, 16> LiveIn; + for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) { + unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx); + if (MRI->reg_nodbg_empty(VirtReg)) + continue; + LiveInterval &LI = LIS->getInterval(VirtReg); + if (LI.empty() || LIS->intervalIsInOneMBB(LI)) + continue; + // This is a virtual register that is live across basic blocks. Its + // assigned PhysReg must be marked as live-in to those blocks. + unsigned PhysReg = VRM->getPhys(VirtReg); + assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register."); + + // Scan the segments of LI. + for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I != E; + ++I) { + if (!Indexes->findLiveInMBBs(I->start, I->end, LiveIn)) + continue; + for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) + if (!LiveIn[i]->isLiveIn(PhysReg)) + LiveIn[i]->addLiveIn(PhysReg); + LiveIn.clear(); + } + } +} + +void VirtRegRewriter::rewrite() { SmallVector<unsigned, 8> SuperDeads; SmallVector<unsigned, 8> SuperDefs; SmallVector<unsigned, 8> SuperKills; @@ -135,8 +275,9 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) { if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue; unsigned VirtReg = MO.getReg(); - unsigned PhysReg = getPhys(VirtReg); - assert(PhysReg != NO_PHYS_REG && "Instruction uses unmapped VirtReg"); + unsigned PhysReg = VRM->getPhys(VirtReg); + assert(PhysReg != VirtRegMap::NO_PHYS_REG && + "Instruction uses unmapped VirtReg"); assert(!Reserved.test(PhysReg) && "Reserved register assignment"); // Preserve semantics of sub-register operands. @@ -207,31 +348,3 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) { if (!MRI->reg_nodbg_empty(Reg)) MRI->setPhysRegUsed(Reg); } - -void VirtRegMap::print(raw_ostream &OS, const Module* M) const { - const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - - OS << "********** REGISTER MAP **********\n"; - for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(i); - if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) { - OS << '[' << PrintReg(Reg, TRI) << " -> " - << PrintReg(Virt2PhysMap[Reg], TRI) << "] " - << MRI.getRegClass(Reg)->getName() << "\n"; - } - } - - for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(i); - if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) { - OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg] - << "] " << MRI.getRegClass(Reg)->getName() << "\n"; - } - } - OS << '\n'; -} - -void VirtRegMap::dump() const { - print(dbgs()); -} diff --git a/lib/CodeGen/VirtRegMap.h b/lib/CodeGen/VirtRegMap.h index 8cac311..c320985 100644 --- a/lib/CodeGen/VirtRegMap.h +++ b/lib/CodeGen/VirtRegMap.h @@ -177,13 +177,6 @@ namespace llvm { /// the specified stack slot void assignVirt2StackSlot(unsigned virtReg, int frameIndex); - /// rewrite - Rewrite all instructions in MF to use only physical registers - /// by mapping all virtual register operands to their assigned physical - /// registers. - /// - /// @param Indexes Optionally remove deleted instructions from indexes. - void rewrite(SlotIndexes *Indexes); - void print(raw_ostream &OS, const Module* M = 0) const; void dump() const; }; diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp index 24bf97f..b27d57b 100644 --- a/lib/DebugInfo/DWARFCompileUnit.cpp +++ b/lib/DebugInfo/DWARFCompileUnit.cpp @@ -82,7 +82,7 @@ void DWARFCompileUnit::clear() { Abbrevs = 0; AddrSize = 0; BaseAddr = 0; - DieArray.clear(); + clearDIEs(false); } void DWARFCompileUnit::dump(raw_ostream &OS) { @@ -97,6 +97,13 @@ void DWARFCompileUnit::dump(raw_ostream &OS) { getCompileUnitDIE(false)->dump(OS, this, -1U); } +const char *DWARFCompileUnit::getCompilationDir() { + extractDIEsIfNeeded(true); + if (DieArray.empty()) + return 0; + return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0); +} + void DWARFCompileUnit::setDIERelations() { if (DieArray.empty()) return; @@ -201,7 +208,7 @@ size_t DWARFCompileUnit::extractDIEsIfNeeded(bool cu_die_only) { } void DWARFCompileUnit::clearDIEs(bool keep_compile_unit_die) { - if (DieArray.size() > 1) { + if (DieArray.size() > (unsigned)keep_compile_unit_die) { // std::vectors never get any smaller when resized to a smaller size, // or when clear() or erase() are called, the size will report that it // is smaller, but the memory allocated remains intact (call capacity() @@ -227,8 +234,8 @@ DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges, // all compile units to stay loaded when they weren't needed. So we can end // up parsing the DWARF and then throwing them all away to keep memory usage // down. - const bool clear_dies = extractDIEsIfNeeded(false) > 1; - + const bool clear_dies = extractDIEsIfNeeded(false) > 1 && + clear_dies_if_already_not_parsed; DieArray[0].buildAddressRangeTable(this, debug_aranges); // Keep memory down by clearing DIEs if this generate function @@ -236,3 +243,13 @@ DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges, if (clear_dies) clearDIEs(true); } + +const DWARFDebugInfoEntryMinimal* +DWARFCompileUnit::getFunctionDIEForAddress(int64_t address) { + extractDIEsIfNeeded(false); + for (size_t i = 0, n = DieArray.size(); i != n; i++) { + if (DieArray[i].addressRangeContainsAddress(this, address)) + return &DieArray[i]; + } + return 0; +} diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h index d916729..b34a596 100644 --- a/lib/DebugInfo/DWARFCompileUnit.h +++ b/lib/DebugInfo/DWARFCompileUnit.h @@ -43,7 +43,7 @@ public: const DWARFAbbreviationDeclarationSet *abbrevs); /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it - /// hasn't already been done. + /// hasn't already been done. Returns the number of DIEs parsed at this call. size_t extractDIEsIfNeeded(bool cu_die_only); void clear(); void dump(raw_ostream &OS); @@ -78,6 +78,8 @@ public: return &DieArray[0]; } + const char *getCompilationDir(); + /// setDIERelations - We read in all of the DIE entries into our flat list /// of DIE entries and now we need to go back through all of them and set the /// parent, sibling and child pointers for quick DIE navigation. @@ -104,6 +106,11 @@ public: void buildAddressRangeTable(DWARFDebugAranges *debug_aranges, bool clear_dies_if_already_not_parsed); + /// getFunctionDIEForAddress - Returns pointer to parsed subprogram DIE, + /// address ranges of which contain the provided address, + /// or NULL if there is no such subprogram. The pointer + /// is valid until DWARFCompileUnit::clear() or clearDIEs() is called. + const DWARFDebugInfoEntryMinimal *getFunctionDIEForAddress(int64_t address); }; } diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp index dccadc4..a4e0d8e 100644 --- a/lib/DebugInfo/DWARFContext.cpp +++ b/lib/DebugInfo/DWARFContext.cpp @@ -8,8 +8,10 @@ //===----------------------------------------------------------------------===// #include "DWARFContext.h" +#include "llvm/ADT/SmallString.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/Format.h" +#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> using namespace llvm; @@ -140,30 +142,66 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t offset) { return 0; } -DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address) { +DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address, + DILineInfoSpecifier specifier) { // First, get the offset of the compile unit. uint32_t cuOffset = getDebugAranges()->findAddress(address); // Retrieve the compile unit. DWARFCompileUnit *cu = getCompileUnitForOffset(cuOffset); if (!cu) - return DILineInfo("<invalid>", 0, 0); - // Get the line table for this compile unit. - const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu); - if (!lineTable) - return DILineInfo("<invalid>", 0, 0); - // Get the index of the row we're looking for in the line table. - uint64_t hiPC = - cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_high_pc, - -1ULL); - uint32_t rowIndex = lineTable->lookupAddress(address, hiPC); - if (rowIndex == -1U) - return DILineInfo("<invalid>", 0, 0); - - // From here, contruct the DILineInfo. - const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex]; - const std::string &fileName = lineTable->Prologue.FileNames[row.File-1].Name; - - return DILineInfo(fileName.c_str(), row.Line, row.Column); + return DILineInfo(); + SmallString<16> fileName("<invalid>"); + SmallString<16> functionName("<invalid>"); + uint32_t line = 0; + uint32_t column = 0; + if (specifier.needs(DILineInfoSpecifier::FunctionName)) { + const DWARFDebugInfoEntryMinimal *function_die = + cu->getFunctionDIEForAddress(address); + if (function_die) { + if (const char *name = function_die->getSubprogramName(cu)) + functionName = name; + } + } + if (specifier.needs(DILineInfoSpecifier::FileLineInfo)) { + // Get the line table for this compile unit. + const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu); + if (lineTable) { + // Get the index of the row we're looking for in the line table. + uint64_t hiPC = cu->getCompileUnitDIE()->getAttributeValueAsUnsigned( + cu, DW_AT_high_pc, -1ULL); + uint32_t rowIndex = lineTable->lookupAddress(address, hiPC); + if (rowIndex != -1U) { + const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex]; + // Take file/line info from the line table. + const DWARFDebugLine::FileNameEntry &fileNameEntry = + lineTable->Prologue.FileNames[row.File - 1]; + fileName = fileNameEntry.Name; + if (specifier.needs(DILineInfoSpecifier::AbsoluteFilePath) && + sys::path::is_relative(fileName.str())) { + // Append include directory of file (if it is present in line table) + // and compilation directory of compile unit to make path absolute. + const char *includeDir = 0; + if (uint64_t includeDirIndex = fileNameEntry.DirIdx) { + includeDir = lineTable->Prologue + .IncludeDirectories[includeDirIndex - 1]; + } + SmallString<16> absFileName; + if (includeDir == 0 || sys::path::is_relative(includeDir)) { + if (const char *compilationDir = cu->getCompilationDir()) + sys::path::append(absFileName, compilationDir); + } + if (includeDir) { + sys::path::append(absFileName, includeDir); + } + sys::path::append(absFileName, fileName.str()); + fileName = absFileName; + } + line = row.Line; + column = row.Column; + } + } + } + return DILineInfo(fileName, functionName, line, column); } void DWARFContextInMemory::anchor() { } diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h index d2e763a..e55a27e 100644 --- a/lib/DebugInfo/DWARFContext.h +++ b/lib/DebugInfo/DWARFContext.h @@ -66,7 +66,8 @@ public: const DWARFDebugLine::LineTable * getLineTableForCompileUnit(DWARFCompileUnit *cu); - virtual DILineInfo getLineInfoForAddress(uint64_t address); + virtual DILineInfo getLineInfoForAddress(uint64_t address, + DILineInfoSpecifier specifier = DILineInfoSpecifier()); bool isLittleEndian() const { return IsLittleEndian; } diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp index 1788145..ef470e5 100644 --- a/lib/DebugInfo/DWARFDebugAranges.cpp +++ b/lib/DebugInfo/DWARFDebugAranges.cpp @@ -93,6 +93,7 @@ bool DWARFDebugAranges::generate(DWARFContext *ctx) { cu->buildAddressRangeTable(this, true); } } + sort(true, /* overlap size */ 0); return !isEmpty(); } @@ -221,4 +222,3 @@ bool DWARFDebugAranges::getMaxRange(uint64_t &LoPC, uint64_t &HiPC) const { HiPC = Aranges.back().HiPC(); return true; } - diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp index 236db97..429a36c 100644 --- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp +++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp @@ -440,3 +440,54 @@ DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *cu, } } } + +bool +DWARFDebugInfoEntryMinimal::addressRangeContainsAddress( + const DWARFCompileUnit *cu, const uint64_t address) const { + if (!isNULL() && getTag() == DW_TAG_subprogram) { + uint64_t hi_pc = -1ULL; + uint64_t lo_pc = getAttributeValueAsUnsigned(cu, DW_AT_low_pc, -1ULL); + if (lo_pc != -1ULL) + hi_pc = getAttributeValueAsUnsigned(cu, DW_AT_high_pc, -1ULL); + if (hi_pc != -1ULL) { + return (lo_pc <= address && address < hi_pc); + } + } + return false; +} + +const char* +DWARFDebugInfoEntryMinimal::getSubprogramName( + const DWARFCompileUnit *cu) const { + if (isNULL() || getTag() != DW_TAG_subprogram) + return 0; + // Try to get mangled name if possible. + if (const char *name = + getAttributeValueAsString(cu, DW_AT_MIPS_linkage_name, 0)) + return name; + if (const char *name = getAttributeValueAsString(cu, DW_AT_linkage_name, 0)) + return name; + if (const char *name = getAttributeValueAsString(cu, DW_AT_name, 0)) + return name; + // Try to get name from specification DIE. + uint32_t spec_ref = + getAttributeValueAsReference(cu, DW_AT_specification, -1U); + if (spec_ref != -1U) { + DWARFDebugInfoEntryMinimal spec_die; + if (spec_die.extract(cu, &spec_ref)) { + if (const char *name = spec_die.getSubprogramName(cu)) + return name; + } + } + // Try to get name from abstract origin DIE. + uint32_t abs_origin_ref = + getAttributeValueAsReference(cu, DW_AT_abstract_origin, -1U); + if (abs_origin_ref != -1U) { + DWARFDebugInfoEntryMinimal abs_origin_die; + if (abs_origin_die.extract(cu, &abs_origin_ref)) { + if (const char *name = abs_origin_die.getSubprogramName(cu)) + return name; + } + } + return 0; +} diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h index 37b3bcd..d5d86b9 100644 --- a/lib/DebugInfo/DWARFDebugInfoEntry.h +++ b/lib/DebugInfo/DWARFDebugInfoEntry.h @@ -128,6 +128,15 @@ public: void buildAddressRangeTable(const DWARFCompileUnit *cu, DWARFDebugAranges *debug_aranges) const; + + bool addressRangeContainsAddress(const DWARFCompileUnit *cu, + const uint64_t address) const; + + // If a DIE represents a subprogram, returns its mangled name + // (or short name, if mangled is missing). This name may be fetched + // from specification or abstract origin for this subprogram. + // Returns null if no name is found. + const char* getSubprogramName(const DWARFCompileUnit *cu) const; }; } diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h index bc6a70b..a8c0669 100644 --- a/lib/DebugInfo/DWARFDebugLine.h +++ b/lib/DebugInfo/DWARFDebugLine.h @@ -12,7 +12,6 @@ #include "llvm/Support/DataExtractor.h" #include <map> -#include <string> #include <vector> namespace llvm { @@ -22,9 +21,9 @@ class raw_ostream; class DWARFDebugLine { public: struct FileNameEntry { - FileNameEntry() : DirIdx(0), ModTime(0), Length(0) {} + FileNameEntry() : Name(0), DirIdx(0), ModTime(0), Length(0) {} - std::string Name; + const char *Name; uint64_t DirIdx; uint64_t ModTime; uint64_t Length; @@ -56,7 +55,7 @@ public: // The number assigned to the first special opcode. uint8_t OpcodeBase; std::vector<uint8_t> StandardOpcodeLengths; - std::vector<std::string> IncludeDirectories; + std::vector<const char*> IncludeDirectories; std::vector<FileNameEntry> FileNames; // Length of the prologue in bytes. diff --git a/lib/ExecutionEngine/EventListenerCommon.h b/lib/ExecutionEngine/EventListenerCommon.h index 1c07c94..911d1d6 100644 --- a/lib/ExecutionEngine/EventListenerCommon.h +++ b/lib/ExecutionEngine/EventListenerCommon.h @@ -14,8 +14,8 @@ #ifndef EVENT_LISTENER_COMMON_H #define EVENT_LISTENER_COMMON_H +#include "llvm/DebugInfo.h" #include "llvm/Metadata.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/Path.h" diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp index 5dfa78f..c11c17e 100644 --- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp +++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp @@ -16,11 +16,11 @@ #include "llvm/ExecutionEngine/JITEventListener.h" #define DEBUG_TYPE "amplifier-jit-event-listener" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/Metadata.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/OwningPtr.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/ExecutionEngine/IntelJITEventsWrapper.h" #include "llvm/Support/Debug.h" @@ -138,7 +138,7 @@ void IntelJITEventListener::NotifyFunctionEmitted( // the first instruction that has one if (FunctionMessage.source_file_name == 0) { MDNode *scope = I->Loc.getScope( - Details.MF->getFunction()->getContext()); + Details.MF->getFunction()->getContext()); FunctionMessage.source_file_name = const_cast<char*>( Filenames.getFullPath(scope)); } @@ -152,7 +152,7 @@ void IntelJITEventListener::NotifyFunctionEmitted( } Wrapper.iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, - &FunctionMessage); + &FunctionMessage); MethodIDs[FnStart] = FunctionMessage.method_id; } diff --git a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt index 80d2273..9c06fda 100644 --- a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt +++ b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt @@ -18,6 +18,6 @@ [common] [component_0] -type = Library +type = OptionalLibrary name = IntelJITEvents parent = ExecutionEngine diff --git a/lib/ExecutionEngine/Interpreter/CMakeLists.txt b/lib/ExecutionEngine/Interpreter/CMakeLists.txt index d331f83..74df8f0 100644 --- a/lib/ExecutionEngine/Interpreter/CMakeLists.txt +++ b/lib/ExecutionEngine/Interpreter/CMakeLists.txt @@ -15,3 +15,5 @@ add_llvm_library(LLVMInterpreter if( LLVM_ENABLE_FFI ) target_link_libraries( LLVMInterpreter ${FFI_LIBRARY_PATH} ) endif() + +add_dependencies(LLVMInterpreter intrinsics_gen) diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp index af47be9..5202b09 100644 --- a/lib/ExecutionEngine/Interpreter/Execution.cpp +++ b/lib/ExecutionEngine/Interpreter/Execution.cpp @@ -651,11 +651,40 @@ void Interpreter::visitSwitchInst(SwitchInst &I) { // Check to see if any of the cases match... BasicBlock *Dest = 0; for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) { - GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF); - if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) { - Dest = cast<BasicBlock>(i.getCaseSuccessor()); - break; + IntegersSubset& Case = i.getCaseValueEx(); + if (Case.isSingleNumber()) { + // FIXME: Currently work with ConstantInt based numbers. + const ConstantInt *CI = Case.getSingleNumber(0).toConstantInt(); + GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF); + if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) { + Dest = cast<BasicBlock>(i.getCaseSuccessor()); + break; + } } + if (Case.isSingleNumbersOnly()) { + for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) { + // FIXME: Currently work with ConstantInt based numbers. + const ConstantInt *CI = Case.getSingleNumber(n).toConstantInt(); + GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF); + if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) { + Dest = cast<BasicBlock>(i.getCaseSuccessor()); + break; + } + } + } else + for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) { + IntegersSubset::Range r = Case.getItem(n); + // FIXME: Currently work with ConstantInt based numbers. + const ConstantInt *LowCI = r.getLow().toConstantInt(); + const ConstantInt *HighCI = r.getHigh().toConstantInt(); + GenericValue Low = getOperandValue(const_cast<ConstantInt*>(LowCI), SF); + GenericValue High = getOperandValue(const_cast<ConstantInt*>(HighCI), SF); + if (executeICMP_ULE(Low, CondVal, ElTy).IntVal != 0 && + executeICMP_ULE(CondVal, High, ElTy).IntVal != 0) { + Dest = cast<BasicBlock>(i.getCaseSuccessor()); + break; + } + } } if (!Dest) Dest = I.getDefaultDest(); // No cases matched: use default SwitchToNewBasicBlock(Dest, SF); diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp index 504c8bd..ff3a9dc 100644 --- a/lib/ExecutionEngine/JIT/JITEmitter.cpp +++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp @@ -17,9 +17,9 @@ #include "JITDwarfEmitter.h" #include "llvm/ADT/OwningPtr.h" #include "llvm/Constants.h" -#include "llvm/Module.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/Module.h" #include "llvm/CodeGen/JITCodeEmitter.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineCodeInfo.h" @@ -108,13 +108,18 @@ namespace { /// particular GlobalVariable so that we can reuse them if necessary. GlobalToIndirectSymMapTy GlobalToIndirectSymMap; +#ifndef NDEBUG /// Instance of the JIT this ResolverState serves. JIT *TheJIT; +#endif public: JITResolverState(JIT *jit) : FunctionToLazyStubMap(this), - FunctionToCallSitesMap(this), - TheJIT(jit) {} + FunctionToCallSitesMap(this) { +#ifndef NDEBUG + TheJIT = jit; +#endif + } FunctionToLazyStubMapTy& getFunctionToLazyStubMap( const MutexGuard& locked) { diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp index 2d1775c..7be6ef8 100644 --- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp +++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp @@ -852,7 +852,7 @@ static int jit_noop() { /// for resolving library symbols, not code generated symbols. /// void *DefaultJITMemoryManager::getPointerToNamedFunction(const std::string &Name, - bool AbortOnFailure) { + bool AbortOnFailure) { // Check to see if this is one of the functions we want to intercept. Note, // we cast to intptr_t here to silence a -pedantic warning that complains // about casting a function pointer to a normal pointer. diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp index 44f89cf..84274c0 100644 --- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp +++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp @@ -45,7 +45,7 @@ ExecutionEngine *MCJIT::createJIT(Module *M, // If the target supports JIT code generation, create the JIT. if (TargetJITInfo *TJ = TM->getJITInfo()) - return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM, M), GVsWithCode); + return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM), GVsWithCode); if (ErrorStr) *ErrorStr = "target does not support JIT code generation"; @@ -217,7 +217,7 @@ GenericValue MCJIT::runFunction(Function *F, } void *MCJIT::getPointerToNamedFunction(const std::string &Name, - bool AbortOnFailure){ + bool AbortOnFailure) { if (!isSymbolSearchingDisabled() && MemMgr) { void *ptr = MemMgr->getPointerToNamedFunction(Name, false); if (ptr) @@ -231,7 +231,7 @@ void *MCJIT::getPointerToNamedFunction(const std::string &Name, if (AbortOnFailure) { report_fatal_error("Program used external function '"+Name+ - "' which could not be resolved!"); + "' which could not be resolved!"); } return 0; } diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h index a68949a..441aaeb 100644 --- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h +++ b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h @@ -22,24 +22,20 @@ namespace llvm { // matching LLVM IR counterparts in the module(s) being compiled. class MCJITMemoryManager : public RTDyldMemoryManager { virtual void anchor(); - JITMemoryManager *JMM; + OwningPtr<JITMemoryManager> JMM; - // FIXME: Multiple modules. - Module *M; public: - MCJITMemoryManager(JITMemoryManager *jmm, Module *m) : - JMM(jmm?jmm:JITMemoryManager::CreateDefaultMemManager()), M(m) {} - // We own the JMM, so make sure to delete it. - ~MCJITMemoryManager() { delete JMM; } + MCJITMemoryManager(JITMemoryManager *jmm) : + JMM(jmm?jmm:JITMemoryManager::CreateDefaultMemManager()) {} uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, unsigned SectionID) { - return JMM->allocateSpace(Size, Alignment); + return JMM->allocateDataSection(Size, Alignment, SectionID); } uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID) { - return JMM->allocateSpace(Size, Alignment); + return JMM->allocateCodeSection(Size, Alignment, SectionID); } virtual void *getPointerToNamedFunction(const std::string &Name, diff --git a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt index 4516dfa..e30516e 100644 --- a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt +++ b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt @@ -18,6 +18,6 @@ [common] [component_0] -type = Library +type = OptionalLibrary name = OProfileJIT parent = ExecutionEngine diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp index e6142e3..6b8e9d1 100644 --- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp +++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp @@ -16,9 +16,9 @@ #include "llvm/ExecutionEngine/JITEventListener.h" #define DEBUG_TYPE "oprofile-jit-event-listener" +#include "llvm/DebugInfo.h" #include "llvm/Function.h" #include "llvm/ADT/OwningPtr.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/ExecutionEngine/OProfileWrapper.h" #include "llvm/Support/Debug.h" diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h index 8206ead..c3e3572 100644 --- a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h +++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h @@ -48,7 +48,7 @@ public: virtual void updateSymbolAddress(const object::SymbolRef &Sym, uint64_t Addr) {} - // Subclasses can override this method to provide JIT debugging support + // Subclasses can override these methods to provide JIT debugging support virtual void registerWithDebugger() {} virtual void deregisterWithDebugger() {} }; diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 1b1840a..b464040 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -39,7 +39,7 @@ namespace { // Resolve the relocations for all symbols we currently know about. void RuntimeDyldImpl::resolveRelocations() { // First, resolve relocations associated with external symbols. - resolveSymbols(); + resolveExternalSymbols(); // Just iterate over the sections we have and resolve all the relocations // in them. Gross overkill, but it gets the job done. @@ -59,8 +59,8 @@ void RuntimeDyldImpl::mapSectionAddress(void *LocalAddress, llvm_unreachable("Attempting to remap address of unknown section!"); } -// Subclasses can implement this method to create specialized image instances -// The caller owns the the pointer that is returned. +// Subclasses can implement this method to create specialized image instances. +// The caller owns the pointer that is returned. ObjectImage *RuntimeDyldImpl::createObjectImage(const MemoryBuffer *InputBuffer) { ObjectFile *ObjFile = ObjectFile::createObjectFile(const_cast<MemoryBuffer*> (InputBuffer)); @@ -75,11 +75,15 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) { Arch = (Triple::ArchType)obj->getArch(); - LocalSymbolMap LocalSymbols; // Functions and data symbols from the - // object file. - ObjSectionToIDMap LocalSections; // Used sections from the object file - CommonSymbolMap CommonSymbols; // Common symbols requiring allocation - uint64_t CommonSize = 0; + // Symbols found in this object + StringMap<SymbolLoc> LocalSymbols; + // Used sections from the object file + ObjSectionToIDMap LocalSections; + + // Common symbols requiring allocation, and the total size required to + // allocate all common symbols. + CommonSymbolMap CommonSymbols; + uint64_t CommonSize = 0; error_code err; // Parse symbols @@ -106,28 +110,29 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) { if (SymType == object::SymbolRef::ST_Function || SymType == object::SymbolRef::ST_Data) { uint64_t FileOffset; - StringRef sData; + StringRef SectionData; section_iterator si = obj->end_sections(); Check(i->getFileOffset(FileOffset)); Check(i->getSection(si)); if (si == obj->end_sections()) continue; - Check(si->getContents(sData)); + Check(si->getContents(SectionData)); const uint8_t* SymPtr = (const uint8_t*)InputBuffer->getBufferStart() + (uintptr_t)FileOffset; - uintptr_t SectOffset = (uintptr_t)(SymPtr - (const uint8_t*)sData.begin()); + uintptr_t SectOffset = (uintptr_t)(SymPtr - + (const uint8_t*)SectionData.begin()); unsigned SectionID = findOrEmitSection(*obj, *si, SymType == object::SymbolRef::ST_Function, LocalSections); - bool isGlobal = flags & SymbolRef::SF_Global; LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset); DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset) << " flags: " << flags << " SID: " << SectionID << " Offset: " << format("%p", SectOffset)); + bool isGlobal = flags & SymbolRef::SF_Global; if (isGlobal) - SymbolTable[Name] = SymbolLoc(SectionID, SectOffset); + GlobalSymbolTable[Name] = SymbolLoc(SectionID, SectOffset); } } DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name << "\n"); @@ -137,7 +142,7 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) { if (CommonSize != 0) emitCommonSymbols(*obj, CommonSymbols, CommonSize, LocalSymbols); - // Parse and proccess relocations + // Parse and process relocations DEBUG(dbgs() << "Parse relocations:\n"); for (section_iterator si = obj->begin_sections(), se = obj->end_sections(); si != se; si.increment(err)) { @@ -150,7 +155,7 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) { e = si->end_relocations(); i != e; i.increment(err)) { Check(err); - // If it's first relocation in this section, find its SectionID + // If it's the first relocation in this section, find its SectionID if (isFirstRelocation) { SectionID = findOrEmitSection(*obj, *si, true, LocalSections); DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n"); @@ -177,10 +182,10 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) { return false; } -unsigned RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj, - const CommonSymbolMap &Map, - uint64_t TotalSize, - LocalSymbolMap &LocalSymbols) { +void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj, + const CommonSymbolMap &CommonSymbols, + uint64_t TotalSize, + SymbolTableMap &SymbolTable) { // Allocate memory for the section unsigned SectionID = Sections.size(); uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void*), @@ -197,18 +202,16 @@ unsigned RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj, << "\n"); // Assign the address of each symbol - for (CommonSymbolMap::const_iterator it = Map.begin(), itEnd = Map.end(); - it != itEnd; it++) { - uint64_t Size = it->second; + for (CommonSymbolMap::const_iterator it = CommonSymbols.begin(), + itEnd = CommonSymbols.end(); it != itEnd; it++) { StringRef Name; it->first.getName(Name); Obj.updateSymbolAddress(it->first, (uint64_t)Addr); - LocalSymbols[Name.data()] = SymbolLoc(SectionID, Offset); + SymbolTable[Name.data()] = SymbolLoc(SectionID, Offset); + uint64_t Size = it->second; Offset += Size; Addr += Size; } - - return SectionID; } unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj, @@ -274,8 +277,8 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj, } else { // Even if we didn't load the section, we need to record an entry for it - // to handle later processing (and by 'handle' I mean don't do anything - // with these sections). + // to handle later processing (and by 'handle' I mean don't do anything + // with these sections). Allocate = 0; Addr = 0; DEBUG(dbgs() << "emitSection SectionID: " << SectionID @@ -307,28 +310,26 @@ unsigned RuntimeDyldImpl::findOrEmitSection(ObjectImage &Obj, return SectionID; } -void RuntimeDyldImpl::AddRelocation(const RelocationValueRef &Value, - unsigned SectionID, uintptr_t Offset, - uint32_t RelType) { - DEBUG(dbgs() << "AddRelocation SymNamePtr: " << format("%p", Value.SymbolName) - << " SID: " << Value.SectionID - << " Addend: " << format("%p", Value.Addend) - << " Offset: " << format("%p", Offset) - << " RelType: " << format("%x", RelType) - << "\n"); +void RuntimeDyldImpl::addRelocationForSection(const RelocationEntry &RE, + unsigned SectionID) { + Relocations[SectionID].push_back(RE); +} - if (Value.SymbolName == 0) { - Relocations[Value.SectionID].push_back(RelocationEntry( - SectionID, - Offset, - RelType, - Value.Addend)); - } else - SymbolRelocations[Value.SymbolName].push_back(RelocationEntry( - SectionID, - Offset, - RelType, - Value.Addend)); +void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE, + StringRef SymbolName) { + // Relocation by symbol. If the symbol is found in the global symbol table, + // create an appropriate section relocation. Otherwise, add it to + // ExternalSymbolRelocations. + SymbolTableMap::const_iterator Loc = + GlobalSymbolTable.find(SymbolName); + if (Loc == GlobalSymbolTable.end()) { + ExternalSymbolRelocations[SymbolName].push_back(RE); + } else { + // Copy the RE since we want to modify its addend. + RelocationEntry RECopy = RE; + RECopy.Addend += Loc->second.second; + Relocations[Loc->second.first].push_back(RECopy); + } } uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) { @@ -369,12 +370,12 @@ void RuntimeDyldImpl::resolveRelocationEntry(const RelocationEntry &RE, uint8_t *Target = Sections[RE.SectionID].Address + RE.Offset; DEBUG(dbgs() << "\tSectionID: " << RE.SectionID << " + " << RE.Offset << " (" << format("%p", Target) << ")" - << " Data: " << RE.Data + << " RelType: " << RE.RelType << " Addend: " << RE.Addend << "\n"); resolveRelocation(Target, Sections[RE.SectionID].LoadAddress + RE.Offset, - Value, RE.Data, RE.Addend); + Value, RE.RelType, RE.Addend); } } @@ -385,16 +386,14 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs, } } -// resolveSymbols - Resolve any relocations to the specified symbols if -// we know where it lives. -void RuntimeDyldImpl::resolveSymbols() { - StringMap<RelocationList>::iterator i = SymbolRelocations.begin(), - e = SymbolRelocations.end(); +void RuntimeDyldImpl::resolveExternalSymbols() { + StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin(), + e = ExternalSymbolRelocations.end(); for (; i != e; i++) { StringRef Name = i->first(); RelocationList &Relocs = i->second; - StringMap<SymbolLoc>::const_iterator Loc = SymbolTable.find(Name); - if (Loc == SymbolTable.end()) { + SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name); + if (Loc == GlobalSymbolTable.end()) { // This is an external symbol, try to get it address from // MemoryManager. uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(), @@ -404,15 +403,7 @@ void RuntimeDyldImpl::resolveSymbols() { << "\n"); resolveRelocationList(Relocs, (uintptr_t)Addr); } else { - // Change the relocation to be section relative rather than symbol - // relative and move it to the resolved relocation list. - DEBUG(dbgs() << "Resolving symbol '" << Name << "'\n"); - for (int i = 0, e = Relocs.size(); i != e; ++i) { - RelocationEntry Entry = Relocs[i]; - Entry.Addend += Loc->second.second; - Relocations[Loc->second.first].push_back(Entry); - } - Relocs.clear(); + report_fatal_error("Expected external symbol"); } } } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index db6da8c..39aed34 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -334,28 +334,31 @@ void RuntimeDyldELF::resolveRelocation(uint8_t *LocalAddress, void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel, ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID, - LocalSymbolMap &Symbols, + const SymbolTableMap &Symbols, StubMap &Stubs) { uint32_t RelType = (uint32_t)(Rel.Type & 0xffffffffL); intptr_t Addend = (intptr_t)Rel.AdditionalInfo; - RelocationValueRef Value; - StringRef TargetName; const SymbolRef &Symbol = Rel.Symbol; + + // Obtain the symbol name which is referenced in the relocation + StringRef TargetName; Symbol.getName(TargetName); DEBUG(dbgs() << "\t\tRelType: " << RelType << " Addend: " << Addend << " TargetName: " << TargetName << "\n"); - // First look the symbol in object file symbols. - LocalSymbolMap::iterator lsi = Symbols.find(TargetName.data()); + RelocationValueRef Value; + // First search for the symbol in the local symbol table + SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data()); if (lsi != Symbols.end()) { Value.SectionID = lsi->second.first; Value.Addend = lsi->second.second; } else { - // Second look the symbol in global symbol table. - StringMap<SymbolLoc>::iterator gsi = SymbolTable.find(TargetName.data()); - if (gsi != SymbolTable.end()) { + // Search for the symbol in the global symbol table + SymbolTableMap::const_iterator gsi = + GlobalSymbolTable.find(TargetName.data()); + if (gsi != GlobalSymbolTable.end()) { Value.SectionID = gsi->second.first; Value.Addend = gsi->second.second; } else { @@ -366,7 +369,7 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel, // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously // and can be changed by another developers. Maybe best way is add // a new symbol type ST_Section to SymbolRef and use it. - section_iterator si = Obj.end_sections(); + section_iterator si(Obj.end_sections()); Symbol.getSection(si); if (si == Obj.end_sections()) llvm_unreachable("Symbol section not found, bad object file format!"); @@ -410,14 +413,24 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel, Stubs[Value] = Section.StubOffset; uint8_t *StubTargetAddr = createStubFunction(Section.Address + Section.StubOffset); - AddRelocation(Value, Rel.SectionID, - StubTargetAddr - Section.Address, ELF::R_ARM_ABS32); + RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address, + ELF::R_ARM_ABS32, Value.Addend); + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); + resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address + Section.StubOffset, RelType, 0); Section.StubOffset += getMaxStubSize(); } - } else - AddRelocation(Value, Rel.SectionID, Rel.Offset, RelType); + } else { + RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend); + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); + } } bool RuntimeDyldELF::isCompatibleFormat(const MemoryBuffer *InputBuffer) const { diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h index e7f6fab..e413f78 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h @@ -51,7 +51,8 @@ protected: virtual void processRelocationRef(const ObjRelocationInfo &Rel, ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID, - LocalSymbolMap &Symbols, StubMap &Stubs); + const SymbolTableMap &Symbols, + StubMap &Stubs); virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer); virtual void handleObjectLoaded(ObjectImage *Obj); diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index 2dea13f..c38ca69 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -14,60 +14,83 @@ #ifndef LLVM_RUNTIME_DYLD_IMPL_H #define LLVM_RUNTIME_DYLD_IMPL_H +#include "ObjectImage.h" #include "llvm/ExecutionEngine/RuntimeDyld.h" -#include "llvm/Object/ObjectFile.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/ADT/Twine.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Support/Memory.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/system_error.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Object/ObjectFile.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/ADT/Triple.h" -#include <map> #include "llvm/Support/Format.h" -#include "ObjectImage.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/system_error.h" +#include <map> using namespace llvm; using namespace llvm::object; namespace llvm { +class MemoryBuffer; +class Twine; + + +/// SectionEntry - represents a section emitted into memory by the dynamic +/// linker. class SectionEntry { public: - uint8_t* Address; + /// Address - address in the linker's memory where the section resides. + uint8_t *Address; + + /// Size - section size. size_t Size; - uint64_t LoadAddress; // For each section, the address it will be - // considered to live at for relocations. The same - // as the pointer to the above memory block for - // hosted JITs. - uintptr_t StubOffset; // It's used for architecturies with stub - // functions for far relocations like ARM. - uintptr_t ObjAddress; // Section address in object file. It's use for - // calculate MachO relocation addend - SectionEntry(uint8_t* address, size_t size, uintptr_t stubOffset, + + /// LoadAddress - the address of the section in the target process's memory. + /// Used for situations in which JIT-ed code is being executed in the address + /// space of a separate process. If the code executes in the same address + /// space where it was JIT-ed, this just equals Address. + uint64_t LoadAddress; + + /// StubOffset - used for architectures with stub functions for far + /// relocations (like ARM). + uintptr_t StubOffset; + + /// ObjAddress - address of the section in the in-memory object file. Used + /// for calculating relocations in some object formats (like MachO). + uintptr_t ObjAddress; + + SectionEntry(uint8_t *address, size_t size, uintptr_t stubOffset, uintptr_t objAddress) : Address(address), Size(size), LoadAddress((uintptr_t)address), StubOffset(stubOffset), ObjAddress(objAddress) {} }; +/// RelocationEntry - used to represent relocations internally in the dynamic +/// linker. class RelocationEntry { public: - unsigned SectionID; // Section the relocation is contained in. - uintptr_t Offset; // Offset into the section for the relocation. - uint32_t Data; // Relocatino data. Including type of relocation - // and another flags and parameners from - intptr_t Addend; // Addend encoded in the instruction itself, if any, - // plus the offset into the source section for - // the symbol once the relocation is resolvable. - RelocationEntry(unsigned id, uint64_t offset, uint32_t data, int64_t addend) - : SectionID(id), Offset(offset), Data(data), Addend(addend) {} + /// SectionID - the section this relocation points to. + unsigned SectionID; + + /// Offset - offset into the section. + uintptr_t Offset; + + /// RelType - relocation type. + uint32_t RelType; + + /// Addend - the relocation addend encoded in the instruction itself. Also + /// used to make a relocation section relative instead of symbol relative. + intptr_t Addend; + + RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend) + : SectionID(id), Offset(offset), RelType(type), Addend(addend) {} }; -// Raw relocation data from object file +/// ObjRelocationInfo - relocation information as read from the object file. +/// Used to pass around data taken from object::RelocationRef, together with +/// the section to which the relocation points (represented by a SectionID). class ObjRelocationInfo { public: unsigned SectionID; @@ -97,7 +120,8 @@ protected: // The MemoryManager to load objects into. RTDyldMemoryManager *MemMgr; - // A list of emmitted sections. + // A list of all sections emitted by the dynamic linker. These sections are + // referenced in the code by means of their index in this list - SectionID. typedef SmallVector<SectionEntry, 64> SectionList; SectionList Sections; @@ -105,11 +129,11 @@ protected: // references it. typedef std::map<SectionRef, unsigned> ObjSectionToIDMap; - // Master symbol table. As modules are loaded and external symbols are - // resolved, their addresses are stored here as a SectionID/Offset pair. + // A global symbol table for symbols from all loaded modules. Maps the + // symbol name to a (SectionID, offset in section) pair. typedef std::pair<unsigned, uintptr_t> SymbolLoc; - StringMap<SymbolLoc> SymbolTable; - typedef DenseMap<const char*, SymbolLoc> LocalSymbolMap; + typedef StringMap<SymbolLoc> SymbolTableMap; + SymbolTableMap GlobalSymbolTable; // Keep a map of common symbols to their sizes typedef std::map<SymbolRef, unsigned> CommonSymbolMap; @@ -121,12 +145,14 @@ protected: // in the relocation list where it's stored. typedef SmallVector<RelocationEntry, 64> RelocationList; // Relocations to sections already loaded. Indexed by SectionID which is the - // source of the address. The target where the address will be writen is + // source of the address. The target where the address will be written is // SectionID/Offset in the relocation itself. DenseMap<unsigned, RelocationList> Relocations; - // Relocations to external symbols that are not yet resolved. - // Indexed by symbol name. - StringMap<RelocationList> SymbolRelocations; + + // Relocations to external symbols that are not yet resolved. Symbols are + // external when they aren't found in the global symbol table of all loaded + // modules. This map is indexed by symbol name. + StringMap<RelocationList> ExternalSymbolRelocations; typedef std::map<RelocationValueRef, uintptr_t> StubMap; @@ -153,16 +179,17 @@ protected: return (uint8_t*)Sections[SectionID].Address; } - /// \brief Emits a section containing common symbols. - /// \return SectionID. - unsigned emitCommonSymbols(ObjectImage &Obj, - const CommonSymbolMap &Map, - uint64_t TotalSize, - LocalSymbolMap &Symbols); + /// \brief Given the common symbols discovered in the object file, emit a + /// new section for them and update the symbol mappings in the object and + /// symbol table. + void emitCommonSymbols(ObjectImage &Obj, + const CommonSymbolMap &CommonSymbols, + uint64_t TotalSize, + SymbolTableMap &SymbolTable); /// \brief Emits section data from the object file to the MemoryManager. /// \param IsCode if it's true then allocateCodeSection() will be - /// used for emmits, else allocateDataSection() will be used. + /// used for emits, else allocateDataSection() will be used. /// \return SectionID. unsigned emitSection(ObjectImage &Obj, const SectionRef &Section, @@ -178,10 +205,12 @@ protected: bool IsCode, ObjSectionToIDMap &LocalSections); - /// \brief If Value.SymbolName is NULL then store relocation to the - /// Relocations, else store it in the SymbolRelocations. - void AddRelocation(const RelocationValueRef &Value, unsigned SectionID, - uintptr_t Offset, uint32_t RelType); + // \brief Add a relocation entry that uses the given section. + void addRelocationForSection(const RelocationEntry &RE, unsigned SectionID); + + // \brief Add a relocation entry that uses the given symbol. This symbol may + // be found in the global symbol table, or it may be external. + void addRelocationForSymbol(const RelocationEntry &RE, StringRef SymbolName); /// \brief Emits long jump instruction to Addr. /// \return Pointer to the memory area for emitting target address. @@ -203,14 +232,16 @@ protected: uint32_t Type, int64_t Addend) = 0; - /// \brief Parses the object file relocation and store it to Relocations - /// or SymbolRelocations. Its depend from object file type. + /// \brief Parses the object file relocation and stores it to Relocations + /// or SymbolRelocations (this depends on the object file type). virtual void processRelocationRef(const ObjRelocationInfo &Rel, ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID, - LocalSymbolMap &Symbols, StubMap &Stubs) = 0; + const SymbolTableMap &Symbols, + StubMap &Stubs) = 0; - void resolveSymbols(); + /// \brief Resolve relocations to external symbols. + void resolveExternalSymbols(); virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer); virtual void handleObjectLoaded(ObjectImage *Obj) { @@ -228,9 +259,9 @@ public: void *getSymbolAddress(StringRef Name) { // FIXME: Just look up as a function for now. Overly simple of course. // Work in progress. - if (SymbolTable.find(Name) == SymbolTable.end()) + if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end()) return 0; - SymbolLoc Loc = SymbolTable.lookup(Name); + SymbolLoc Loc = GlobalSymbolTable.lookup(Name); return getSectionAddress(Loc.first) + Loc.second; } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp index b7f515d..0e3a9d4 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp @@ -30,7 +30,8 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress, unsigned MachoType = (Type >> 28) & 0xf; unsigned Size = 1 << ((Type >> 25) & 3); - DEBUG(dbgs() << "resolveRelocation LocalAddress: " << format("%p", LocalAddress) + DEBUG(dbgs() << "resolveRelocation LocalAddress: " + << format("%p", LocalAddress) << " FinalAddress: " << format("%p", FinalAddress) << " Value: " << format("%p", Value) << " Addend: " << Addend @@ -53,12 +54,12 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress, break; case Triple::x86: resolveI386Relocation(LocalAddress, - FinalAddress, - (uintptr_t)Value, - isPCRel, - Type, - Size, - Addend); + FinalAddress, + (uintptr_t)Value, + isPCRel, + Type, + Size, + Addend); break; case Triple::arm: // Fall through. case Triple::thumb: @@ -73,14 +74,13 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress, } } -bool RuntimeDyldMachO:: -resolveI386Relocation(uint8_t *LocalAddress, - uint64_t FinalAddress, - uint64_t Value, - bool isPCRel, - unsigned Type, - unsigned Size, - int64_t Addend) { +bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress, + uint64_t FinalAddress, + uint64_t Value, + bool isPCRel, + unsigned Type, + unsigned Size, + int64_t Addend) { if (isPCRel) Value -= FinalAddress + 4; // see resolveX86_64Relocation @@ -102,14 +102,13 @@ resolveI386Relocation(uint8_t *LocalAddress, } } -bool RuntimeDyldMachO:: -resolveX86_64Relocation(uint8_t *LocalAddress, - uint64_t FinalAddress, - uint64_t Value, - bool isPCRel, - unsigned Type, - unsigned Size, - int64_t Addend) { +bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress, + uint64_t FinalAddress, + uint64_t Value, + bool isPCRel, + unsigned Type, + unsigned Size, + int64_t Addend) { // If the relocation is PC-relative, the value to be encoded is the // pointer difference. if (isPCRel) @@ -144,14 +143,13 @@ resolveX86_64Relocation(uint8_t *LocalAddress, } } -bool RuntimeDyldMachO:: -resolveARMRelocation(uint8_t *LocalAddress, - uint64_t FinalAddress, - uint64_t Value, - bool isPCRel, - unsigned Type, - unsigned Size, - int64_t Addend) { +bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress, + uint64_t FinalAddress, + uint64_t Value, + bool isPCRel, + unsigned Type, + unsigned Size, + int64_t Addend) { // If the relocation is PC-relative, the value to be encoded is the // pointer difference. if (isPCRel) { @@ -207,7 +205,7 @@ resolveARMRelocation(uint8_t *LocalAddress, void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel, ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID, - LocalSymbolMap &Symbols, + const SymbolTableMap &Symbols, StubMap &Stubs) { uint32_t RelType = (uint32_t) (Rel.Type & 0xffffffffL); @@ -217,18 +215,19 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel, bool isExtern = (RelType >> 27) & 1; if (isExtern) { + // Obtain the symbol name which is referenced in the relocation StringRef TargetName; const SymbolRef &Symbol = Rel.Symbol; Symbol.getName(TargetName); - // First look the symbol in object file symbols. - LocalSymbolMap::iterator lsi = Symbols.find(TargetName.data()); + // First search for the symbol in the local symbol table + SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data()); if (lsi != Symbols.end()) { Value.SectionID = lsi->second.first; Value.Addend = lsi->second.second; } else { - // Second look the symbol in global symbol table. - StringMap<SymbolLoc>::iterator gsi = SymbolTable.find(TargetName.data()); - if (gsi != SymbolTable.end()) { + // Search for the symbol in the global symbol table + SymbolTableMap::const_iterator gsi = GlobalSymbolTable.find(TargetName.data()); + if (gsi != GlobalSymbolTable.end()) { Value.SectionID = gsi->second.first; Value.Addend = gsi->second.second; } else @@ -249,8 +248,8 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel, Value.SectionID = findOrEmitSection(Obj, *si, true, ObjSectionToID); Value.Addend = *(const intptr_t *)Target; if (Value.Addend) { - // The MachO addend is offset from the current section, we need set it - // as offset from destination section + // The MachO addend is an offset from the current section. We need it + // to be an offset from the destination section Value.Addend += Section.ObjAddress - Sections[Value.SectionID].ObjAddress; } } @@ -269,19 +268,29 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel, Stubs[Value] = Section.StubOffset; uint8_t *StubTargetAddr = createStubFunction(Section.Address + Section.StubOffset); - AddRelocation(Value, Rel.SectionID, StubTargetAddr - Section.Address, - macho::RIT_Vanilla); + RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address, + macho::RIT_Vanilla, Value.Addend); + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address + Section.StubOffset, RelType, 0); Section.StubOffset += getMaxStubSize(); } - } else - AddRelocation(Value, Rel.SectionID, Rel.Offset, RelType); + } else { + RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend); + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); + } } -bool RuntimeDyldMachO::isCompatibleFormat(const MemoryBuffer *InputBuffer) const { +bool RuntimeDyldMachO::isCompatibleFormat( + const MemoryBuffer *InputBuffer) const { StringRef Magic = InputBuffer->getBuffer().slice(0, 4); if (Magic == "\xFE\xED\xFA\xCE") return true; if (Magic == "\xCE\xFA\xED\xFE") return true; diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h index 418d130..707664c 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h @@ -51,7 +51,8 @@ protected: virtual void processRelocationRef(const ObjRelocationInfo &Rel, ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID, - LocalSymbolMap &Symbols, StubMap &Stubs); + const SymbolTableMap &Symbols, + StubMap &Stubs); public: virtual void resolveRelocation(uint8_t *LocalAddress, diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp index 42364f9..7cdd669 100644 --- a/lib/ExecutionEngine/TargetSelect.cpp +++ b/lib/ExecutionEngine/TargetSelect.cpp @@ -26,11 +26,7 @@ using namespace llvm; TargetMachine *EngineBuilder::selectTarget() { - StringRef MArch = ""; - StringRef MCPU = ""; - SmallVector<std::string, 1> MAttrs; - Triple TT(M->getTargetTriple()); - + Triple TT(LLVM_HOSTTRIPLE); return selectTarget(TT, MArch, MCPU, MAttrs); } @@ -56,8 +52,9 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple, } if (!TheTarget) { - *ErrorStr = "No available targets are compatible with this -march, " - "see -version for the available targets.\n"; + if (ErrorStr) + *ErrorStr = "No available targets are compatible with this -march, " + "see -version for the available targets.\n"; return 0; } diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index 765fcc8..afba2e8 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm-c/Linker.h" #include <cctype> using namespace llvm; @@ -595,12 +596,12 @@ void ModuleLinker::computeTypeMapping() { // example. When the source module got loaded into the same LLVMContext, if // it had the same type, it would have been renamed to "%foo.42 = { i32 }". std::vector<StructType*> SrcStructTypes; - SrcM->findUsedStructTypes(SrcStructTypes); + SrcM->findUsedStructTypes(SrcStructTypes, true); SmallPtrSet<StructType*, 32> SrcStructTypesSet(SrcStructTypes.begin(), SrcStructTypes.end()); std::vector<StructType*> DstStructTypes; - DstM->findUsedStructTypes(DstStructTypes); + DstM->findUsedStructTypes(DstStructTypes, true); SmallPtrSet<StructType*, 32> DstStructTypesSet(DstStructTypes.begin(), DstStructTypes.end()); @@ -683,7 +684,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, GlobalVariable *NG = new GlobalVariable(*DstGV->getParent(), NewType, SrcGV->isConstant(), DstGV->getLinkage(), /*init*/0, /*name*/"", DstGV, - DstGV->isThreadLocal(), + DstGV->getThreadLocalMode(), DstGV->getType()->getAddressSpace()); // Propagate alignment, visibility and section info. @@ -758,7 +759,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) { new GlobalVariable(*DstM, TypeMap.get(SGV->getType()->getElementType()), SGV->isConstant(), SGV->getLinkage(), /*init*/0, SGV->getName(), /*insertbefore*/0, - SGV->isThreadLocal(), + SGV->getThreadLocalMode(), SGV->getType()->getAddressSpace()); // Propagate alignment, visibility and section info. copyGVAttributes(NewDGV, SGV); @@ -1335,3 +1336,17 @@ bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode, return false; } + +//===----------------------------------------------------------------------===// +// C API. +//===----------------------------------------------------------------------===// + +LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src, + LLVMLinkerMode Mode, char **OutMessages) { + std::string Messages; + LLVMBool Result = Linker::LinkModules(unwrap(Dest), unwrap(Src), + Mode, OutMessages? &Messages : 0); + if (OutMessages) + *OutMessages = strdup(Messages.c_str()); + return Result; +} diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp index 9fc33b6..7203b9a 100644 --- a/lib/MC/ELFObjectWriter.cpp +++ b/lib/MC/ELFObjectWriter.cpp @@ -627,7 +627,7 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF, const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm, const MCValue &Target, - const MCFragment &F, + const MCFragment &F, const MCFixup &Fixup, bool IsPCRel) const { const MCSymbol &Symbol = Target.getSymA()->getSymbol(); @@ -1061,11 +1061,19 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm, entry.Index += LocalSymbolData.size(); if (is64Bit()) { String64(*F, entry.r_offset); + if (TargetObjectWriter->isN64()) { + String32(*F, entry.Index); - struct ELF::Elf64_Rela ERE64; - ERE64.setSymbolAndType(entry.Index, entry.Type); - String64(*F, ERE64.r_info); - + String8(*F, TargetObjectWriter->getRSsym(entry.Type)); + String8(*F, TargetObjectWriter->getRType3(entry.Type)); + String8(*F, TargetObjectWriter->getRType2(entry.Type)); + String8(*F, TargetObjectWriter->getRType(entry.Type)); + } + else { + struct ELF::Elf64_Rela ERE64; + ERE64.setSymbolAndType(entry.Index, entry.Type); + String64(*F, ERE64.r_info); + } if (hasRelocationAddend()) String64(*F, entry.r_addend); } else { diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp index 0b2e4ae..2e447b0 100644 --- a/lib/MC/MCAsmBackend.cpp +++ b/lib/MC/MCAsmBackend.cpp @@ -39,7 +39,7 @@ MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { { "FK_SecRel_4", 0, 32, 0 }, { "FK_SecRel_8", 0, 64, 0 } }; - + assert((size_t)Kind <= sizeof(Builtins) / sizeof(Builtins[0]) && "Unknown fixup kind"); return Builtins[Kind]; diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp index 8286c1d..8da2e0e 100644 --- a/lib/MC/MCAsmInfo.cpp +++ b/lib/MC/MCAsmInfo.cpp @@ -50,6 +50,7 @@ MCAsmInfo::MCAsmInfo() { AllowNameToStartWithDigit = false; AllowPeriodsInName = true; AllowUTF8 = true; + UseDataRegionDirectives = false; ZeroDirective = "\t.zero\t"; AsciiDirective = "\t.ascii\t"; AscizDirective = "\t.asciz\t"; @@ -57,12 +58,6 @@ MCAsmInfo::MCAsmInfo() { Data16bitsDirective = "\t.short\t"; Data32bitsDirective = "\t.long\t"; Data64bitsDirective = "\t.quad\t"; - DataBegin = "$d."; - CodeBegin = "$a."; - JT8Begin = "$d."; - JT16Begin = "$d."; - JT32Begin = "$d."; - SupportsDataRegions = false; SunStyleELFSectionSwitchSyntax = false; UsesELFSectionDirectiveForBSS = false; AlignDirective = "\t.align\t"; @@ -89,14 +84,10 @@ MCAsmInfo::MCAsmInfo() { SupportsDebugInformation = false; ExceptionsType = ExceptionHandling::None; DwarfUsesInlineInfoSection = false; - DwarfRequiresRelocationForSectionOffset = true; DwarfSectionOffsetDirective = 0; - DwarfUsesLabelOffsetForRanges = true; - DwarfUsesRelocationsForStringPool = true; + DwarfUsesRelocationsAcrossSections = true; DwarfRegNumForCFI = false; HasMicrosoftFastStdCallMangling = false; - - AsmTransCBE = 0; } MCAsmInfo::~MCAsmInfo() { diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp index 881d992..678e75a 100644 --- a/lib/MC/MCAsmInfoCOFF.cpp +++ b/lib/MC/MCAsmInfoCOFF.cpp @@ -26,7 +26,7 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() { PrivateGlobalPrefix = "L"; // Prefix for private global symbols WeakRefDirective = "\t.weak\t"; LinkOnceDirective = "\t.linkonce discard\n"; - + // Doesn't support visibility: HiddenVisibilityAttr = HiddenDeclarationVisibilityAttr = MCSA_Invalid; ProtectedVisibilityAttr = MCSA_Invalid; @@ -36,8 +36,6 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() { SupportsDebugInformation = true; DwarfSectionOffsetDirective = "\t.secrel32\t"; HasMicrosoftFastStdCallMangling = true; - - SupportsDataRegions = false; } void MCAsmInfoMicrosoft::anchor() { } diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp index c1e2635..8e0ac23 100644 --- a/lib/MC/MCAsmInfoDarwin.cpp +++ b/lib/MC/MCAsmInfoDarwin.cpp @@ -18,7 +18,7 @@ #include "llvm/MC/MCStreamer.h" using namespace llvm; -void MCAsmInfoDarwin::anchor() { } +void MCAsmInfoDarwin::anchor() { } MCAsmInfoDarwin::MCAsmInfoDarwin() { // Common settings for all Darwin targets. @@ -43,13 +43,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() { HasMachoTBSSDirective = true; // Uses .tbss HasStaticCtorDtorReferenceInStaticMode = true; - CodeBegin = "L$start$code$"; - DataBegin = "L$start$data$"; - JT8Begin = "L$start$jt8$"; - JT16Begin = "L$start$jt16$"; - JT32Begin = "L$start$jt32$"; - SupportsDataRegions = true; - // FIXME: Darwin 10 and newer don't need this. LinkerRequiresNonEmptyDwarfLines = true; @@ -61,12 +54,10 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() { // Doesn't support protected visibility. ProtectedVisibilityAttr = MCSA_Invalid; - + HasDotTypeDotSizeDirective = false; HasNoDeadStrip = true; HasSymbolResolver = true; - DwarfRequiresRelocationForSectionOffset = false; - DwarfUsesLabelOffsetForRanges = false; - DwarfUsesRelocationsForStringPool = false; + DwarfUsesRelocationsAcrossSections = false; } diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp index 11f0f72..373df4b 100644 --- a/lib/MC/MCAsmStreamer.cpp +++ b/lib/MC/MCAsmStreamer.cpp @@ -138,6 +138,7 @@ public: virtual void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol); virtual void EmitAssemblerFlag(MCAssemblerFlag Flag); + virtual void EmitDataRegion(MCDataRegionType Kind); virtual void EmitThumbFunc(MCSymbol *Func); virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value); @@ -170,7 +171,7 @@ public: unsigned ByteAlignment); virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0, - unsigned Size = 0, unsigned ByteAlignment = 0); + uint64_t Size = 0, unsigned ByteAlignment = 0); virtual void EmitTBSSSymbol (const MCSection *Section, MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment = 0); @@ -352,6 +353,21 @@ void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { EmitEOL(); } +void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) { + MCContext &Ctx = getContext(); + const MCAsmInfo &MAI = Ctx.getAsmInfo(); + if (!MAI.doesSupportDataRegionDirectives()) + return; + switch (Kind) { + case MCDR_DataRegion: OS << "\t.data_region"; break; + case MCDR_DataRegionJT8: OS << "\t.data_region jt8"; break; + case MCDR_DataRegionJT16: OS << "\t.data_region jt16"; break; + case MCDR_DataRegionJT32: OS << "\t.data_region jt32"; break; + case MCDR_DataRegionEnd: OS << "\t.end_data_region"; break; + } + EmitEOL(); +} + void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) { // This needs to emit to a temporary string to get properly quoted // MCSymbols when they have spaces in them. @@ -513,7 +529,7 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, } void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol, - unsigned Size, unsigned ByteAlignment) { + uint64_t Size, unsigned ByteAlignment) { // Note: a .zerofill directive does not switch sections. OS << ".zerofill "; @@ -826,7 +842,7 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line, if (IsVerboseAsm) { OS.PadToColumn(MAI.getCommentColumn()); - OS << MAI.getCommentString() << ' ' << FileName << ':' + OS << MAI.getCommentString() << ' ' << FileName << ':' << Line << ':' << Column; } EmitEOL(); @@ -1009,7 +1025,7 @@ void MCAsmStreamer::EmitCFISignalFrame() { if (!UseCFI) return; - OS << "\t.cif_signal_frame"; + OS << "\t.cfi_signal_frame"; EmitEOL(); } diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp index bb67868..0aa0c98 100644 --- a/lib/MC/MCAssembler.cpp +++ b/lib/MC/MCAssembler.cpp @@ -409,7 +409,7 @@ static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout, // See if we are aligning with nops, and if so do that first to try to fill // the Count bytes. Then if that did not fill any bytes or there are any - // bytes left to fill use the the Value and ValueSize to fill the rest. + // bytes left to fill use the Value and ValueSize to fill the rest. // If we are aligning with nops, ask that target to emit the right data. if (AF.hasEmitNops()) { if (!Asm.getBackend().writeNopData(Count, OW)) diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp index d3c4fb1..b5b14b9 100644 --- a/lib/MC/MCContext.cpp +++ b/lib/MC/MCContext.cpp @@ -274,11 +274,11 @@ unsigned MCContext::GetDwarfFile(StringRef Directory, StringRef FileName, if (Directory.empty()) { // Separate the directory part from the basename of the FileName. - std::pair<StringRef, StringRef> Slash = FileName.rsplit('/'); - Directory = Slash.second; - if (!Directory.empty()) { - Directory = Slash.first; - FileName = Slash.second; + StringRef tFileName = sys::path::filename(FileName); + if (!tFileName.empty()) { + Directory = sys::path::parent_path(FileName); + if (!Directory.empty()) + FileName = tFileName; } } diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h index 880a31a..322abd5 100644 --- a/lib/MC/MCDisassembler/Disassembler.h +++ b/lib/MC/MCDisassembler/Disassembler.h @@ -99,6 +99,14 @@ public: DisAsm.reset(disAsm); IP.reset(iP); } + const std::string &getTripleName() const { return TripleName; } + void *getDisInfo() const { return DisInfo; } + int getTagType() const { return TagType; } + LLVMOpInfoCallback getGetOpInfo() const { return GetOpInfo; } + LLVMSymbolLookupCallback getSymbolLookupCallback() const { + return SymbolLookUp; + } + const Target *getTarget() const { return TheTarget; } const MCDisassembler *getDisAsm() const { return DisAsm.get(); } const MCAsmInfo *getAsmInfo() const { return MAI.get(); } MCInstPrinter *getIP() { return IP.get(); } diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp index b2672ca..1226f1a 100644 --- a/lib/MC/MCDisassembler/EDDisassembler.cpp +++ b/lib/MC/MCDisassembler/EDDisassembler.cpp @@ -44,7 +44,7 @@ struct TripleMap { const char *String; }; -static struct TripleMap triplemap[] = { +static const struct TripleMap triplemap[] = { { Triple::x86, "i386-unknown-unknown" }, { Triple::x86_64, "x86_64-unknown-unknown" }, { Triple::arm, "arm-unknown-unknown" }, @@ -256,7 +256,7 @@ void EDDisassembler::initMaps(const MCRegisterInfo ®isterInfo) { unsigned registerIndex; for (registerIndex = 0; registerIndex < numRegisters; ++registerIndex) { - const char* registerName = registerInfo.get(registerIndex).Name; + const char* registerName = registerInfo.getName(registerIndex); RegVec.push_back(registerName); RegRMap[registerName] = registerIndex; diff --git a/lib/MC/MCDisassembler/EDMain.cpp b/lib/MC/MCDisassembler/EDMain.cpp index c658717..5c065db 100644 --- a/lib/MC/MCDisassembler/EDMain.cpp +++ b/lib/MC/MCDisassembler/EDMain.cpp @@ -4,7 +4,7 @@ // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file implements the enhanced disassembler's public C API. @@ -34,9 +34,9 @@ int EDGetDisassembler(EDDisassemblerRef *disassembler, Syntax = EDDisassembler::kEDAssemblySyntaxARMUAL; break; } - + EDDisassemblerRef ret = EDDisassembler::getDisassembler(triple, Syntax); - + if (!ret) return -1; *disassembler = ret; @@ -70,18 +70,18 @@ unsigned int EDCreateInsts(EDInstRef *insts, uint64_t address, void *arg) { unsigned int index; - + for (index = 0; index < count; ++index) { EDInst *inst = ((EDDisassembler*)disassembler)->createInst(byteReader, address, arg); - + if (!inst) return index; - + insts[index] = inst; address += inst->byteSize(); } - + return count; } @@ -165,14 +165,14 @@ int EDTokenIsRegister(EDTokenRef token) { int EDTokenIsNegativeLiteral(EDTokenRef token) { if (((EDToken*)token)->type() != EDToken::kTokenLiteral) return -1; - + return ((EDToken*)token)->literalSign(); } int EDLiteralTokenAbsoluteValue(uint64_t *value, EDTokenRef token) { if (((EDToken*)token)->type() != EDToken::kTokenLiteral) return -1; - + return ((EDToken*)token)->literalAbsoluteValue(*value); } @@ -180,7 +180,7 @@ int EDRegisterTokenValue(unsigned *registerID, EDTokenRef token) { if (((EDToken*)token)->type() != EDToken::kTokenRegister) return -1; - + return ((EDToken*)token)->registerID(*registerID); } @@ -231,7 +231,7 @@ struct ByteReaderWrapper { EDByteBlock_t byteBlock; }; -static int readerWrapperCallback(uint8_t *byte, +static int readerWrapperCallback(uint8_t *byte, uint64_t address, void *arg) { struct ByteReaderWrapper *wrapper = (struct ByteReaderWrapper *)arg; @@ -245,13 +245,9 @@ unsigned int EDBlockCreateInsts(EDInstRef *insts, uint64_t address) { struct ByteReaderWrapper wrapper; wrapper.byteBlock = byteBlock; - - return EDCreateInsts(insts, - count, - disassembler, - readerWrapperCallback, - address, - (void*)&wrapper); + + return EDCreateInsts(insts, count, disassembler, readerWrapperCallback, + address, (void*)&wrapper); } int EDBlockEvaluateOperand(uint64_t *result, EDOperandRef operand, diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp index 84a34f1..75eaf80 100644 --- a/lib/MC/MCDwarf.cpp +++ b/lib/MC/MCDwarf.cpp @@ -36,7 +36,7 @@ using namespace llvm; // First special line opcode - leave room for the standard opcodes. // Note: If you want to change this, you'll have to update the -// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit(). +// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit(). #define DWARF2_LINE_OPCODE_BASE 13 // Minimum line offset in a special line info. opcode. This value @@ -105,7 +105,7 @@ void MCLineEntry::Make(MCStreamer *MCOS, const MCSection *Section) { // // This helper routine returns an expression of End - Start + IntVal . -// +// static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS, const MCSymbol &Start, const MCSymbol &End, @@ -198,7 +198,7 @@ static inline void EmitDwarfLineTable(MCStreamer *MCOS, // Set the value of the symbol, as we are at the end of the section. MCOS->EmitLabel(SectionEnd); - // Switch back the the dwarf line section. + // Switch back the dwarf line section. MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection()); const MCAsmInfo &asmInfo = MCOS->getContext().getAsmInfo(); @@ -310,7 +310,7 @@ const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) { if (MCOS->getContext().getAsmInfo().getLinkerRequiresNonEmptyDwarfLines() && MCLineSectionOrder.begin() == MCLineSectionOrder.end()) { // The darwin9 linker has a bug (see PR8715). For for 32-bit architectures - // it requires: + // it requires: // total_length >= prologue_length + 10 // We are 4 bytes short, since we have total_length = 51 and // prologue_length = 45 @@ -354,7 +354,7 @@ void MCDwarfLineAddr::Encode(int64_t LineDelta, uint64_t AddrDelta, AddrDelta = ScaleAddrDelta(AddrDelta); // A LineDelta of INT64_MAX is a signal that this is actually a - // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the + // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the // end_sequence to emit the matrix entry. if (LineDelta == INT64_MAX) { if (AddrDelta == MAX_SPECIAL_ADDR_DELTA) @@ -552,7 +552,7 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS, const MCSymbol *LineSectionSymbol) { MCContext &context = MCOS->getContext(); - MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection()); + MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection()); // Create a symbol at the start and end of this section used in here for the // expression to calculate the length in the header. @@ -705,7 +705,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS, const MCSymbol *LineSectionSymbol) { MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection()); MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection()); MCSymbol *AbbrevSectionSymbol; - if (AsmInfo.doesDwarfRequireRelocationForSectionOffset()) { + if (AsmInfo.doesDwarfUseRelocationsAcrossSections()) { AbbrevSectionSymbol = context.CreateTempSymbol(); MCOS->EmitLabel(AbbrevSectionSymbol); } else { @@ -766,7 +766,7 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS, MCOS->EmitLabel(Label); // Create and entry for the info and add it to the other entries. - MCGenDwarfLabelEntry *Entry = + MCGenDwarfLabelEntry *Entry = new MCGenDwarfLabelEntry(Name, FileNumber, LineNumber, Label); MCOS->getContext().addMCGenDwarfLabelEntry(Entry); } @@ -1285,7 +1285,7 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer, 0); if (verboseAsm) streamer.AddComment("FDE CIE Offset"); streamer.EmitAbsValue(offset, 4); - } else if (!asmInfo.doesDwarfRequireRelocationForSectionOffset()) { + } else if (!asmInfo.doesDwarfUseRelocationsAcrossSections()) { const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart, cieStart, 0); streamer.EmitAbsValue(offset, 4); diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp index 171ab4d..6eb6914 100644 --- a/lib/MC/MCELFObjectTargetWriter.cpp +++ b/lib/MC/MCELFObjectTargetWriter.cpp @@ -15,9 +15,11 @@ using namespace llvm; MCELFObjectTargetWriter::MCELFObjectTargetWriter(bool Is64Bit_, uint8_t OSABI_, uint16_t EMachine_, - bool HasRelocationAddend_) + bool HasRelocationAddend_, + bool IsN64_) : OSABI(OSABI_), EMachine(EMachine_), - HasRelocationAddend(HasRelocationAddend_), Is64Bit(Is64Bit_) { + HasRelocationAddend(HasRelocationAddend_), Is64Bit(Is64Bit_), + IsN64(IsN64_){ } /// Default e_flags = 0 diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp index 6c4d0e3..2d342dc 100644 --- a/lib/MC/MCELFStreamer.cpp +++ b/lib/MC/MCELFStreamer.cpp @@ -13,6 +13,8 @@ #include "MCELF.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" @@ -89,7 +91,7 @@ public: unsigned ByteAlignment); virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0, - unsigned Size = 0, unsigned ByteAlignment = 0) { + uint64_t Size = 0, unsigned ByteAlignment = 0) { llvm_unreachable("ELF doesn't support this directive"); } virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp index 7880155..0eb7fcc 100644 --- a/lib/MC/MCExpr.cpp +++ b/lib/MC/MCExpr.cpp @@ -202,6 +202,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_PPC_DARWIN_LO16: return "lo16"; case VK_PPC_GAS_HA16: return "ha"; case VK_PPC_GAS_LO16: return "l"; + case VK_PPC_TPREL16_HA: return "tprel@ha"; + case VK_PPC_TPREL16_LO: return "tprel@l"; case VK_Mips_GPREL: return "GPREL"; case VK_Mips_GOT_CALL: return "GOT_CALL"; case VK_Mips_GOT16: return "GOT16"; @@ -220,6 +222,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_Mips_GOT_DISP: return "GOT_DISP"; case VK_Mips_GOT_PAGE: return "GOT_PAGE"; case VK_Mips_GOT_OFST: return "GOT_OFST"; + case VK_Mips_HIGHER: return "HIGHER"; + case VK_Mips_HIGHEST: return "HIGHEST"; } llvm_unreachable("Invalid variant kind"); } diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp index bc6cf77..b75fe2c 100644 --- a/lib/MC/MCMachOStreamer.cpp +++ b/lib/MC/MCMachOStreamer.cpp @@ -1,4 +1,3 @@ -//===- lib/MC/MCMachOStreamer.cpp - Mach-O Object Output ------------===// // // The LLVM Compiler Infrastructure // @@ -33,6 +32,8 @@ class MCMachOStreamer : public MCObjectStreamer { private: virtual void EmitInstToData(const MCInst &Inst); + void EmitDataRegion(DataRegionData::KindTy Kind); + void EmitDataRegionEnd(); public: MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS, MCCodeEmitter *Emitter) @@ -46,6 +47,7 @@ public: virtual void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol); virtual void EmitAssemblerFlag(MCAssemblerFlag Flag); + virtual void EmitDataRegion(MCDataRegionType Kind); virtual void EmitThumbFunc(MCSymbol *Func); virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value); virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute); @@ -72,7 +74,7 @@ public: llvm_unreachable("macho doesn't support this directive"); } virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0, - unsigned Size = 0, unsigned ByteAlignment = 0); + uint64_t Size = 0, unsigned ByteAlignment = 0); virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment = 0); virtual void EmitBytes(StringRef Data, unsigned AddrSpace); @@ -138,6 +140,26 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) { SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeMask); } +void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) { + // Create a temporary label to mark the start of the data region. + MCSymbol *Start = getContext().CreateTempSymbol(); + EmitLabel(Start); + // Record the region for the object writer to use. + DataRegionData Data = { Kind, Start, NULL }; + std::vector<DataRegionData> &Regions = getAssembler().getDataRegions(); + Regions.push_back(Data); +} + +void MCMachOStreamer::EmitDataRegionEnd() { + std::vector<DataRegionData> &Regions = getAssembler().getDataRegions(); + assert(Regions.size() && "Mismatched .end_data_region!"); + DataRegionData &Data = Regions.back(); + assert(Data.End == NULL && "Mismatched .end_data_region!"); + // Create a temporary label to mark the end of the data region. + Data.End = getContext().CreateTempSymbol(); + EmitLabel(Data.End); +} + void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { // Let the target do whatever target specific stuff it needs to do. getAssembler().getBackend().handleAssemblerFlag(Flag); @@ -153,6 +175,26 @@ void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { } } +void MCMachOStreamer::EmitDataRegion(MCDataRegionType Kind) { + switch (Kind) { + case MCDR_DataRegion: + EmitDataRegion(DataRegionData::Data); + return; + case MCDR_DataRegionJT8: + EmitDataRegion(DataRegionData::JumpTable8); + return; + case MCDR_DataRegionJT16: + EmitDataRegion(DataRegionData::JumpTable16); + return; + case MCDR_DataRegionJT32: + EmitDataRegion(DataRegionData::JumpTable32); + return; + case MCDR_DataRegionEnd: + EmitDataRegionEnd(); + return; + } +} + void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) { // Remember that the function is a thumb function. Fixup and relocation // values will need adjusted. @@ -284,7 +326,7 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, } void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol, - unsigned Size, unsigned ByteAlignment) { + uint64_t Size, unsigned ByteAlignment) { MCSectionData &SectData = getAssembler().getOrCreateSectionData(*Section); // The symbol may not be present, which only creates the section. diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp index 7ff2d1b..4c17d91 100644 --- a/lib/MC/MCNullStreamer.cpp +++ b/lib/MC/MCNullStreamer.cpp @@ -63,7 +63,7 @@ namespace { virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment) {} virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0, - unsigned Size = 0, unsigned ByteAlignment = 0) {} + uint64_t Size = 0, unsigned ByteAlignment = 0) {} virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment) {} virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {} @@ -82,7 +82,7 @@ namespace { virtual bool EmitValueToOffset(const MCExpr *Offset, unsigned char Value = 0) { return false; } - + virtual void EmitFileDirective(StringRef Filename) {} virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory, StringRef Filename) { @@ -99,12 +99,12 @@ namespace { virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) { RecordProcEnd(Frame); } - + /// @} }; } - + MCStreamer *llvm::createNullStreamer(MCContext &Context) { return new MCNullStreamer(Context); } diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp index b22ae33..4e6a1b9 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -169,7 +169,7 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) { Ctx->getMachOSection("__DWARF", "__apple_types", MCSectionMachO::S_ATTR_DEBUG, SectionKind::getMetadata()); - + DwarfAbbrevSection = Ctx->getMachOSection("__DWARF", "__debug_abbrev", MCSectionMachO::S_ATTR_DEBUG, diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index 8aef43c..2daad0a 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -45,6 +45,8 @@ FatalAssemblerWarnings("fatal-assembler-warnings", namespace { /// \brief Helper class for tracking macro definitions. +typedef std::vector<AsmToken> MacroArgument; + struct Macro { StringRef Name; StringRef Body; @@ -178,9 +180,9 @@ private: bool ParseCppHashLineFilenameComment(const SMLoc &L); bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M); - bool expandMacro(SmallString<256> &Buf, StringRef Body, + bool expandMacro(raw_svector_ostream &OS, StringRef Body, const std::vector<StringRef> &Parameters, - const std::vector<std::vector<AsmToken> > &A, + const std::vector<MacroArgument> &A, const SMLoc &L); void HandleMacroExit(); @@ -204,11 +206,18 @@ private: void EatToEndOfStatement(); + bool ParseMacroArgument(MacroArgument &MA); + bool ParseMacroArguments(const Macro *M, std::vector<MacroArgument> &A); + /// \brief Parse up to the end of statement and a return the contents from the /// current token until the end of the statement; the current token on exit /// will be either the EndOfStatement or EOF. StringRef ParseStringToEndOfStatement(); + /// \brief Parse until the end of a statement or a comma is encountered, + /// return the contents from the current token up to the end or comma. + StringRef ParseStringToComma(); + bool ParseAssignment(StringRef Name, bool allow_redef); bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc); @@ -245,6 +254,10 @@ private: bool ParseDirectiveIncbin(); // ".incbin" bool ParseDirectiveIf(SMLoc DirectiveLoc); // ".if" + // ".ifb" or ".ifnb", depending on ExpectBlank. + bool ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank); + // ".ifc" or ".ifnc", depending on ExpectEqual. + bool ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual); // ".ifdef" or ".ifndef", depending on expect_defined bool ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined); bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif" @@ -257,6 +270,15 @@ private: const MCExpr *ApplyModifierToExpr(const MCExpr *E, MCSymbolRefExpr::VariantKind Variant); + + // Macro-like directives + Macro *ParseMacroLikeBody(SMLoc DirectiveLoc); + void InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc, + raw_svector_ostream &OS); + bool ParseDirectiveRept(SMLoc DirectiveLoc); // ".rept" + bool ParseDirectiveIrp(SMLoc DirectiveLoc); // ".irp" + bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc" + bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr" }; /// \brief Generic implementations of directive handling, etc. which is shared @@ -328,6 +350,7 @@ public: AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacro>(".macro"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endm"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endmacro"); + AddDirectiveHandler<&GenericAsmParser::ParseDirectivePurgeMacro>(".purgem"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLEB128>(".sleb128"); AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLEB128>(".uleb128"); @@ -359,6 +382,7 @@ public: bool ParseDirectiveMacrosOnOff(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveMacro(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveEndMacro(StringRef, SMLoc DirectiveLoc); + bool ParseDirectivePurgeMacro(StringRef, SMLoc DirectiveLoc); bool ParseDirectiveLEB128(StringRef, SMLoc); }; @@ -456,7 +480,7 @@ bool AsmParser::EnterIncludeFile(const std::string &Filename) { } /// Process the specified .incbin file by seaching for it in the include paths -/// then just emiting the byte contents of the file to the streamer. This +/// then just emitting the byte contents of the file to the streamer. This /// returns true on failure. bool AsmParser::ProcessIncbinFile(const std::string &Filename) { std::string IncludedFile; @@ -602,6 +626,18 @@ StringRef AsmParser::ParseStringToEndOfStatement() { return StringRef(Start, End - Start); } +StringRef AsmParser::ParseStringToComma() { + const char *Start = getTok().getLoc().getPointer(); + + while (Lexer.isNot(AsmToken::EndOfStatement) && + Lexer.isNot(AsmToken::Comma) && + Lexer.isNot(AsmToken::Eof)) + Lex(); + + const char *End = getTok().getLoc().getPointer(); + return StringRef(Start, End - Start); +} + /// ParseParenExpr - Parse a paren expression and return it. /// NOTE: This assumes the leading '(' has already been consumed. /// @@ -700,7 +736,7 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { IDVal == "f" ? 1 : 0); Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext()); - if(IDVal == "b" && Sym->isUndefined()) + if (IDVal == "b" && Sym->isUndefined()) return Error(Loc, "invalid reference to undefined symbol"); EndLoc = Lexer.getLoc(); Lex(); // Eat identifier. @@ -1042,6 +1078,14 @@ bool AsmParser::ParseStatement() { // example. if (IDVal == ".if") return ParseDirectiveIf(IDLoc); + if (IDVal == ".ifb") + return ParseDirectiveIfb(IDLoc, true); + if (IDVal == ".ifnb") + return ParseDirectiveIfb(IDLoc, false); + if (IDVal == ".ifc") + return ParseDirectiveIfc(IDLoc, true); + if (IDVal == ".ifnc") + return ParseDirectiveIfc(IDLoc, false); if (IDVal == ".ifdef") return ParseDirectiveIfdef(IDLoc, true); if (IDVal == ".ifndef" || IDVal == ".ifnotdef") @@ -1123,6 +1167,11 @@ bool AsmParser::ParseStatement() { // Otherwise, we have a normal instruction or directive. if (IDVal[0] == '.' && IDVal != ".") { + + // Target hook for parsing target specific directives. + if (!getTargetParser().ParseDirective(ID)) + return false; + // Assembler features if (IDVal == ".set" || IDVal == ".equ") return ParseDirectiveSet(IDVal, true); @@ -1192,6 +1241,10 @@ bool AsmParser::ParseStatement() { // Symbol attribute directives + if (IDVal == ".extern") { + EatToEndOfStatement(); // .extern is the default, ignore it. + return false; + } if (IDVal == ".globl" || IDVal == ".global") return ParseDirectiveSymbolAttribute(MCSA_Global); if (IDVal == ".indirect_symbol") @@ -1225,22 +1278,27 @@ bool AsmParser::ParseStatement() { if (IDVal == ".incbin") return ParseDirectiveIncbin(); - if (IDVal == ".code16") + if (IDVal == ".code16" || IDVal == ".code16gcc") return TokError(Twine(IDVal) + " not supported yet"); + // Macro-like directives + if (IDVal == ".rept") + return ParseDirectiveRept(IDLoc); + if (IDVal == ".irp") + return ParseDirectiveIrp(IDLoc); + if (IDVal == ".irpc") + return ParseDirectiveIrpc(IDLoc); + if (IDVal == ".endr") + return ParseDirectiveEndr(IDLoc); + // Look up the handler in the handler table. std::pair<MCAsmParserExtension*, DirectiveHandler> Handler = DirectiveMap.lookup(IDVal); if (Handler.first) return (*Handler.second)(Handler.first, IDVal, IDLoc); - // Target hook for parsing target specific directives. - if (!getTargetParser().ParseDirective(ID)) - return false; - bool retval = Warning(IDLoc, "ignoring directive for now"); - EatToEndOfStatement(); - return retval; + return Error(IDLoc, "unknown directive"); } CheckForValidSection(); @@ -1339,7 +1397,7 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) { return false; } -/// DiagHandler - will use the the last parsed cpp hash line filename comment +/// DiagHandler - will use the last parsed cpp hash line filename comment /// for the Filename and LineNo if any in the diagnostic. void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) { const AsmParser *Parser = static_cast<const AsmParser*>(Context); @@ -1393,11 +1451,10 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) { NewDiag.print(0, OS); } -bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body, +bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body, const std::vector<StringRef> &Parameters, - const std::vector<std::vector<AsmToken> > &A, + const std::vector<MacroArgument> &A, const SMLoc &L) { - raw_svector_ostream OS(Buf); unsigned NParameters = Parameters.size(); if (NParameters != 0 && NParameters != A.size()) return Error(L, "Wrong number of arguments"); @@ -1449,7 +1506,7 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body, break; // Otherwise substitute with the token values, with spaces eliminated. - for (std::vector<AsmToken>::const_iterator it = A[Index].begin(), + for (MacroArgument::const_iterator it = A[Index].begin(), ie = A[Index].end(); it != ie; ++it) OS << it->getString(); break; @@ -1472,7 +1529,7 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body, if (Index == NParameters) return Error(L, "Parameter not found"); - for (std::vector<AsmToken>::const_iterator it = A[Index].begin(), + for (MacroArgument::const_iterator it = A[Index].begin(), ie = A[Index].end(); it != ie; ++it) OS << it->getString(); @@ -1482,9 +1539,6 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body, Body = Body.substr(Pos); } - // We include the .endmacro in the buffer as our queue to exit the macro - // instantiation. - OS << ".endmacro\n"; return false; } @@ -1494,55 +1548,92 @@ MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL, { } -bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc, - const Macro *M) { - // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate - // this, although we should protect against infinite loops. - if (ActiveMacros.size() == 20) - return TokError("macros cannot be nested more than 20 levels deep"); - - // Parse the macro instantiation arguments. - std::vector<std::vector<AsmToken> > MacroArguments; - MacroArguments.push_back(std::vector<AsmToken>()); +/// ParseMacroArgument - Extract AsmTokens for a macro argument. +/// This is used for both default macro parameter values and the +/// arguments in macro invocations +bool AsmParser::ParseMacroArgument(MacroArgument &MA) { unsigned ParenLevel = 0; + for (;;) { - if (Lexer.is(AsmToken::Eof)) + SMLoc LastTokenLoc; + + if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal)) return TokError("unexpected token in macro instantiation"); + + // HandleMacroEntry relies on not advancing the lexer here + // to be able to fill in the remaining default parameter values if (Lexer.is(AsmToken::EndOfStatement)) break; + if (ParenLevel == 0 && Lexer.is(AsmToken::Comma)) + break; - // If we aren't inside parentheses and this is a comma, start a new token - // list. - if (ParenLevel == 0 && Lexer.is(AsmToken::Comma)) { - MacroArguments.push_back(std::vector<AsmToken>()); - } else { - // Adjust the current parentheses level. - if (Lexer.is(AsmToken::LParen)) - ++ParenLevel; - else if (Lexer.is(AsmToken::RParen) && ParenLevel) - --ParenLevel; - - // Append the token to the current argument list. - MacroArguments.back().push_back(getTok()); - } + // Adjust the current parentheses level. + if (Lexer.is(AsmToken::LParen)) + ++ParenLevel; + else if (Lexer.is(AsmToken::RParen) && ParenLevel) + --ParenLevel; + + // Append the token to the current argument list. + MA.push_back(getTok()); Lex(); } - // If the last argument didn't end up with any tokens, it's not a real - // argument and we should remove it from the list. This happens with either - // a tailing comma or an empty argument list. - if (MacroArguments.back().empty()) - MacroArguments.pop_back(); + if (ParenLevel != 0) + return TokError("unbalanced parenthesises in macro argument"); + return false; +} + +// Parse the macro instantiation arguments. +bool AsmParser::ParseMacroArguments(const Macro *M, + std::vector<MacroArgument> &A) { + const unsigned NParameters = M ? M->Parameters.size() : 0; + + // Parse two kinds of macro invocations: + // - macros defined without any parameters accept an arbitrary number of them + // - macros defined with parameters accept at most that many of them + for (unsigned Parameter = 0; !NParameters || Parameter < NParameters; + ++Parameter) { + MacroArgument MA; + + if (ParseMacroArgument(MA)) + return true; + + if (!MA.empty()) + A.push_back(MA); + if (Lexer.is(AsmToken::EndOfStatement)) + return false; + + if (Lexer.is(AsmToken::Comma)) + Lex(); + } + return TokError("Too many arguments"); +} + +bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc, + const Macro *M) { + // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate + // this, although we should protect against infinite loops. + if (ActiveMacros.size() == 20) + return TokError("macros cannot be nested more than 20 levels deep"); + + std::vector<MacroArgument> MacroArguments; + if (ParseMacroArguments(M, MacroArguments)) + return true; // Macro instantiation is lexical, unfortunately. We construct a new buffer // to hold the macro body with substitutions. SmallString<256> Buf; StringRef Body = M->Body; + raw_svector_ostream OS(Buf); - if (expandMacro(Buf, Body, M->Parameters, MacroArguments, getTok().getLoc())) + if (expandMacro(OS, Body, M->Parameters, MacroArguments, getTok().getLoc())) return true; + // We include the .endmacro in the buffer as our queue to exit the macro + // instantiation. + OS << ".endmacro\n"; + MemoryBuffer *Instantiation = - MemoryBuffer::getMemBufferCopy(Buf.str(), "<instantiation>"); + MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>"); // Create the macro instantiation object and add to the current macro // instantiation stack. @@ -2295,10 +2386,9 @@ bool AsmParser::ParseDirectiveIncbin() { bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) { TheCondStack.push_back(TheCondState); TheCondState.TheCond = AsmCond::IfCond; - if(TheCondState.Ignore) { + if (TheCondState.Ignore) { EatToEndOfStatement(); - } - else { + } else { int64_t ExprValue; if (ParseAbsoluteExpression(ExprValue)) return true; @@ -2315,6 +2405,61 @@ bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) { return false; } +/// ParseDirectiveIfb +/// ::= .ifb string +bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) { + TheCondStack.push_back(TheCondState); + TheCondState.TheCond = AsmCond::IfCond; + + if (TheCondState.Ignore) { + EatToEndOfStatement(); + } else { + StringRef Str = ParseStringToEndOfStatement(); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '.ifb' directive"); + + Lex(); + + TheCondState.CondMet = ExpectBlank == Str.empty(); + TheCondState.Ignore = !TheCondState.CondMet; + } + + return false; +} + +/// ParseDirectiveIfc +/// ::= .ifc string1, string2 +bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) { + TheCondStack.push_back(TheCondState); + TheCondState.TheCond = AsmCond::IfCond; + + if (TheCondState.Ignore) { + EatToEndOfStatement(); + } else { + StringRef Str1 = ParseStringToComma(); + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("unexpected token in '.ifc' directive"); + + Lex(); + + StringRef Str2 = ParseStringToEndOfStatement(); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '.ifc' directive"); + + Lex(); + + TheCondState.CondMet = ExpectEqual == (Str1 == Str2); + TheCondState.Ignore = !TheCondState.CondMet; + } + + return false; +} + +/// ParseDirectiveIfdef +/// ::= .ifdef symbol bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) { StringRef Name; TheCondStack.push_back(TheCondState); @@ -2853,7 +2998,7 @@ bool GenericAsmParser::ParseDirectiveCFISameValue(StringRef IDVal, /// ParseDirectiveCFIRestore /// ::= .cfi_restore register bool GenericAsmParser::ParseDirectiveCFIRestore(StringRef IDVal, - SMLoc DirectiveLoc) { + SMLoc DirectiveLoc) { int64_t Register = 0; if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc)) return true; @@ -2866,7 +3011,7 @@ bool GenericAsmParser::ParseDirectiveCFIRestore(StringRef IDVal, /// ParseDirectiveCFIEscape /// ::= .cfi_escape expression[,...] bool GenericAsmParser::ParseDirectiveCFIEscape(StringRef IDVal, - SMLoc DirectiveLoc) { + SMLoc DirectiveLoc) { std::string Values; int64_t CurrValue; if (getParser().ParseAbsoluteExpression(CurrValue)) @@ -2998,6 +3143,27 @@ bool GenericAsmParser::ParseDirectiveEndMacro(StringRef Directive, "no current macro definition"); } +/// ParseDirectivePurgeMacro +/// ::= .purgem +bool GenericAsmParser::ParseDirectivePurgeMacro(StringRef Directive, + SMLoc DirectiveLoc) { + StringRef Name; + if (getParser().ParseIdentifier(Name)) + return TokError("expected identifier in '.purgem' directive"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '.purgem' directive"); + + StringMap<Macro*>::iterator I = getParser().MacroMap.find(Name); + if (I == getParser().MacroMap.end()) + return Error(DirectiveLoc, "macro '" + Name + "' is not defined"); + + // Undefine the macro. + delete I->getValue(); + getParser().MacroMap.erase(I); + return false; +} + bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) { getParser().CheckForValidSection(); @@ -3017,6 +3183,217 @@ bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) { return false; } +Macro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) { + AsmToken EndToken, StartToken = getTok(); + + unsigned NestLevel = 0; + for (;;) { + // Check whether we have reached the end of the file. + if (getLexer().is(AsmToken::Eof)) { + Error(DirectiveLoc, "no matching '.endr' in definition"); + return 0; + } + + if (Lexer.is(AsmToken::Identifier) && + (getTok().getIdentifier() == ".rept")) { + ++NestLevel; + } + + // Otherwise, check whether we have reached the .endr. + if (Lexer.is(AsmToken::Identifier) && + getTok().getIdentifier() == ".endr") { + if (NestLevel == 0) { + EndToken = getTok(); + Lex(); + if (Lexer.isNot(AsmToken::EndOfStatement)) { + TokError("unexpected token in '.endr' directive"); + return 0; + } + break; + } + --NestLevel; + } + + // Otherwise, scan till the end of the statement. + EatToEndOfStatement(); + } + + const char *BodyStart = StartToken.getLoc().getPointer(); + const char *BodyEnd = EndToken.getLoc().getPointer(); + StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart); + + // We Are Anonymous. + StringRef Name; + std::vector<StringRef> Parameters; + return new Macro(Name, Body, Parameters); +} + +void AsmParser::InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc, + raw_svector_ostream &OS) { + OS << ".endr\n"; + + MemoryBuffer *Instantiation = + MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>"); + + // Create the macro instantiation object and add to the current macro + // instantiation stack. + MacroInstantiation *MI = new MacroInstantiation(M, DirectiveLoc, + getTok().getLoc(), + Instantiation); + ActiveMacros.push_back(MI); + + // Jump to the macro instantiation and prime the lexer. + CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc()); + Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)); + Lex(); +} + +bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) { + int64_t Count; + if (ParseAbsoluteExpression(Count)) + return TokError("unexpected token in '.rept' directive"); + + if (Count < 0) + return TokError("Count is negative"); + + if (Lexer.isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '.rept' directive"); + + // Eat the end of statement. + Lex(); + + // Lex the rept definition. + Macro *M = ParseMacroLikeBody(DirectiveLoc); + if (!M) + return true; + + // Macro instantiation is lexical, unfortunately. We construct a new buffer + // to hold the macro body with substitutions. + SmallString<256> Buf; + std::vector<StringRef> Parameters; + const std::vector<MacroArgument> A; + raw_svector_ostream OS(Buf); + while (Count--) { + if (expandMacro(OS, M->Body, Parameters, A, getTok().getLoc())) + return true; + } + InstantiateMacroLikeBody(M, DirectiveLoc, OS); + + return false; +} + +/// ParseDirectiveIrp +/// ::= .irp symbol,values +bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) { + std::vector<StringRef> Parameters; + StringRef Parameter; + + if (ParseIdentifier(Parameter)) + return TokError("expected identifier in '.irp' directive"); + + Parameters.push_back(Parameter); + + if (Lexer.isNot(AsmToken::Comma)) + return TokError("expected comma in '.irp' directive"); + + Lex(); + + std::vector<MacroArgument> A; + if (ParseMacroArguments(0, A)) + return true; + + // Eat the end of statement. + Lex(); + + // Lex the irp definition. + Macro *M = ParseMacroLikeBody(DirectiveLoc); + if (!M) + return true; + + // Macro instantiation is lexical, unfortunately. We construct a new buffer + // to hold the macro body with substitutions. + SmallString<256> Buf; + raw_svector_ostream OS(Buf); + + for (std::vector<MacroArgument>::iterator i = A.begin(), e = A.end(); i != e; + ++i) { + std::vector<MacroArgument> Args; + Args.push_back(*i); + + if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc())) + return true; + } + + InstantiateMacroLikeBody(M, DirectiveLoc, OS); + + return false; +} + +/// ParseDirectiveIrpc +/// ::= .irpc symbol,values +bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) { + std::vector<StringRef> Parameters; + StringRef Parameter; + + if (ParseIdentifier(Parameter)) + return TokError("expected identifier in '.irpc' directive"); + + Parameters.push_back(Parameter); + + if (Lexer.isNot(AsmToken::Comma)) + return TokError("expected comma in '.irpc' directive"); + + Lex(); + + std::vector<MacroArgument> A; + if (ParseMacroArguments(0, A)) + return true; + + if (A.size() != 1 || A.front().size() != 1) + return TokError("unexpected token in '.irpc' directive"); + + // Eat the end of statement. + Lex(); + + // Lex the irpc definition. + Macro *M = ParseMacroLikeBody(DirectiveLoc); + if (!M) + return true; + + // Macro instantiation is lexical, unfortunately. We construct a new buffer + // to hold the macro body with substitutions. + SmallString<256> Buf; + raw_svector_ostream OS(Buf); + + StringRef Values = A.front().front().getString(); + std::size_t I, End = Values.size(); + for (I = 0; I < End; ++I) { + MacroArgument Arg; + Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I+1))); + + std::vector<MacroArgument> Args; + Args.push_back(Arg); + + if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc())) + return true; + } + + InstantiateMacroLikeBody(M, DirectiveLoc, OS); + + return false; +} + +bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) { + if (ActiveMacros.empty()) + return TokError("unexpected '.endr' directive, no current .rept"); + + // The only .repl that should get here are the ones created by + // InstantiateMacroLikeBody. + assert(getLexer().is(AsmToken::EndOfStatement)); + + HandleMacroExit(); + return false; +} /// \brief Create an MCAsmParser instance. MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp index 6f45068..5662fea 100644 --- a/lib/MC/MCParser/DarwinAsmParser.cpp +++ b/lib/MC/MCParser/DarwinAsmParser.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/MemoryBuffer.h" @@ -56,6 +57,9 @@ public: AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveTBSS>(".tbss"); AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveZerofill>(".zerofill"); + AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegion>(".data_region"); + AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegionEnd>(".end_data_region"); + // Special section directives. AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const"); AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(".const_data"); @@ -113,6 +117,8 @@ public: bool ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc); bool ParseDirectiveTBSS(StringRef, SMLoc); bool ParseDirectiveZerofill(StringRef, SMLoc); + bool ParseDirectiveDataRegion(StringRef, SMLoc); + bool ParseDirectiveDataRegionEnd(StringRef, SMLoc); // Named Section Directive bool ParseSectionDirectiveConst(StringRef, SMLoc) { @@ -659,6 +665,42 @@ bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) { return false; } +/// ParseDirectiveDataRegion +/// ::= .data_region [ ( jt8 | jt16 | jt32 ) ] +bool DarwinAsmParser::ParseDirectiveDataRegion(StringRef, SMLoc) { + if (getLexer().is(AsmToken::EndOfStatement)) { + Lex(); + getStreamer().EmitDataRegion(MCDR_DataRegion); + return false; + } + StringRef RegionType; + SMLoc Loc = getParser().getTok().getLoc(); + if (getParser().ParseIdentifier(RegionType)) + return TokError("expected region type after '.data_region' directive"); + int Kind = StringSwitch<int>(RegionType) + .Case("jt8", MCDR_DataRegionJT8) + .Case("jt16", MCDR_DataRegionJT16) + .Case("jt32", MCDR_DataRegionJT32) + .Default(-1); + if (Kind == -1) + return Error(Loc, "unknown region type in '.data_region' directive"); + Lex(); + + getStreamer().EmitDataRegion((MCDataRegionType)Kind); + return false; +} + +/// ParseDirectiveDataRegionEnd +/// ::= .end_data_region +bool DarwinAsmParser::ParseDirectiveDataRegionEnd(StringRef, SMLoc) { + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '.end_data_region' directive"); + + Lex(); + getStreamer().EmitDataRegion(MCDR_DataRegionEnd); + return false; +} + namespace llvm { MCAsmParserExtension *createDarwinAsmParser() { diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp index ffc400b..9316bb1 100644 --- a/lib/MC/MCParser/ELFAsmParser.cpp +++ b/lib/MC/MCParser/ELFAsmParser.cpp @@ -64,6 +64,7 @@ public: AddDirectiveHandler<&ELFAsmParser::ParseDirectiveType>(".type"); AddDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident"); AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver"); + AddDirectiveHandler<&ELFAsmParser::ParseDirectiveVersion>(".version"); AddDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref"); AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak"); AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local"); @@ -141,6 +142,7 @@ public: bool ParseDirectiveType(StringRef, SMLoc); bool ParseDirectiveIdent(StringRef, SMLoc); bool ParseDirectiveSymver(StringRef, SMLoc); + bool ParseDirectiveVersion(StringRef, SMLoc); bool ParseDirectiveWeakref(StringRef, SMLoc); bool ParseDirectiveSymbolAttribute(StringRef, SMLoc); @@ -548,6 +550,32 @@ bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) { return false; } +/// ParseDirectiveVersion +/// ::= .version string +bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) { + if (getLexer().isNot(AsmToken::String)) + return TokError("unexpected token in '.version' directive"); + + StringRef Data = getTok().getIdentifier(); + + Lex(); + + const MCSection *Note = + getContext().getELFSection(".note", ELF::SHT_NOTE, 0, + SectionKind::getReadOnly()); + + getStreamer().PushSection(); + getStreamer().SwitchSection(Note); + getStreamer().EmitIntValue(Data.size()+1, 4); // namesz. + getStreamer().EmitIntValue(0, 4); // descsz = 0 (no description). + getStreamer().EmitIntValue(1, 4); // type = NT_VERSION. + getStreamer().EmitBytes(Data, 0); // name. + getStreamer().EmitIntValue(0, 1); // terminate the string. + getStreamer().EmitValueToAlignment(4); // ensure 4 byte alignment. + getStreamer().PopSection(); + return false; +} + /// ParseDirectiveWeakref /// ::= .weakref foo, bar bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) { diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp index a770c97..9ccab93 100644 --- a/lib/MC/MCPureStreamer.cpp +++ b/lib/MC/MCPureStreamer.cpp @@ -39,7 +39,7 @@ public: virtual void EmitLabel(MCSymbol *Symbol); virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value); virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0, - unsigned Size = 0, unsigned ByteAlignment = 0); + uint64_t Size = 0, unsigned ByteAlignment = 0); virtual void EmitBytes(StringRef Data, unsigned AddrSpace); virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0, unsigned ValueSize = 1, @@ -144,7 +144,7 @@ void MCPureStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) { } void MCPureStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol, - unsigned Size, unsigned ByteAlignment) { + uint64_t Size, unsigned ByteAlignment) { report_fatal_error("not yet implemented in pure streamer"); } diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp index 90091f0..aac9377 100644 --- a/lib/MC/MCSectionCOFF.cpp +++ b/lib/MC/MCSectionCOFF.cpp @@ -20,7 +20,7 @@ MCSectionCOFF::~MCSectionCOFF() {} // anchor. // should be printed before the section name bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const { - + // FIXME: Does .section .bss/.data/.text work everywhere?? if (Name == ".text" || Name == ".data" || Name == ".bss") return true; @@ -30,7 +30,7 @@ bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name, void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, raw_ostream &OS) const { - + // standard sections don't require the '.section' if (ShouldOmitSectionDirective(SectionName, MAI)) { OS << '\t' << getSectionName() << '\n'; @@ -47,7 +47,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, if (getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) OS << 'n'; OS << "\"\n"; - + if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) { switch (Selection) { case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES: diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp index dfd77c3..0775cfa 100644 --- a/lib/MC/MCSectionELF.cpp +++ b/lib/MC/MCSectionELF.cpp @@ -22,7 +22,7 @@ MCSectionELF::~MCSectionELF() {} // anchor. // should be printed before the section name bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const { - + // FIXME: Does .section .bss/.data/.text work everywhere?? if (Name == ".text" || Name == ".data" || (Name == ".bss" && !MAI.usesELFSectionDirectiveForBSS())) @@ -33,7 +33,7 @@ bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name, void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, raw_ostream &OS) const { - + if (ShouldOmitSectionDirective(SectionName, MAI)) { OS << '\t' << getSectionName() << '\n'; return; @@ -62,7 +62,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, } // Handle the weird solaris syntax if desired. - if (MAI.usesSunStyleELFSectionSwitchSyntax() && + if (MAI.usesSunStyleELFSectionSwitchSyntax() && !(Flags & ELF::SHF_MERGE)) { if (Flags & ELF::SHF_ALLOC) OS << ",#alloc"; @@ -75,7 +75,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, OS << '\n'; return; } - + OS << ",\""; if (Flags & ELF::SHF_ALLOC) OS << 'a'; @@ -91,13 +91,13 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, OS << 'S'; if (Flags & ELF::SHF_TLS) OS << 'T'; - + // If there are target-specific flags, print them. if (Flags & ELF::XCORE_SHF_CP_SECTION) OS << 'c'; if (Flags & ELF::XCORE_SHF_DP_SECTION) OS << 'd'; - + OS << '"'; OS << ','; diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp index 43e62ff..e363f28 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -20,12 +20,9 @@ #include <cstdlib> using namespace llvm; -MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx), EmitEHFrame(true), - EmitDebugFrame(false), - CurrentW64UnwindInfo(0), - LastSymbol(0), - UniqueCodeBeginSuffix(0), - UniqueDataBeginSuffix(0) { +MCStreamer::MCStreamer(MCContext &Ctx) + : Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false), + CurrentW64UnwindInfo(0), LastSymbol(0) { const MCSection *section = NULL; SectionStack.push_back(std::make_pair(section, section)); } @@ -183,85 +180,6 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol) { LastSymbol = Symbol; } -void MCStreamer::EmitDataRegion() { - if (RegionIndicator == Data) return; - - MCContext &Context = getContext(); - const MCAsmInfo &MAI = Context.getAsmInfo(); - if (!MAI.getSupportsDataRegions()) return; - - // Generate a unique symbol name. - MCSymbol *NewSym = Context.GetOrCreateSymbol(MAI.getDataBeginLabelName() + - Twine(UniqueDataBeginSuffix++)); - EmitLabel(NewSym); - - RegionIndicator = Data; -} - -void MCStreamer::EmitCodeRegion() { - if (RegionIndicator == Code) return; - - MCContext &Context = getContext(); - const MCAsmInfo &MAI = Context.getAsmInfo(); - if (!MAI.getSupportsDataRegions()) return; - - // Generate a unique symbol name. - MCSymbol *NewSym = Context.GetOrCreateSymbol(MAI.getCodeBeginLabelName() + - Twine(UniqueCodeBeginSuffix++)); - EmitLabel(NewSym); - - RegionIndicator = Code; -} - -void MCStreamer::EmitJumpTable8Region() { - if (RegionIndicator == JumpTable8) return; - - MCContext &Context = getContext(); - const MCAsmInfo &MAI = Context.getAsmInfo(); - if (!MAI.getSupportsDataRegions()) return; - - // Generate a unique symbol name. - MCSymbol *NewSym = - Context.GetOrCreateSymbol(MAI.getJumpTable8BeginLabelName() + - Twine(UniqueDataBeginSuffix++)); - EmitLabel(NewSym); - - RegionIndicator = JumpTable8; -} - -void MCStreamer::EmitJumpTable16Region() { - if (RegionIndicator == JumpTable16) return; - - MCContext &Context = getContext(); - const MCAsmInfo &MAI = Context.getAsmInfo(); - if (!MAI.getSupportsDataRegions()) return; - - // Generate a unique symbol name. - MCSymbol *NewSym = - Context.GetOrCreateSymbol(MAI.getJumpTable16BeginLabelName() + - Twine(UniqueDataBeginSuffix++)); - EmitLabel(NewSym); - - RegionIndicator = JumpTable16; -} - - -void MCStreamer::EmitJumpTable32Region() { - if (RegionIndicator == JumpTable32) return; - - MCContext &Context = getContext(); - const MCAsmInfo &MAI = Context.getAsmInfo(); - if (!MAI.getSupportsDataRegions()) return; - - // Generate a unique symbol name. - MCSymbol *NewSym = - Context.GetOrCreateSymbol(MAI.getJumpTable32BeginLabelName() + - Twine(UniqueDataBeginSuffix++)); - EmitLabel(NewSym); - - RegionIndicator = JumpTable32; -} - void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) { EnsureValidFrame(); MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo(); @@ -283,7 +201,6 @@ void MCStreamer::EmitCFIStartProc() { EmitCFIStartProcImpl(Frame); FrameInfos.push_back(Frame); - RegionIndicator = Code; } void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) { diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp index 86dc108..05c83f7 100644 --- a/lib/MC/MCSubtargetInfo.cpp +++ b/lib/MC/MCSubtargetInfo.cpp @@ -17,11 +17,13 @@ using namespace llvm; +MCSchedModel MCSchedModel::DefaultSchedModel; // For unknown processors. + void MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS, const SubtargetFeatureKV *PF, const SubtargetFeatureKV *PD, - const SubtargetInfoKV *PI, + const SubtargetInfoKV *ProcSched, const InstrStage *IS, const unsigned *OC, const unsigned *FP, @@ -29,10 +31,10 @@ MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS, TargetTriple = TT; ProcFeatures = PF; ProcDesc = PD; - ProcItins = PI; + ProcSchedModel = ProcSched; Stages = IS; OperandCycles = OC; - ForwardingPathes = FP; + ForwardingPaths = FP; NumFeatures = NF; NumProcs = NP; @@ -68,14 +70,14 @@ uint64_t MCSubtargetInfo::ToggleFeature(StringRef FS) { } -InstrItineraryData -MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const { - assert(ProcItins && "Instruction itineraries information not available!"); +MCSchedModel * +MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const { + assert(ProcSchedModel && "Processor machine model not available!"); #ifndef NDEBUG for (size_t i = 1; i < NumProcs; i++) { - assert(strcmp(ProcItins[i - 1].Key, ProcItins[i].Key) < 0 && - "Itineraries table is not sorted"); + assert(strcmp(ProcSchedModel[i - 1].Key, ProcSchedModel[i].Key) < 0 && + "Processor machine model table is not sorted"); } #endif @@ -83,14 +85,19 @@ MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const { SubtargetInfoKV KV; KV.Key = CPU.data(); const SubtargetInfoKV *Found = - std::lower_bound(ProcItins, ProcItins+NumProcs, KV); - if (Found == ProcItins+NumProcs || StringRef(Found->Key) != CPU) { + std::lower_bound(ProcSchedModel, ProcSchedModel+NumProcs, KV); + if (Found == ProcSchedModel+NumProcs || StringRef(Found->Key) != CPU) { errs() << "'" << CPU << "' is not a recognized processor for this target" << " (ignoring processor)\n"; - return InstrItineraryData(); + return &MCSchedModel::DefaultSchedModel; } + assert(Found->Value && "Missing processor SchedModel value"); + return (MCSchedModel *)Found->Value; +} - return InstrItineraryData(Stages, OperandCycles, ForwardingPathes, - (InstrItinerary *)Found->Value); +InstrItineraryData +MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const { + MCSchedModel *SchedModel = getSchedModelForCPU(CPU); + return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths); } diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp index e013e77..f7f9184 100644 --- a/lib/MC/MCSymbol.cpp +++ b/lib/MC/MCSymbol.cpp @@ -30,7 +30,7 @@ static bool isAcceptableChar(char C) { /// syntactically correct. static bool NameNeedsQuoting(StringRef Str) { assert(!Str.empty() && "Cannot create an empty MCSymbol"); - + // If any of the characters in the string is an unacceptable character, force // quotes. for (unsigned i = 0, e = Str.size(); i != e; ++i) @@ -72,7 +72,7 @@ void MCSymbol::print(raw_ostream &OS) const { OS << getName(); return; } - + OS << '"' << getName() << '"'; } diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp index 8e4066c..5820a22 100644 --- a/lib/MC/MachObjectWriter.cpp +++ b/lib/MC/MachObjectWriter.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCMachOSymbolFlags.h" #include "llvm/MC/MCValue.h" #include "llvm/Object/MachOFormat.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include <vector> @@ -351,6 +352,21 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD, Write32(Address); } +void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type, + uint32_t DataOffset, + uint32_t DataSize) { + uint64_t Start = OS.tell(); + (void) Start; + + Write32(Type); + Write32(macho::LinkeditLoadCommandSize); + Write32(DataOffset); + Write32(DataSize); + + assert(OS.tell() - Start == macho::LinkeditLoadCommandSize); +} + + void MachObjectWriter::RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, @@ -654,6 +670,13 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm, macho::DysymtabLoadCommandSize); } + // Add the data-in-code load command size, if used. + unsigned NumDataRegions = Asm.getDataRegions().size(); + if (NumDataRegions) { + ++NumLoadCommands; + LoadCommandsSize += macho::LinkeditLoadCommandSize; + } + // Compute the total size of the section data, as well as its file size and vm // size. uint64_t SectionDataStart = (is64Bit() ? macho::Header64Size : @@ -701,6 +724,15 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm, RelocTableEnd += NumRelocs * macho::RelocationInfoSize; } + // Write the data-in-code load command, if used. + uint64_t DataInCodeTableEnd = RelocTableEnd + NumDataRegions * 8; + if (NumDataRegions) { + uint64_t DataRegionsOffset = RelocTableEnd; + uint64_t DataRegionsSize = NumDataRegions * 8; + WriteLinkeditLoadCommand(macho::LCT_DataInCode, DataRegionsOffset, + DataRegionsSize); + } + // Write the symbol table load command, if used. if (NumSymbols) { unsigned FirstLocalSymbol = 0; @@ -717,10 +749,10 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm, // If used, the indirect symbols are written after the section data. if (NumIndirectSymbols) - IndirectSymbolOffset = RelocTableEnd; + IndirectSymbolOffset = DataInCodeTableEnd; // The symbol table is written after the indirect symbol data. - uint64_t SymbolTableOffset = RelocTableEnd + IndirectSymbolSize; + uint64_t SymbolTableOffset = DataInCodeTableEnd + IndirectSymbolSize; // The string table is written after symbol table. uint64_t StringTableOffset = @@ -760,6 +792,23 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm, } } + // Write out the data-in-code region payload, if there is one. + for (MCAssembler::const_data_region_iterator + it = Asm.data_region_begin(), ie = Asm.data_region_end(); + it != ie; ++it) { + const DataRegionData *Data = &(*it); + uint64_t Start = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->Start), Layout); + uint64_t End = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->End), Layout); + DEBUG(dbgs() << "data in code region-- kind: " << Data->Kind + << " start: " << Start << "(" << Data->Start->getName() << ")" + << " end: " << End << "(" << Data->End->getName() << ")" + << " size: " << End - Start + << "\n"); + Write32(Start); + Write16(End - Start); + Write16(Data->Kind); + } + // Write the symbol table data, if used. if (NumSymbols) { // Write the indirect symbol entries. diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp index be41579..0a44e77 100644 --- a/lib/MC/SubtargetFeature.cpp +++ b/lib/MC/SubtargetFeature.cpp @@ -92,7 +92,7 @@ static void Split(std::vector<std::string> &V, const StringRef S) { static std::string Join(const std::vector<std::string> &V) { // Start with empty string. std::string Result; - // If the vector is not empty + // If the vector is not empty if (!V.empty()) { // Start with the first feature Result = V[0]; @@ -104,7 +104,7 @@ static std::string Join(const std::vector<std::string> &V) { Result += V[i]; } } - // Return the features string + // Return the features string return Result; } @@ -205,7 +205,7 @@ void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry, /// ClearImpliedBits - For each feature that (transitively) implies this /// feature, clear it. -/// +/// static void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry, const SubtargetFeatureKV *FeatureTable, @@ -252,7 +252,7 @@ SubtargetFeatures::ToggleFeature(uint64_t Bits, const StringRef Feature, return Bits; } - + /// getFeatureBits - Get feature bits a CPU. /// @@ -279,7 +279,7 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU, // Check if help is needed if (CPU == "help") Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize); - + // Find CPU entry if CPU name is specified. if (!CPU.empty()) { const SubtargetFeatureKV *CPUEntry = Find(CPU, CPUTable, CPUTableSize); @@ -304,11 +304,11 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU, // Iterate through each feature for (size_t i = 0, E = Features.size(); i < E; i++) { const StringRef Feature = Features[i]; - + // Check for help if (Feature == "+help") Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize); - + // Find feature in table. const SubtargetFeatureKV *FeatureEntry = Find(StripFlag(Feature), FeatureTable, FeatureTableSize); @@ -349,7 +349,7 @@ void *SubtargetFeatures::getItinerary(const StringRef CPU, // Find entry const SubtargetInfoKV *Entry = Find(CPU, Table, TableSize); - + if (Entry) { return Entry->Value; } else { diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp index 67dc649..b026277 100644 --- a/lib/MC/WinCOFFStreamer.cpp +++ b/lib/MC/WinCOFFStreamer.cpp @@ -67,7 +67,7 @@ public: virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment); virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol, - unsigned Size,unsigned ByteAlignment); + uint64_t Size,unsigned ByteAlignment); virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment); virtual void EmitBytes(StringRef Data, unsigned AddrSpace); @@ -324,7 +324,7 @@ void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, } void WinCOFFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol, - unsigned Size,unsigned ByteAlignment) { + uint64_t Size,unsigned ByteAlignment) { llvm_unreachable("not implemented"); } diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp index c5f15ba..2a5951a 100644 --- a/lib/Object/Archive.cpp +++ b/lib/Object/Archive.cpp @@ -28,7 +28,7 @@ struct ArchiveMemberHeader { char UID[6]; char GID[6]; char AccessMode[8]; - char Size[10]; //< Size of data, not including header or padding. + char Size[10]; ///< Size of data, not including header or padding. char Terminator[2]; ///! Get the name without looking up long names. @@ -60,11 +60,11 @@ static const ArchiveMemberHeader *ToHeader(const char *base) { static bool isInternalMember(const ArchiveMemberHeader &amh) { - const char *internals[] = { + static const char *const internals[] = { "/", "//", "#_LLVM_SYM_TAB_#" - }; + }; StringRef name = amh.getName(); for (std::size_t i = 0; i < sizeof(internals) / sizeof(*internals); ++i) { diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp index 8024ac8..7766946d 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -624,6 +624,28 @@ error_code COFFObjectFile::getSymbolName(const coff_symbol *symbol, return object_error::success; } +ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData( + const coff_symbol *symbol) const { + const uint8_t *aux = NULL; + + if ( symbol->NumberOfAuxSymbols > 0 ) { + // AUX data comes immediately after the symbol in COFF + aux = reinterpret_cast<const uint8_t *>(symbol + 1); +# ifndef NDEBUG + // Verify that the aux symbol points to a valid entry in the symbol table. + uintptr_t offset = uintptr_t(aux) - uintptr_t(base()); + if (offset < Header->PointerToSymbolTable + || offset >= Header->PointerToSymbolTable + + (Header->NumberOfSymbols * sizeof(coff_symbol))) + report_fatal_error("Aux Symbol data was outside of symbol table."); + + assert((offset - Header->PointerToSymbolTable) % sizeof(coff_symbol) + == 0 && "Aux Symbol data did not point to the beginning of a symbol"); +# endif + } + return ArrayRef<uint8_t>(aux, symbol->NumberOfAuxSymbols * sizeof(coff_symbol)); +} + error_code COFFObjectFile::getSectionName(const coff_section *Sec, StringRef &Res) const { StringRef Name; @@ -696,6 +718,20 @@ error_code COFFObjectFile::getRelocationType(DataRefImpl Rel, return object_error::success; } +const coff_section *COFFObjectFile::getCOFFSection(section_iterator &It) const { + return toSec(It->getRawDataRefImpl()); +} + +const coff_symbol *COFFObjectFile::getCOFFSymbol(symbol_iterator &It) const { + return toSymb(It->getRawDataRefImpl()); +} + +const coff_relocation *COFFObjectFile::getCOFFRelocation( + relocation_iterator &It) const { + return toRel(It->getRawDataRefImpl()); +} + + #define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(enum) \ case COFF::enum: res = #enum; break; diff --git a/lib/Object/MachOObject.cpp b/lib/Object/MachOObject.cpp index b7e5cdc..00dea3f 100644 --- a/lib/Object/MachOObject.cpp +++ b/lib/Object/MachOObject.cpp @@ -357,6 +357,19 @@ void MachOObject::ReadSymbol64TableEntry(uint64_t SymbolTableOffset, ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res); } +template<> +void SwapStruct(macho::DataInCodeTableEntry &Value) { + SwapValue(Value.Offset); + SwapValue(Value.Length); + SwapValue(Value.Kind); +} +void MachOObject::ReadDataInCodeTableEntry(uint64_t TableOffset, + unsigned Index, + InMemoryStruct<macho::DataInCodeTableEntry> &Res) const { + uint64_t Offset = (TableOffset + + Index * sizeof(macho::DataInCodeTableEntry)); + ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res); +} void MachOObject::ReadULEB128s(uint64_t Index, SmallVectorImpl<uint64_t> &Out) const { diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp index 3bcda17..d229671 100644 --- a/lib/Object/MachOObjectFile.cpp +++ b/lib/Object/MachOObjectFile.cpp @@ -598,13 +598,15 @@ error_code MachOObjectFile::isSectionZeroInit(DataRefImpl DRI, if (MachOObj->is64Bit()) { InMemoryStruct<macho::Section64> Sect; getSection64(DRI, Sect); - Result = (Sect->Flags & MachO::SectionTypeZeroFill || - Sect->Flags & MachO::SectionTypeZeroFillLarge); + unsigned SectionType = Sect->Flags & MachO::SectionFlagMaskSectionType; + Result = (SectionType == MachO::SectionTypeZeroFill || + SectionType == MachO::SectionTypeZeroFillLarge); } else { InMemoryStruct<macho::Section> Sect; getSection(DRI, Sect); - Result = (Sect->Flags & MachO::SectionTypeZeroFill || - Sect->Flags & MachO::SectionTypeZeroFillLarge); + unsigned SectionType = Sect->Flags & MachO::SectionFlagMaskSectionType; + Result = (SectionType == MachO::SectionTypeZeroFill || + SectionType == MachO::SectionTypeZeroFillLarge); } return object_error::success; @@ -786,7 +788,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel, switch (Arch) { case Triple::x86: { - const char* Table[] = { + static const char *const Table[] = { "GENERIC_RELOC_VANILLA", "GENERIC_RELOC_PAIR", "GENERIC_RELOC_SECTDIFF", @@ -801,7 +803,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel, break; } case Triple::x86_64: { - const char* Table[] = { + static const char *const Table[] = { "X86_64_RELOC_UNSIGNED", "X86_64_RELOC_SIGNED", "X86_64_RELOC_BRANCH", @@ -820,7 +822,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel, break; } case Triple::arm: { - const char* Table[] = { + static const char *const Table[] = { "ARM_RELOC_VANILLA", "ARM_RELOC_PAIR", "ARM_RELOC_SECTDIFF", @@ -839,7 +841,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel, break; } case Triple::ppc: { - const char* Table[] = { + static const char *const Table[] = { "PPC_RELOC_VANILLA", "PPC_RELOC_PAIR", "PPC_RELOC_BR14", diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp index 9b81fe7..38cfaed 100644 --- a/lib/Support/APInt.cpp +++ b/lib/Support/APInt.cpp @@ -1135,7 +1135,7 @@ APInt APInt::lshr(unsigned shiftAmt) const { // If all the bits were shifted out, the result is 0. This avoids issues // with shifting by the size of the integer type, which produces undefined // results. We define these "undefined results" to always be 0. - if (shiftAmt == BitWidth) + if (shiftAmt >= BitWidth) return APInt(BitWidth, 0); // If none of the bits are shifted out, the result is *this. This avoids @@ -1446,7 +1446,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const { APInt signedMin = APInt::getSignedMinValue(d.getBitWidth()); APInt signedMax = APInt::getSignedMaxValue(d.getBitWidth()); - nc = allOnes - (-d).urem(d); + nc = allOnes - (allOnes - d).urem(d); p = d.getBitWidth() - 1; // initialize p q1 = signedMin.udiv(nc); // initialize q1 = 2p/nc r1 = signedMin - q1*nc; // initialize r1 = rem(2p,nc) diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index e6fdf16..593315d 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -219,10 +219,10 @@ static Option *LookupNearestOption(StringRef Arg, if (!Best || Distance < BestDistance) { Best = O; BestDistance = Distance; - if (RHS.empty() || !PermitValue) - NearestString = OptionNames[i]; - else - NearestString = std::string(OptionNames[i]) + "=" + RHS.str(); + if (RHS.empty() || !PermitValue) + NearestString = OptionNames[i]; + else + NearestString = std::string(OptionNames[i]) + "=" + RHS.str(); } } } diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp index 5206cf1..720ef36 100644 --- a/lib/Support/ConstantRange.cpp +++ b/lib/Support/ConstantRange.cpp @@ -143,16 +143,17 @@ bool ConstantRange::isSignWrappedSet() const { /// getSetSize - Return the number of elements in this set. /// APInt ConstantRange::getSetSize() const { - if (isEmptySet()) - return APInt(getBitWidth(), 0); - if (getBitWidth() == 1) { - if (Lower != Upper) // One of T or F in the set... - return APInt(2, 1); - return APInt(2, 2); // Must be full set... + if (isEmptySet()) + return APInt(getBitWidth()+1, 0); + + if (isFullSet()) { + APInt Size(getBitWidth()+1, 0); + Size.setBit(getBitWidth()); + return Size; } - // Simply subtract the bounds... - return Upper - Lower; + // This is also correct for wrapped sets. + return (Upper - Lower).zext(getBitWidth()+1); } /// getUnsignedMax - Return the largest unsigned value contained in the @@ -248,6 +249,12 @@ ConstantRange ConstantRange::subtract(const APInt &Val) const { return ConstantRange(Lower - Val, Upper - Val); } +/// \brief Subtract the specified range from this range (aka relative complement +/// of the sets). +ConstantRange ConstantRange::difference(const ConstantRange &CR) const { + return intersectWith(CR.inverse()); +} + /// intersectWith - Return the range that results from the intersection of this /// range with another range. The resultant range is guaranteed to include all /// elements contained in both input ranges, and to have the smallest possible @@ -288,7 +295,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const { if (CR.Upper.ult(Upper)) return CR; - if (CR.Upper.ult(Lower)) + if (CR.Upper.ule(Lower)) return ConstantRange(CR.Lower, Upper); if (getSetSize().ult(CR.getSetSize())) @@ -316,7 +323,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const { return CR; } - if (CR.Upper.ult(Lower)) { + if (CR.Upper.ule(Lower)) { if (CR.Lower.ult(Lower)) return *this; @@ -420,9 +427,13 @@ ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const { unsigned SrcTySize = getBitWidth(); assert(SrcTySize < DstTySize && "Not a value extension"); - if (isFullSet() || isWrappedSet()) + if (isFullSet() || isWrappedSet()) { // Change into [0, 1 << src bit width) - return ConstantRange(APInt(DstTySize,0), APInt(DstTySize,1).shl(SrcTySize)); + APInt LowerExt(DstTySize, 0); + if (!Upper) // special case: [X, 0) -- not really wrapping around + LowerExt = Lower.zext(DstTySize); + return ConstantRange(LowerExt, APInt(DstTySize, 1).shl(SrcTySize)); + } return ConstantRange(Lower.zext(DstTySize), Upper.zext(DstTySize)); } @@ -450,10 +461,53 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const { /// truncated to the specified type. ConstantRange ConstantRange::truncate(uint32_t DstTySize) const { assert(getBitWidth() > DstTySize && "Not a value truncation"); - if (isFullSet() || getSetSize().getActiveBits() > DstTySize) + if (isEmptySet()) + return ConstantRange(DstTySize, /*isFullSet=*/false); + if (isFullSet()) return ConstantRange(DstTySize, /*isFullSet=*/true); - return ConstantRange(Lower.trunc(DstTySize), Upper.trunc(DstTySize)); + APInt MaxValue = APInt::getMaxValue(DstTySize).zext(getBitWidth()); + APInt MaxBitValue(getBitWidth(), 0); + MaxBitValue.setBit(DstTySize); + + APInt LowerDiv(Lower), UpperDiv(Upper); + ConstantRange Union(DstTySize, /*isFullSet=*/false); + + // Analyze wrapped sets in their two parts: [0, Upper) \/ [Lower, MaxValue] + // We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and + // then we do the union with [MaxValue, Upper) + if (isWrappedSet()) { + // if Upper is greater than Max Value, it covers the whole truncated range. + if (Upper.uge(MaxValue)) + return ConstantRange(DstTySize, /*isFullSet=*/true); + + Union = ConstantRange(APInt::getMaxValue(DstTySize),Upper.trunc(DstTySize)); + UpperDiv = APInt::getMaxValue(getBitWidth()); + + // Union covers the MaxValue case, so return if the remaining range is just + // MaxValue. + if (LowerDiv == UpperDiv) + return Union; + } + + // Chop off the most significant bits that are past the destination bitwidth. + if (LowerDiv.uge(MaxValue)) { + APInt Div(getBitWidth(), 0); + APInt::udivrem(LowerDiv, MaxBitValue, Div, LowerDiv); + UpperDiv = UpperDiv - MaxBitValue * Div; + } + + if (UpperDiv.ule(MaxValue)) + return ConstantRange(LowerDiv.trunc(DstTySize), + UpperDiv.trunc(DstTySize)).unionWith(Union); + + // The truncated value wrapps around. Check if we can do better than fullset. + APInt UpperModulo = UpperDiv - MaxBitValue; + if (UpperModulo.ult(LowerDiv)) + return ConstantRange(LowerDiv.trunc(DstTySize), + UpperModulo.trunc(DstTySize)).unionWith(Union); + + return ConstantRange(DstTySize, /*isFullSet=*/true); } /// zextOrTrunc - make this range have the bit width given by \p DstTySize. The @@ -529,8 +583,6 @@ ConstantRange::multiply(const ConstantRange &Other) const { if (isEmptySet() || Other.isEmptySet()) return ConstantRange(getBitWidth(), /*isFullSet=*/false); - if (isFullSet() || Other.isFullSet()) - return ConstantRange(getBitWidth(), /*isFullSet=*/true); APInt this_min = getUnsignedMin().zext(getBitWidth() * 2); APInt this_max = getUnsignedMax().zext(getBitWidth() * 2); diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp index e2af0bc..e175056 100644 --- a/lib/Support/CrashRecoveryContext.cpp +++ b/lib/Support/CrashRecoveryContext.cpp @@ -223,7 +223,7 @@ void CrashRecoveryContext::Disable() { #include <signal.h> -static int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP }; +static const int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP }; static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]); static struct sigaction PrevActions[NumSignals]; diff --git a/lib/Support/Errno.cpp b/lib/Support/Errno.cpp index 18c6581..dd218f6 100644 --- a/lib/Support/Errno.cpp +++ b/lib/Support/Errno.cpp @@ -52,7 +52,7 @@ std::string StrError(int errnum) { # endif #elif HAVE_DECL_STRERROR_S // "Windows Secure API" if (errnum) - strerror_s(buffer, errnum); + strerror_s(buffer, MaxErrStrLen - 1, errnum); #elif defined(HAVE_STRERROR) // Copy the thread un-safe result of strerror into // the buffer as fast as possible to minimize impact diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp index 32126ec..f6aaf83 100644 --- a/lib/Support/GraphWriter.cpp +++ b/lib/Support/GraphWriter.cpp @@ -99,7 +99,6 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait, case GraphProgram::NEATO: args.push_back("-f"); args.push_back("neato");break; case GraphProgram::TWOPI: args.push_back("-f"); args.push_back("twopi");break; case GraphProgram::CIRCO: args.push_back("-f"); args.push_back("circo");break; - default: errs() << "Unknown graph layout name; using default.\n"; } args.push_back(0); diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 0f06964..9a2c39d 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -11,7 +11,13 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/DataStream.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/Host.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Config/config.h" #include <string.h> @@ -25,6 +31,12 @@ #ifdef _MSC_VER #include <intrin.h> #endif +#if defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__)) +#include <mach/mach.h> +#include <mach/mach_host.h> +#include <mach/host_info.h> +#include <mach/machine.h> +#endif //===----------------------------------------------------------------------===// // @@ -230,11 +242,18 @@ std::string sys::getHostCPUName() { case 45: return "corei7-avx"; - case 28: // Intel Atom processor. All processors are manufactured using - // the 45 nm process + // Ivy Bridge: + case 58: + return "core-avx-i"; + + case 28: // Most 45 nm Intel Atom processors + case 38: // 45 nm Atom Lincroft + case 39: // 32 nm Atom Medfield + case 53: // 32 nm Atom Midview + case 54: // 32 nm Atom Midview return "atom"; - default: return "i686"; + default: return (Em64T) ? "x86-64" : "i686"; } case 15: { switch (Model) { @@ -315,6 +334,179 @@ std::string sys::getHostCPUName() { } return "generic"; } +#elif defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__)) +std::string sys::getHostCPUName() { + host_basic_info_data_t hostInfo; + mach_msg_type_number_t infoCount; + + infoCount = HOST_BASIC_INFO_COUNT; + host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, + &infoCount); + + if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic"; + + switch(hostInfo.cpu_subtype) { + case CPU_SUBTYPE_POWERPC_601: return "601"; + case CPU_SUBTYPE_POWERPC_602: return "602"; + case CPU_SUBTYPE_POWERPC_603: return "603"; + case CPU_SUBTYPE_POWERPC_603e: return "603e"; + case CPU_SUBTYPE_POWERPC_603ev: return "603ev"; + case CPU_SUBTYPE_POWERPC_604: return "604"; + case CPU_SUBTYPE_POWERPC_604e: return "604e"; + case CPU_SUBTYPE_POWERPC_620: return "620"; + case CPU_SUBTYPE_POWERPC_750: return "750"; + case CPU_SUBTYPE_POWERPC_7400: return "7400"; + case CPU_SUBTYPE_POWERPC_7450: return "7450"; + case CPU_SUBTYPE_POWERPC_970: return "970"; + default: ; + } + + return "generic"; +} +#elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__)) +std::string sys::getHostCPUName() { + // Access to the Processor Version Register (PVR) on PowerPC is privileged, + // and so we must use an operating-system interface to determine the current + // processor type. On Linux, this is exposed through the /proc/cpuinfo file. + const char *generic = "generic"; + + // Note: We cannot mmap /proc/cpuinfo here and then process the resulting + // memory buffer because the 'file' has 0 size (it can be read from only + // as a stream). + + std::string Err; + DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err); + if (!DS) { + DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n"); + return generic; + } + + // The cpu line is second (after the 'processor: 0' line), so if this + // buffer is too small then something has changed (or is wrong). + char buffer[1024]; + size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer)); + delete DS; + + const char *CPUInfoStart = buffer; + const char *CPUInfoEnd = buffer + CPUInfoSize; + + const char *CIP = CPUInfoStart; + + const char *CPUStart = 0; + size_t CPULen = 0; + + // We need to find the first line which starts with cpu, spaces, and a colon. + // After the colon, there may be some additional spaces and then the cpu type. + while (CIP < CPUInfoEnd && CPUStart == 0) { + if (CIP < CPUInfoEnd && *CIP == '\n') + ++CIP; + + if (CIP < CPUInfoEnd && *CIP == 'c') { + ++CIP; + if (CIP < CPUInfoEnd && *CIP == 'p') { + ++CIP; + if (CIP < CPUInfoEnd && *CIP == 'u') { + ++CIP; + while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t')) + ++CIP; + + if (CIP < CPUInfoEnd && *CIP == ':') { + ++CIP; + while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t')) + ++CIP; + + if (CIP < CPUInfoEnd) { + CPUStart = CIP; + while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' && + *CIP != ',' && *CIP != '\n')) + ++CIP; + CPULen = CIP - CPUStart; + } + } + } + } + } + + if (CPUStart == 0) + while (CIP < CPUInfoEnd && *CIP != '\n') + ++CIP; + } + + if (CPUStart == 0) + return generic; + + return StringSwitch<const char *>(StringRef(CPUStart, CPULen)) + .Case("604e", "604e") + .Case("604", "604") + .Case("7400", "7400") + .Case("7410", "7400") + .Case("7447", "7400") + .Case("7455", "7450") + .Case("G4", "g4") + .Case("POWER4", "970") + .Case("PPC970FX", "970") + .Case("PPC970MP", "970") + .Case("G5", "g5") + .Case("POWER5", "g5") + .Case("A2", "a2") + .Case("POWER6", "pwr6") + .Case("POWER7", "pwr7") + .Default(generic); +} +#elif defined(__linux__) && defined(__arm__) +std::string sys::getHostCPUName() { + // The cpuid register on arm is not accessible from user space. On Linux, + // it is exposed through the /proc/cpuinfo file. + // Note: We cannot mmap /proc/cpuinfo here and then process the resulting + // memory buffer because the 'file' has 0 size (it can be read from only + // as a stream). + + std::string Err; + DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err); + if (!DS) { + DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n"); + return "generic"; + } + + // Read 1024 bytes from /proc/cpuinfo, which should contain the CPU part line + // in all cases. + char buffer[1024]; + size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer)); + delete DS; + + StringRef Str(buffer, CPUInfoSize); + + SmallVector<StringRef, 32> Lines; + Str.split(Lines, "\n"); + + // Look for the CPU implementer line. + StringRef Implementer; + for (unsigned I = 0, E = Lines.size(); I != E; ++I) + if (Lines[I].startswith("CPU implementer")) + Implementer = Lines[I].substr(15).ltrim("\t :"); + + if (Implementer == "0x41") // ARM Ltd. + // Look for the CPU part line. + for (unsigned I = 0, E = Lines.size(); I != E; ++I) + if (Lines[I].startswith("CPU part")) + // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The + // values correspond to the "Part number" in the CP15/c0 register. The + // contents are specified in the various processor manuals. + return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :")) + .Case("0x926", "arm926ej-s") + .Case("0xb02", "mpcore") + .Case("0xb36", "arm1136j-s") + .Case("0xb56", "arm1156t2-s") + .Case("0xb76", "arm1176jz-s") + .Case("0xc08", "cortex-a8") + .Case("0xc09", "cortex-a9") + .Case("0xc20", "cortex-m0") + .Case("0xc23", "cortex-m3") + .Case("0xc24", "cortex-m4") + .Default("generic"); + + return "generic"; +} #else std::string sys::getHostCPUName() { return "generic"; diff --git a/lib/Support/Memory.cpp b/lib/Support/Memory.cpp index 6806c1d..22f7494 100644 --- a/lib/Support/Memory.cpp +++ b/lib/Support/Memory.cpp @@ -45,7 +45,7 @@ void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr, # if (defined(__POWERPC__) || defined (__ppc__) || \ defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__) - sys_icache_invalidate(Addr, Len); + sys_icache_invalidate(const_cast<void *>(Addr), Len); # endif #else @@ -67,11 +67,12 @@ void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr, asm volatile("isync"); # elif defined(__arm__) && defined(__GNUC__) // FIXME: Can we safely always call this for __GNUC__ everywhere? - char *Start = (char*) Addr; - char *End = Start + Len; - __clear_cache(Start, End); + const char *Start = static_cast<const char *>(Addr); + const char *End = Start + Len; + __clear_cache(const_cast<char *>(Start), const_cast<char *>(End)); # elif defined(__mips__) - cacheflush((intptr_t)Addr, Len, BCACHE); + const char *Start = static_cast<const char *>(Addr); + cacheflush(const_cast<char *>(Start), Len, BCACHE); # endif #endif // end apple diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp index 16e5c7a..992f03c 100644 --- a/lib/Support/MemoryBuffer.cpp +++ b/lib/Support/MemoryBuffer.cpp @@ -17,6 +17,7 @@ #include "llvm/Config/config.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Errno.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" @@ -214,6 +215,14 @@ error_code MemoryBuffer::getFile(const char *Filename, OwningPtr<MemoryBuffer> &result, int64_t FileSize, bool RequiresNullTerminator) { + // First check that the "file" is not a directory + bool is_dir = false; + error_code err = sys::fs::is_directory(Filename, is_dir); + if (err) + return err; + if (is_dir) + return make_error_code(errc::is_a_directory); + int OpenFlags = O_RDONLY; #ifdef O_BINARY OpenFlags |= O_BINARY; // Open input file in binary mode on win32. @@ -304,16 +313,6 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename, RealMapOffset)) { result.reset(GetNamedBuffer<MemoryBufferMMapFile>( StringRef(Pages + Delta, MapSize), Filename, RequiresNullTerminator)); - - if (RequiresNullTerminator && result->getBufferEnd()[0] != '\0') { - // There could be a racing issue that resulted in the file being larger - // than the FileSize passed by the caller. We already have an assertion - // for this in MemoryBuffer::init() but have a runtime guarantee that - // the buffer will be null-terminated here, so do a copy that adds a - // null-terminator. - result.reset(MemoryBuffer::getMemBufferCopy(result->getBuffer(), - Filename)); - } return error_code::success(); } } diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp index dcddeda..db4a56b 100644 --- a/lib/Support/Path.cpp +++ b/lib/Support/Path.cpp @@ -60,8 +60,11 @@ sys::IdentifyFileType(const char *magic, unsigned length) { case '\177': if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') { - if (length >= 18 && magic[17] == 0) - switch (magic[16]) { + bool Data2MSB = magic[5] == 2; + unsigned high = Data2MSB ? 16 : 17; + unsigned low = Data2MSB ? 17 : 16; + if (length >= 18 && magic[high] == 0) + switch (magic[low]) { default: break; case 1: return ELF_Relocatable_FileType; case 2: return ELF_Executable_FileType; diff --git a/lib/Support/PathV2.cpp b/lib/Support/PathV2.cpp index e2a69a6..46571c0 100644 --- a/lib/Support/PathV2.cpp +++ b/lib/Support/PathV2.cpp @@ -744,6 +744,8 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result) { /// @brief Identify the magic in magic. file_magic identify_magic(StringRef magic) { + if (magic.size() < 4) + return file_magic::unknown; switch ((unsigned char)magic[0]) { case 0xDE: // 0x0B17C0DE = BC wraper if (magic[1] == (char)0xC0 && magic[2] == (char)0x17 && diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp index 15278c5..e4e01be 100644 --- a/lib/Support/SourceMgr.cpp +++ b/lib/Support/SourceMgr.cpp @@ -79,9 +79,10 @@ int SourceMgr::FindBufferContainingLoc(SMLoc Loc) const { return -1; } -/// FindLineNumber - Find the line number for the specified location in the -/// specified file. This is not a fast method. -unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const { +/// getLineAndColumn - Find the line and column number for the specified +/// location in the specified file. This is not a fast method. +std::pair<unsigned, unsigned> +SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const { if (BufferID == -1) BufferID = FindBufferContainingLoc(Loc); assert(BufferID != -1 && "Invalid Location!"); @@ -91,7 +92,8 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const { // location. unsigned LineNo = 1; - const char *Ptr = Buff->getBufferStart(); + const char *BufStart = Buff->getBufferStart(); + const char *Ptr = BufStart; // If we have a line number cache, and if the query is to a later point in the // same file, start searching from the last query location. This optimizes @@ -108,7 +110,6 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const { for (; SMLoc::getFromPointer(Ptr) != Loc; ++Ptr) if (*Ptr == '\n') ++LineNo; - // Allocate the line number cache if it doesn't exist. if (LineNoCache == 0) LineNoCache = new LineNoCacheTy(); @@ -118,7 +119,10 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const { Cache.LastQueryBufferID = BufferID; Cache.LastQuery = Ptr; Cache.LineNoOfQuery = LineNo; - return LineNo; + + size_t NewlineOffs = StringRef(BufStart, Ptr-BufStart).find_last_of("\n\r"); + if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0; + return std::make_pair(LineNo, Ptr-BufStart-NewlineOffs); } void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const { @@ -145,50 +149,59 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind, ArrayRef<SMRange> Ranges) const { // First thing to do: find the current buffer containing the specified - // location. - int CurBuf = FindBufferContainingLoc(Loc); - assert(CurBuf != -1 && "Invalid or unspecified location!"); - - MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer; - - // Scan backward to find the start of the line. - const char *LineStart = Loc.getPointer(); - while (LineStart != CurMB->getBufferStart() && - LineStart[-1] != '\n' && LineStart[-1] != '\r') - --LineStart; - - // Get the end of the line. - const char *LineEnd = Loc.getPointer(); - while (LineEnd != CurMB->getBufferEnd() && - LineEnd[0] != '\n' && LineEnd[0] != '\r') - ++LineEnd; - std::string LineStr(LineStart, LineEnd); - - // Convert any ranges to column ranges that only intersect the line of the - // location. + // location to pull out the source line. SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges; - for (unsigned i = 0, e = Ranges.size(); i != e; ++i) { - SMRange R = Ranges[i]; - if (!R.isValid()) continue; - - // If the line doesn't contain any part of the range, then ignore it. - if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart) - continue; - - // Ignore pieces of the range that go onto other lines. - if (R.Start.getPointer() < LineStart) - R.Start = SMLoc::getFromPointer(LineStart); - if (R.End.getPointer() > LineEnd) - R.End = SMLoc::getFromPointer(LineEnd); + std::pair<unsigned, unsigned> LineAndCol; + const char *BufferID = "<unknown>"; + std::string LineStr; + + if (Loc.isValid()) { + int CurBuf = FindBufferContainingLoc(Loc); + assert(CurBuf != -1 && "Invalid or unspecified location!"); + + MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer; + BufferID = CurMB->getBufferIdentifier(); - // Translate from SMLoc ranges to column ranges. - ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart, - R.End.getPointer()-LineStart)); + // Scan backward to find the start of the line. + const char *LineStart = Loc.getPointer(); + const char *BufStart = CurMB->getBufferStart(); + while (LineStart != BufStart && LineStart[-1] != '\n' && + LineStart[-1] != '\r') + --LineStart; + + // Get the end of the line. + const char *LineEnd = Loc.getPointer(); + const char *BufEnd = CurMB->getBufferEnd(); + while (LineEnd != BufEnd && LineEnd[0] != '\n' && LineEnd[0] != '\r') + ++LineEnd; + LineStr = std::string(LineStart, LineEnd); + + // Convert any ranges to column ranges that only intersect the line of the + // location. + for (unsigned i = 0, e = Ranges.size(); i != e; ++i) { + SMRange R = Ranges[i]; + if (!R.isValid()) continue; + + // If the line doesn't contain any part of the range, then ignore it. + if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart) + continue; + + // Ignore pieces of the range that go onto other lines. + if (R.Start.getPointer() < LineStart) + R.Start = SMLoc::getFromPointer(LineStart); + if (R.End.getPointer() > LineEnd) + R.End = SMLoc::getFromPointer(LineEnd); + + // Translate from SMLoc ranges to column ranges. + ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart, + R.End.getPointer()-LineStart)); + } + + LineAndCol = getLineAndColumn(Loc, CurBuf); } - - return SMDiagnostic(*this, Loc, - CurMB->getBufferIdentifier(), FindLineNumber(Loc, CurBuf), - Loc.getPointer()-LineStart, Kind, Msg.str(), + + return SMDiagnostic(*this, Loc, BufferID, LineAndCol.first, + LineAndCol.second-1, Kind, Msg.str(), LineStr, ColRanges); } @@ -205,9 +218,11 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind, raw_ostream &OS = errs(); - int CurBuf = FindBufferContainingLoc(Loc); - assert(CurBuf != -1 && "Invalid or unspecified location!"); - PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS); + if (Loc != SMLoc()) { + int CurBuf = FindBufferContainingLoc(Loc); + assert(CurBuf != -1 && "Invalid or unspecified location!"); + PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS); + } Diagnostic.print(0, OS, ShowColors); } @@ -228,8 +243,8 @@ SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, const std::string &FN, void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors) const { - // Display colors only if OS goes to a tty. - ShowColors &= S.is_displayed(); + // Display colors only if OS supports colors. + ShowColors &= S.has_colors(); if (ShowColors) S.changeColor(raw_ostream::SAVEDCOLOR, true); @@ -343,5 +358,3 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, S << '\n'; } - - diff --git a/lib/Support/StreamableMemoryObject.cpp b/lib/Support/StreamableMemoryObject.cpp index c23f07b..fe3752a 100644 --- a/lib/Support/StreamableMemoryObject.cpp +++ b/lib/Support/StreamableMemoryObject.cpp @@ -20,7 +20,7 @@ class RawMemoryObject : public StreamableMemoryObject { public: RawMemoryObject(const unsigned char *Start, const unsigned char *End) : FirstChar(Start), LastChar(End) { - assert(LastChar > FirstChar && "Invalid start/end range"); + assert(LastChar >= FirstChar && "Invalid start/end range"); } virtual uint64_t getBase() const { return 0; } diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp index c131fe0..c2fc261 100644 --- a/lib/Support/StringMap.cpp +++ b/lib/Support/StringMap.cpp @@ -189,7 +189,7 @@ void StringMapImpl::RehashTable() { // grow/rehash the table. if (NumItems*4 > NumBuckets*3) { NewSize = NumBuckets*2; - } else if (NumBuckets-(NumItems+NumTombstones) < NumBuckets/8) { + } else if (NumBuckets-(NumItems+NumTombstones) <= NumBuckets/8) { NewSize = NumBuckets; } else { return; diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp index abe570f..8aab4b2 100644 --- a/lib/Support/StringRef.cpp +++ b/lib/Support/StringRef.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/edit_distance.h" + #include <bitset> using namespace llvm; @@ -230,6 +231,31 @@ StringRef::size_type StringRef::find_last_of(StringRef Chars, return npos; } +/// find_last_not_of - Find the last character in the string that is not +/// \arg C, or npos if not found. +StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { + for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) + if (Data[i] != C) + return i; + return npos; +} + +/// find_last_not_of - Find the last character in the string that is not in +/// \arg Chars, or npos if not found. +/// +/// Note: O(size() + Chars.size()) +StringRef::size_type StringRef::find_last_not_of(StringRef Chars, + size_t From) const { + std::bitset<1 << CHAR_BIT> CharBits; + for (size_type i = 0, e = Chars.size(); i != e; ++i) + CharBits.set((unsigned char)Chars[i]); + + for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) + if (!CharBits.test((unsigned char)Data[i])) + return i; + return npos; +} + void StringRef::split(SmallVectorImpl<StringRef> &A, StringRef Separators, int MaxSplit, bool KeepEmpty) const { @@ -272,14 +298,22 @@ static unsigned GetAutoSenseRadix(StringRef &Str) { if (Str.startswith("0x")) { Str = Str.substr(2); return 16; - } else if (Str.startswith("0b")) { + } + + if (Str.startswith("0b")) { Str = Str.substr(2); return 2; - } else if (Str.startswith("0")) { + } + + if (Str.startswith("0o")) { + Str = Str.substr(2); return 8; - } else { - return 10; } + + if (Str.startswith("0")) + return 8; + + return 10; } @@ -383,7 +417,7 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { unsigned BitWidth = Log2Radix * Str.size(); if (BitWidth < Result.getBitWidth()) BitWidth = Result.getBitWidth(); // don't shrink the result - else + else if (BitWidth > Result.getBitWidth()) Result = Result.zext(BitWidth); APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp index 53c8d84..9c81327 100644 --- a/lib/Support/TargetRegistry.cpp +++ b/lib/Support/TargetRegistry.cpp @@ -23,6 +23,47 @@ TargetRegistry::iterator TargetRegistry::begin() { return iterator(FirstTarget); } +const Target *TargetRegistry::lookupTarget(const std::string &ArchName, + Triple &TheTriple, + std::string &Error) { + // Allocate target machine. First, check whether the user has explicitly + // specified an architecture to compile for. If so we have to look it up by + // name, because it might be a backend that has no mapping to a target triple. + const Target *TheTarget = 0; + if (!ArchName.empty()) { + for (TargetRegistry::iterator it = TargetRegistry::begin(), + ie = TargetRegistry::end(); it != ie; ++it) { + if (ArchName == it->getName()) { + TheTarget = &*it; + break; + } + } + + if (!TheTarget) { + Error = "error: invalid target '" + ArchName + "'.\n"; + return 0; + } + + // Adjust the triple to match (if known), otherwise stick with the + // given triple. + Triple::ArchType Type = Triple::getArchTypeForLLVMName(ArchName); + if (Type != Triple::UnknownArch) + TheTriple.setArch(Type); + } else { + // Get the target specific parser. + std::string TempError; + TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), TempError); + if (TheTarget == 0) { + Error = ": error: unable to get target for '" + + TheTriple.getTriple() + + "', see --version and --triple.\n"; + return 0; + } + } + + return TheTarget; +} + const Target *TargetRegistry::lookupTarget(const std::string &TT, std::string &Error) { // Provide special warning when no targets are initialized. diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp index 08b12b6..0587aae 100644 --- a/lib/Support/ThreadLocal.cpp +++ b/lib/Support/ThreadLocal.cpp @@ -25,9 +25,18 @@ namespace llvm { using namespace sys; ThreadLocalImpl::ThreadLocalImpl() { } ThreadLocalImpl::~ThreadLocalImpl() { } -void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);} -const void* ThreadLocalImpl::getInstance() { return data; } -void ThreadLocalImpl::removeInstance() { data = 0; } +void ThreadLocalImpl::setInstance(const void* d) { + typedef int SIZE_TOO_BIG[sizeof(d) <= sizeof(data) ? 1 : -1]; + void **pd = reinterpret_cast<void**>(&data); + *pd = const_cast<void*>(d); +} +const void* ThreadLocalImpl::getInstance() { + void **pd = reinterpret_cast<void**>(&data); + return *pd; +} +void ThreadLocalImpl::removeInstance() { + setInstance(0); +} } #else @@ -40,31 +49,30 @@ void ThreadLocalImpl::removeInstance() { data = 0; } namespace llvm { using namespace sys; -ThreadLocalImpl::ThreadLocalImpl() : data(0) { - pthread_key_t* key = new pthread_key_t; +ThreadLocalImpl::ThreadLocalImpl() : data() { + typedef int SIZE_TOO_BIG[sizeof(pthread_key_t) <= sizeof(data) ? 1 : -1]; + pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data); int errorcode = pthread_key_create(key, NULL); assert(errorcode == 0); (void) errorcode; - data = (void*)key; } ThreadLocalImpl::~ThreadLocalImpl() { - pthread_key_t* key = static_cast<pthread_key_t*>(data); + pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data); int errorcode = pthread_key_delete(*key); assert(errorcode == 0); (void) errorcode; - delete key; } void ThreadLocalImpl::setInstance(const void* d) { - pthread_key_t* key = static_cast<pthread_key_t*>(data); + pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data); int errorcode = pthread_setspecific(*key, d); assert(errorcode == 0); (void) errorcode; } const void* ThreadLocalImpl::getInstance() { - pthread_key_t* key = static_cast<pthread_key_t*>(data); + pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data); return pthread_getspecific(*key); } diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index 44a1b38..7b26ea9 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -38,8 +38,8 @@ const char *Triple::getArchTypeName(ArchType Kind) { case x86_64: return "x86_64"; case xcore: return "xcore"; case mblaze: return "mblaze"; - case ptx32: return "ptx32"; - case ptx64: return "ptx64"; + case nvptx: return "nvptx"; + case nvptx64: return "nvptx64"; case le32: return "le32"; case amdil: return "amdil"; } @@ -62,7 +62,12 @@ const char *Triple::getArchTypePrefix(ArchType Kind) { case mblaze: return "mblaze"; - case hexagon: return "hexagon"; + case mips: + case mipsel: + case mips64: + case mips64el:return "mips"; + + case hexagon: return "hexagon"; case r600: return "r600"; @@ -74,8 +79,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) { case xcore: return "xcore"; - case ptx32: return "ptx"; - case ptx64: return "ptx"; + case nvptx: return "nvptx"; + case nvptx64: return "nvptx"; case le32: return "le32"; case amdil: return "amdil"; } @@ -160,8 +165,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { .Case("x86", x86) .Case("x86-64", x86_64) .Case("xcore", xcore) - .Case("ptx32", ptx32) - .Case("ptx64", ptx64) + .Case("nvptx", nvptx) + .Case("nvptx64", nvptx64) .Case("le32", le32) .Case("amdil", amdil) .Default(UnknownArch); @@ -192,8 +197,8 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) { .Cases("arm", "armv4t", "armv5", "armv6", Triple::arm) .Cases("armv7", "armv7f", "armv7k", "armv7s", "xscale", Triple::arm) .Case("r600", Triple::r600) - .Case("ptx32", Triple::ptx32) - .Case("ptx64", Triple::ptx64) + .Case("nvptx", Triple::nvptx) + .Case("nvptx64", Triple::nvptx64) .Case("amdil", Triple::amdil) .Default(Triple::UnknownArch); } @@ -215,8 +220,8 @@ const char *Triple::getArchNameForAssembler() { .Cases("armv6", "thumbv6", "armv6") .Cases("armv7", "thumbv7", "armv7") .Case("r600", "r600") - .Case("ptx32", "ptx32") - .Case("ptx64", "ptx64") + .Case("nvptx", "nvptx") + .Case("nvptx64", "nvptx64") .Case("le32", "le32") .Case("amdil", "amdil") .Default(NULL); @@ -249,8 +254,8 @@ static Triple::ArchType parseArch(StringRef ArchName) { .Case("sparcv9", Triple::sparcv9) .Case("tce", Triple::tce) .Case("xcore", Triple::xcore) - .Case("ptx32", Triple::ptx32) - .Case("ptx64", Triple::ptx64) + .Case("nvptx", Triple::nvptx) + .Case("nvptx64", Triple::nvptx64) .Case("le32", Triple::le32) .Case("amdil", Triple::amdil) .Default(Triple::UnknownArch); @@ -584,6 +589,29 @@ bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor, return true; } +void Triple::getiOSVersion(unsigned &Major, unsigned &Minor, + unsigned &Micro) const { + switch (getOS()) { + default: llvm_unreachable("unexpected OS for Darwin triple"); + case Darwin: + case MacOSX: + // Ignore the version from the triple. This is only handled because the + // the clang driver combines OS X and IOS support into a common Darwin + // toolchain that wants to know the iOS version number even when targeting + // OS X. + Major = 3; + Minor = 0; + Micro = 0; + break; + case IOS: + getOSVersion(Major, Minor, Micro); + // Default to 3.0. + if (Major == 0) + Major = 3; + break; + } +} + void Triple::setTriple(const Twine &Str) { *this = Triple(Str); } @@ -652,8 +680,8 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::mblaze: case llvm::Triple::mips: case llvm::Triple::mipsel: + case llvm::Triple::nvptx: case llvm::Triple::ppc: - case llvm::Triple::ptx32: case llvm::Triple::r600: case llvm::Triple::sparc: case llvm::Triple::tce: @@ -664,8 +692,8 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::mips64: case llvm::Triple::mips64el: + case llvm::Triple::nvptx64: case llvm::Triple::ppc64: - case llvm::Triple::ptx64: case llvm::Triple::sparcv9: case llvm::Triple::x86_64: return 64; @@ -701,8 +729,8 @@ Triple Triple::get32BitArchVariant() const { case Triple::mblaze: case Triple::mips: case Triple::mipsel: + case Triple::nvptx: case Triple::ppc: - case Triple::ptx32: case Triple::r600: case Triple::sparc: case Triple::tce: @@ -714,8 +742,8 @@ Triple Triple::get32BitArchVariant() const { case Triple::mips64: T.setArch(Triple::mips); break; case Triple::mips64el: T.setArch(Triple::mipsel); break; + case Triple::nvptx64: T.setArch(Triple::nvptx); break; case Triple::ppc64: T.setArch(Triple::ppc); break; - case Triple::ptx64: T.setArch(Triple::ptx32); break; case Triple::sparcv9: T.setArch(Triple::sparc); break; case Triple::x86_64: T.setArch(Triple::x86); break; } @@ -742,8 +770,8 @@ Triple Triple::get64BitArchVariant() const { case Triple::mips64: case Triple::mips64el: + case Triple::nvptx64: case Triple::ppc64: - case Triple::ptx64: case Triple::sparcv9: case Triple::x86_64: // Already 64-bit. @@ -751,8 +779,8 @@ Triple Triple::get64BitArchVariant() const { case Triple::mips: T.setArch(Triple::mips64); break; case Triple::mipsel: T.setArch(Triple::mips64el); break; + case Triple::nvptx: T.setArch(Triple::nvptx64); break; case Triple::ppc: T.setArch(Triple::ppc64); break; - case Triple::ptx32: T.setArch(Triple::ptx64); break; case Triple::sparc: T.setArch(Triple::sparcv9); break; case Triple::x86: T.setArch(Triple::x86_64); break; } diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc index ddc1e0f..b41390a 100644 --- a/lib/Support/Unix/Path.inc +++ b/lib/Support/Unix/Path.inc @@ -884,7 +884,8 @@ const char *Path::MapInFilePages(int FD, size_t FileSize, off_t Offset) { } void Path::UnMapFilePages(const char *BasePtr, size_t FileSize) { - ::munmap((void*)BasePtr, FileSize); + const void *Addr = static_cast<const void *>(BasePtr); + ::munmap(const_cast<void *>(Addr), FileSize); } } // end llvm namespace diff --git a/lib/Support/Unix/PathV2.inc b/lib/Support/Unix/PathV2.inc index 7d259a3..93ccd1a 100644 --- a/lib/Support/Unix/PathV2.inc +++ b/lib/Support/Unix/PathV2.inc @@ -17,12 +17,16 @@ //===----------------------------------------------------------------------===// #include "Unix.h" +#include "llvm/Support/Process.h" #if HAVE_SYS_STAT_H #include <sys/stat.h> #endif #if HAVE_FCNTL_H #include <fcntl.h> #endif +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif #if HAVE_DIRENT_H # include <dirent.h> # define NAMLEN(dirent) strlen((dirent)->d_name) @@ -274,8 +278,7 @@ error_code exists(const Twine &path, bool &result) { SmallString<128> path_storage; StringRef p = path.toNullTerminatedStringRef(path_storage); - struct stat status; - if (::stat(p.begin(), &status) == -1) { + if (::access(p.begin(), F_OK) == -1) { if (errno != errc::no_such_file_or_directory) return error_code(errno, system_category()); result = false; @@ -287,8 +290,8 @@ error_code exists(const Twine &path, bool &result) { bool equivalent(file_status A, file_status B) { assert(status_known(A) && status_known(B)); - return A.st_dev == B.st_dev && - A.st_ino == B.st_ino; + return A.fs_st_dev == B.fs_st_dev && + A.fs_st_ino == B.fs_st_ino; } error_code equivalent(const Twine &A, const Twine &B, bool &result) { @@ -327,30 +330,62 @@ error_code status(const Twine &path, file_status &result) { return ec; } + perms prms = static_cast<perms>(status.st_mode & perms_mask); + if (S_ISDIR(status.st_mode)) - result = file_status(file_type::directory_file); + result = file_status(file_type::directory_file, prms); else if (S_ISREG(status.st_mode)) - result = file_status(file_type::regular_file); + result = file_status(file_type::regular_file, prms); else if (S_ISBLK(status.st_mode)) - result = file_status(file_type::block_file); + result = file_status(file_type::block_file, prms); else if (S_ISCHR(status.st_mode)) - result = file_status(file_type::character_file); + result = file_status(file_type::character_file, prms); else if (S_ISFIFO(status.st_mode)) - result = file_status(file_type::fifo_file); + result = file_status(file_type::fifo_file, prms); else if (S_ISSOCK(status.st_mode)) - result = file_status(file_type::socket_file); + result = file_status(file_type::socket_file, prms); else - result = file_status(file_type::type_unknown); + result = file_status(file_type::type_unknown, prms); + + result.fs_st_dev = status.st_dev; + result.fs_st_ino = status.st_ino; + + return error_code::success(); +} + +// Modifies permissions on a file. +error_code permissions(const Twine &path, perms prms) { + if ((prms & add_perms) && (prms & remove_perms)) + llvm_unreachable("add_perms and remove_perms are mutually exclusive"); - result.st_dev = status.st_dev; - result.st_ino = status.st_ino; + // Get current permissions + file_status info; + if (error_code ec = status(path, info)) { + return ec; + } + + // Set updated permissions. + SmallString<128> path_storage; + StringRef p = path.toNullTerminatedStringRef(path_storage); + perms permsToSet; + if (prms & add_perms) { + permsToSet = (info.permissions() | prms) & perms_mask; + } else if (prms & remove_perms) { + permsToSet = (info.permissions() & ~prms) & perms_mask; + } else { + permsToSet = prms & perms_mask; + } + if (::chmod(p.begin(), static_cast<mode_t>(permsToSet))) { + return error_code(errno, system_category()); + } return error_code::success(); } +// Since this is most often used for temporary files, mode defaults to 0600. error_code unique_file(const Twine &model, int &result_fd, - SmallVectorImpl<char> &result_path, - bool makeAbsolute) { + SmallVectorImpl<char> &result_path, + bool makeAbsolute, unsigned mode) { SmallString<128> Model; model.toVector(Model); // Null terminate. @@ -367,37 +402,20 @@ error_code unique_file(const Twine &model, int &result_fd, } } - // Replace '%' with random chars. From here on, DO NOT modify model. It may be - // needed if the randomly chosen path already exists. - SmallString<128> RandomPath; - RandomPath.reserve(Model.size() + 1); - ::srand(::time(NULL)); + // From here on, DO NOT modify model. It may be needed if the randomly chosen + // path already exists. + SmallString<128> RandomPath = Model; retry_random_path: - // This is opened here instead of above to make it easier to track when to - // close it. Collisions should be rare enough for the possible extra syscalls - // not to matter. - FILE *RandomSource = ::fopen("/dev/urandom", "r"); - RandomPath.set_size(0); - for (SmallVectorImpl<char>::const_iterator i = Model.begin(), - e = Model.end(); i != e; ++i) { - if (*i == '%') { - char val = 0; - if (RandomSource) - val = fgetc(RandomSource); - else - val = ::rand(); - RandomPath.push_back("0123456789abcdef"[val & 15]); - } else - RandomPath.push_back(*i); + // Replace '%' with random chars. + for (unsigned i = 0, e = Model.size(); i != e; ++i) { + if (Model[i] == '%') + RandomPath[i] = "0123456789abcdef"[sys::Process::GetRandomNumber() & 15]; } - if (RandomSource) - ::fclose(RandomSource); - // Try to open + create the file. rety_open_create: - int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, 0600); + int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, mode); if (RandomFD == -1) { // If the file existed, try again, otherwise, error. if (errno == errc::file_exists) @@ -513,6 +531,36 @@ error_code get_magic(const Twine &path, uint32_t len, return error_code::success(); } +error_code map_file_pages(const Twine &path, off_t file_offset, size_t size, + bool map_writable, void *&result) { + SmallString<128> path_storage; + StringRef name = path.toNullTerminatedStringRef(path_storage); + int oflags = map_writable ? O_RDWR : O_RDONLY; + int ofd = ::open(name.begin(), oflags); + if ( ofd == -1 ) + return error_code(errno, system_category()); + AutoFD fd(ofd); + int flags = map_writable ? MAP_SHARED : MAP_PRIVATE; + int prot = map_writable ? (PROT_READ|PROT_WRITE) : PROT_READ; +#ifdef MAP_FILE + flags |= MAP_FILE; +#endif + result = ::mmap(0, size, prot, flags, fd, file_offset); + if (result == MAP_FAILED) { + return error_code(errno, system_category()); + } + + return error_code::success(); +} + +error_code unmap_file_pages(void *base, size_t size) { + if ( ::munmap(base, size) == -1 ) + return error_code(errno, system_category()); + + return error_code::success(); +} + + } // end namespace fs } // end namespace sys } // end namespace llvm diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc index f640462..174112e 100644 --- a/lib/Support/Unix/Process.inc +++ b/lib/Support/Unix/Process.inc @@ -12,6 +12,8 @@ //===----------------------------------------------------------------------===// #include "Unix.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/Support/TimeValue.h" #ifdef HAVE_SYS_TIME_H #include <sys/time.h> #endif @@ -247,16 +249,18 @@ static bool terminalHasColors() { return false; } +bool Process::FileDescriptorHasColors(int fd) { + // A file descriptor has colors if it is displayed and the terminal has + // colors. + return FileDescriptorIsDisplayed(fd) && terminalHasColors(); +} + bool Process::StandardOutHasColors() { - if (!StandardOutIsDisplayed()) - return false; - return terminalHasColors(); + return FileDescriptorHasColors(STDOUT_FILENO); } bool Process::StandardErrHasColors() { - if (!StandardErrIsDisplayed()) - return false; - return terminalHasColors(); + return FileDescriptorHasColors(STDERR_FILENO); } bool Process::ColorNeedsFlush() { @@ -297,3 +301,33 @@ const char *Process::OutputReverse() { const char *Process::ResetColor() { return "\033[0m"; } + +#if !defined(HAVE_ARC4RANDOM) +static unsigned GetRandomNumberSeed() { + // Attempt to get the initial seed from /dev/urandom, if possible. + if (FILE *RandomSource = ::fopen("/dev/urandom", "r")) { + unsigned seed; + int count = ::fread((void *)&seed, sizeof(seed), 1, RandomSource); + ::fclose(RandomSource); + + // Return the seed if the read was successful. + if (count == 1) + return seed; + } + + // Otherwise, swizzle the current time and the process ID to form a reasonable + // seed. + TimeValue Now = llvm::TimeValue::now(); + return hash_combine(Now.seconds(), Now.nanoseconds(), ::getpid()); +} +#endif + +unsigned llvm::sys::Process::GetRandomNumber() { +#if defined(HAVE_ARC4RANDOM) + return arc4random(); +#else + static int x = (::srand(GetRandomNumberSeed()), 0); + (void)x; + return ::rand(); +#endif +} diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc index f5454cf..1d667ab 100644 --- a/lib/Support/Unix/Signals.inc +++ b/lib/Support/Unix/Signals.inc @@ -15,6 +15,7 @@ #include "Unix.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Mutex.h" +#include <string> #include <vector> #include <algorithm> #if HAVE_EXECINFO_H @@ -43,7 +44,7 @@ static SmartMutex<true> SignalsMutex; /// InterruptFunction - The function to call if ctrl-c is pressed. static void (*InterruptFunction)() = 0; -static std::vector<sys::Path> FilesToRemove; +static std::vector<std::string> FilesToRemove; static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun; // IntSigs - Signals that may interrupt the program at any time. @@ -117,10 +118,20 @@ static void UnregisterHandlers() { /// RemoveFilesToRemove - Process the FilesToRemove list. This function /// should be called with the SignalsMutex lock held. +/// NB: This must be an async signal safe function. It cannot allocate or free +/// memory, even in debug builds. static void RemoveFilesToRemove() { - while (!FilesToRemove.empty()) { - FilesToRemove.back().eraseFromDisk(true); - FilesToRemove.pop_back(); + // Note: avoid iterators in case of debug iterators that allocate or release + // memory. + for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) { + // Note that we don't want to use any external code here, and we don't care + // about errors. We're going to try as hard as we can as often as we need + // to to make these files go away. If these aren't files, too bad. + // + // We do however rely on a std::string implementation for which repeated + // calls to 'c_str()' don't allocate memory. We pre-call 'c_str()' on all + // of these strings to try to ensure this is safe. + unlink(FilesToRemove[i].c_str()); } } @@ -178,7 +189,19 @@ void llvm::sys::SetInterruptFunction(void (*IF)()) { bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename, std::string* ErrMsg) { SignalsMutex.acquire(); - FilesToRemove.push_back(Filename); + std::string *OldPtr = FilesToRemove.empty() ? 0 : &FilesToRemove[0]; + FilesToRemove.push_back(Filename.str()); + + // We want to call 'c_str()' on every std::string in this vector so that if + // the underlying implementation requires a re-allocation, it happens here + // rather than inside of the signal handler. If we see the vector grow, we + // have to call it on every entry. If it remains in place, we only need to + // call it on the latest one. + if (OldPtr == &FilesToRemove[0]) + FilesToRemove.back().c_str(); + else + for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) + FilesToRemove[i].c_str(); SignalsMutex.release(); @@ -189,10 +212,19 @@ bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename, // DontRemoveFileOnSignal - The public API void llvm::sys::DontRemoveFileOnSignal(const sys::Path &Filename) { SignalsMutex.acquire(); - std::vector<sys::Path>::reverse_iterator I = - std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename); - if (I != FilesToRemove.rend()) - FilesToRemove.erase(I.base()-1); + std::vector<std::string>::reverse_iterator RI = + std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename.str()); + std::vector<std::string>::iterator I = FilesToRemove.end(); + if (RI != FilesToRemove.rend()) + I = FilesToRemove.erase(RI.base()-1); + + // We need to call c_str() on every element which would have been moved by + // the erase. These elements, in a C++98 implementation where c_str() + // requires a reallocation on the first call may have had the call to c_str() + // made on insertion become invalid by being copied down an element. + for (std::vector<std::string>::iterator E = FilesToRemove.end(); I != E; ++I) + I->c_str(); + SignalsMutex.release(); } diff --git a/lib/Support/Unix/Unix.h b/lib/Support/Unix/Unix.h index b7be311..361f297 100644 --- a/lib/Support/Unix/Unix.h +++ b/lib/Support/Unix/Unix.h @@ -44,16 +44,10 @@ #include <assert.h> #endif -#ifdef TIME_WITH_SYS_TIME +#ifdef HAVE_SYS_TIME_H # include <sys/time.h> -# include <time.h> -#else -# ifdef HAVE_SYS_TIME_H -# include <sys/time.h> -# else -# include <time.h> -# endif #endif +#include <time.h> #ifdef HAVE_SYS_WAIT_H # include <sys/wait.h> diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc index d8dc522..2280b34 100644 --- a/lib/Support/Windows/Path.inc +++ b/lib/Support/Windows/Path.inc @@ -188,8 +188,20 @@ static Path *TempDirectory; Path Path::GetTemporaryDirectory(std::string* ErrMsg) { - if (TempDirectory) + if (TempDirectory) { +#if defined(_MSC_VER) + // Visual Studio gets confused and emits a diagnostic about calling exists, + // even though this is the implementation for PathV1. Temporarily + // disable the deprecated warning message + #pragma warning(push) + #pragma warning(disable:4996) +#endif + assert(TempDirectory->exists() && "Who has removed TempDirectory?"); +#if defined(_MSC_VER) + #pragma warning(pop) +#endif return *TempDirectory; + } char pathname[MAX_PATH]; if (!GetTempPath(MAX_PATH, pathname)) { @@ -201,7 +213,7 @@ Path::GetTemporaryDirectory(std::string* ErrMsg) { Path result; result.set(pathname); - // Append a subdirectory passed on our process id so multiple LLVMs don't + // Append a subdirectory based on our process id so multiple LLVMs don't // step on each other's toes. #ifdef __MINGW32__ // Mingw's Win32 header files are broken. diff --git a/lib/Support/Windows/PathV2.inc b/lib/Support/Windows/PathV2.inc index e9ce5d9..66eeab0 100644 --- a/lib/Support/Windows/PathV2.inc +++ b/lib/Support/Windows/PathV2.inc @@ -301,11 +301,21 @@ error_code rename(const Twine &from, const Twine &to) { if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec; if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec; - if (!::MoveFileExW(wide_from.begin(), wide_to.begin(), - MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING)) - return windows_error(::GetLastError()); + error_code ec = error_code::success(); + for (int i = 0; i < 2000; i++) { + if (::MoveFileExW(wide_from.begin(), wide_to.begin(), + MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING)) + return error_code::success(); + ec = windows_error(::GetLastError()); + if (ec != windows_error::access_denied) + break; + // Retry MoveFile() at ACCESS_DENIED. + // System scanners (eg. indexer) might open the source file when + // It is written and closed. + ::Sleep(1); + } - return error_code::success(); + return ec; } error_code resize_file(const Twine &path, uint64_t size) { @@ -487,9 +497,46 @@ handle_status_error: return error_code::success(); } + +// Modifies permissions on a file. +error_code permissions(const Twine &path, perms prms) { +#if 0 // verify code below before enabling: + // If the permissions bits are not trying to modify + // "write" permissions, there is nothing to do. + if (!(prms & (owner_write|group_write|others_write))) + return error_code::success(); + + SmallString<128> path_storage; + SmallVector<wchar_t, 128> path_utf16; + + if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage), + path_utf16)) + return ec; + + DWORD attributes = ::GetFileAttributesW(path_utf16.begin()); + + if (prms & add_perms) { + attributes &= ~FILE_ATTRIBUTE_READONLY; + } + else if (prms & remove_perms) { + attributes |= FILE_ATTRIBUTE_READONLY; + } + else { + assert(0 && "neither add_perms or remove_perms is set"); + } + + if ( ! ::SetFileAttributesW(path_utf16.begin(), attributes)) + return windows_error(::GetLastError()); +#endif + return error_code::success(); +} + + +// FIXME: mode should be used here and default to user r/w only, +// it currently comes in as a UNIX mode. error_code unique_file(const Twine &model, int &result_fd, - SmallVectorImpl<char> &result_path, - bool makeAbsolute) { + SmallVectorImpl<char> &result_path, + bool makeAbsolute, unsigned mode) { // Use result_path as temp storage. result_path.set_size(0); StringRef m = model.toStringRef(result_path); @@ -743,6 +790,19 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) { return error_code::success(); } +error_code map_file_pages(const Twine &path, off_t file_offset, size_t size, + bool map_writable, void *&result) { + assert(0 && "NOT IMPLEMENTED"); + return windows_error::invalid_function; +} + +error_code unmap_file_pages(void *base, size_t size) { + assert(0 && "NOT IMPLEMENTED"); + return windows_error::invalid_function; +} + + + } // end namespace fs } // end namespace sys } // end namespace llvm diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc index 9a388b4..e29eb6d 100644 --- a/lib/Support/Windows/Process.inc +++ b/lib/Support/Windows/Process.inc @@ -133,7 +133,7 @@ bool Process::StandardErrIsDisplayed() { } bool Process::FileDescriptorIsDisplayed(int fd) { - DWORD Mode; // Unused + DWORD Mode; // Unused return (GetConsoleMode((HANDLE)_get_osfhandle(fd), &Mode) != 0); } @@ -153,13 +153,17 @@ unsigned Process::StandardErrColumns() { return Columns; } -// It always has colors. -bool Process::StandardErrHasColors() { - return StandardErrIsDisplayed(); +// The terminal always has colors. +bool Process::FileDescriptorHasColors(int fd) { + return FileDescriptorIsDisplayed(fd); } bool Process::StandardOutHasColors() { - return StandardOutIsDisplayed(); + return FileDescriptorHasColors(1); +} + +bool Process::StandardErrHasColors() { + return FileDescriptorHasColors(2); } namespace { diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc index 26b9bba..9593923 100644 --- a/lib/Support/Windows/RWMutex.inc +++ b/lib/Support/Windows/RWMutex.inc @@ -67,9 +67,9 @@ static bool loadSRW() { "ReleaseSRWLockShared"); ::FreeLibrary(hLib); - if (fpInitializeSRWLock != NULL) { - sHasSRW = true; - } + if (fpInitializeSRWLock != NULL) { + sHasSRW = true; + } } } return sHasSRW; diff --git a/lib/Support/Windows/ThreadLocal.inc b/lib/Support/Windows/ThreadLocal.inc index 512462d..057deb3 100644 --- a/lib/Support/Windows/ThreadLocal.inc +++ b/lib/Support/Windows/ThreadLocal.inc @@ -22,26 +22,25 @@ namespace llvm { using namespace sys; -ThreadLocalImpl::ThreadLocalImpl() { - DWORD* tls = new DWORD; +ThreadLocalImpl::ThreadLocalImpl() : data() { + typedef int SIZE_TOO_BIG[sizeof(DWORD) <= sizeof(data) ? 1 : -1]; + DWORD* tls = reinterpret_cast<DWORD*>(&data); *tls = TlsAlloc(); assert(*tls != TLS_OUT_OF_INDEXES); - data = tls; } ThreadLocalImpl::~ThreadLocalImpl() { - DWORD* tls = static_cast<DWORD*>(data); + DWORD* tls = reinterpret_cast<DWORD*>(&data); TlsFree(*tls); - delete tls; } const void* ThreadLocalImpl::getInstance() { - DWORD* tls = static_cast<DWORD*>(data); + DWORD* tls = reinterpret_cast<DWORD*>(&data); return TlsGetValue(*tls); } void ThreadLocalImpl::setInstance(const void* d){ - DWORD* tls = static_cast<DWORD*>(data); + DWORD* tls = reinterpret_cast<DWORD*>(&data); int errorcode = TlsSetValue(*tls, const_cast<void*>(d)); assert(errorcode != 0); (void)errorcode; diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp index 330519f..7c353c8 100644 --- a/lib/Support/YAMLParser.cpp +++ b/lib/Support/YAMLParser.cpp @@ -27,12 +27,12 @@ using namespace llvm; using namespace yaml; enum UnicodeEncodingForm { - UEF_UTF32_LE, //< UTF-32 Little Endian - UEF_UTF32_BE, //< UTF-32 Big Endian - UEF_UTF16_LE, //< UTF-16 Little Endian - UEF_UTF16_BE, //< UTF-16 Big Endian - UEF_UTF8, //< UTF-8 or ascii. - UEF_Unknown //< Not a valid Unicode encoding. + UEF_UTF32_LE, ///< UTF-32 Little Endian + UEF_UTF32_BE, ///< UTF-32 Big Endian + UEF_UTF16_LE, ///< UTF-16 Little Endian + UEF_UTF16_BE, ///< UTF-16 Big Endian + UEF_UTF8, ///< UTF-8 or ascii. + UEF_Unknown ///< Not a valid Unicode encoding. }; /// EncodingInfo - Holds the encoding type and length of the byte order mark if @@ -489,9 +489,6 @@ private: /// @brief Can the next token be the start of a simple key? bool IsSimpleKeyAllowed; - /// @brief Is the next token required to start a simple key? - bool IsSimpleKeyRequired; - /// @brief True if an error has occurred. bool Failed; @@ -658,7 +655,7 @@ std::string yaml::escape(StringRef Input) { EscapedInput += "\\r"; else if (*i == 0x1B) EscapedInput += "\\e"; - else if (*i >= 0 && *i < 0x20) { // Control characters not handled above. + else if ((unsigned char)*i < 0x20) { // Control characters not handled above. std::string HexStr = utohexstr(*i); EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. @@ -704,7 +701,6 @@ Scanner::Scanner(StringRef Input, SourceMgr &sm) , FlowLevel(0) , IsStartOfStream(true) , IsSimpleKeyAllowed(true) - , IsSimpleKeyRequired(false) , Failed(false) { InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); SM.AddNewSourceBuffer(InputBuffer, SMLoc()); @@ -755,6 +751,8 @@ Token Scanner::getNext() { } StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { + if (Position == End) + return Position; // Check 7 bit c-printable - b-char. if ( *Position == 0x09 || (*Position >= 0x20 && *Position <= 0x7E)) @@ -778,6 +776,8 @@ StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { } StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { + if (Position == End) + return Position; if (*Position == 0x0D) { if (Position + 1 != End && *(Position + 1) == 0x0A) return Position + 2; @@ -1211,7 +1211,9 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { ++Current; // Repeat until the previous character was not a '\' or was an escaped // backslash. - } while (*(Current - 1) == '\\' && wasEscaped(Start + 1, Current)); + } while ( Current != End + && *(Current - 1) == '\\' + && wasEscaped(Start + 1, Current)); } else { skip(1); while (true) { @@ -1624,9 +1626,7 @@ StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { return UnquotedValue; } // Plain or block. - size_t trimtrail = Value.rfind(' '); - return Value.drop_back( - trimtrail == StringRef::npos ? 0 : Value.size() - trimtrail); + return Value.rtrim(" "); } StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue @@ -1733,7 +1733,9 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue // TODO: Report error. break; unsigned int UnicodeScalarValue; - UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue); + if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; encodeUTF8(UnicodeScalarValue, Storage); UnquotedValue = UnquotedValue.substr(2); break; @@ -1743,7 +1745,9 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue // TODO: Report error. break; unsigned int UnicodeScalarValue; - UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue); + if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; encodeUTF8(UnicodeScalarValue, Storage); UnquotedValue = UnquotedValue.substr(4); break; @@ -1753,7 +1757,9 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue // TODO: Report error. break; unsigned int UnicodeScalarValue; - UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue); + if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) + // TODO: Report error. + UnicodeScalarValue = 0xFFFD; encodeUTF8(UnicodeScalarValue, Storage); UnquotedValue = UnquotedValue.substr(8); break; @@ -2113,5 +2119,3 @@ bool Document::expectToken(int TK) { } return true; } - -OwningPtr<Document> document_iterator::NullDoc; diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp index 86cdca1..fa69c2d 100644 --- a/lib/Support/raw_ostream.cpp +++ b/lib/Support/raw_ostream.cpp @@ -528,7 +528,8 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) { } else { // Use ::writev() where available. #if defined(HAVE_WRITEV) - struct iovec IOV = { (void*) Ptr, Size }; + const void *Addr = static_cast<const void *>(Ptr); + struct iovec IOV = {const_cast<void *>(Addr), Size }; ret = ::writev(FD, &IOV, 1); #else ret = ::write(FD, Ptr, Size); @@ -650,6 +651,10 @@ bool raw_fd_ostream::is_displayed() const { return sys::Process::FileDescriptorIsDisplayed(FD); } +bool raw_fd_ostream::has_colors() const { + return sys::Process::FileDescriptorHasColors(FD); +} + //===----------------------------------------------------------------------===// // outs(), errs(), nulls() //===----------------------------------------------------------------------===// diff --git a/lib/TableGen/CMakeLists.txt b/lib/TableGen/CMakeLists.txt index 82f72b0..ba7bf14 100644 --- a/lib/TableGen/CMakeLists.txt +++ b/lib/TableGen/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_library(LLVMTableGen Error.cpp Main.cpp Record.cpp + StringMatcher.cpp TableGenAction.cpp TableGenBackend.cpp TGLexer.cpp diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp index 01bc55e..7aeef56 100644 --- a/lib/TableGen/Main.cpp +++ b/lib/TableGen/Main.cpp @@ -34,7 +34,9 @@ namespace { cl::init("-")); cl::opt<std::string> - DependFilename("d", cl::desc("Dependency filename"), cl::value_desc("filename"), + DependFilename("d", + cl::desc("Dependency filename"), + cl::value_desc("filename"), cl::init("")); cl::opt<std::string> @@ -53,7 +55,8 @@ int TableGenMain(char *argv0, TableGenAction &Action) { try { // Parse the input file. OwningPtr<MemoryBuffer> File; - if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) { + if (error_code ec = + MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) { errs() << "Could not open input file '" << InputFilename << "': " << ec.message() <<"\n"; return 1; @@ -93,7 +96,7 @@ int TableGenMain(char *argv0, TableGenAction &Action) { DepOut.os() << OutputFilename << ":"; const std::vector<std::string> &Dependencies = Parser.getDependencies(); for (std::vector<std::string>::const_iterator I = Dependencies.begin(), - E = Dependencies.end(); + E = Dependencies.end(); I != E; ++I) { DepOut.os() << " " << (*I); } diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp index 93eed24..99fdc1f 100644 --- a/lib/TableGen/Record.cpp +++ b/lib/TableGen/Record.cpp @@ -1699,7 +1699,7 @@ void Record::checkName() { assert(TypedName && "Record name is not typed!"); RecTy *Type = TypedName->getType(); if (dynamic_cast<StringRecTy *>(Type) == 0) { - throw "Record name is not a string!"; + throw TGError(getLoc(), "Record name is not a string!"); } } diff --git a/lib/TableGen/StringMatcher.cpp b/lib/TableGen/StringMatcher.cpp new file mode 100644 index 0000000..1668170 --- /dev/null +++ b/lib/TableGen/StringMatcher.cpp @@ -0,0 +1,149 @@ +//===- StringMatcher.cpp - Generate a matcher for input strings -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the StringMatcher class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/TableGen/StringMatcher.h" +#include "llvm/Support/raw_ostream.h" +#include <map> +using namespace llvm; + +/// FindFirstNonCommonLetter - Find the first character in the keys of the +/// string pairs that is not shared across the whole set of strings. All +/// strings are assumed to have the same length. +static unsigned +FindFirstNonCommonLetter(const std::vector<const + StringMatcher::StringPair*> &Matches) { + assert(!Matches.empty()); + for (unsigned i = 0, e = Matches[0]->first.size(); i != e; ++i) { + // Check to see if letter i is the same across the set. + char Letter = Matches[0]->first[i]; + + for (unsigned str = 0, e = Matches.size(); str != e; ++str) + if (Matches[str]->first[i] != Letter) + return i; + } + + return Matches[0]->first.size(); +} + +/// EmitStringMatcherForChar - Given a set of strings that are known to be the +/// same length and whose characters leading up to CharNo are the same, emit +/// code to verify that CharNo and later are the same. +/// +/// \return - True if control can leave the emitted code fragment. +bool StringMatcher:: +EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches, + unsigned CharNo, unsigned IndentCount) const { + assert(!Matches.empty() && "Must have at least one string to match!"); + std::string Indent(IndentCount*2+4, ' '); + + // If we have verified that the entire string matches, we're done: output the + // matching code. + if (CharNo == Matches[0]->first.size()) { + assert(Matches.size() == 1 && "Had duplicate keys to match on"); + + // If the to-execute code has \n's in it, indent each subsequent line. + StringRef Code = Matches[0]->second; + + std::pair<StringRef, StringRef> Split = Code.split('\n'); + OS << Indent << Split.first << "\t // \"" << Matches[0]->first << "\"\n"; + + Code = Split.second; + while (!Code.empty()) { + Split = Code.split('\n'); + OS << Indent << Split.first << "\n"; + Code = Split.second; + } + return false; + } + + // Bucket the matches by the character we are comparing. + std::map<char, std::vector<const StringPair*> > MatchesByLetter; + + for (unsigned i = 0, e = Matches.size(); i != e; ++i) + MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]); + + + // If we have exactly one bucket to match, see how many characters are common + // across the whole set and match all of them at once. + if (MatchesByLetter.size() == 1) { + unsigned FirstNonCommonLetter = FindFirstNonCommonLetter(Matches); + unsigned NumChars = FirstNonCommonLetter-CharNo; + + // Emit code to break out if the prefix doesn't match. + if (NumChars == 1) { + // Do the comparison with if (Str[1] != 'f') + // FIXME: Need to escape general characters. + OS << Indent << "if (" << StrVariableName << "[" << CharNo << "] != '" + << Matches[0]->first[CharNo] << "')\n"; + OS << Indent << " break;\n"; + } else { + // Do the comparison with if memcmp(Str.data()+1, "foo", 3). + // FIXME: Need to escape general strings. + OS << Indent << "if (memcmp(" << StrVariableName << ".data()+" << CharNo + << ", \"" << Matches[0]->first.substr(CharNo, NumChars) << "\", " + << NumChars << "))\n"; + OS << Indent << " break;\n"; + } + + return EmitStringMatcherForChar(Matches, FirstNonCommonLetter, IndentCount); + } + + // Otherwise, we have multiple possible things, emit a switch on the + // character. + OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n"; + OS << Indent << "default: break;\n"; + + for (std::map<char, std::vector<const StringPair*> >::iterator LI = + MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) { + // TODO: escape hard stuff (like \n) if we ever care about it. + OS << Indent << "case '" << LI->first << "':\t // " + << LI->second.size() << " string"; + if (LI->second.size() != 1) OS << 's'; + OS << " to match.\n"; + if (EmitStringMatcherForChar(LI->second, CharNo+1, IndentCount+1)) + OS << Indent << " break;\n"; + } + + OS << Indent << "}\n"; + return true; +} + + +/// Emit - Top level entry point. +/// +void StringMatcher::Emit(unsigned Indent) const { + // If nothing to match, just fall through. + if (Matches.empty()) return; + + // First level categorization: group strings by length. + std::map<unsigned, std::vector<const StringPair*> > MatchesByLength; + + for (unsigned i = 0, e = Matches.size(); i != e; ++i) + MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]); + + // Output a switch statement on length and categorize the elements within each + // bin. + OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n"; + OS.indent(Indent*2+2) << "default: break;\n"; + + for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI = + MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) { + OS.indent(Indent*2+2) << "case " << LI->first << ":\t // " + << LI->second.size() + << " string" << (LI->second.size() == 1 ? "" : "s") << " to match.\n"; + if (EmitStringMatcherForChar(LI->second, 0, Indent)) + OS.indent(Indent*2+4) << "break;\n"; + } + + OS.indent(Indent*2+2) << "}\n"; +} diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp index 04c4fc1..9424677 100644 --- a/lib/TableGen/TGParser.cpp +++ b/lib/TableGen/TGParser.cpp @@ -292,107 +292,78 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC, /// ProcessForeachDefs - Given a record, apply all of the variable /// values in all surrounding foreach loops, creating new records for /// each combination of values. -bool TGParser::ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass, - SMLoc Loc) { +bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc) { + if (Loops.empty()) + return false; + // We want to instantiate a new copy of CurRec for each combination // of nested loop iterator values. We don't want top instantiate // any copies until we have values for each loop iterator. IterSet IterVals; - for (LoopVector::iterator Loop = Loops.begin(), LoopEnd = Loops.end(); - Loop != LoopEnd; - ++Loop) { - // Process this loop. - if (ProcessForeachDefs(CurRec, CurMultiClass, Loc, - IterVals, *Loop, Loop+1)) { - Error(Loc, - "Could not process loops for def " + CurRec->getNameInitAsString()); - return true; - } - } - - return false; + return ProcessForeachDefs(CurRec, Loc, IterVals); } /// ProcessForeachDefs - Given a record, a loop and a loop iterator, /// apply each of the variable values in this loop and then process /// subloops. -bool TGParser::ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass, - SMLoc Loc, IterSet &IterVals, - ForeachLoop &CurLoop, - LoopVector::iterator NextLoop) { - Init *IterVar = CurLoop.IterVar; - ListInit *List = dynamic_cast<ListInit *>(CurLoop.ListValue); - - if (List == 0) { - Error(Loc, "Loop list is not a list"); - return true; - } - - // Process each value. - for (int64_t i = 0; i < List->getSize(); ++i) { - Init *ItemVal = List->resolveListElementReference(*CurRec, 0, i); - IterVals.push_back(IterRecord(IterVar, ItemVal)); - - if (IterVals.size() == Loops.size()) { - // Ok, we have all of the iterator values for this point in the - // iteration space. Instantiate a new record to reflect this - // combination of values. - Record *IterRec = new Record(*CurRec); - - // Set the iterator values now. - for (IterSet::iterator i = IterVals.begin(), iend = IterVals.end(); - i != iend; - ++i) { - VarInit *IterVar = dynamic_cast<VarInit *>(i->IterVar); - if (IterVar == 0) { - Error(Loc, "foreach iterator is unresolved"); - return true; - } - - TypedInit *IVal = dynamic_cast<TypedInit *>(i->IterValue); - if (IVal == 0) { - Error(Loc, "foreach iterator value is untyped"); - return true; - } - - IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false)); +bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){ + // Recursively build a tuple of iterator values. + if (IterVals.size() != Loops.size()) { + assert(IterVals.size() < Loops.size()); + ForeachLoop &CurLoop = Loops[IterVals.size()]; + ListInit *List = dynamic_cast<ListInit *>(CurLoop.ListValue); + if (List == 0) { + Error(Loc, "Loop list is not a list"); + return true; + } - if (SetValue(IterRec, Loc, IterVar->getName(), - std::vector<unsigned>(), IVal)) { - Error(Loc, "when instantiating this def"); - return true; - } + // Process each value. + for (int64_t i = 0; i < List->getSize(); ++i) { + Init *ItemVal = List->resolveListElementReference(*CurRec, 0, i); + IterVals.push_back(IterRecord(CurLoop.IterVar, ItemVal)); + if (ProcessForeachDefs(CurRec, Loc, IterVals)) + return true; + IterVals.pop_back(); + } + return false; + } - // Resolve it next. - IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getName())); + // This is the bottom of the recursion. We have all of the iterator values + // for this point in the iteration space. Instantiate a new record to + // reflect this combination of values. + Record *IterRec = new Record(*CurRec); - // Remove it. - IterRec->removeValue(IterVar->getName()); - } + // Set the iterator values now. + for (unsigned i = 0, e = IterVals.size(); i != e; ++i) { + VarInit *IterVar = IterVals[i].IterVar; + TypedInit *IVal = dynamic_cast<TypedInit *>(IterVals[i].IterValue); + if (IVal == 0) { + Error(Loc, "foreach iterator value is untyped"); + return true; + } - if (Records.getDef(IterRec->getNameInitAsString())) { - Error(Loc, "def already exists: " + IterRec->getNameInitAsString()); - return true; - } + IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false)); - Records.addDef(IterRec); - IterRec->resolveReferences(); + if (SetValue(IterRec, Loc, IterVar->getName(), + std::vector<unsigned>(), IVal)) { + Error(Loc, "when instantiating this def"); + return true; } - if (NextLoop != Loops.end()) { - // Process nested loops. - if (ProcessForeachDefs(CurRec, CurMultiClass, Loc, IterVals, *NextLoop, - NextLoop+1)) { - Error(Loc, - "Could not process loops for def " + - CurRec->getNameInitAsString()); - return true; - } - } + // Resolve it next. + IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getName())); + + // Remove it. + IterRec->removeValue(IterVar->getName()); + } - // We're done with this iterator. - IterVals.pop_back(); + if (Records.getDef(IterRec->getNameInitAsString())) { + Error(Loc, "def already exists: " + IterRec->getNameInitAsString()); + return true; } + + Records.addDef(IterRec); + IterRec->resolveReferences(); return false; } @@ -1726,9 +1697,11 @@ Init *TGParser::ParseDeclaration(Record *CurRec, /// the name of the declared object or a NULL Init on error. Return /// the name of the parsed initializer list through ForeachListName. /// -/// ForeachDeclaration ::= ID '=' Value +/// ForeachDeclaration ::= ID '=' '[' ValueList ']' +/// ForeachDeclaration ::= ID '=' '{' RangeList '}' +/// ForeachDeclaration ::= ID '=' RangePiece /// -Init *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) { +VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) { if (Lex.getCode() != tgtok::Id) { TokError("Expected identifier in foreach declaration"); return 0; @@ -1744,26 +1717,59 @@ Init *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) { } Lex.Lex(); // Eat the '=' - // Expect a list initializer. - ForeachListValue = ParseValue(0, 0, ParseForeachMode); + RecTy *IterType = 0; + std::vector<unsigned> Ranges; - TypedInit *TypedList = dynamic_cast<TypedInit *>(ForeachListValue); - if (TypedList == 0) { - TokError("Value list is untyped"); - return 0; + switch (Lex.getCode()) { + default: TokError("Unknown token when expecting a range list"); return 0; + case tgtok::l_square: { // '[' ValueList ']' + Init *List = ParseSimpleValue(0, 0, ParseForeachMode); + ForeachListValue = dynamic_cast<ListInit*>(List); + if (ForeachListValue == 0) { + TokError("Expected a Value list"); + return 0; + } + RecTy *ValueType = ForeachListValue->getType(); + ListRecTy *ListType = dynamic_cast<ListRecTy *>(ValueType); + if (ListType == 0) { + TokError("Value list is not of list type"); + return 0; + } + IterType = ListType->getElementType(); + break; } - RecTy *ValueType = TypedList->getType(); - ListRecTy *ListType = dynamic_cast<ListRecTy *>(ValueType); - if (ListType == 0) { - TokError("Value list is not of list type"); - return 0; + case tgtok::IntVal: { // RangePiece. + if (ParseRangePiece(Ranges)) + return 0; + break; } - RecTy *IterType = ListType->getElementType(); - VarInit *IterVar = VarInit::get(DeclName, IterType); + case tgtok::l_brace: { // '{' RangeList '}' + Lex.Lex(); // eat the '{' + Ranges = ParseRangeList(); + if (Lex.getCode() != tgtok::r_brace) { + TokError("expected '}' at end of bit range list"); + return 0; + } + Lex.Lex(); + break; + } + } + + if (!Ranges.empty()) { + assert(!IterType && "Type already initialized?"); + IterType = IntRecTy::get(); + std::vector<Init*> Values; + for (unsigned i = 0, e = Ranges.size(); i != e; ++i) + Values.push_back(IntInit::get(Ranges[i])); + ForeachListValue = ListInit::get(Values, IterType); + } - return IterVar; + if (!IterType) + return 0; + + return VarInit::get(DeclName, IterType); } /// ParseTemplateArgList - Read a template argument list, which is a non-empty @@ -1932,7 +1938,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) { // Parse ObjectName and make a record for it. Record *CurRec = new Record(ParseObjectName(CurMultiClass), DefLoc, Records); - if (!CurMultiClass) { + if (!CurMultiClass && Loops.empty()) { // Top-level def definition. // Ensure redefinition doesn't happen. @@ -1942,7 +1948,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) { return true; } Records.addDef(CurRec); - } else { + } else if (CurMultiClass) { // Otherwise, a def inside a multiclass, add it to the multiclass. for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size(); i != e; ++i) if (CurMultiClass->DefPrototypes[i]->getNameInit() @@ -1978,7 +1984,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) { } } - if (ProcessForeachDefs(CurRec, CurMultiClass, DefLoc)) { + if (ProcessForeachDefs(CurRec, DefLoc)) { Error(DefLoc, "Could not process loops for def" + CurRec->getNameInitAsString()); return true; @@ -1999,8 +2005,8 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) { // Make a temporary object to record items associated with the for // loop. - Init *ListValue = 0; - Init *IterName = ParseForeachDeclaration(ListValue); + ListInit *ListValue = 0; + VarInit *IterName = ParseForeachDeclaration(ListValue); if (IterName == 0) return TokError("expected declaration in for"); diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h index b8e7cb1..3d2c72c 100644 --- a/lib/TableGen/TGParser.h +++ b/lib/TableGen/TGParser.h @@ -45,10 +45,11 @@ namespace llvm { /// ForeachLoop - Record the iteration state associated with a for loop. /// This is used to instantiate items in the loop body. struct ForeachLoop { - Init *IterVar; - Init *ListValue; + VarInit *IterVar; + ListInit *ListValue; - ForeachLoop(Init *IVar, Init *LValue) : IterVar(IVar), ListValue(LValue) {} + ForeachLoop(VarInit *IVar, ListInit *LValue) + : IterVar(IVar), ListValue(LValue) {} }; class TGParser { @@ -113,20 +114,17 @@ private: // Semantic analysis methods. // IterRecord: Map an iterator name to a value. struct IterRecord { - Init *IterVar; + VarInit *IterVar; Init *IterValue; - IterRecord(Init *Var, Init *Val) : IterVar(Var), IterValue(Val) {} + IterRecord(VarInit *Var, Init *Val) : IterVar(Var), IterValue(Val) {} }; // IterSet: The set of all iterator values at some point in the // iteration space. typedef std::vector<IterRecord> IterSet; - bool ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass, - SMLoc Loc); - bool ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass, - SMLoc Loc, IterSet &IterVals, ForeachLoop &CurLoop, - LoopVector::iterator NextLoop); + bool ProcessForeachDefs(Record *CurRec, SMLoc Loc); + bool ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals); private: // Parser methods. bool ParseObjectList(MultiClass *MC = 0); @@ -160,7 +158,7 @@ private: // Parser methods. bool ParseTemplateArgList(Record *CurRec); Init *ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs); - Init *ParseForeachDeclaration(Init *&ForeachListValue); + VarInit *ParseForeachDeclaration(ListInit *&ForeachListValue); SubClassReference ParseSubClassReference(Record *CurRec, bool isDefm); SubMultiClassReference ParseSubMultiClassReference(MultiClass *CurMC); diff --git a/lib/TableGen/TableGenBackend.cpp b/lib/TableGen/TableGenBackend.cpp index 09bcc7a..7c8367a 100644 --- a/lib/TableGen/TableGenBackend.cpp +++ b/lib/TableGen/TableGenBackend.cpp @@ -1,4 +1,4 @@ -//===- TableGenBackend.cpp - Base class for TableGen Backends ---*- C++ -*-===// +//===- TableGenBackend.cpp - Utilities for TableGen Backends ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,17 +11,27 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/Twine.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/TableGenBackend.h" -#include "llvm/TableGen/Record.h" using namespace llvm; -void TableGenBackend::anchor() { } - -void TableGenBackend::EmitSourceFileHeader(StringRef Desc, - raw_ostream &OS) const { - OS << "//===- TableGen'erated file -------------------------------------*-" - " C++ -*-===//\n//\n// " << Desc << "\n//\n// Automatically generate" - "d file, do not edit!\n//\n//===------------------------------------" - "----------------------------------===//\n\n"; +static void printLine(raw_ostream &OS, const Twine &Prefix, char Fill, + StringRef Suffix) { + uint64_t Pos = OS.tell(); + OS << Prefix; + for (unsigned i = OS.tell() - Pos, e = 80 - Suffix.size(); i != e; ++i) + OS << Fill; + OS << Suffix << '\n'; } +void llvm::emitSourceFileHeader(StringRef Desc, raw_ostream &OS) { + printLine(OS, "/*===- TableGen'erated file ", '-', "*- C++ -*-===*\\"); + printLine(OS, "|*", ' ', "*|"); + printLine(OS, "|* " + Desc, ' ', "*|"); + printLine(OS, "|*", ' ', "*|"); + printLine(OS, "|* Automatically generated file, do not edit!", ' ', "*|"); + printLine(OS, "|*", ' ', "*|"); + printLine(OS, "\\*===", '-', "===*/"); + OS << '\n'; +} diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 9b0cb0c..cd3c0e0 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -141,7 +141,7 @@ def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", FeatureAvoidPartialCPSR]>; class ProcNoItin<string Name, list<SubtargetFeature> Features> - : Processor<Name, GenericItineraries, Features>; + : Processor<Name, NoItineraries, Features>; // V4 Processors. def : ProcNoItin<"generic", []>; @@ -204,13 +204,13 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2, FeatureDSPThumb2]>; // V7a Processors. -def : Processor<"cortex-a8", CortexA8Itineraries, +def : ProcessorModel<"cortex-a8", CortexA8Model, [ProcA8, HasV7Ops, FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS]>; -def : Processor<"cortex-a9", CortexA9Itineraries, +def : ProcessorModel<"cortex-a9", CortexA9Model, [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS]>; -def : Processor<"cortex-a9-mp", CortexA9Itineraries, +def : ProcessorModel<"cortex-a9-mp", CortexA9Model, [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureMP, FeatureHasRAS]>; diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 410790a..9a1ce06 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -23,8 +23,8 @@ #include "InstPrinter/ARMInstPrinter.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMMCExpr.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/Module.h" #include "llvm/Type.h" #include "llvm/Assembly/Writer.h" @@ -283,9 +283,16 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const { } } -void ARMAsmPrinter::EmitFunctionEntryLabel() { - OutStreamer.ForceCodeRegion(); +void ARMAsmPrinter::EmitFunctionBodyEnd() { + // Make sure to terminate any constant pools that were at the end + // of the function. + if (!InConstantPool) + return; + InConstantPool = false; + OutStreamer.EmitDataRegion(MCDR_DataRegionEnd); +} +void ARMAsmPrinter::EmitFunctionEntryLabel() { if (AFI->isThumbFunction()) { OutStreamer.EmitAssemblerFlag(MCAF_Code16); OutStreamer.EmitThumbFunc(CurrentFnSym); @@ -415,7 +422,9 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, if (ExtraCode[1] != 0) return true; // Unknown modifier. switch (ExtraCode[0]) { - default: return true; // Unknown modifier. + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O); case 'a': // Print as a memory address. if (MI->getOperand(OpNum).isReg()) { O << "[" @@ -434,15 +443,18 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, printOperand(MI, OpNum, O); return false; case 'y': // Print a VFP single precision register as indexed double. - // This uses the ordering of the alias table to get the first 'd' register - // that overlaps the 's' register. Also, s0 is an odd register, hence the - // odd modulus check below. if (MI->getOperand(OpNum).isReg()) { unsigned Reg = MI->getOperand(OpNum).getReg(); const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); - O << ARMInstPrinter::getRegisterName(TRI->getAliasSet(Reg)[0]) << - (((Reg % 2) == 1) ? "[0]" : "[1]"); - return false; + // Find the 'd' register that has this 's' register as a sub-register, + // and determine the lane number. + for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) { + if (!ARM::DPRRegClass.contains(*SR)) + continue; + bool Lane0 = TRI->getSubReg(*SR, ARM::ssub_0) == Reg; + O << ARMInstPrinter::getRegisterName(*SR) << (Lane0 ? "[0]" : "[1]"); + return false; + } } return true; case 'B': // Bitwise inverse of integer or symbol without a preceding #. @@ -934,13 +946,13 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) { const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id unsigned JTI = MO1.getIndex(); - // Tag the jump table appropriately for precise disassembly. - OutStreamer.EmitJumpTable32Region(); - // Emit a label for the jump table. MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); OutStreamer.EmitLabel(JTISymbol); + // Mark the jump table as data-in-code. + OutStreamer.EmitDataRegion(MCDR_DataRegionJT32); + // Emit each entry of the table. const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); @@ -969,6 +981,8 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) { OutContext); OutStreamer.EmitValue(Expr, 4); } + // Mark the end of jump table data-in-code region. + OutStreamer.EmitDataRegion(MCDR_DataRegionEnd); } void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { @@ -978,15 +992,6 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id unsigned JTI = MO1.getIndex(); - // Emit a label for the jump table. - if (MI->getOpcode() == ARM::t2TBB_JT) { - OutStreamer.EmitJumpTable8Region(); - } else if (MI->getOpcode() == ARM::t2TBH_JT) { - OutStreamer.EmitJumpTable16Region(); - } else { - OutStreamer.EmitJumpTable32Region(); - } - MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); OutStreamer.EmitLabel(JTISymbol); @@ -995,10 +1000,15 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; unsigned OffsetWidth = 4; - if (MI->getOpcode() == ARM::t2TBB_JT) + if (MI->getOpcode() == ARM::t2TBB_JT) { OffsetWidth = 1; - else if (MI->getOpcode() == ARM::t2TBH_JT) + // Mark the jump table as data-in-code. + OutStreamer.EmitDataRegion(MCDR_DataRegionJT8); + } else if (MI->getOpcode() == ARM::t2TBH_JT) { OffsetWidth = 2; + // Mark the jump table as data-in-code. + OutStreamer.EmitDataRegion(MCDR_DataRegionJT16); + } for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { MachineBasicBlock *MBB = JTBBs[i]; @@ -1031,6 +1041,11 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { OutContext); OutStreamer.EmitValue(Expr, OffsetWidth); } + // Mark the end of jump table data-in-code region. 32-bit offsets use + // actual branch instructions here, so we don't mark those as a data-region + // at all. + if (OffsetWidth != 4) + OutStreamer.EmitDataRegion(MCDR_DataRegionEnd); } void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI, @@ -1208,8 +1223,11 @@ extern cl::opt<bool> EnableARMEHABI; #include "ARMGenMCPseudoLowering.inc" void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { - if (MI->getOpcode() != ARM::CONSTPOOL_ENTRY) - OutStreamer.EmitCodeRegion(); + // If we just ended a constant pool, mark it as such. + if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) { + OutStreamer.EmitDataRegion(MCDR_DataRegionEnd); + InConstantPool = false; + } // Emit unwinding stuff for frame-related instructions if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup)) @@ -1565,9 +1583,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { unsigned LabelId = (unsigned)MI->getOperand(0).getImm(); unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex(); - // Mark the constant pool entry as data if we're not already in a data - // region. - OutStreamer.EmitDataRegion(); + // If this is the first entry of the pool, mark it. + if (!InConstantPool) { + OutStreamer.EmitDataRegion(MCDR_DataRegion); + InConstantPool = true; + } + OutStreamer.EmitLabel(GetCPISymbol(LabelId)); const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx]; diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h index af3f75a..3555e8f 100644 --- a/lib/Target/ARM/ARMAsmPrinter.h +++ b/lib/Target/ARM/ARMAsmPrinter.h @@ -44,9 +44,12 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter { /// MachineFunction. const MachineConstantPool *MCP; + /// InConstantPool - Maintain state when emitting a sequence of constant + /// pool entries so we can properly mark them as data regions. + bool InConstantPool; public: explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) { + : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL), InConstantPool(false) { Subtarget = &TM.getSubtarget<ARMSubtarget>(); } @@ -70,6 +73,7 @@ public: bool runOnMachineFunction(MachineFunction &F); virtual void EmitConstantPool() {} // we emit constant pools customly! + virtual void EmitFunctionBodyEnd(); virtual void EmitFunctionEntryLabel(); void EmitStartOfAsmFile(Module &M); void EmitEndOfAsmFile(Module &M); diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index c6280f8..714238a 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -51,9 +51,9 @@ WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true), /// ARM_MLxEntry - Record information about MLA / MLS instructions. struct ARM_MLxEntry { - unsigned MLxOpc; // MLA / MLS opcode - unsigned MulOpc; // Expanded multiplication opcode - unsigned AddSubOpc; // Expanded add / sub opcode + uint16_t MLxOpc; // MLA / MLS opcode + uint16_t MulOpc; // Expanded multiplication opcode + uint16_t AddSubOpc; // Expanded add / sub opcode bool NegAcc; // True if the acc is negated before the add / sub. bool HasLane; // True if instruction has an extra "lane" operand. }; @@ -1531,11 +1531,11 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { /// This will go away once we can teach tblgen how to set the optional CPSR def /// operand itself. struct AddSubFlagsOpcodePair { - unsigned PseudoOpc; - unsigned MachineOpc; + uint16_t PseudoOpc; + uint16_t MachineOpc; }; -static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { +static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { {ARM::ADDSri, ARM::ADDri}, {ARM::ADDSrr, ARM::ADDrr}, {ARM::ADDSrsi, ARM::ADDrsi}, @@ -1563,14 +1563,9 @@ static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { }; unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) { - static const int NPairs = - sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair); - for (AddSubFlagsOpcodePair *OpcPair = &AddSubFlagsOpcodeMap[0], - *End = &AddSubFlagsOpcodeMap[NPairs]; OpcPair != End; ++OpcPair) { - if (OldOpc == OpcPair->PseudoOpc) { - return OpcPair->MachineOpc; - } - } + for (unsigned i = 0, e = array_lengthof(AddSubFlagsOpcodeMap); i != e; ++i) + if (OldOpc == AddSubFlagsOpcodeMap[i].PseudoOpc) + return AddSubFlagsOpcodeMap[i].MachineOpc; return 0; } @@ -1742,20 +1737,33 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, return Offset == 0; } +/// analyzeCompare - For a comparison instruction, return the source registers +/// in SrcReg and SrcReg2 if having two register operands, and the value it +/// compares against in CmpValue. Return true if the comparison instruction +/// can be analyzed. bool ARMBaseInstrInfo:: -AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpMask, - int &CmpValue) const { +analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, + int &CmpMask, int &CmpValue) const { switch (MI->getOpcode()) { default: break; case ARM::CMPri: case ARM::t2CMPri: SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = 0; CmpMask = ~0; CmpValue = MI->getOperand(1).getImm(); return true; + case ARM::CMPrr: + case ARM::t2CMPrr: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = MI->getOperand(1).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; case ARM::TSTri: case ARM::t2TSTri: SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = 0; CmpMask = MI->getOperand(1).getImm(); CmpValue = 0; return true; @@ -1793,20 +1801,67 @@ static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg, return false; } -/// OptimizeCompareInstr - Convert the instruction supplying the argument to the -/// comparison into one that sets the zero bit in the flags register. -bool ARMBaseInstrInfo:: -OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, - int CmpValue, const MachineRegisterInfo *MRI) const { - if (CmpValue != 0) - return false; +/// getSwappedCondition - assume the flags are set by MI(a,b), return +/// the condition code if we modify the instructions such that flags are +/// set by MI(b,a). +inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) { + switch (CC) { + default: return ARMCC::AL; + case ARMCC::EQ: return ARMCC::EQ; + case ARMCC::NE: return ARMCC::NE; + case ARMCC::HS: return ARMCC::LS; + case ARMCC::LO: return ARMCC::HI; + case ARMCC::HI: return ARMCC::LO; + case ARMCC::LS: return ARMCC::HS; + case ARMCC::GE: return ARMCC::LE; + case ARMCC::LT: return ARMCC::GT; + case ARMCC::GT: return ARMCC::LT; + case ARMCC::LE: return ARMCC::GE; + } +} + +/// isRedundantFlagInstr - check whether the first instruction, whose only +/// purpose is to update flags, can be made redundant. +/// CMPrr can be made redundant by SUBrr if the operands are the same. +/// CMPri can be made redundant by SUBri if the operands are the same. +/// This function can be extended later on. +inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg, + unsigned SrcReg2, int ImmValue, + MachineInstr *OI) { + if ((CmpI->getOpcode() == ARM::CMPrr || + CmpI->getOpcode() == ARM::t2CMPrr) && + (OI->getOpcode() == ARM::SUBrr || + OI->getOpcode() == ARM::t2SUBrr) && + ((OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getReg() == SrcReg2) || + (OI->getOperand(1).getReg() == SrcReg2 && + OI->getOperand(2).getReg() == SrcReg))) + return true; - MachineRegisterInfo::def_iterator DI = MRI->def_begin(SrcReg); - if (llvm::next(DI) != MRI->def_end()) - // Only support one definition. - return false; + if ((CmpI->getOpcode() == ARM::CMPri || + CmpI->getOpcode() == ARM::t2CMPri) && + (OI->getOpcode() == ARM::SUBri || + OI->getOpcode() == ARM::t2SUBri) && + OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getImm() == ImmValue) + return true; + return false; +} - MachineInstr *MI = &*DI; +/// optimizeCompareInstr - Convert the instruction supplying the argument to the +/// comparison into one that sets the zero bit in the flags register; +/// Remove a redundant Compare instruction if an earlier instruction can set the +/// flags in the same way as Compare. +/// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two +/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the +/// condition code of instructions which use the flags. +bool ARMBaseInstrInfo:: +optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, + int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const { + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) return false; // Masked compares sometimes use the same register as the corresponding 'and'. if (CmpMask != ~0) { @@ -1825,32 +1880,49 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, } } - // Conservatively refuse to convert an instruction which isn't in the same BB - // as the comparison. - if (MI->getParent() != CmpInstr->getParent()) - return false; - - // Check that CPSR isn't set between the comparison instruction and the one we - // want to change. - MachineBasicBlock::iterator I = CmpInstr,E = MI, B = MI->getParent()->begin(); + // Get ready to iterate backward from CmpInstr. + MachineBasicBlock::iterator I = CmpInstr, E = MI, + B = CmpInstr->getParent()->begin(); // Early exit if CmpInstr is at the beginning of the BB. if (I == B) return false; + // There are two possible candidates which can be changed to set CPSR: + // One is MI, the other is a SUB instruction. + // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1). + // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue). + MachineInstr *Sub = NULL; + if (SrcReg2 != 0) + // MI is not a candidate for CMPrr. + MI = NULL; + else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) { + // Conservatively refuse to convert an instruction which isn't in the same + // BB as the comparison. + // For CMPri, we need to check Sub, thus we can't return here. + if (CmpInstr->getOpcode() == ARM::CMPri || + CmpInstr->getOpcode() == ARM::t2CMPri) + MI = NULL; + else + return false; + } + + // Check that CPSR isn't set between the comparison instruction and the one we + // want to change. At the same time, search for Sub. + const TargetRegisterInfo *TRI = &getRegisterInfo(); --I; for (; I != E; --I) { const MachineInstr &Instr = *I; - for (unsigned IO = 0, EO = Instr.getNumOperands(); IO != EO; ++IO) { - const MachineOperand &MO = Instr.getOperand(IO); - if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) - return false; - if (!MO.isReg()) continue; - + if (Instr.modifiesRegister(ARM::CPSR, TRI) || + Instr.readsRegister(ARM::CPSR, TRI)) // This instruction modifies or uses CPSR after the one we want to // change. We can't do this transformation. - if (MO.getReg() == ARM::CPSR) - return false; + return false; + + // Check whether CmpInstr can be made redundant by the current instruction. + if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) { + Sub = &*I; + break; } if (I == B) @@ -1858,7 +1930,13 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, return false; } - // Set the "zero" bit in CPSR. + // Return false if no candidates exist. + if (!MI && !Sub) + return false; + + // The single candidate is called MI. + if (!MI) MI = Sub; + switch (MI->getOpcode()) { default: break; case ARM::RSBrr: @@ -1894,13 +1972,17 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, case ARM::EORri: case ARM::t2EORrr: case ARM::t2EORri: { - // Scan forward for the use of CPSR, if it's a conditional code requires - // checking of V bit, then this is not safe to do. If we can't find the - // CPSR use (i.e. used in another block), then it's not safe to perform - // the optimization. + // Scan forward for the use of CPSR + // When checking against MI: if it's a conditional code requires + // checking of V bit, then this is not safe to do. + // It is safe to remove CmpInstr if CPSR is redefined or killed. + // If we are done with the basic block, we need to check whether CPSR is + // live-out. + SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4> + OperandsToUpdate; bool isSafe = false; I = CmpInstr; - E = MI->getParent()->end(); + E = CmpInstr->getParent()->end(); while (!isSafe && ++I != E) { const MachineInstr &Instr = *I; for (unsigned IO = 0, EO = Instr.getNumOperands(); @@ -1918,28 +2000,56 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, } // Condition code is after the operand before CPSR. ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm(); - switch (CC) { - default: - isSafe = true; - break; - case ARMCC::VS: - case ARMCC::VC: - case ARMCC::GE: - case ARMCC::LT: - case ARMCC::GT: - case ARMCC::LE: - return false; + if (Sub) { + ARMCC::CondCodes NewCC = getSwappedCondition(CC); + if (NewCC == ARMCC::AL) + return false; + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based + // on CMP needs to be updated to be based on SUB. + // Push the condition code operands to OperandsToUpdate. + // If it is safe to remove CmpInstr, the condition code of these + // operands will be modified. + if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg) + OperandsToUpdate.push_back(std::make_pair(&((*I).getOperand(IO-1)), + NewCC)); } + else + switch (CC) { + default: + // CPSR can be used multiple times, we should continue. + break; + case ARMCC::VS: + case ARMCC::VC: + case ARMCC::GE: + case ARMCC::LT: + case ARMCC::GT: + case ARMCC::LE: + return false; + } } } - if (!isSafe) - return false; + // If CPSR is not killed nor re-defined, we should check whether it is + // live-out. If it is live-out, do not optimize. + if (!isSafe) { + MachineBasicBlock *MBB = CmpInstr->getParent(); + for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), + SE = MBB->succ_end(); SI != SE; ++SI) + if ((*SI)->isLiveIn(ARM::CPSR)) + return false; + } // Toggle the optional operand to CPSR. MI->getOperand(5).setReg(ARM::CPSR); MI->getOperand(5).setIsDef(true); CmpInstr->eraseFromParent(); + + // Modify the condition code of operands in OperandsToUpdate. + // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to + // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. + for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++) + OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second); return true; } } @@ -2071,9 +2181,9 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, const MCInstrDesc &Desc = MI->getDesc(); unsigned Class = Desc.getSchedClass(); - unsigned UOps = ItinData->Itineraries[Class].NumMicroOps; - if (UOps) - return UOps; + int ItinUOps = ItinData->getNumMicroOps(Class); + if (ItinUOps >= 0) + return ItinUOps; unsigned Opc = MI->getOpcode(); switch (Opc) { @@ -2088,7 +2198,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, // // On Cortex-A8, each pair of register loads / stores can be scheduled on the // same cycle. The scheduling for the first load / store must be done - // separately by assuming the the address is not 64-bit aligned. + // separately by assuming the address is not 64-bit aligned. // // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address // is not 64-bit aligned, then AGU would take an extra cycle. For VFP / NEON @@ -2147,19 +2257,19 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, return 2; // 4 registers would be issued: 2, 2. // 5 registers would be issued: 2, 2, 1. - UOps = (NumRegs / 2); + int A8UOps = (NumRegs / 2); if (NumRegs % 2) - ++UOps; - return UOps; + ++A8UOps; + return A8UOps; } else if (Subtarget.isCortexA9()) { - UOps = (NumRegs / 2); + int A9UOps = (NumRegs / 2); // If there are odd number of registers or if it's not 64-bit aligned, // then it takes an extra AGU (Address Generation Unit) cycle. if ((NumRegs % 2) || !MI->hasOneMemOperand() || (*MI->memoperands_begin())->getAlignment() < 8) - ++UOps; - return UOps; + ++A9UOps; + return A9UOps; } else { // Assume the worst. return NumRegs; @@ -2478,82 +2588,14 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, return II; } -int -ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const { - if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || - DefMI->isRegSequence() || DefMI->isImplicitDef()) - return 1; - - if (!ItinData || ItinData->isEmpty()) - return DefMI->mayLoad() ? 3 : 1; - - const MCInstrDesc *DefMCID = &DefMI->getDesc(); - const MCInstrDesc *UseMCID = &UseMI->getDesc(); - const MachineOperand &DefMO = DefMI->getOperand(DefIdx); - unsigned Reg = DefMO.getReg(); - if (Reg == ARM::CPSR) { - if (DefMI->getOpcode() == ARM::FMSTAT) { - // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?) - return Subtarget.isCortexA9() ? 1 : 20; - } - - // CPSR set and branch can be paired in the same cycle. - if (UseMI->isBranch()) - return 0; - - // Otherwise it takes the instruction latency (generally one). - int Latency = getInstrLatency(ItinData, DefMI); - - // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to - // its uses. Instructions which are otherwise scheduled between them may - // incur a code size penalty (not able to use the CPSR setting 16-bit - // instructions). - if (Latency > 0 && Subtarget.isThumb2()) { - const MachineFunction *MF = DefMI->getParent()->getParent(); - if (MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize)) - --Latency; - } - return Latency; - } - - unsigned DefAlign = DefMI->hasOneMemOperand() - ? (*DefMI->memoperands_begin())->getAlignment() : 0; - unsigned UseAlign = UseMI->hasOneMemOperand() - ? (*UseMI->memoperands_begin())->getAlignment() : 0; - - unsigned DefAdj = 0; - if (DefMI->isBundle()) { - DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj); - if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || - DefMI->isRegSequence() || DefMI->isImplicitDef()) - return 1; - DefMCID = &DefMI->getDesc(); - } - unsigned UseAdj = 0; - if (UseMI->isBundle()) { - unsigned NewUseIdx; - const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI, - Reg, NewUseIdx, UseAdj); - if (NewUseMI) { - UseMI = NewUseMI; - UseIdx = NewUseIdx; - UseMCID = &UseMI->getDesc(); - } - } - - int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign, - *UseMCID, UseIdx, UseAlign); - int Adj = DefAdj + UseAdj; - if (Adj) { - Latency -= (int)(DefAdj + UseAdj); - if (Latency < 1) - return 1; - } - - if (Latency > 1 && - (Subtarget.isCortexA8() || Subtarget.isCortexA9())) { +/// Return the number of cycles to add to (or subtract from) the static +/// itinerary based on the def opcode and alignment. The caller will ensure that +/// adjusted latency is at least one cycle. +static int adjustDefLatency(const ARMSubtarget &Subtarget, + const MachineInstr *DefMI, + const MCInstrDesc *DefMCID, unsigned DefAlign) { + int Adjust = 0; + if (Subtarget.isCortexA8() || Subtarget.isCortexA9()) { // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] // variants are one cycle cheaper. switch (DefMCID->getOpcode()) { @@ -2564,7 +2606,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); if (ShImm == 0 || (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) - --Latency; + --Adjust; break; } case ARM::t2LDRs: @@ -2574,13 +2616,13 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, // Thumb2 mode: lsl only. unsigned ShAmt = DefMI->getOperand(3).getImm(); if (ShAmt == 0 || ShAmt == 2) - --Latency; + --Adjust; break; } } } - if (DefAlign < 8 && Subtarget.isCortexA9()) + if (DefAlign < 8 && Subtarget.isCortexA9()) { switch (DefMCID->getOpcode()) { default: break; case ARM::VLD1q8: @@ -2689,10 +2731,101 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, case ARM::VLD4LNq32_UPD: // If the address is not 64-bit aligned, the latencies of these // instructions increases by one. - ++Latency; + ++Adjust; break; } + } + return Adjust; +} + + + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const { + // No operand latency. The caller may fall back to getInstrLatency. + if (!ItinData || ItinData->isEmpty()) + return -1; + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); + unsigned Reg = DefMO.getReg(); + const MCInstrDesc *DefMCID = &DefMI->getDesc(); + const MCInstrDesc *UseMCID = &UseMI->getDesc(); + + unsigned DefAdj = 0; + if (DefMI->isBundle()) { + DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj); + DefMCID = &DefMI->getDesc(); + } + if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || + DefMI->isRegSequence() || DefMI->isImplicitDef()) { + return 1; + } + + unsigned UseAdj = 0; + if (UseMI->isBundle()) { + unsigned NewUseIdx; + const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI, + Reg, NewUseIdx, UseAdj); + if (!NewUseMI) + return -1; + + UseMI = NewUseMI; + UseIdx = NewUseIdx; + UseMCID = &UseMI->getDesc(); + } + + if (Reg == ARM::CPSR) { + if (DefMI->getOpcode() == ARM::FMSTAT) { + // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?) + return Subtarget.isCortexA9() ? 1 : 20; + } + + // CPSR set and branch can be paired in the same cycle. + if (UseMI->isBranch()) + return 0; + + // Otherwise it takes the instruction latency (generally one). + unsigned Latency = getInstrLatency(ItinData, DefMI); + + // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to + // its uses. Instructions which are otherwise scheduled between them may + // incur a code size penalty (not able to use the CPSR setting 16-bit + // instructions). + if (Latency > 0 && Subtarget.isThumb2()) { + const MachineFunction *MF = DefMI->getParent()->getParent(); + if (MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize)) + --Latency; + } + return Latency; + } + + if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit()) + return -1; + + unsigned DefAlign = DefMI->hasOneMemOperand() + ? (*DefMI->memoperands_begin())->getAlignment() : 0; + unsigned UseAlign = UseMI->hasOneMemOperand() + ? (*UseMI->memoperands_begin())->getAlignment() : 0; + + // Get the itinerary's latency if possible, and handle variable_ops. + int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign, + *UseMCID, UseIdx, UseAlign); + // Unable to find operand latency. The caller may resort to getInstrLatency. + if (Latency < 0) + return Latency; + + // Adjust for IT block position. + int Adj = DefAdj + UseAdj; + + // Adjust for dynamic def-side opcode variants not captured by the itinerary. + Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign); + if (Adj >= 0 || (int)Latency > -Adj) { + return Latency + Adj; + } + // Return the itinerary latency, which may be zero but not less than zero. return Latency; } @@ -2892,22 +3025,20 @@ ARMBaseInstrInfo::getOutputLatency(const InstrItineraryData *ItinData, return 1; // If the second MI is predicated, then there is an implicit use dependency. - return getOperandLatency(ItinData, DefMI, DefIdx, DepMI, - DepMI->getNumOperands()); + return getInstrLatency(ItinData, DefMI); } -int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost) const { +unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() || MI->isImplicitDef()) return 1; - if (!ItinData || ItinData->isEmpty()) - return 1; - + // An instruction scheduler typically runs on unbundled instructions, however + // other passes may query the latency of a bundled instruction. if (MI->isBundle()) { - int Latency = 0; + unsigned Latency = 0; MachineBasicBlock::const_instr_iterator I = MI; MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { @@ -2918,15 +3049,33 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, } const MCInstrDesc &MCID = MI->getDesc(); - unsigned Class = MCID.getSchedClass(); - unsigned UOps = ItinData->Itineraries[Class].NumMicroOps; - if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) + if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) { // When predicated, CPSR is an additional source operand for CPSR updating // instructions, this apparently increases their latencies. *PredCost = 1; - if (UOps) - return ItinData->getStageLatency(Class); - return getNumMicroOps(ItinData, MI); + } + // Be sure to call getStageLatency for an empty itinerary in case it has a + // valid MinLatency property. + if (!ItinData) + return MI->mayLoad() ? 3 : 1; + + unsigned Class = MCID.getSchedClass(); + + // For instructions with variable uops, use uops as latency. + if (!ItinData->isEmpty() && ItinData->getNumMicroOps(Class) < 0) + return getNumMicroOps(ItinData, MI); + + // For the common case, fall back on the itinerary's latency. + unsigned Latency = ItinData->getStageLatency(Class); + + // Adjust for dynamic def-side opcode variants not captured by the itinerary. + unsigned DefAlign = MI->hasOneMemOperand() + ? (*MI->memoperands_begin())->getAlignment() : 0; + int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign); + if (Adj >= 0 || (int)Latency > -Adj) { + return Latency + Adj; + } + return Latency; } int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, @@ -2960,7 +3109,10 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, return true; // Hoist VFP / NEON instructions with 4 or higher latency. - int Latency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); + int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx, + /*FindMin=*/false); + if (Latency < 0) + Latency = getInstrLatency(ItinData, DefMI); if (Latency <= 3) return false; return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON || diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 2fe8507..1a10a4a 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -186,16 +186,20 @@ public: return NumCycles == 1; } - /// AnalyzeCompare - For a comparison instruction, return the source register - /// in SrcReg and the value it compares against in CmpValue. Return true if - /// the comparison instruction can be analyzed. - virtual bool AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, - int &CmpMask, int &CmpValue) const; - - /// OptimizeCompareInstr - Convert the instruction to set the zero flag so - /// that we can remove a "comparison with zero". - virtual bool OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, - int CmpMask, int CmpValue, + /// analyzeCompare - For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2 if having two register operands, and the value it + /// compares against in CmpValue. Return true if the comparison instruction + /// can be analyzed. + virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const; + + /// optimizeCompareInstr - Convert the instruction to set the zero flag so + /// that we can remove a "comparison with zero"; Remove a redundant CMP + /// instruction if the flags can be updated in the same way by an earlier + /// instruction such as SUB. + virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const; /// FoldImmediate - 'Reg' is known to be defined by a move immediate @@ -249,8 +253,9 @@ private: const MCInstrDesc &UseMCID, unsigned UseIdx, unsigned UseAlign) const; - int getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, unsigned *PredCost = 0) const; + unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = 0) const; int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 3907f75..231bd26 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -62,12 +62,14 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, const uint16_t* ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - return (STI.isTargetIOS()) ? CSR_iOS_SaveList : CSR_AAPCS_SaveList; + return (STI.isTargetIOS() && !STI.isAAPCS_ABI()) + ? CSR_iOS_SaveList : CSR_AAPCS_SaveList; } const uint32_t* ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const { - return (STI.isTargetIOS()) ? CSR_iOS_RegMask : CSR_AAPCS_RegMask; + return (STI.isTargetIOS() && !STI.isAAPCS_ABI()) + ? CSR_iOS_RegMask : CSR_AAPCS_RegMask; } BitVector ARMBaseRegisterInfo:: @@ -257,8 +259,9 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) } const TargetRegisterClass * -ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const { - return ARM::GPRRegisterClass; +ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { + return &ARM::GPRRegClass; } const TargetRegisterClass * @@ -369,7 +372,7 @@ ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC, }; // We only support even/odd hints for GPR and rGPR. - if (RC != ARM::GPRRegisterClass && RC != ARM::rGPRRegisterClass) + if (RC != &ARM::GPRRegClass && RC != &ARM::rGPRRegClass) return RC->getRawAllocationOrder(MF); if (HintType == ARMRI::RegPairEven) { @@ -712,6 +715,11 @@ requiresRegisterScavenging(const MachineFunction &MF) const { } bool ARMBaseRegisterInfo:: +trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + return true; +} + +bool ARMBaseRegisterInfo:: requiresFrameIndexScavenging(const MachineFunction &MF) const { return true; } @@ -932,7 +940,8 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB, const MCInstrDesc &MCID = TII.get(ADDriOpc); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this)); + const MachineFunction &MF = *MBB->getParent(); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF)); MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg) .addFrameIndex(FrameIdx).addImm(Offset)); @@ -1110,7 +1119,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Must be addrmode4/6. MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false); else { - ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass); + ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass); if (!AFI->isThumbFunction()) emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, Pred, PredReg, TII); diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index af79351..da29f7e 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -109,7 +109,8 @@ public: SmallVectorImpl<unsigned> &SubIndices, unsigned &NewSubIdx) const; - const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + const TargetRegisterClass* + getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const; const TargetRegisterClass* getCrossCopyRegClass(const TargetRegisterClass *RC) const; @@ -173,6 +174,8 @@ public: virtual bool requiresRegisterScavenging(const MachineFunction &MF) const; + virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const; + virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const; virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const; diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp index bc681be..b8627a2 100644 --- a/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -1648,7 +1648,7 @@ void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) { static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) { unsigned RegD = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - bool isSPVFP = ARM::SPRRegisterClass->contains(RegD); + bool isSPVFP = ARM::SPRRegClass.contains(RegD); RegD = getARMRegisterNumbering(RegD); if (!isSPVFP) { Binary |= (RegD & 0x0F) << ARMII::RegRdShift; @@ -1663,7 +1663,7 @@ static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) { static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) { unsigned RegN = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - bool isSPVFP = ARM::SPRRegisterClass->contains(RegN); + bool isSPVFP = ARM::SPRRegClass.contains(RegN); RegN = getARMRegisterNumbering(RegN); if (!isSPVFP) { Binary |= (RegN & 0x0F) << ARMII::RegRnShift; @@ -1678,7 +1678,7 @@ static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) { static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) { unsigned RegM = MI.getOperand(OpIdx).getReg(); unsigned Binary = 0; - bool isSPVFP = ARM::SPRRegisterClass->contains(RegM); + bool isSPVFP = ARM::SPRRegClass.contains(RegM); RegM = getARMRegisterNumbering(RegM); if (!isSPVFP) { Binary |= (RegM & 0x0F); diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index fc35c7c..a953985 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -69,27 +69,6 @@ static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) { return 0; } -/// WorstCaseAlign - Assuming only the low KnownBits bits in Offset are exact, -/// add padding such that: -/// -/// 1. The result is aligned to 1 << LogAlign. -/// -/// 2. No other value of the unknown bits would require more padding. -/// -/// This may add more padding than is required to satisfy just one of the -/// constraints. It is necessary to compute alignment this way to guarantee -/// that we don't underestimate the padding before an aligned block. If the -/// real padding before a block is larger than we think, constant pool entries -/// may go out of range. -static inline unsigned WorstCaseAlign(unsigned Offset, unsigned LogAlign, - unsigned KnownBits) { - // Add the worst possible padding that the unknown bits could cause. - Offset += UnknownPadding(LogAlign, KnownBits); - - // Then align the result. - return RoundUpToAlignment(Offset, 1u << LogAlign); -} - namespace { /// ARMConstantIslands - Due to limited PC-relative displacements, ARM /// requires constant pool entries to be scattered among the instructions @@ -109,7 +88,12 @@ namespace { /// Offset - Distance from the beginning of the function to the beginning /// of this basic block. /// - /// The offset is always aligned as required by the basic block. + /// Offsets are computed assuming worst case padding before an aligned + /// block. This means that subtracting basic block offsets always gives a + /// conservative estimate of the real distance which may be smaller. + /// + /// Because worst case padding is used, the computed offset of an aligned + /// block may not actually be aligned. unsigned Offset; /// Size - Size of the basic block in bytes. If the block contains @@ -140,7 +124,12 @@ namespace { /// This number should be used to predict worst case padding when /// splitting the block. unsigned internalKnownBits() const { - return Unalign ? Unalign : KnownBits; + unsigned Bits = Unalign ? Unalign : KnownBits; + // If the block size isn't a multiple of the known bits, assume the + // worst case padding. + if (Size & ((1u << Bits) - 1)) + Bits = CountTrailingZeros_32(Size); + return Bits; } /// Compute the offset immediately following this block. If LogAlign is @@ -152,7 +141,7 @@ namespace { if (!LA) return PO; // Add alignment padding from the terminator. - return WorstCaseAlign(PO, LA, internalKnownBits()); + return PO + UnknownPadding(LA, internalKnownBits()); } /// Compute the number of known low bits of postOffset. If this block @@ -342,9 +331,7 @@ void ARMConstantIslands::verify() { for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); MBBI != E; ++MBBI) { MachineBasicBlock *MBB = MBBI; - unsigned Align = MBB->getAlignment(); unsigned MBBId = MBB->getNumber(); - assert(BBInfo[MBBId].Offset % (1u << Align) == 0); assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset); } DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n"); @@ -428,7 +415,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { // ARM and Thumb2 functions need to be 4-byte aligned. if (!isThumb1) - MF->EnsureAlignment(2); // 2 = log2(4) + MF->ensureAlignment(2); // 2 = log2(4) // Perform the initial placement of the constant pool entries. To start with, // we put them all at the end of the function. @@ -529,7 +516,7 @@ ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) { // The function needs to be as aligned as the basic blocks. The linker may // move functions around based on their alignment. - MF->EnsureAlignment(BB->getAlignment()); + MF->ensureAlignment(BB->getAlignment()); // Order the entries in BB by descending alignment. That ensures correct // alignment of all entries as long as BB is sufficiently aligned. Keep @@ -828,7 +815,7 @@ void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) { // tBR_JTr contains a .align 2 directive. if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) { BBI.PostAlign = 2; - MBB->getParent()->EnsureAlignment(2); + MBB->getParent()->ensureAlignment(2); } } @@ -1045,7 +1032,6 @@ bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset, MachineInstr *CPEMI, unsigned MaxDisp, bool NegOk, bool DoDump) { unsigned CPEOffset = getOffsetOf(CPEMI); - assert(CPEOffset % 4 == 0 && "Misaligned CPE"); if (DoDump) { DEBUG({ @@ -1256,11 +1242,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, if (BBHasFallthrough(UserMBB)) { // Size of branch to insert. unsigned Delta = isThumb1 ? 2 : 4; - // End of UserBlock after adding a branch. - unsigned UserBlockEnd = UserBBI.postOffset() + Delta; // Compute the offset where the CPE will begin. - unsigned CPEOffset = WorstCaseAlign(UserBlockEnd, CPELogAlign, - UserBBI.postKnownBits()); + unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta; if (isOffsetInRange(UserOffset, CPEOffset, U)) { DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber() @@ -1299,20 +1282,16 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // up the insertion point. // Try to split the block so it's fully aligned. Compute the latest split - // point where we can add a 4-byte branch instruction, and then - // WorstCaseAlign to LogAlign. + // point where we can add a 4-byte branch instruction, and then align to + // LogAlign which is the largest possible alignment in the function. unsigned LogAlign = MF->getAlignment(); assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry"); unsigned KnownBits = UserBBI.internalKnownBits(); unsigned UPad = UnknownPadding(LogAlign, KnownBits); - unsigned BaseInsertOffset = UserOffset + U.getMaxDisp(); + unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad; DEBUG(dbgs() << format("Split in middle of big block before %#x", BaseInsertOffset)); - // Account for alignment and unknown padding. - BaseInsertOffset &= ~((1u << LogAlign) - 1); - BaseInsertOffset -= UPad; - // The 4 in the following is for the unconditional branch we'll be inserting // (allows for long branch on Thumb1). Alignment of the island is handled // inside isOffsetInRange. @@ -1327,11 +1306,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // pool entries following this block; only the last one is in the water list. // Back past any possible branches (allow for a conditional and a maximally // long unconditional). - if (BaseInsertOffset >= BBInfo[UserMBB->getNumber()+1].Offset) - BaseInsertOffset = BBInfo[UserMBB->getNumber()+1].Offset - - (isThumb1 ? 6 : 8); - unsigned EndInsertOffset = - WorstCaseAlign(BaseInsertOffset + 4, LogAlign, KnownBits) + + if (BaseInsertOffset + 8 >= UserBBI.postOffset()) { + BaseInsertOffset = UserBBI.postOffset() - UPad - 8; + DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset)); + } + unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad + CPEMI->getOperand(2).getImm(); MachineBasicBlock::iterator MI = UserMI; ++MI; @@ -1342,6 +1321,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, Offset < BaseInsertOffset; Offset += TII->GetInstSizeInBytes(MI), MI = llvm::next(MI)) { + assert(MI != UserMBB->end() && "Fell off end of block"); if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) { CPUser &U = CPUsers[CPUIndex]; if (!isOffsetInRange(Offset, EndInsertOffset, U)) { @@ -1353,9 +1333,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // reused within the block, but it doesn't matter much. Also assume CPEs // are added in order with alignment padding. We may eventually be able // to pack the aligned CPEs better. - EndInsertOffset = RoundUpToAlignment(EndInsertOffset, - 1u << getCPELogAlign(U.CPEMI)) + - U.CPEMI->getOperand(2).getImm(); + EndInsertOffset += U.CPEMI->getOperand(2).getImm(); CPUIndex++; } diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 5fc0360..a242b13 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -459,22 +459,23 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { MIB.addOperand(MI.getOperand(OpIdx++)); bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + bool SrcIsUndef = MI.getOperand(OpIdx).isUndef(); unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3); - MIB.addReg(D0); + MIB.addReg(D0, getUndefRegState(SrcIsUndef)); if (NumRegs > 1 && TableEntry->copyAllListRegs) - MIB.addReg(D1); + MIB.addReg(D1, getUndefRegState(SrcIsUndef)); if (NumRegs > 2 && TableEntry->copyAllListRegs) - MIB.addReg(D2); + MIB.addReg(D2, getUndefRegState(SrcIsUndef)); if (NumRegs > 3 && TableEntry->copyAllListRegs) - MIB.addReg(D3); + MIB.addReg(D3, getUndefRegState(SrcIsUndef)); // Copy the predicate operands. MIB.addOperand(MI.getOperand(OpIdx++)); MIB.addOperand(MI.getOperand(OpIdx++)); - if (SrcIsKill) // Add an implicit kill for the super-reg. + if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg. MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); @@ -925,7 +926,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, if (isARM) { AddDefaultPred(MIB3); if (Opcode == ARM::MOV_ga_pcrel_ldr) - MIB2->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } TransferImpOps(MI, MIB1, MIB3); MI.eraseFromParent(); diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 2e1eaca..b96395f 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -47,11 +47,6 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; -static cl::opt<bool> -DisableARMFastISel("disable-arm-fast-isel", - cl::desc("Turn off experimental ARM fast-isel support"), - cl::init(false), cl::Hidden); - extern cl::opt<bool> EnableARMLongCalls; namespace { @@ -182,7 +177,6 @@ class ARMFastISel : public FastISel { bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, unsigned Alignment = 0, bool isZExt = true, bool allocReg = true); - bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr, unsigned Alignment = 0); bool ARMComputeAddress(const Value *Obj, Address &Addr); @@ -195,21 +189,25 @@ class ARMFastISel : public FastISel { unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT); unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg); unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg); - unsigned ARMSelectCallOp(const GlobalValue *GV); + unsigned ARMSelectCallOp(bool UseReg); // Call handling routines. private: - CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return); + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, + bool Return, + bool isVarArg); bool ProcessCallArgs(SmallVectorImpl<Value*> &Args, SmallVectorImpl<unsigned> &ArgRegs, SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC, - unsigned &NumBytes); + unsigned &NumBytes, + bool isVarArg); + unsigned getLibcallReg(const Twine &Name); bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, const Instruction *I, CallingConv::ID CC, - unsigned &NumBytes); + unsigned &NumBytes, bool isVarArg); bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call); // OptionalDef handling routines. @@ -719,7 +717,7 @@ unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) { if (!FuncInfo.StaticAllocaMap.count(AI)) return 0; MVT VT; - if (!isLoadTypeLegal(AI->getType(), VT)) return false; + if (!isLoadTypeLegal(AI->getType(), VT)) return 0; DenseMap<const AllocaInst*, int>::iterator SI = FuncInfo.StaticAllocaMap.find(AI); @@ -910,8 +908,9 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3) { // put the alloca address into a register, set the base type back to // register and continue. This should almost never happen. if (needsLowering && Addr.BaseType == Address::FrameIndexBase) { - const TargetRegisterClass *RC = isThumb2 ? ARM::tGPRRegisterClass - : ARM::GPRRegisterClass; + const TargetRegisterClass *RC = isThumb2 ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned ResultReg = createResultReg(RC); unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri; AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, @@ -1005,7 +1004,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, useAM3 = true; } } - RC = ARM::GPRRegisterClass; + RC = &ARM::GPRRegClass; break; case MVT::i16: if (isThumb2) { @@ -1017,7 +1016,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, Opc = isZExt ? ARM::LDRH : ARM::LDRSH; useAM3 = true; } - RC = ARM::GPRRegisterClass; + RC = &ARM::GPRRegClass; break; case MVT::i32: if (isThumb2) { @@ -1028,7 +1027,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, } else { Opc = ARM::LDRi12; } - RC = ARM::GPRRegisterClass; + RC = &ARM::GPRRegClass; break; case MVT::f32: if (!Subtarget->hasVFP2()) return false; @@ -1037,7 +1036,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, needVMOV = true; VT = MVT::i32; Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12; - RC = ARM::GPRRegisterClass; + RC = &ARM::GPRRegClass; } else { Opc = ARM::VLDRS; RC = TLI.getRegClassFor(VT); @@ -1106,8 +1105,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr, // This is mostly going to be Neon/vector support. default: return false; case MVT::i1: { - unsigned Res = createResultReg(isThumb2 ? ARM::tGPRRegisterClass : - ARM::GPRRegisterClass); + unsigned Res = createResultReg(isThumb2 ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass); unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri; AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), Res) @@ -1358,7 +1358,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) { unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX; AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)) .addReg(AddrReg)); - return true; + return true; } bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, @@ -1423,12 +1423,12 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, if (!UseImm) CmpOpc = ARM::t2CMPrr; else - CmpOpc = isNegativeImm ? ARM::t2CMNzri : ARM::t2CMPri; + CmpOpc = isNegativeImm ? ARM::t2CMNri : ARM::t2CMPri; } else { if (!UseImm) CmpOpc = ARM::CMPrr; else - CmpOpc = isNegativeImm ? ARM::CMNzri : ARM::CMPri; + CmpOpc = isNegativeImm ? ARM::CMNri : ARM::CMPri; } break; } @@ -1491,8 +1491,9 @@ bool ARMFastISel::SelectCmp(const Instruction *I) { // Now set a register based on the comparison. Explicitly set the predicates // here. unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi; - const TargetRegisterClass *RC = isThumb2 ? ARM::rGPRRegisterClass - : ARM::GPRRegisterClass; + const TargetRegisterClass *RC = isThumb2 ? + (const TargetRegisterClass*)&ARM::rGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned DestReg = createResultReg(RC); Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0); unsigned ZeroReg = TargetMaterializeConstant(Zero); @@ -1516,7 +1517,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) { unsigned Op = getRegForValue(V); if (Op == 0) return false; - unsigned Result = createResultReg(ARM::DPRRegisterClass); + unsigned Result = createResultReg(&ARM::DPRRegClass); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::VCVTDS), Result) .addReg(Op)); @@ -1535,7 +1536,7 @@ bool ARMFastISel::SelectFPTrunc(const Instruction *I) { unsigned Op = getRegForValue(V); if (Op == 0) return false; - unsigned Result = createResultReg(ARM::SPRRegisterClass); + unsigned Result = createResultReg(&ARM::SPRRegClass); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::VCVTSD), Result) .addReg(Op)); @@ -1736,7 +1737,7 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { // type and the target independent selector doesn't know how to handle it. if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1) return false; - + unsigned Opc; switch (ISDOpcode) { default: return false; @@ -1809,10 +1810,11 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) { // Call Handling Code -// This is largely taken directly from CCAssignFnForNode - we don't support -// varargs in FastISel so that part has been removed. +// This is largely taken directly from CCAssignFnForNode // TODO: We may not support all of this. -CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) { +CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, + bool Return, + bool isVarArg) { switch (CC) { default: llvm_unreachable("Unsupported calling convention"); @@ -1825,14 +1827,17 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) { // Use target triple & subtarget features to do actual dispatch. if (Subtarget->isAAPCS_ABI()) { if (Subtarget->hasVFP2() && - TM.Options.FloatABIType == FloatABI::Hard) + TM.Options.FloatABIType == FloatABI::Hard && !isVarArg) return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); else return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); } else return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); case CallingConv::ARM_AAPCS_VFP: - return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + if (!isVarArg) + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + // Fall through to soft float variant, variadic functions don't + // use hard floating point ABI. case CallingConv::ARM_AAPCS: return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); case CallingConv::ARM_APCS: @@ -1846,10 +1851,12 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC, - unsigned &NumBytes) { + unsigned &NumBytes, + bool isVarArg) { SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context); - CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false)); + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, *Context); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, + CCAssignFnForCall(CC, false, isVarArg)); // Check that we can handle all of the arguments. If we can't, then bail out // now before we add code to the MBB. @@ -1981,7 +1988,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, const Instruction *I, CallingConv::ID CC, - unsigned &NumBytes) { + unsigned &NumBytes, bool isVarArg) { // Issue CALLSEQ_END unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, @@ -1991,8 +1998,8 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, // Now the return value. if (RetVT != MVT::isVoid) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true)); + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg)); // Copy all of the result registers out of their specified physreg. if (RVLocs.size() == 2 && RetVT == MVT::f64) { @@ -2041,9 +2048,6 @@ bool ARMFastISel::SelectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; - if (F.isVarArg()) - return false; - CallingConv::ID CC = F.getCallingConv(); if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; @@ -2053,7 +2057,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext()); - CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */)); + CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */, + F.isVarArg())); const Value *RV = Ret->getOperand(0); unsigned Reg = getRegForValue(RV); @@ -2110,12 +2115,17 @@ bool ARMFastISel::SelectRet(const Instruction *I) { return true; } -unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) { - if (isThumb2) { - return ARM::tBL; - } else { - return ARM::BL; - } +unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) { + if (UseReg) + return isThumb2 ? ARM::tBLXr : ARM::BLX; + else + return isThumb2 ? ARM::tBL : ARM::BL; +} + +unsigned ARMFastISel::getLibcallReg(const Twine &Name) { + GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false, + GlobalValue::ExternalLinkage, 0, Name); + return ARMMaterializeGV(GV, TLI.getValueType(GV->getType())); } // A quick function that will emit a call for a named libcall in F with the @@ -2136,8 +2146,14 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { else if (!isTypeLegal(RetTy, RetVT)) return false; - // TODO: For now if we have long calls specified we don't handle the call. - if (EnableARMLongCalls) return false; + // Can't handle non-double multi-reg retvals. + if (RetVT != MVT::isVoid && RetVT != MVT::i32) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false)); + if (RVLocs.size() >= 2 && RetVT != MVT::f64) + return false; + } // Set up the argument vectors. SmallVector<Value*, 8> Args; @@ -2170,23 +2186,36 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { // Handle the arguments now that we've gotten them. SmallVector<unsigned, 4> RegArgs; unsigned NumBytes; - if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, + RegArgs, CC, NumBytes, false)) return false; + unsigned CalleeReg = 0; + if (EnableARMLongCalls) { + CalleeReg = getLibcallReg(TLI.getLibcallName(Call)); + if (CalleeReg == 0) return false; + } + // Issue the call. - MachineInstrBuilder MIB; - unsigned CallOpc = ARMSelectCallOp(NULL); - if (isThumb2) - // Explicitly adding the predicate here. - MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(CallOpc))) - .addExternalSymbol(TLI.getLibcallName(Call)); - else + unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(CallOpc)); + if (isThumb2) { // Explicitly adding the predicate here. - MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(CallOpc)) - .addExternalSymbol(TLI.getLibcallName(Call))); + AddDefaultPred(MIB); + if (EnableARMLongCalls) + MIB.addReg(CalleeReg); + else + MIB.addExternalSymbol(TLI.getLibcallName(Call)); + } else { + if (EnableARMLongCalls) + MIB.addReg(CalleeReg); + else + MIB.addExternalSymbol(TLI.getLibcallName(Call)); + // Explicitly adding the predicate here. + AddDefaultPred(MIB); + } // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) MIB.addReg(RegArgs[i]); @@ -2197,7 +2226,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { // Finish off the call including any return values. SmallVector<unsigned, 4> UsedRegs; - if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false; // Set all unused physreg defs as dead. static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); @@ -2213,22 +2242,15 @@ bool ARMFastISel::SelectCall(const Instruction *I, // Can't handle inline asm. if (isa<InlineAsm>(Callee)) return false; - // Only handle global variable Callees. - const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); - if (!GV) - return false; - // Check the calling convention. ImmutableCallSite CS(CI); CallingConv::ID CC = CS.getCallingConv(); // TODO: Avoid some calling conventions? - // Let SDISel handle vararg functions. PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); FunctionType *FTy = cast<FunctionType>(PT->getElementType()); - if (FTy->isVarArg()) - return false; + bool isVarArg = FTy->isVarArg(); // Handle *simple* calls for now. Type *RetTy = I->getType(); @@ -2239,8 +2261,15 @@ bool ARMFastISel::SelectCall(const Instruction *I, RetVT != MVT::i8 && RetVT != MVT::i1) return false; - // TODO: For now if we have long calls specified we don't handle the call. - if (EnableARMLongCalls) return false; + // Can't handle non-double multi-reg retvals. + if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 && + RetVT != MVT::i16 && RetVT != MVT::i32) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg)); + if (RVLocs.size() >= 2 && RetVT != MVT::f64) + return false; + } // Set up the argument vectors. SmallVector<Value*, 8> Args; @@ -2295,33 +2324,49 @@ bool ARMFastISel::SelectCall(const Instruction *I, // Handle the arguments now that we've gotten them. SmallVector<unsigned, 4> RegArgs; unsigned NumBytes; - if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, + RegArgs, CC, NumBytes, isVarArg)) return false; + bool UseReg = false; + const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); + if (!GV || EnableARMLongCalls) UseReg = true; + + unsigned CalleeReg = 0; + if (UseReg) { + if (IntrMemName) + CalleeReg = getLibcallReg(IntrMemName); + else + CalleeReg = getRegForValue(Callee); + + if (CalleeReg == 0) return false; + } + // Issue the call. - MachineInstrBuilder MIB; - unsigned CallOpc = ARMSelectCallOp(GV); - // Explicitly adding the predicate here. + unsigned CallOpc = ARMSelectCallOp(UseReg); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(CallOpc)); if(isThumb2) { // Explicitly adding the predicate here. - MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(CallOpc))); - if (!IntrMemName) + AddDefaultPred(MIB); + if (UseReg) + MIB.addReg(CalleeReg); + else if (!IntrMemName) MIB.addGlobalAddress(GV, 0, 0); - else + else MIB.addExternalSymbol(IntrMemName, 0); } else { - if (!IntrMemName) - // Explicitly adding the predicate here. - MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(CallOpc)) - .addGlobalAddress(GV, 0, 0)); + if (UseReg) + MIB.addReg(CalleeReg); + else if (!IntrMemName) + MIB.addGlobalAddress(GV, 0, 0); else - MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(CallOpc)) - .addExternalSymbol(IntrMemName, 0)); + MIB.addExternalSymbol(IntrMemName, 0); + + // Explicitly adding the predicate here. + AddDefaultPred(MIB); } - + // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) MIB.addReg(RegArgs[i]); @@ -2332,7 +2377,8 @@ bool ARMFastISel::SelectCall(const Instruction *I, // Finish off the call including any return values. SmallVector<unsigned, 4> UsedRegs; - if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg)) + return false; // Set all unused physreg defs as dead. static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); @@ -2383,6 +2429,42 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { default: return false; + case Intrinsic::frameaddress: { + MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + unsigned LdrOpc; + const TargetRegisterClass *RC; + if (isThumb2) { + LdrOpc = ARM::t2LDRi12; + RC = (const TargetRegisterClass*)&ARM::tGPRRegClass; + } else { + LdrOpc = ARM::LDRi12; + RC = (const TargetRegisterClass*)&ARM::GPRRegClass; + } + + const ARMBaseRegisterInfo *RegInfo = + static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo()); + unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); + unsigned SrcReg = FramePtr; + + // Recursively load frame address + // ldr r0 [fp] + // ldr r0 [r0] + // ldr r0 [r0] + // ... + unsigned DestReg; + unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = createResultReg(RC); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(LdrOpc), DestReg) + .addReg(SrcReg).addImm(0)); + SrcReg = DestReg; + } + UpdateValueMap(&I, SrcReg); + return true; + } case Intrinsic::memcpy: case Intrinsic::memmove: { const MemTransferInst &MTI = cast<MemTransferInst>(I); @@ -2406,10 +2488,10 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { return true; } } - + if (!MTI.getLength()->getType()->isIntegerTy(32)) return false; - + if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255) return false; @@ -2421,20 +2503,24 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { // Don't handle volatile. if (MSI.isVolatile()) return false; - + if (!MSI.getLength()->getType()->isIntegerTy(32)) return false; - + if (MSI.getDestAddressSpace() > 255) return false; - + return SelectCall(&I, "memset"); } + case Intrinsic::trap: { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::TRAP)); + return true; + } } } bool ARMFastISel::SelectTrunc(const Instruction *I) { - // The high bits for a type smaller than the register size are assumed to be + // The high bits for a type smaller than the register size are assumed to be // undefined. Value *Op = I->getOperand(0); @@ -2625,7 +2711,7 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo, // See if we can handle this address. Address Addr; if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false; - + unsigned ResultReg = MI->getOperand(0).getReg(); if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false)) return false; @@ -2640,8 +2726,7 @@ namespace llvm { // Darwin and thumb1 only for now. const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>(); - if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only() && - !DisableARMFastISel) + if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only()) return new ARMFastISel(funcInfo); return 0; } diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 402ecb0..2629496 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -790,7 +790,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, // The writeback is only needed when emitting two vst1.64 instructions. if (NumAlignedDPRCS2Regs >= 6) { unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, - ARM::QQPRRegisterClass); + &ARM::QQPRRegClass); MBB.addLiveIn(SupReg); AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), ARM::R4) @@ -808,7 +808,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, // 16-byte aligned vst1.64 with 4 d-regs, no writeback. if (NumAlignedDPRCS2Regs >= 4) { unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, - ARM::QQPRRegisterClass); + &ARM::QQPRRegClass); MBB.addLiveIn(SupReg); AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q)) .addReg(ARM::R4).addImm(16).addReg(NextReg) @@ -820,7 +820,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, // 16-byte aligned vst1.64 with 2 d-regs. if (NumAlignedDPRCS2Regs >= 2) { unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, - ARM::QPRRegisterClass); + &ARM::QPRRegClass); MBB.addLiveIn(SupReg); AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64)) .addReg(ARM::R4).addImm(16).addReg(SupReg)); @@ -908,7 +908,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, // 16-byte aligned vld1.64 with 4 d-regs and writeback. if (NumAlignedDPRCS2Regs >= 6) { unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, - ARM::QQPRRegisterClass); + &ARM::QQPRRegClass); AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg) .addReg(ARM::R4, RegState::Define) .addReg(ARM::R4, RegState::Kill).addImm(16) @@ -924,7 +924,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, // 16-byte aligned vld1.64 with 4 d-regs, no writeback. if (NumAlignedDPRCS2Regs >= 4) { unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, - ARM::QQPRRegisterClass); + &ARM::QQPRRegClass); AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg) .addReg(ARM::R4).addImm(16) .addReg(SupReg, RegState::ImplicitDefine)); @@ -935,7 +935,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, // 16-byte aligned vld1.64 with 2 d-regs. if (NumAlignedDPRCS2Regs >= 2) { unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, - ARM::QPRRegisterClass); + &ARM::QPRRegClass); AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg) .addReg(ARM::R4).addImm(16)); NextReg += 2; @@ -1244,7 +1244,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, CanEliminateFrame = false; } - if (!ARM::GPRRegisterClass->contains(Reg)) + if (!ARM::GPRRegClass.contains(Reg)) continue; if (Spilled) { @@ -1404,7 +1404,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } else if (!AFI->isThumb1OnlyFunction()) { // note: Thumb1 functions spill to R12, not the stack. Reserve a slot // closest to SP or frame pointer. - const TargetRegisterClass *RC = ARM::GPRRegisterClass; + const TargetRegisterClass *RC = &ARM::GPRRegClass; RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false)); diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 1eafbbc..1953192 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -210,29 +210,29 @@ private: /// loads of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, - unsigned *DOpcodes, - unsigned *QOpcodes0, unsigned *QOpcodes1); + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, const uint16_t *QOpcodes1); /// SelectVST - Select NEON store intrinsics. NumVecs should /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// stores of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, - unsigned *DOpcodes, - unsigned *QOpcodes0, unsigned *QOpcodes1); + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, const uint16_t *QOpcodes1); /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should /// be 2, 3 or 4. The opcode arrays specify the instructions used for /// load/store of D registers and Q registers. SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, unsigned NumVecs, - unsigned *DOpcodes, unsigned *QOpcodes); + const uint16_t *DOpcodes, const uint16_t *QOpcodes); /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. (Q registers are not supported.) SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, - unsigned *Opcodes); + const uint16_t *Opcodes); /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be @@ -583,8 +583,6 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, } - - //----- AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, @@ -1597,8 +1595,9 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { } SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, - unsigned *DOpcodes, unsigned *QOpcodes0, - unsigned *QOpcodes1) { + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); @@ -1729,8 +1728,9 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, } SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, - unsigned *DOpcodes, unsigned *QOpcodes0, - unsigned *QOpcodes1) { + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); @@ -1875,8 +1875,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, unsigned NumVecs, - unsigned *DOpcodes, - unsigned *QOpcodes) { + const uint16_t *DOpcodes, + const uint16_t *QOpcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); @@ -1994,7 +1994,8 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, } SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, - unsigned NumVecs, unsigned *Opcodes) { + unsigned NumVecs, + const uint16_t *Opcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); @@ -2893,176 +2894,199 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VLD2DUP: { - unsigned Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, - ARM::VLD2DUPd32 }; + static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, + ARM::VLD2DUPd32 }; return SelectVLDDup(N, false, 2, Opcodes); } case ARMISD::VLD3DUP: { - unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, - ARM::VLD3DUPd32Pseudo }; + static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo, + ARM::VLD3DUPd16Pseudo, + ARM::VLD3DUPd32Pseudo }; return SelectVLDDup(N, false, 3, Opcodes); } case ARMISD::VLD4DUP: { - unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, - ARM::VLD4DUPd32Pseudo }; + static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo, + ARM::VLD4DUPd16Pseudo, + ARM::VLD4DUPd32Pseudo }; return SelectVLDDup(N, false, 4, Opcodes); } case ARMISD::VLD2DUP_UPD: { - unsigned Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed, - ARM::VLD2DUPd32wb_fixed }; + static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed, + ARM::VLD2DUPd16wb_fixed, + ARM::VLD2DUPd32wb_fixed }; return SelectVLDDup(N, true, 2, Opcodes); } case ARMISD::VLD3DUP_UPD: { - unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD, - ARM::VLD3DUPd32Pseudo_UPD }; + static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, + ARM::VLD3DUPd16Pseudo_UPD, + ARM::VLD3DUPd32Pseudo_UPD }; return SelectVLDDup(N, true, 3, Opcodes); } case ARMISD::VLD4DUP_UPD: { - unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD, - ARM::VLD4DUPd32Pseudo_UPD }; + static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, + ARM::VLD4DUPd16Pseudo_UPD, + ARM::VLD4DUPd32Pseudo_UPD }; return SelectVLDDup(N, true, 4, Opcodes); } case ARMISD::VLD1_UPD: { - unsigned DOpcodes[] = { ARM::VLD1d8wb_fixed, ARM::VLD1d16wb_fixed, - ARM::VLD1d32wb_fixed, ARM::VLD1d64wb_fixed }; - unsigned QOpcodes[] = { ARM::VLD1q8wb_fixed, - ARM::VLD1q16wb_fixed, - ARM::VLD1q32wb_fixed, - ARM::VLD1q64wb_fixed }; + static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed, + ARM::VLD1d16wb_fixed, + ARM::VLD1d32wb_fixed, + ARM::VLD1d64wb_fixed }; + static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed, + ARM::VLD1q16wb_fixed, + ARM::VLD1q32wb_fixed, + ARM::VLD1q64wb_fixed }; return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0); } case ARMISD::VLD2_UPD: { - unsigned DOpcodes[] = { ARM::VLD2d8wb_fixed, - ARM::VLD2d16wb_fixed, - ARM::VLD2d32wb_fixed, - ARM::VLD1q64wb_fixed}; - unsigned QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed, - ARM::VLD2q16PseudoWB_fixed, - ARM::VLD2q32PseudoWB_fixed }; + static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed, + ARM::VLD2d16wb_fixed, + ARM::VLD2d32wb_fixed, + ARM::VLD1q64wb_fixed}; + static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed, + ARM::VLD2q16PseudoWB_fixed, + ARM::VLD2q32PseudoWB_fixed }; return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0); } case ARMISD::VLD3_UPD: { - unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD, - ARM::VLD3d32Pseudo_UPD, ARM::VLD1q64wb_fixed}; - unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, - ARM::VLD3q16Pseudo_UPD, - ARM::VLD3q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, - ARM::VLD3q16oddPseudo_UPD, - ARM::VLD3q32oddPseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, + ARM::VLD3d16Pseudo_UPD, + ARM::VLD3d32Pseudo_UPD, + ARM::VLD1q64wb_fixed}; + static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, + ARM::VLD3q16Pseudo_UPD, + ARM::VLD3q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, + ARM::VLD3q16oddPseudo_UPD, + ARM::VLD3q32oddPseudo_UPD }; return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); } case ARMISD::VLD4_UPD: { - unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, - ARM::VLD4d32Pseudo_UPD, ARM::VLD1q64wb_fixed}; - unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, - ARM::VLD4q16Pseudo_UPD, - ARM::VLD4q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, - ARM::VLD4q16oddPseudo_UPD, - ARM::VLD4q32oddPseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, + ARM::VLD4d16Pseudo_UPD, + ARM::VLD4d32Pseudo_UPD, + ARM::VLD1q64wb_fixed}; + static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, + ARM::VLD4q16oddPseudo_UPD, + ARM::VLD4q32oddPseudo_UPD }; return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); } case ARMISD::VLD2LN_UPD: { - unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD, - ARM::VLD2LNd32Pseudo_UPD }; - unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD, - ARM::VLD2LNq32Pseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, + ARM::VLD2LNd16Pseudo_UPD, + ARM::VLD2LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD, + ARM::VLD2LNq32Pseudo_UPD }; return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes); } case ARMISD::VLD3LN_UPD: { - unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD, - ARM::VLD3LNd32Pseudo_UPD }; - unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD, - ARM::VLD3LNq32Pseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, + ARM::VLD3LNd16Pseudo_UPD, + ARM::VLD3LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD, + ARM::VLD3LNq32Pseudo_UPD }; return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes); } case ARMISD::VLD4LN_UPD: { - unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD, - ARM::VLD4LNd32Pseudo_UPD }; - unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD, - ARM::VLD4LNq32Pseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, + ARM::VLD4LNd16Pseudo_UPD, + ARM::VLD4LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD, + ARM::VLD4LNq32Pseudo_UPD }; return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes); } case ARMISD::VST1_UPD: { - unsigned DOpcodes[] = { ARM::VST1d8wb_fixed, ARM::VST1d16wb_fixed, - ARM::VST1d32wb_fixed, ARM::VST1d64wb_fixed }; - unsigned QOpcodes[] = { ARM::VST1q8wb_fixed, - ARM::VST1q16wb_fixed, - ARM::VST1q32wb_fixed, - ARM::VST1q64wb_fixed }; + static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed, + ARM::VST1d16wb_fixed, + ARM::VST1d32wb_fixed, + ARM::VST1d64wb_fixed }; + static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed, + ARM::VST1q16wb_fixed, + ARM::VST1q32wb_fixed, + ARM::VST1q64wb_fixed }; return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0); } case ARMISD::VST2_UPD: { - unsigned DOpcodes[] = { ARM::VST2d8wb_fixed, - ARM::VST2d16wb_fixed, - ARM::VST2d32wb_fixed, - ARM::VST1q64wb_fixed}; - unsigned QOpcodes[] = { ARM::VST2q8PseudoWB_fixed, - ARM::VST2q16PseudoWB_fixed, - ARM::VST2q32PseudoWB_fixed }; + static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed, + ARM::VST2d16wb_fixed, + ARM::VST2d32wb_fixed, + ARM::VST1q64wb_fixed}; + static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed, + ARM::VST2q16PseudoWB_fixed, + ARM::VST2q32PseudoWB_fixed }; return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0); } case ARMISD::VST3_UPD: { - unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD, - ARM::VST3d32Pseudo_UPD,ARM::VST1d64TPseudoWB_fixed}; - unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, - ARM::VST3q16Pseudo_UPD, - ARM::VST3q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, - ARM::VST3q16oddPseudo_UPD, - ARM::VST3q32oddPseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD, + ARM::VST3d16Pseudo_UPD, + ARM::VST3d32Pseudo_UPD, + ARM::VST1d64TPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, + ARM::VST3q16Pseudo_UPD, + ARM::VST3q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, + ARM::VST3q16oddPseudo_UPD, + ARM::VST3q32oddPseudo_UPD }; return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); } case ARMISD::VST4_UPD: { - unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, - ARM::VST4d32Pseudo_UPD,ARM::VST1d64QPseudoWB_fixed}; - unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, - ARM::VST4q16Pseudo_UPD, - ARM::VST4q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, - ARM::VST4q16oddPseudo_UPD, - ARM::VST4q32oddPseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD, + ARM::VST4d16Pseudo_UPD, + ARM::VST4d32Pseudo_UPD, + ARM::VST1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD }; return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); } case ARMISD::VST2LN_UPD: { - unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD, - ARM::VST2LNd32Pseudo_UPD }; - unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD, - ARM::VST2LNq32Pseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, + ARM::VST2LNd16Pseudo_UPD, + ARM::VST2LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD, + ARM::VST2LNq32Pseudo_UPD }; return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes); } case ARMISD::VST3LN_UPD: { - unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD, - ARM::VST3LNd32Pseudo_UPD }; - unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD, - ARM::VST3LNq32Pseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, + ARM::VST3LNd16Pseudo_UPD, + ARM::VST3LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD, + ARM::VST3LNq32Pseudo_UPD }; return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes); } case ARMISD::VST4LN_UPD: { - unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD, - ARM::VST4LNd32Pseudo_UPD }; - unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD, - ARM::VST4LNq32Pseudo_UPD }; + static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, + ARM::VST4LNd16Pseudo_UPD, + ARM::VST4LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD, + ARM::VST4LNq32Pseudo_UPD }; return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes); } @@ -3179,124 +3203,144 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case Intrinsic::arm_neon_vld1: { - unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, - ARM::VLD1d32, ARM::VLD1d64 }; - unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, - ARM::VLD1q32, ARM::VLD1q64}; + static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, + ARM::VLD1d32, ARM::VLD1d64 }; + static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, + ARM::VLD1q32, ARM::VLD1q64}; return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vld2: { - unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, - ARM::VLD2d32, ARM::VLD1q64 }; - unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo, - ARM::VLD2q32Pseudo }; + static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, + ARM::VLD2d32, ARM::VLD1q64 }; + static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo, + ARM::VLD2q32Pseudo }; return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vld3: { - unsigned DOpcodes[] = { ARM::VLD3d8Pseudo, ARM::VLD3d16Pseudo, - ARM::VLD3d32Pseudo, ARM::VLD1d64TPseudo }; - unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, - ARM::VLD3q16Pseudo_UPD, - ARM::VLD3q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo, - ARM::VLD3q16oddPseudo, - ARM::VLD3q32oddPseudo }; + static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo, + ARM::VLD3d16Pseudo, + ARM::VLD3d32Pseudo, + ARM::VLD1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, + ARM::VLD3q16Pseudo_UPD, + ARM::VLD3q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo, + ARM::VLD3q16oddPseudo, + ARM::VLD3q32oddPseudo }; return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vld4: { - unsigned DOpcodes[] = { ARM::VLD4d8Pseudo, ARM::VLD4d16Pseudo, - ARM::VLD4d32Pseudo, ARM::VLD1d64QPseudo }; - unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, - ARM::VLD4q16Pseudo_UPD, - ARM::VLD4q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo, - ARM::VLD4q16oddPseudo, - ARM::VLD4q32oddPseudo }; + static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo, + ARM::VLD4d16Pseudo, + ARM::VLD4d32Pseudo, + ARM::VLD1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo, + ARM::VLD4q16oddPseudo, + ARM::VLD4q32oddPseudo }; return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vld2lane: { - unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo, - ARM::VLD2LNd32Pseudo }; - unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo }; + static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo, + ARM::VLD2LNd16Pseudo, + ARM::VLD2LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo, + ARM::VLD2LNq32Pseudo }; return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vld3lane: { - unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo, - ARM::VLD3LNd32Pseudo }; - unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo }; + static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo, + ARM::VLD3LNd16Pseudo, + ARM::VLD3LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo, + ARM::VLD3LNq32Pseudo }; return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vld4lane: { - unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo, - ARM::VLD4LNd32Pseudo }; - unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo }; + static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo, + ARM::VLD4LNd16Pseudo, + ARM::VLD4LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo, + ARM::VLD4LNq32Pseudo }; return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst1: { - unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16, - ARM::VST1d32, ARM::VST1d64 }; - unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16, - ARM::VST1q32, ARM::VST1q64 }; + static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16, + ARM::VST1d32, ARM::VST1d64 }; + static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16, + ARM::VST1q32, ARM::VST1q64 }; return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vst2: { - unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, - ARM::VST2d32, ARM::VST1q64 }; - unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, - ARM::VST2q32Pseudo }; + static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, + ARM::VST2d32, ARM::VST1q64 }; + static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, + ARM::VST2q32Pseudo }; return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vst3: { - unsigned DOpcodes[] = { ARM::VST3d8Pseudo, ARM::VST3d16Pseudo, - ARM::VST3d32Pseudo, ARM::VST1d64TPseudo }; - unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, - ARM::VST3q16Pseudo_UPD, - ARM::VST3q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo, - ARM::VST3q16oddPseudo, - ARM::VST3q32oddPseudo }; + static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo, + ARM::VST3d16Pseudo, + ARM::VST3d32Pseudo, + ARM::VST1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, + ARM::VST3q16Pseudo_UPD, + ARM::VST3q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo, + ARM::VST3q16oddPseudo, + ARM::VST3q32oddPseudo }; return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vst4: { - unsigned DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo, - ARM::VST4d32Pseudo, ARM::VST1d64QPseudo }; - unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, - ARM::VST4q16Pseudo_UPD, - ARM::VST4q32Pseudo_UPD }; - unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo, - ARM::VST4q16oddPseudo, - ARM::VST4q32oddPseudo }; + static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo, + ARM::VST4d16Pseudo, + ARM::VST4d32Pseudo, + ARM::VST1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo, + ARM::VST4q16oddPseudo, + ARM::VST4q32oddPseudo }; return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vst2lane: { - unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo, - ARM::VST2LNd32Pseudo }; - unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo }; + static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo, + ARM::VST2LNd16Pseudo, + ARM::VST2LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo, + ARM::VST2LNq32Pseudo }; return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst3lane: { - unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo, - ARM::VST3LNd32Pseudo }; - unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo }; + static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo, + ARM::VST3LNd16Pseudo, + ARM::VST3LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo, + ARM::VST3LNq32Pseudo }; return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst4lane: { - unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo, - ARM::VST4LNd32Pseudo }; - unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo }; + static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo, + ARM::VST4LNd16Pseudo, + ARM::VST4LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo, + ARM::VST4LNq32Pseudo }; return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); } } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index a103c94..04370c0 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -52,6 +52,7 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); +STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); // This option should go away when tail calls fully work. static cl::opt<bool> @@ -153,12 +154,12 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, } void ARMTargetLowering::addDRTypeForNEON(EVT VT) { - addRegisterClass(VT, ARM::DPRRegisterClass); + addRegisterClass(VT, &ARM::DPRRegClass); addTypeForNEON(VT, MVT::f64, MVT::v2i32); } void ARMTargetLowering::addQRTypeForNEON(EVT VT) { - addRegisterClass(VT, ARM::QPRRegisterClass); + addRegisterClass(VT, &ARM::QPRRegClass); addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } @@ -431,14 +432,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } if (Subtarget->isThumb1Only()) - addRegisterClass(MVT::i32, ARM::tGPRRegisterClass); + addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else - addRegisterClass(MVT::i32, ARM::GPRRegisterClass); + addRegisterClass(MVT::i32, &ARM::GPRRegClass); if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { - addRegisterClass(MVT::f32, ARM::SPRRegisterClass); + addRegisterClass(MVT::f32, &ARM::SPRRegClass); if (!Subtarget->isFPOnlySP()) - addRegisterClass(MVT::f64, ARM::DPRRegisterClass); + addRegisterClass(MVT::f64, &ARM::DPRRegClass); setTruncStoreAction(MVT::f64, MVT::f32, Expand); } @@ -824,6 +825,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) benefitFromCodePlacementOpt = true; + // Prefer likely predicted branches to selects on out-of-order cores. + predictableSelectIsExpensive = Subtarget->isCortexA9(); + setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } @@ -849,7 +853,7 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{ // the cost is 1 for both f32 and f64. case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: - RRC = ARM::DPRRegisterClass; + RRC = &ARM::DPRRegClass; // When NEON is used for SP, only half of the register file is available // because operations that define both SP and DP results will be constrained // to the VFP2 class (D0-D15). We currently model this constraint prior to @@ -859,15 +863,15 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{ break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: - RRC = ARM::DPRRegisterClass; + RRC = &ARM::DPRRegClass; Cost = 2; break; case MVT::v4i64: - RRC = ARM::DPRRegisterClass; + RRC = &ARM::DPRRegClass; Cost = 4; break; case MVT::v8i64: - RRC = ARM::DPRRegisterClass; + RRC = &ARM::DPRRegClass; Cost = 8; break; } @@ -891,6 +895,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; + case ARMISD::CMN: return "ARMISD::CMN"; case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; @@ -1027,9 +1032,9 @@ const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { // load / store 4 to 8 consecutive D registers. if (Subtarget->hasNEON()) { if (VT == MVT::v4i64) - return ARM::QQPRRegisterClass; - else if (VT == MVT::v8i64) - return ARM::QQQQPRRegisterClass; + return &ARM::QQPRRegClass; + if (VT == MVT::v8i64) + return &ARM::QQQQPRRegClass; } return TargetLowering::getRegClassFor(VT); } @@ -1286,14 +1291,20 @@ void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. SDValue -ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool doesNotRet = CLI.DoesNotReturn; + bool isVarArg = CLI.IsVarArg; + MachineFunction &MF = DAG.getMachineFunction(); bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool IsSibCall = false; @@ -1415,21 +1426,22 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CCInfo.clearFirstByValReg(); } - unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); - SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, - StkPtrOff); - SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); - SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); - SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, - MVT::i32); - MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, - Flags.getByValAlign(), - /*isVolatile=*/false, - /*AlwaysInline=*/false, - MachinePointerInfo(0), - MachinePointerInfo(0))); - + if (Flags.getByValSize() - 4*offset > 0) { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); + SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, + StkPtrOff); + SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); + SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, + MVT::i32); + SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32); + + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; + MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, + Ops, array_lengthof(Ops))); + } } else if (!IsSibCall) { assert(VA.isMemLoc()); @@ -2095,12 +2107,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); Args.push_back(Entry); // FIXME: is there useful debug info available here? - std::pair<SDValue, SDValue> CallResult = - LowerCallTo(Chain, (Type *) Type::getInt32Ty(*DAG.getContext()), + TargetLowering::CallLoweringInfo CLI(Chain, + (Type *) Type::getInt32Ty(*DAG.getContext()), false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed=*/true, DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; } @@ -2108,7 +2121,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, // "local exec" model. SDValue ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, - SelectionDAG &DAG) const { + SelectionDAG &DAG, + TLSModel::Model model) const { const GlobalValue *GV = GA->getGlobal(); DebugLoc dl = GA->getDebugLoc(); SDValue Offset; @@ -2117,7 +2131,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); - if (GV->isDeclaration()) { + if (model == TLSModel::InitialExec) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); @@ -2142,6 +2156,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, false, false, false, 0); } else { // local exec model + assert(model == TLSModel::LocalExec); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); @@ -2162,12 +2177,18 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "TLS not implemented for non-ELF targets"); GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); - // If the relocation model is PIC, use the "General Dynamic" TLS Model, - // otherwise use the "Local Exec" TLS Model - if (getTargetMachine().getRelocationModel() == Reloc::PIC_) - return LowerToTLSGeneralDynamicModel(GA, DAG); - else - return LowerToTLSExecModels(GA, DAG); + + TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); + + switch (model) { + case TLSModel::GeneralDynamic: + case TLSModel::LocalDynamic: + return LowerToTLSGeneralDynamicModel(GA, DAG); + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModels(GA, DAG, model); + } + llvm_unreachable("bogus TLS model"); } SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, @@ -2457,9 +2478,9 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) - RC = ARM::tGPRRegisterClass; + RC = &ARM::tGPRRegClass; else - RC = ARM::GPRRegisterClass; + RC = &ARM::GPRRegClass; // Transform the arguments stored in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); @@ -2543,9 +2564,9 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) { const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) - RC = ARM::tGPRRegisterClass; + RC = &ARM::tGPRRegClass; else - RC = ARM::GPRRegisterClass; + RC = &ARM::GPRRegClass; unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); @@ -2627,14 +2648,15 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, const TargetRegisterClass *RC; if (RegVT == MVT::f32) - RC = ARM::SPRRegisterClass; + RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64) - RC = ARM::DPRRegisterClass; + RC = &ARM::DPRRegClass; else if (RegVT == MVT::v2f64) - RC = ARM::QPRRegisterClass; + RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) - RC = (AFI->isThumb1OnlyFunction() ? - ARM::tGPRRegisterClass : ARM::GPRRegisterClass); + RC = AFI->isThumb1OnlyFunction() ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); @@ -4791,7 +4813,9 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) { for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); - Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT)); + // Element types smaller than 32 bits are not legal, so use i32 elements. + // The values are implicitly truncated so sext vs. zext doesn't matter. + Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); } return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); @@ -5252,14 +5276,14 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned scratch = - MRI.createVirtualRegister(isThumb2 ? ARM::rGPRRegisterClass - : ARM::GPRRegisterClass); + unsigned scratch = MRI.createVirtualRegister(isThumb2 ? + (const TargetRegisterClass*)&ARM::rGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass); if (isThumb2) { - MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); - MRI.constrainRegClass(oldval, ARM::rGPRRegisterClass); - MRI.constrainRegClass(newval, ARM::rGPRRegisterClass); + MRI.constrainRegClass(dest, &ARM::rGPRRegClass); + MRI.constrainRegClass(oldval, &ARM::rGPRRegClass); + MRI.constrainRegClass(newval, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc; @@ -5362,8 +5386,8 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); if (isThumb2) { - MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); - MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + MRI.constrainRegClass(dest, &ARM::rGPRRegClass); + MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc; @@ -5394,8 +5418,9 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); - const TargetRegisterClass *TRC = - isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = isThumb2 ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned scratch = MRI.createVirtualRegister(TRC); unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); @@ -5469,8 +5494,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); if (isThumb2) { - MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); - MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + MRI.constrainRegClass(dest, &ARM::rGPRRegClass); + MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc, extendOpc; @@ -5504,8 +5529,9 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); - const TargetRegisterClass *TRC = - isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = isThumb2 ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned scratch = MRI.createVirtualRegister(TRC); unsigned scratch2 = MRI.createVirtualRegister(TRC); @@ -5531,7 +5557,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, // Sign extend the value, if necessary. if (signExtend && extendOpc) { - oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass); + oldval = MRI.createVirtualRegister(&ARM::GPRRegClass); AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval) .addReg(dest) .addImm(0)); @@ -5586,9 +5612,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); if (isThumb2) { - MRI.constrainRegClass(destlo, ARM::rGPRRegisterClass); - MRI.constrainRegClass(desthi, ARM::rGPRRegisterClass); - MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + MRI.constrainRegClass(destlo, &ARM::rGPRRegClass); + MRI.constrainRegClass(desthi, &ARM::rGPRRegClass); + MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); } unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD; @@ -5614,8 +5640,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); - const TargetRegisterClass *TRC = - isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = isThumb2 ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned storesuccess = MRI.createVirtualRegister(TRC); // thisMBB: @@ -5722,8 +5749,9 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); - const TargetRegisterClass *TRC = - isThumb ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = isThumb ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = @@ -5827,8 +5855,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineFrameInfo *MFI = MF->getFrameInfo(); int FI = MFI->getFunctionContextIndex(); - const TargetRegisterClass *TRC = - Subtarget->isThumb() ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = Subtarget->isThumb() ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRnopcRegClass; // Get a mapping of the call site numbers to all of the landing pads they're // associated with. @@ -6176,14 +6205,12 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { for (unsigned i = 0; SavedRegs[i] != 0; ++i) { unsigned Reg = SavedRegs[i]; if (Subtarget->isThumb2() && - !ARM::tGPRRegisterClass->contains(Reg) && - !ARM::hGPRRegisterClass->contains(Reg)) + !ARM::tGPRRegClass.contains(Reg) && + !ARM::hGPRRegClass.contains(Reg)) continue; - else if (Subtarget->isThumb1Only() && - !ARM::tGPRRegisterClass->contains(Reg)) + if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) continue; - else if (!Subtarget->isThumb() && - !ARM::GPRRegisterClass->contains(Reg)) + if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) continue; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); @@ -6214,6 +6241,304 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { llvm_unreachable("Expecting a BB with two successors!"); } +MachineBasicBlock *ARMTargetLowering:: +EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { + // This pseudo instruction has 3 operands: dst, src, size + // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). + // Otherwise, we will generate unrolled scalar copies. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned src = MI->getOperand(1).getReg(); + unsigned SizeVal = MI->getOperand(2).getImm(); + unsigned Align = MI->getOperand(3).getImm(); + DebugLoc dl = MI->getDebugLoc(); + + bool isThumb2 = Subtarget->isThumb2(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned ldrOpc, strOpc, UnitSize = 0; + + const TargetRegisterClass *TRC = isThumb2 ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; + const TargetRegisterClass *TRC_Vec = 0; + + if (Align & 1) { + ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; + strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; + UnitSize = 1; + } else if (Align & 2) { + ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST; + strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST; + UnitSize = 2; + } else { + // Check whether we can use NEON instructions. + if (!MF->getFunction()->hasFnAttr(Attribute::NoImplicitFloat) && + Subtarget->hasNEON()) { + if ((Align % 16 == 0) && SizeVal >= 16) { + ldrOpc = ARM::VLD1q32wb_fixed; + strOpc = ARM::VST1q32wb_fixed; + UnitSize = 16; + TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass; + } + else if ((Align % 8 == 0) && SizeVal >= 8) { + ldrOpc = ARM::VLD1d32wb_fixed; + strOpc = ARM::VST1d32wb_fixed; + UnitSize = 8; + TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass; + } + } + // Can't use NEON instructions. + if (UnitSize == 0) { + ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; + strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM; + UnitSize = 4; + } + } + + unsigned BytesLeft = SizeVal % UnitSize; + unsigned LoopSize = SizeVal - BytesLeft; + + if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { + // Use LDR and STR to copy. + // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) + // [destOut] = STR_POST(scratch, destIn, UnitSize) + unsigned srcIn = src; + unsigned destIn = dest; + for (unsigned i = 0; i < LoopSize; i+=UnitSize) { + unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + if (UnitSize >= 8) { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc), scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(destIn).addImm(0).addReg(scratch)); + } else if (isThumb2) { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc), scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addImm(UnitSize)); + } else { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc), scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0) + .addImm(UnitSize)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addReg(0).addImm(UnitSize)); + } + srcIn = srcOut; + destIn = destOut; + } + + // Handle the leftover bytes with LDRB and STRB. + // [scratch, srcOut] = LDRB_POST(srcIn, 1) + // [destOut] = STRB_POST(scratch, destIn, 1) + ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; + strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; + for (unsigned i = 0; i < BytesLeft; i++) { + unsigned scratch = MRI.createVirtualRegister(TRC); + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + if (isThumb2) { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc),scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addReg(0).addImm(1)); + } else { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc),scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addReg(0).addImm(1)); + } + srcIn = srcOut; + destIn = destOut; + } + MI->eraseFromParent(); // The instruction is gone now. + return BB; + } + + // Expand the pseudo op to a loop. + // thisMBB: + // ... + // movw varEnd, # --> with thumb2 + // movt varEnd, # + // ldrcp varEnd, idx --> without thumb2 + // fallthrough --> loopMBB + // loopMBB: + // PHI varPhi, varEnd, varLoop + // PHI srcPhi, src, srcLoop + // PHI destPhi, dst, destLoop + // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) + // [destLoop] = STR_POST(scratch, destPhi, UnitSize) + // subs varLoop, varPhi, #UnitSize + // bne loopMBB + // fallthrough --> exitMBB + // exitMBB: + // epilogue to handle left-over bytes + // [scratch, srcOut] = LDRB_POST(srcLoop, 1) + // [destOut] = STRB_POST(scratch, destLoop, 1) + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Load an immediate to varEnd. + unsigned varEnd = MRI.createVirtualRegister(TRC); + if (isThumb2) { + unsigned VReg1 = varEnd; + if ((LoopSize & 0xFFFF0000) != 0) + VReg1 = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1) + .addImm(LoopSize & 0xFFFF)); + + if ((LoopSize & 0xFFFF0000) != 0) + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd) + .addReg(VReg1) + .addImm(LoopSize >> 16)); + } else { + MachineConstantPool *ConstantPool = MF->getConstantPool(); + Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); + const Constant *C = ConstantInt::get(Int32Ty, LoopSize); + + // MachineConstantPool wants an explicit alignment. + unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty); + if (Align == 0) + Align = getTargetData()->getTypeAllocSize(C->getType()); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp)) + .addReg(varEnd, RegState::Define) + .addConstantPoolIndex(Idx) + .addImm(0)); + } + BB->addSuccessor(loopMBB); + + // Generate the loop body: + // varPhi = PHI(varLoop, varEnd) + // srcPhi = PHI(srcLoop, src) + // destPhi = PHI(destLoop, dst) + MachineBasicBlock *entryBB = BB; + BB = loopMBB; + unsigned varLoop = MRI.createVirtualRegister(TRC); + unsigned varPhi = MRI.createVirtualRegister(TRC); + unsigned srcLoop = MRI.createVirtualRegister(TRC); + unsigned srcPhi = MRI.createVirtualRegister(TRC); + unsigned destLoop = MRI.createVirtualRegister(TRC); + unsigned destPhi = MRI.createVirtualRegister(TRC); + + BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) + .addReg(varLoop).addMBB(loopMBB) + .addReg(varEnd).addMBB(entryBB); + BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) + .addReg(srcLoop).addMBB(loopMBB) + .addReg(src).addMBB(entryBB); + BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) + .addReg(destLoop).addMBB(loopMBB) + .addReg(dest).addMBB(entryBB); + + // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) + // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) + unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); + if (UnitSize >= 8) { + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) + .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0)); + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) + .addReg(destPhi).addImm(0).addReg(scratch)); + } else if (isThumb2) { + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) + .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize)); + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) + .addReg(scratch).addReg(destPhi) + .addImm(UnitSize)); + } else { + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) + .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0) + .addImm(UnitSize)); + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) + .addReg(scratch).addReg(destPhi) + .addReg(0).addImm(UnitSize)); + } + + // Decrement loop variable by UnitSize. + MachineInstrBuilder MIB = BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); + AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); + MIB->getOperand(5).setReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + // loopMBB can loop back to loopMBB or fall through to exitMBB. + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // Add epilogue to handle BytesLeft. + BB = exitMBB; + MachineInstr *StartOfExit = exitMBB->begin(); + ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; + strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; + + // [scratch, srcOut] = LDRB_POST(srcLoop, 1) + // [destOut] = STRB_POST(scratch, destLoop, 1) + unsigned srcIn = srcLoop; + unsigned destIn = destLoop; + for (unsigned i = 0; i < BytesLeft; i++) { + unsigned scratch = MRI.createVirtualRegister(TRC); + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + if (isThumb2) { + AddDefaultPred(BuildMI(*BB, StartOfExit, dl, + TII->get(ldrOpc),scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); + + AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addImm(1)); + } else { + AddDefaultPred(BuildMI(*BB, StartOfExit, dl, + TII->get(ldrOpc),scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1)); + + AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addReg(0).addImm(1)); + } + srcIn = srcOut; + destIn = destOut; + } + + MI->eraseFromParent(); // The instruction is gone now. + return BB; +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -6517,10 +6842,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class - unsigned NewMovDstReg = MRI.createVirtualRegister( - isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass); - unsigned NewRsbDstReg = MRI.createVirtualRegister( - isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass); + unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ? + (const TargetRegisterClass*)&ARM::rGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, @@ -6534,12 +6858,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // fall through to SinkMBB RSBBB->addSuccessor(SinkBB); - // insert a movs at the end of BB - BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVr : ARM::MOVr), - NewMovDstReg) - .addReg(ABSSrcReg, RegState::Kill) - .addImm((unsigned)ARMCC::AL).addReg(0) - .addReg(ARM::CPSR, RegState::Define); + // insert a cmp at the end of BB + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(ABSSrcReg).addImm(0)); // insert a bcc with opposite CC to ARMCC::MI at the end of BB BuildMI(BB, dl, @@ -6551,7 +6873,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // by if-conversion pass BuildMI(*RSBBB, RSBBB->begin(), dl, TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) - .addReg(NewMovDstReg, RegState::Kill) + .addReg(ABSSrcReg, RegState::Kill) .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); // insert PHI in SinkBB, @@ -6559,7 +6881,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BuildMI(*SinkBB, SinkBB->begin(), dl, TII->get(ARM::PHI), ABSDstReg) .addReg(NewRsbDstReg).addMBB(RSBBB) - .addReg(NewMovDstReg).addMBB(BB); + .addReg(ABSSrcReg).addMBB(BB); // remove ABS instruction MI->eraseFromParent(); @@ -6567,6 +6889,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // return last added BB return SinkBB; } + case ARM::COPY_STRUCT_BYVAL_I32: + ++NumLoopByVals; + return EmitStructByval(MI, BB); } } @@ -7353,7 +7678,7 @@ static SDValue PerformSTORECombine(SDNode *N, if (St->isVolatile()) return SDValue(); - // Optimize trunc store (of multiple scalars) to shuffle and store. First, + // Optimize trunc store (of multiple scalars) to shuffle and store. First, // pack all of the elements in one place. Next, store to memory in fewer // chunks. SDValue StVal = St->getValue(); @@ -8721,12 +9046,19 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { return Imm >= 0 && Imm <= 255; } -/// isLegalAddImmediate - Return true if the specified immediate is legal -/// add immediate, that is the target has add instructions which can add -/// a register with the immediate without having to materialize the +/// isLegalAddImmediate - Return true if the specified immediate is a legal add +/// *or sub* immediate, that is the target has add or sub instructions which can +/// add a register with the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { - return ARM_AM::getSOImmVal(Imm) != -1; + // Same encoding for add/sub, just flip the sign. + int64_t AbsImm = llvm::abs64(Imm); + if (!Subtarget->isThumb()) + return ARM_AM::getSOImmVal(AbsImm) != -1; + if (Subtarget->isThumb2()) + return ARM_AM::getT2SOImmVal(AbsImm) != -1; + // Thumb1 only has 8-bit unsigned immediate. + return AbsImm >= 0 && AbsImm <= 255; } static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, @@ -9030,39 +9362,38 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, switch (Constraint[0]) { case 'l': // Low regs or general regs. if (Subtarget->isThumb()) - return RCPair(0U, ARM::tGPRRegisterClass); - else - return RCPair(0U, ARM::GPRRegisterClass); + return RCPair(0U, &ARM::tGPRRegClass); + return RCPair(0U, &ARM::GPRRegClass); case 'h': // High regs or no regs. if (Subtarget->isThumb()) - return RCPair(0U, ARM::hGPRRegisterClass); + return RCPair(0U, &ARM::hGPRRegClass); break; case 'r': - return RCPair(0U, ARM::GPRRegisterClass); + return RCPair(0U, &ARM::GPRRegClass); case 'w': if (VT == MVT::f32) - return RCPair(0U, ARM::SPRRegisterClass); + return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) - return RCPair(0U, ARM::DPRRegisterClass); + return RCPair(0U, &ARM::DPRRegClass); if (VT.getSizeInBits() == 128) - return RCPair(0U, ARM::QPRRegisterClass); + return RCPair(0U, &ARM::QPRRegClass); break; case 'x': if (VT == MVT::f32) - return RCPair(0U, ARM::SPR_8RegisterClass); + return RCPair(0U, &ARM::SPR_8RegClass); if (VT.getSizeInBits() == 64) - return RCPair(0U, ARM::DPR_8RegisterClass); + return RCPair(0U, &ARM::DPR_8RegClass); if (VT.getSizeInBits() == 128) - return RCPair(0U, ARM::QPR_8RegisterClass); + return RCPair(0U, &ARM::QPR_8RegClass); break; case 't': if (VT == MVT::f32) - return RCPair(0U, ARM::SPRRegisterClass); + return RCPair(0U, &ARM::SPRRegClass); break; } } if (StringRef("{cc}").equals_lower(Constraint)) - return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass); + return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 352d980..7ad48b9 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -41,6 +41,9 @@ namespace llvm { // PIC mode. WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable + // Add pseudo op to model memcpy for struct byval. + COPY_STRUCT_BYVAL, + CALL, // Function call. CALL_PRED, // Function call that's predicable. CALL_NOLINK, // Function call with branch not branch-and-link. @@ -53,6 +56,7 @@ namespace llvm { PIC_ADD, // Add with a PC operand and a PIC label. CMP, // ARM compare instructions. + CMN, // ARM CMN instructions. CMPZ, // ARM compare that sets only Z flag. CMPFP, // ARM VFP compare instruction, sets FPSCR. CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. @@ -422,7 +426,8 @@ namespace llvm { SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const; SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, - SelectionDAG &DAG) const; + SelectionDAG &DAG, + TLSModel::Model model) const; SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -462,13 +467,7 @@ namespace llvm { unsigned &VARegSize, unsigned &VARegSaveSize) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; /// HandleByVal - Target-specific cleanup for ByVal support. @@ -532,6 +531,9 @@ namespace llvm { MachineBasicBlock *MBB) const; bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitStructByval(MachineInstr *MI, + MachineBasicBlock *MBB) const; }; enum NEONModImmType { diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index f04926a..c8966fb 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -827,6 +827,8 @@ class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, let Inst{7-4} = 0b0111; let Inst{9-8} = 0b00; let Inst{27-20} = opcod; + + let Unpredictable{9-8} = 0b11; } // Misc Arithmetic instructions. @@ -1862,7 +1864,6 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, string opc, string dt, string asm, string cstr, list<dag> pattern> : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, oops, iops, f, itin, opc, dt, asm, cstr, pattern> { - // Instruction operands. bits<5> Vd; bits<5> Vn; diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index b8f607e..31b0c41 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -31,7 +31,8 @@ ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) /// getNoopForMachoTarget - Return the noop instruction to use for a noop. void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { if (hasNOP()) { - NopInst.setOpcode(ARM::NOP); + NopInst.setOpcode(ARM::HINT); + NopInst.addOperand(MCOperand::CreateImm(0)); NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); NopInst.addOperand(MCOperand::CreateReg(0)); } else { diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 1eb561d..1b8fc3f 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -18,6 +18,9 @@ // Type profiles. def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; +def SDT_ARMStructByVal : SDTypeProfile<0, 4, + [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; @@ -90,6 +93,10 @@ def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart, [SDNPHasChain, SDNPOutGlue]>; def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" , + SDT_ARMStructByVal, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, @@ -121,6 +128,9 @@ def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64, def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, [SDNPOutGlue]>; +def ARMcmn : SDNode<"ARMISD::CMN", SDT_ARMCmp, + [SDNPOutGlue]>; + def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp, [SDNPOutGlue, SDNPCommutative]>; @@ -161,53 +171,59 @@ def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; // ARM Instruction Predicate Definitions. // def HasV4T : Predicate<"Subtarget->hasV4TOps()">, - AssemblerPredicate<"HasV4TOps">; + AssemblerPredicate<"HasV4TOps", "armv4t">; def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; def HasV5T : Predicate<"Subtarget->hasV5TOps()">; def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, - AssemblerPredicate<"HasV5TEOps">; + AssemblerPredicate<"HasV5TEOps", "armv5te">; def HasV6 : Predicate<"Subtarget->hasV6Ops()">, - AssemblerPredicate<"HasV6Ops">; + AssemblerPredicate<"HasV6Ops", "armv6">; def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, - AssemblerPredicate<"HasV6T2Ops">; + AssemblerPredicate<"HasV6T2Ops", "armv6t2">; def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; def HasV7 : Predicate<"Subtarget->hasV7Ops()">, - AssemblerPredicate<"HasV7Ops">; + AssemblerPredicate<"HasV7Ops", "armv7">; def NoVFP : Predicate<"!Subtarget->hasVFP2()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, - AssemblerPredicate<"FeatureVFP2">; + AssemblerPredicate<"FeatureVFP2", "VFP2">; def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, - AssemblerPredicate<"FeatureVFP3">; + AssemblerPredicate<"FeatureVFP3", "VFP3">; def HasVFP4 : Predicate<"Subtarget->hasVFP4()">, - AssemblerPredicate<"FeatureVFP4">; + AssemblerPredicate<"FeatureVFP4", "VFP4">; def HasNEON : Predicate<"Subtarget->hasNEON()">, - AssemblerPredicate<"FeatureNEON">; + AssemblerPredicate<"FeatureNEON", "NEON">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, - AssemblerPredicate<"FeatureFP16">; + AssemblerPredicate<"FeatureFP16","half-float">; def HasDivide : Predicate<"Subtarget->hasDivide()">, - AssemblerPredicate<"FeatureHWDiv">; + AssemblerPredicate<"FeatureHWDiv", "divide">; def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">, - AssemblerPredicate<"FeatureT2XtPk">; + AssemblerPredicate<"FeatureT2XtPk", + "pack/extract">; def HasThumb2DSP : Predicate<"Subtarget->hasThumb2DSP()">, - AssemblerPredicate<"FeatureDSPThumb2">; + AssemblerPredicate<"FeatureDSPThumb2", + "thumb2-dsp">; def HasDB : Predicate<"Subtarget->hasDataBarrier()">, - AssemblerPredicate<"FeatureDB">; + AssemblerPredicate<"FeatureDB", + "data-barriers">; def HasMP : Predicate<"Subtarget->hasMPExtension()">, - AssemblerPredicate<"FeatureMP">; + AssemblerPredicate<"FeatureMP", + "mp-extensions">; def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; def IsThumb : Predicate<"Subtarget->isThumb()">, - AssemblerPredicate<"ModeThumb">; + AssemblerPredicate<"ModeThumb", "thumb">; def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; def IsThumb2 : Predicate<"Subtarget->isThumb2()">, - AssemblerPredicate<"ModeThumb,FeatureThumb2">; + AssemblerPredicate<"ModeThumb,FeatureThumb2", + "thumb2">; def IsMClass : Predicate<"Subtarget->isMClass()">, - AssemblerPredicate<"FeatureMClass">; + AssemblerPredicate<"FeatureMClass", "armv7m">; def IsARClass : Predicate<"!Subtarget->isMClass()">, - AssemblerPredicate<"!FeatureMClass">; + AssemblerPredicate<"!FeatureMClass", + "armv7a/r">; def IsARM : Predicate<"!Subtarget->isThumb()">, - AssemblerPredicate<"!ModeThumb">; + AssemblerPredicate<"!ModeThumb", "arm-mode">; def IsIOS : Predicate<"Subtarget->isTargetIOS()">; def IsNotIOS : Predicate<"!Subtarget->isTargetIOS()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; @@ -220,7 +236,8 @@ def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available. // But only select them if more precision in FP computation is allowed. // Do not use them for Darwin platforms. -def UseFusedMAC : Predicate<"!TM.Options.NoExcessFPPrecision && " +def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast) && " "!Subtarget->isTargetDarwin()">; def DontUseFusedMAC : Predicate<"!Subtarget->hasVFP4() || " "Subtarget->isTargetDarwin()">; @@ -236,9 +253,9 @@ class RegConstraint<string C> { // ARM specific transformation functions and pattern fragments. // -// so_imm_neg_XFORM - Return a so_imm value packed into the format described for -// so_imm_neg def below. -def so_imm_neg_XFORM : SDNodeXForm<imm, [{ +// imm_neg_XFORM - Return a imm value packed into the format described for +// imm_neg defs below. +def imm_neg_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32); }]>; @@ -257,7 +274,7 @@ def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; } def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{ int64_t Value = -(int)N->getZExtValue(); return Value && ARM_AM::getSOImmVal(Value) != -1; - }], so_imm_neg_XFORM> { + }], imm_neg_XFORM> { let ParserMatchClass = so_imm_neg_asmoperand; } @@ -570,7 +587,10 @@ def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> { } /// imm0_15 predicate - Immediate in the range [0,15]. -def Imm0_15AsmOperand: ImmAsmOperand { let Name = "Imm0_15"; } +def Imm0_15AsmOperand: ImmAsmOperand { + let Name = "Imm0_15"; + let DiagnosticType = "ImmRange0_15"; +} def imm0_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 16; }]> { @@ -615,6 +635,11 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ let ParserMatchClass = Imm0_65535AsmOperand; } +// imm0_65535_neg - An immediate whose negative value is in the range [0.65535]. +def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{ + return -Imm >= 0 && -Imm < 65536; +}]>; + // imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference // a relocatable expression. // @@ -940,6 +965,7 @@ include "ARMInstrFormats.td" /// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a /// binop that produces a value. +let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AsI1_bin_irs<bits<4> opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, PatFrag opnode, string baseOpc, bit Commutable = 0> { @@ -1003,35 +1029,12 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc, let Inst{4} = 1; let Inst{3-0} = shift{3-0}; } - - // Assembly aliases for optional destination operand when it's the same - // as the source operand. - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn, - so_imm:$imm, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"), - (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn, - GPR:$Rm, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), - (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn, - so_reg_imm:$shift, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), - (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn, - so_reg_reg:$shift, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - } /// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are /// reversed. The 'rr' form is only defined for the disassembler; for codegen /// it is equivalent to the AsI1_bin_irs counterpart. +let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AsI1_rbin_irs<bits<4> opcod, string opc, InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, PatFrag opnode, string baseOpc, bit Commutable = 0> { @@ -1094,30 +1097,6 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc, let Inst{4} = 1; let Inst{3-0} = shift{3-0}; } - - // Assembly aliases for optional destination operand when it's the same - // as the source operand. - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn, - so_imm:$imm, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"), - (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn, - GPR:$Rm, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), - (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn, - so_reg_imm:$shift, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), - (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn, - so_reg_reg:$shift, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - } /// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default. @@ -1304,6 +1283,7 @@ class AI_exta_rrot_np<bits<8> opcod, string opc> } /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube. +let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, string baseOpc, bit Commutable = 0> { let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { @@ -1351,7 +1331,8 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, def rsr : AsI1<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", - [(set GPRnopc:$Rd, CPSR, (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>, + [(set GPRnopc:$Rd, CPSR, + (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>, Requires<[IsARM]> { bits<4> Rd; bits<4> Rn; @@ -1366,32 +1347,10 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{3-0} = shift{3-0}; } } - - // Assembly aliases for optional destination operand when it's the same - // as the source operand. - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn, - so_imm:$imm, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"), - (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn, - GPR:$Rm, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), - (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn, - so_reg_imm:$shift, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), - (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPRnopc:$Rdn, GPRnopc:$Rdn, - so_reg_reg:$shift, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; } /// AI1_rsc_irs - Define instructions and patterns for rsc +let TwoOperandAliasConstraint = "$Rn = $Rd" in multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode, string baseOpc> { let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { @@ -1450,29 +1409,6 @@ multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{3-0} = shift{3-0}; } } - - // Assembly aliases for optional destination operand when it's the same - // as the source operand. - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn, - so_imm:$imm, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"), - (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn, - GPR:$Rm, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), - (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn, - so_reg_imm:$shift, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; - def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), - (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn, - so_reg_reg:$shift, pred:$p, - cc_out:$s)>, - Requires<[IsARM]>; } let canFoldAsLoad = 1, isReMaterializable = 1 in { @@ -1511,9 +1447,10 @@ multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii, // Note: We use the complex addrmode_imm12 rather than just an input // GPR and a constrained immediate so that we can use this to match // frame index references and avoid matching constant pool references. - def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), (ins addrmode_imm12:$addr), + def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), + (ins addrmode_imm12:$addr), AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr", - [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> { + [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> { bits<4> Rt; bits<17> addr; let Inst{23} = addr{12}; // U (add = ('U' == 1)) @@ -1521,9 +1458,10 @@ multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii, let Inst{15-12} = Rt; let Inst{11-0} = addr{11-0}; // imm12 } - def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), (ins ldst_so_reg:$shift), - AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", - [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> { + def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), + (ins ldst_so_reg:$shift), + AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", + [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> { bits<4> Rt; bits<17> shift; let shift{4} = 0; // Inst{4} = 0 @@ -1581,9 +1519,10 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii, let Inst{15-12} = Rt; let Inst{11-0} = addr{11-0}; // imm12 } - def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPRnopc:$Rt, ldst_so_reg:$shift), - AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", - [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> { + def rs : AI2ldst<0b011, 0, isByte, (outs), + (ins GPRnopc:$Rt, ldst_so_reg:$shift), + AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", + [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> { bits<4> Rt; bits<17> shift; let shift{4} = 0; // Inst{4} = 0 @@ -1655,33 +1594,18 @@ def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), NoItinerary, []>; } -def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", []>, - Requires<[IsARM, HasV6T2]> { - let Inst{27-16} = 0b001100100000; - let Inst{15-8} = 0b11110000; - let Inst{7-0} = 0b00000000; +def HINT : AI<(outs), (ins imm0_255:$imm), MiscFrm, NoItinerary, + "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> { + bits<8> imm; + let Inst{27-8} = 0b00110010000011110000; + let Inst{7-0} = imm; } -def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "", []>, - Requires<[IsARM, HasV6T2]> { - let Inst{27-16} = 0b001100100000; - let Inst{15-8} = 0b11110000; - let Inst{7-0} = 0b00000001; -} - -def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "", []>, - Requires<[IsARM, HasV6T2]> { - let Inst{27-16} = 0b001100100000; - let Inst{15-8} = 0b11110000; - let Inst{7-0} = 0b00000010; -} - -def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "", []>, - Requires<[IsARM, HasV6T2]> { - let Inst{27-16} = 0b001100100000; - let Inst{15-8} = 0b11110000; - let Inst{7-0} = 0b00000011; -} +def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>; +def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>; +def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>; +def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>; +def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>; def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> { @@ -1694,16 +1618,10 @@ def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", let Inst{27-20} = 0b01101000; let Inst{7-4} = 0b1011; let Inst{11-8} = 0b1111; + let Unpredictable{11-8} = 0b1111; } -def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "", - []>, Requires<[IsARM, HasV6T2]> { - let Inst{27-16} = 0b001100100000; - let Inst{15-8} = 0b11110000; - let Inst{7-0} = 0b00000100; -} - -// The i32imm operand $val can be used by a debugger to store more information +// The 16-bit operand $val can be used by a debugger to store more information // about the breakpoint. def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, "bkpt", "\t$val", []>, Requires<[IsARM]> { @@ -1922,7 +1840,7 @@ let isCall = 1, // at least be a pseudo instruction expanding to the predicated version // at MC lowering time. Defs = [LR], Uses = [SP] in { - def BL : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops), + def BL : ABXI<0b1011, (outs), (ins bl_target:$func), IIC_Br, "bl\t$func", [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM]> { @@ -1932,7 +1850,7 @@ let isCall = 1, let DecoderMethod = "DecodeBranchImmInstruction"; } - def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops), + def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func), IIC_Br, "bl", "\t$func", [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsARM]> { @@ -1942,7 +1860,7 @@ let isCall = 1, } // ARMv5T and above - def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, + def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx\t$func", [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T]> { @@ -1951,7 +1869,7 @@ let isCall = 1, let Inst{3-0} = func; } - def BLX_pred : AI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, + def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx", "\t$func", [(ARMcall_pred GPR:$func)]>, Requires<[IsARM, HasV5T]> { @@ -1962,19 +1880,18 @@ let isCall = 1, // ARMv4T // Note: Restrict $func to the tGPR regclass to prevent it being in LR. - def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsARM, HasV4T]>; // ARMv4 - def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsARM, NoV4T]>; // mov lr, pc; b if callee is marked noreturn to avoid confusing the // return stack predictor. - def BMOVPCB_CALL : ARMPseudoInst<(outs), - (ins bl_target:$func, variable_ops), + def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func), 8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>, Requires<[IsARM]>; } @@ -2044,18 +1961,16 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", // Tail calls. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { - def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops), - IIC_Br, []>; + def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>; - def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops), - IIC_Br, []>; + def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>; - def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst, variable_ops), + def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst), 4, IIC_Br, [], (Bcc br_target:$dst, (ops 14, zero_reg))>, Requires<[IsARM]>; - def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops), + def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst), 4, IIC_Br, [], (BX GPR:$dst)>, Requires<[IsARM]>; @@ -2509,6 +2424,7 @@ multiclass AI2_stridx<bit isByte, string opc, let Inst{23} = offset{12}; let Inst{19-16} = addr; let Inst{11-0} = offset{11-0}; + let Inst{4} = 0; let DecoderMethod = "DecodeAddrMode2IdxInstruction"; } @@ -2768,7 +2684,7 @@ defm STRHT : AI3strT<0b1011, "strht">; multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f, InstrItinClass itin, InstrItinClass itin_upd> { // IA is the default, so no need for an explicit suffix on the - // mnemonic here. Without it is the cannonical spelling. + // mnemonic here. Without it is the canonical spelling. def IA : AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), IndexModeNone, f, itin, @@ -3163,6 +3079,11 @@ def : ARMPat<(add GPR:$src, so_imm_neg:$imm), def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm), (SUBSri GPR:$src, so_imm_neg:$imm)>; +def : ARMPat<(add GPR:$src, imm0_65535_neg:$imm), + (SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>; +def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm), + (SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>; + // The with-carry-in form matches bitwise not instead of the negation. // Effectively, the inverse interpretation of the carry flag already accounts // for part of the negation. @@ -3190,7 +3111,7 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc, let Inst{19-16} = Rn; let Inst{15-12} = Rd; let Inst{3-0} = Rm; - + let Unpredictable{11-8} = 0b1111; } @@ -3482,27 +3403,28 @@ class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, // FIXME: The v5 pseudos are only necessary for the additional Constraint // property. Remove them when it's possible to add those properties -// on an individual MachineInstr, not just an instuction description. -let isCommutable = 1 in { -def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), - IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", - [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>, - Requires<[IsARM, HasV6]> { +// on an individual MachineInstr, not just an instruction description. +let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in { +def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", + [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>, + Requires<[IsARM, HasV6]> { let Inst{15-12} = 0b0000; let Unpredictable{15-12} = 0b1111; } let Constraints = "@earlyclobber $Rd" in def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, - pred:$p, cc_out:$s), - 4, IIC_iMUL32, - [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))], - (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>, - Requires<[IsARM, NoV6]>; + pred:$p, cc_out:$s), + 4, IIC_iMUL32, + [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))], + (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; } def MLA : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), - IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra", + IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra", [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, Requires<[IsARM, HasV6]> { bits<4> Ra; @@ -3511,8 +3433,8 @@ def MLA : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), let Constraints = "@earlyclobber $Rd" in def MLAv5: ARMPseudoExpand<(outs GPR:$Rd), - (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s), - 4, IIC_iMAC32, + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s), + 4, IIC_iMAC32, [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))], (MLA GPR:$Rd, GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s)>, Requires<[IsARM, NoV6]>; @@ -3630,8 +3552,7 @@ def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd), def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), - IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", - [(set GPR:$Rd, (sub GPR:$Ra, (mulhs GPR:$Rn, GPR:$Rm)))]>, + IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>, Requires<[IsARM, HasV6]>; def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd), @@ -3912,49 +3833,85 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs), def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs), (CMPrsr GPR:$src, so_reg_reg:$rhs)>; -// FIXME: We have to be careful when using the CMN instruction and comparison -// with 0. One would expect these two pieces of code should give identical -// results: -// -// rsbs r1, r1, 0 -// cmp r0, r1 -// mov r0, #0 -// it ls -// mov r0, #1 -// -// and: -// -// cmn r0, r1 -// mov r0, #0 -// it ls -// mov r0, #1 -// -// However, the CMN gives the *opposite* result when r1 is 0. This is because -// the carry flag is set in the CMP case but not in the CMN case. In short, the -// CMP instruction doesn't perform a truncate of the (logical) NOT of 0 plus the -// value of r0 and the carry bit (because the "carry bit" parameter to -// AddWithCarry is defined as 1 in this case, the carry flag will always be set -// when r0 >= 0). The CMN instruction doesn't perform a NOT of 0 so there is -// never a "carry" when this AddWithCarry is performed (because the "carry bit" -// parameter to AddWithCarry is defined as 0). -// -// When x is 0 and unsigned: -// -// x = 0 -// ~x = 0xFFFF FFFF -// ~x + 1 = 0x1 0000 0000 -// (-x = 0) != (0x1 0000 0000 = ~x + 1) -// -// Therefore, we should disable CMN when comparing against zero, until we can -// limit when the CMN instruction is used (when we know that the RHS is not 0 or -// when it's a comparison which doesn't look at the 'carry' flag). -// -// (See the ARM docs for the "AddWithCarry" pseudo-code.) -// -// This is related to <rdar://problem/7569620>. -// -//defm CMN : AI1_cmp_irs<0b1011, "cmn", -// BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; +// CMN register-integer +let isCompare = 1, Defs = [CPSR] in { +def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi, + "cmn", "\t$Rn, $imm", + [(ARMcmn GPR:$Rn, so_imm:$imm)]> { + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-0} = imm; + + let Unpredictable{15-12} = 0b1111; +} + +// CMN register-register/shift +def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr, + "cmn", "\t$Rn, $Rm", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPR:$Rn, GPR:$Rm)]> { + bits<4> Rn; + bits<4> Rm; + let isCommutable = 1; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; + + let Unpredictable{15-12} = 0b1111; +} + +def CMNzrsi : AI1<0b1011, (outs), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr, + "cmn", "\t$Rn, $shift", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPR:$Rn, so_reg_imm:$shift)]> { + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + + let Unpredictable{15-12} = 0b1111; +} + +def CMNzrsr : AI1<0b1011, (outs), + (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr, + "cmn", "\t$Rn, $shift", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPRnopc:$Rn, so_reg_reg:$shift)]> { + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + + let Unpredictable{15-12} = 0b1111; +} + +} + +def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm), + (CMNri GPR:$src, so_imm_neg:$imm)>; + +def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm), + (CMNri GPR:$src, so_imm_neg:$imm)>; // Note that TST/TEQ don't set all the same flags that CMP does! defm TST : AI1_cmp_irs<0b1000, "tst", @@ -3964,16 +3921,6 @@ defm TEQ : AI1_cmp_irs<0b1001, "teq", IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr, BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>; -defm CMNz : AI1_cmp_irs<0b1011, "cmn", - IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, - BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>; - -//def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm), -// (CMNri GPR:$src, so_imm_neg:$imm)>; - -def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm), - (CMNzri GPR:$src, so_imm_neg:$imm)>; - // Pseudo i64 compares for some floating point compares. let usesCustomInserter = 1, isBranch = 1, isTerminator = 1, Defs = [CPSR] in { @@ -4242,6 +4189,13 @@ let usesCustomInserter = 1 in { } } +let usesCustomInserter = 1 in { + def COPY_STRUCT_BYVAL_I32 : PseudoInst< + (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment), + NoItinerary, + [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; +} + let mayLoad = 1 in { def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), NoItinerary, @@ -4280,10 +4234,10 @@ def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>, // SWP/SWPB are deprecated in V6/V7. let mayLoad = 1, mayStore = 1 in { -def SWP : AIswp<0, (outs GPRnopc:$Rt), (ins GPRnopc:$Rt2, addr_offset_none:$addr), - "swp", []>; -def SWPB: AIswp<1, (outs GPRnopc:$Rt), (ins GPRnopc:$Rt2, addr_offset_none:$addr), - "swpb", []>; +def SWP : AIswp<0, (outs GPRnopc:$Rt), + (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>; +def SWPB: AIswp<1, (outs GPRnopc:$Rt), + (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>; } //===----------------------------------------------------------------------===// @@ -4609,8 +4563,8 @@ class MovRRCopro<string opc, bit direction, list<dag> pattern = []> } def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */, - [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2, - imm:$CRm)]>; + [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, imm:$CRm)]>; def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>; class MovRRCopro2<string opc, bit direction, list<dag> pattern = []> @@ -4637,8 +4591,8 @@ class MovRRCopro2<string opc, bit direction, list<dag> pattern = []> } def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */, - [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2, - imm:$CRm)]>; + [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, imm:$CRm)]>; def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>; //===----------------------------------------------------------------------===// @@ -4658,7 +4612,8 @@ def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary, let Unpredictable{11-0} = 0b110100001111; } -def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>, Requires<[IsARM]>; +def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>, + Requires<[IsARM]>; // The MRSsys instruction is the MRS instruction from the ARM ARM, // section B9.3.9, with the R bit set to 1. @@ -5114,7 +5069,7 @@ def : ARMInstAlias<"add${s}${p} $Rd, $imm", (SUBri GPR:$Rd, GPR:$Rd, so_imm_neg:$imm, pred:$p, cc_out:$s)>; // Same for CMP <--> CMN via so_imm_neg def : ARMInstAlias<"cmp${p} $Rd, $imm", - (CMNzri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>; + (CMNri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>; def : ARMInstAlias<"cmn${p} $Rd, $imm", (CMPri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>; @@ -5123,6 +5078,7 @@ def : ARMInstAlias<"cmn${p} $Rd, $imm", // FIXME: We need C++ parser hooks to map the alias to the MOV // encoding. It seems we should be able to do that sort of thing // in tblgen, but it could get ugly. +let TwoOperandAliasConstraint = "$Rm = $Rd" in { def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm", (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>; @@ -5135,8 +5091,10 @@ def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm", def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm", (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>; +} def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm", (ins GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>; +let TwoOperandAliasConstraint = "$Rn = $Rd" in { def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm", (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>; @@ -5149,32 +5107,7 @@ def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm", def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm", (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>; -// shifter instructions also support a two-operand form. -def : ARMInstAlias<"asr${s}${p} $Rm, $imm", - (ASRi GPR:$Rm, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>; -def : ARMInstAlias<"lsr${s}${p} $Rm, $imm", - (LSRi GPR:$Rm, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>; -def : ARMInstAlias<"lsl${s}${p} $Rm, $imm", - (LSLi GPR:$Rm, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>; -def : ARMInstAlias<"ror${s}${p} $Rm, $imm", - (RORi GPR:$Rm, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>; -def : ARMInstAlias<"asr${s}${p} $Rn, $Rm", - (ASRr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, - cc_out:$s)>; -def : ARMInstAlias<"lsr${s}${p} $Rn, $Rm", - (LSRr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, - cc_out:$s)>; -def : ARMInstAlias<"lsl${s}${p} $Rn, $Rm", - (LSLr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, - cc_out:$s)>; -def : ARMInstAlias<"ror${s}${p} $Rn, $Rm", - (RORr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, - cc_out:$s)>; - - -// 'mul' instruction can be specified with only two operands. -def : ARMInstAlias<"mul${s}${p} $Rn, $Rm", - (MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p, cc_out:$s)>; +} // "neg" is and alias for "rsb rd, rn, #0" def : ARMInstAlias<"neg${s}${p} $Rd, $Rm", diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index fd8ac0b..d4afa33 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -1962,7 +1962,7 @@ def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16, let Inst{4} = Rn{5}; } -def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt, +def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt, addrmode6oneL32> { let Inst{7} = lane{0}; let Inst{5-4} = Rn{5-4}; @@ -2300,14 +2300,14 @@ class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; @@ -2325,7 +2325,7 @@ class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType TyD, ValueType TyQ, Intrinsic IntOp> + ValueType TyD, ValueType TyQ, SDPatternOperator IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd), (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>; @@ -2343,7 +2343,7 @@ class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType TyQ, ValueType TyD, Intrinsic IntOp> + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd), (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>; @@ -2368,6 +2368,8 @@ class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } // Same as N3VD but no data type. @@ -2379,6 +2381,8 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, "$Vd, $Vn, $Vm", "", [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{ + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } @@ -2391,6 +2395,8 @@ class N3VDSL<bits<2> op21_20, bits<4> op11_8, [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$Vn), (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = 0; } class N3VDSL16<bits<2> op21_20, bits<4> op11_8, @@ -2401,6 +2407,8 @@ class N3VDSL16<bits<2> op21_20, bits<4> op11_8, [(set (Ty DPR:$Vd), (Ty (ShOp (Ty DPR:$Vn), (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = 0; } @@ -2411,6 +2419,8 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, @@ -2420,6 +2430,8 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, OpcodeStr, "$Vd, $Vn, $Vm", "", [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{ + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } class N3VQSL<bits<2> op21_20, bits<4> op11_8, @@ -2432,6 +2444,8 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8, (ResTy (ShOp (ResTy QPR:$Vn), (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), imm:$lane)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = 0; } class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, @@ -2443,21 +2457,25 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, (ResTy (ShOp (ResTy QPR:$Vn), (ResTy (NEONvduplane (OpTy DPR_8:$Vm), imm:$lane)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = 0; } // Basic 3-register intrinsics, both double- and quad-register. class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, Format f, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, - string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> + string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp> : N3VLane32<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", @@ -2468,7 +2486,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, let isCommutable = 0; } class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, - string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> + string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp> : N3VLane16<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", @@ -2479,26 +2497,29 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, } class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, Format f, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin, OpcodeStr, Dt, "$Vd, $Vm, $Vn", "", [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> { + let TwoOperandAliasConstraint = "$Vm = $Vd"; let isCommutable = 0; } class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, Format f, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 1, op4, (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3VLane32<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", @@ -2510,7 +2531,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, } class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3VLane16<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", @@ -2522,11 +2543,12 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, } class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, Format f, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3V<op24, op23, op21_20, op11_8, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin, OpcodeStr, Dt, "$Vd, $Vm, $Vn", "", [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> { + let TwoOperandAliasConstraint = "$Vm = $Vd"; let isCommutable = 0; } @@ -2606,7 +2628,7 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, // Neon Intrinsic-Op instructions (VABA): double- and quad-register. class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, Intrinsic IntOp, SDNode OpNode> + ValueType Ty, SDPatternOperator IntOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", @@ -2614,7 +2636,7 @@ class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>; class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, Intrinsic IntOp, SDNode OpNode> + ValueType Ty, SDPatternOperator IntOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 1, op4, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", @@ -2625,7 +2647,7 @@ class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, // The destination register is also used as the first source operand register. class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", @@ -2633,7 +2655,7 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>; class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3V<op24, op23, op21_20, op11_8, 1, op4, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", @@ -2678,7 +2700,7 @@ class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8, // Long Intrinsic-Op vector operations with explicit extend (VABAL). class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp, + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, @@ -2691,7 +2713,7 @@ class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, // a quad-register and is also used as the first source operand register. class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType TyQ, ValueType TyD, Intrinsic IntOp> + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", @@ -2699,7 +2721,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>; class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), @@ -2712,7 +2734,7 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, imm:$lane)))))]>; class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), @@ -2727,7 +2749,7 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, // Narrowing 3-register intrinsics. class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, - Intrinsic IntOp, bit Commutable> + SDPatternOperator IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", @@ -2780,7 +2802,7 @@ class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, // Long 3-register intrinsics with explicit extend (VABDL). class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp, + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, @@ -2793,7 +2815,7 @@ class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, // Long 3-register intrinsics. class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable> : N3V<op24, op23, op21_20, op11_8, 0, op4, (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", @@ -2802,7 +2824,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, } class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", @@ -2812,7 +2834,7 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, imm:$lane)))))]>; class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", @@ -2830,6 +2852,8 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn), (TyQ (ExtOp (TyD DPR:$Vm)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } @@ -2837,14 +2861,14 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; @@ -2855,7 +2879,7 @@ class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD, OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd", @@ -2863,7 +2887,7 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ, OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd", @@ -2871,6 +2895,7 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, // Shift by immediate, // both double- and quad-register. +let TwoOperandAliasConstraint = "$Vm = $Vd" in { class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, Format f, InstrItinClass itin, Operand ImmTy, string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> @@ -2885,6 +2910,7 @@ class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>; +} // Long shift by immediate. class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, @@ -2908,6 +2934,7 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, // Shift right by immediate and accumulate, // both double- and quad-register. +let TwoOperandAliasConstraint = "$Vm = $Vd" in { class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, Operand ImmTy, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> @@ -2924,9 +2951,11 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", [(set QPR:$Vd, (Ty (add QPR:$src1, (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>; +} // Shift by immediate and insert, // both double- and quad-register. +let TwoOperandAliasConstraint = "$Vm = $Vd" in { class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, Operand ImmTy, Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> @@ -2941,19 +2970,20 @@ class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>; +} // Convert, with fractional bits immediate, // both double- and quad-register. class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, - Intrinsic IntOp> + SDPatternOperator IntOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm, IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>; class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, - Intrinsic IntOp> + SDPatternOperator IntOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm, IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", @@ -3023,7 +3053,7 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, bits<5> op11_7, bit op4, InstrItinClass itinD, InstrItinClass itinQ, - string OpcodeStr, string Dt, Intrinsic IntOp> { + string OpcodeStr, string Dt, SDPatternOperator IntOp> { // 64-bit vector types. def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; @@ -3064,7 +3094,7 @@ multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - Intrinsic IntOp> { + SDPatternOperator IntOp> { def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, itin, OpcodeStr, !strconcat(Dt, "16"), v8i8, v8i16, IntOp>; @@ -3152,7 +3182,7 @@ multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, - Intrinsic IntOp, bit Commutable = 0> { + SDPatternOperator IntOp, bit Commutable = 0> { // 64-bit vector types. def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16, OpcodeStr, !strconcat(Dt, "16"), @@ -3173,7 +3203,7 @@ multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, - Intrinsic IntOp> { + SDPatternOperator IntOp> { // 64-bit vector types. def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16, OpcodeStr, !strconcat(Dt, "16"), @@ -3194,7 +3224,7 @@ multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, multiclass N3VIntSL_HS<bits<4> op11_8, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, - string OpcodeStr, string Dt, Intrinsic IntOp> { + string OpcodeStr, string Dt, SDPatternOperator IntOp> { def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16, OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>; def v2i32 : N3VDIntSL<0b10, op11_8, itinD32, @@ -3210,7 +3240,7 @@ multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, - Intrinsic IntOp, bit Commutable = 0> + SDPatternOperator IntOp, bit Commutable = 0> : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, OpcodeStr, Dt, IntOp, Commutable> { def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16, @@ -3224,7 +3254,7 @@ multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, - Intrinsic IntOp> + SDPatternOperator IntOp> : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, OpcodeStr, Dt, IntOp> { def v8i8 : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16, @@ -3241,7 +3271,7 @@ multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, - Intrinsic IntOp, bit Commutable = 0> + SDPatternOperator IntOp, bit Commutable = 0> : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, OpcodeStr, Dt, IntOp, Commutable> { def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32, @@ -3255,7 +3285,7 @@ multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, InstrItinClass itinD16, InstrItinClass itinD32, InstrItinClass itinQ16, InstrItinClass itinQ32, string OpcodeStr, string Dt, - Intrinsic IntOp> + SDPatternOperator IntOp> : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, OpcodeStr, Dt, IntOp> { def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32, @@ -3270,7 +3300,7 @@ multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, // source operand element sizes of 16, 32 and 64 bits: multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, - Intrinsic IntOp, bit Commutable = 0> { + SDPatternOperator IntOp, bit Commutable = 0> { def v8i8 : N3VNInt<op24, op23, 0b00, op11_8, op4, OpcodeStr, !strconcat(Dt, "16"), v8i8, v8i16, IntOp, Commutable>; @@ -3330,7 +3360,7 @@ multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itin16, InstrItinClass itin32, string OpcodeStr, string Dt, - Intrinsic IntOp, bit Commutable = 0> { + SDPatternOperator IntOp, bit Commutable = 0> { def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp, Commutable>; @@ -3341,7 +3371,7 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, - Intrinsic IntOp> { + SDPatternOperator IntOp> { def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin, @@ -3352,7 +3382,7 @@ multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8, multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itin16, InstrItinClass itin32, string OpcodeStr, string Dt, - Intrinsic IntOp, bit Commutable = 0> + SDPatternOperator IntOp, bit Commutable = 0> : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp, Commutable> { def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16, @@ -3363,7 +3393,7 @@ multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, // ....with explicit extend (VABDL). multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> { + SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> { def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin, OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp, ExtOp, Commutable>; @@ -3436,7 +3466,7 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8, // element sizes of 8, 16 and 32 bits: multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itinD, InstrItinClass itinQ, - string OpcodeStr, string Dt, Intrinsic IntOp, + string OpcodeStr, string Dt, SDPatternOperator IntOp, SDNode OpNode> { // 64-bit vector types. def v8i8 : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD, @@ -3459,7 +3489,7 @@ multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, // element sizes of 8, 16 and 32 bits: multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itinD, InstrItinClass itinQ, - string OpcodeStr, string Dt, Intrinsic IntOp> { + string OpcodeStr, string Dt, SDPatternOperator IntOp> { // 64-bit vector types. def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; @@ -3506,7 +3536,7 @@ multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr, // First with only element sizes of 16 and 32 bits: multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itin16, InstrItinClass itin32, - string OpcodeStr, string Dt, Intrinsic IntOp> { + string OpcodeStr, string Dt, SDPatternOperator IntOp> { def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32, @@ -3514,7 +3544,7 @@ multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, } multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8, - string OpcodeStr, string Dt, Intrinsic IntOp> { + string OpcodeStr, string Dt, SDPatternOperator IntOp> { def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>; def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D, @@ -3524,7 +3554,7 @@ multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8, // ....then also with element size of 8 bits: multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itin16, InstrItinClass itin32, - string OpcodeStr, string Dt, Intrinsic IntOp> + string OpcodeStr, string Dt, SDPatternOperator IntOp> : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> { def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>; @@ -3533,7 +3563,7 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, // ....with explicit extend (VABAL). multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> { + SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> { def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin, OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp, ExtOp, OpNode>; @@ -3550,7 +3580,7 @@ multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, // element sizes of 8, 16 and 32 bits: multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, bits<5> op11_7, bit op4, - string OpcodeStr, string Dt, Intrinsic IntOp> { + string OpcodeStr, string Dt, SDPatternOperator IntOp> { // 64-bit vector types. def v8i8 : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>; @@ -3573,7 +3603,7 @@ multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, // element sizes of 8, 16 and 32 bits: multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, bits<5> op11_7, bit op4, - string OpcodeStr, string Dt, Intrinsic IntOp> { + string OpcodeStr, string Dt, SDPatternOperator IntOp> { // 64-bit vector types. def v8i8 : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>; @@ -3668,33 +3698,6 @@ multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64, OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; // imm6 = xxxxxx - - // Aliases for two-operand forms (source and dest regs the same). - def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "8 $Vdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "v8i8")) - DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>; - def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "16 $Vdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "v4i16")) - DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>; - def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "32 $Vdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "v2i32")) - DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>; - def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "64 $Vdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "v1i64")) - DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>; - - def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "8 $Vdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "v16i8")) - QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>; - def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "16 $Vdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "v8i16")) - QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>; - def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "32 $Vdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "v4i32")) - QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>; - def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "64 $Vdn, $imm"), - (!cast<Instruction>(!strconcat(baseOpc, "v2i64")) - QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>; } // Neon Shift-Accumulate vector operations, @@ -4133,16 +4136,16 @@ def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", Requires<[HasVFP4,UseFusedMAC]>; // Match @llvm.fma.* intrinsics -def : Pat<(v2f32 (fma DPR:$src1, DPR:$Vn, DPR:$Vm)), +def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, Requires<[HasVFP4]>; -def : Pat<(v4f32 (fma QPR:$src1, QPR:$Vn, QPR:$Vm)), +def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)), (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, Requires<[HasVFP4]>; -def : Pat<(v2f32 (fma (fneg DPR:$src1), DPR:$Vn, DPR:$Vm)), +def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)), (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, Requires<[HasVFP4]>; -def : Pat<(v4f32 (fma (fneg QPR:$src1), QPR:$Vn, QPR:$Vm)), +def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)), (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, Requires<[HasVFP4]>; @@ -4305,6 +4308,7 @@ def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1, // VBIC : Vector Bitwise Bit Clear (AND NOT) +let TwoOperandAliasConstraint = "$Vn = $Vd" in { def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, "vbic", "$Vd, $Vn, $Vm", "", @@ -4315,6 +4319,7 @@ def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), "vbic", "$Vd, $Vn, $Vm", "", [(set QPR:$Vd, (v4i32 (and QPR:$Vn, (vnotq QPR:$Vm))))]>; +} def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1, (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src), @@ -4820,14 +4825,14 @@ defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, // VCLZ : Vector Count Leading Zeros defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i", - int_arm_neon_vclz>; + ctlz>; // VCNT : Vector Count One Bits def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, IIC_VCNTiD, "vcnt", "8", - v8i8, v8i8, int_arm_neon_vcnt>; + v8i8, v8i8, ctpop>; def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, IIC_VCNTiQ, "vcnt", "8", - v16i8, v16i8, int_arm_neon_vcnt>; + v16i8, v16i8, ctpop>; // Vector Swap def VSWPd : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0, @@ -5308,6 +5313,9 @@ def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>; // VEXT : Vector Extract + +// All of these have a two-operand InstAlias. +let TwoOperandAliasConstraint = "$Vn = $Vd" in { class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy> : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm, @@ -5327,6 +5335,7 @@ class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy> bits<4> index; let Inst{11-8} = index{3-0}; } +} def VEXTd8 : VEXTd<"vext", "8", v8i8, imm0_7> { let Inst{11-8} = index{3-0}; @@ -5588,47 +5597,51 @@ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; // Vector lengthening move with load, matching extending loads. // extload, zextload and sextload for a standard lengthening load. Example: -// Lengthen_Single<"8", "i16", "i8"> = Pat<(v8i16 (extloadvi8 addrmode5:$addr)) -// (VMOVLuv8i16 (VLDRD addrmode5:$addr))>; +// Lengthen_Single<"8", "i16", "i8"> = +// Pat<(v8i16 (extloadvi8 addrmode6oneL32:$addr)) +// (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, +// (f64 (IMPLICIT_DEF)), (i32 0)))>; multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> { def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy) - (VLDRD addrmode5:$addr))>; + (VLD1LNd32 addrmode6oneL32:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))>; def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy) - (VLDRD addrmode5:$addr))>; + (VLD1LNd32 addrmode6oneL32:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))>; def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy) - (VLDRD addrmode5:$addr))>; + (VLD1LNd32 addrmode6oneL32:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))>; } // extload, zextload and sextload for a lengthening load which only uses // half the lanes available. Example: // Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> = -// Pat<(v4i16 (extloadvi8 addrmode5:$addr)) -// (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), -// (VLDRS addrmode5:$addr), -// ssub_0)), +// Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)), +// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, +// (f64 (IMPLICIT_DEF)), (i32 0))), // dsub_0)>; multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy, string InsnLanes, string InsnTy> { def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)>; def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)>; def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)>; } @@ -5637,33 +5650,33 @@ multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy, // // Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32", qsub_0> = // Pat<(v4i32 (extloadvi8 addrmode5:$addr)) -// (EXTRACT_SUBREG (VMOVLuv4i32 -// (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), -// (VLDRS addrmode5:$addr), -// ssub_0)), +// (EXTRACT_SUBREG (VMOVLuv4i32 +// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, +// (f64 (IMPLICIT_DEF)), +// (i32 0))), // dsub_0)), -// qsub_0)>; +// dsub_0)>; multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy, string Insn1Lanes, string Insn1Ty, string Insn2Lanes, string Insn2Ty> { def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), - ssub_0)), dsub_0))>; + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0))>; def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), - ssub_0)), dsub_0))>; + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0))>; def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), - ssub_0)), dsub_0))>; + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0))>; } // extload, zextload and sextload for a lengthening load followed by another @@ -5671,36 +5684,35 @@ multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy, // requiring half the available lanes (a 64-bit outcome instead of a 128-bit). // // Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> = -// Pat<(v4i32 (extloadvi8 addrmode5:$addr)) -// (EXTRACT_SUBREG (VMOVLuv4i32 -// (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), -// (VLDRS addrmode5:$addr), -// ssub_0)), -// dsub_0)), -// dsub_0)>; +// Pat<(v4i32 (extloadvi8 addrmode5:$addr)) +// (EXTRACT_SUBREG (VMOVLuv4i32 +// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, +// (f64 (IMPLICIT_DEF)), (i32 0))), +// dsub_0)), +// dsub_0)>; multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy, string Insn1Lanes, string Insn1Ty, string Insn2Lanes, string Insn2Ty> { def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), - ssub_0)), dsub_0)), + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)), dsub_0)>; def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), - ssub_0)), dsub_0)), + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)), dsub_0)>; def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) - (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)), + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty) (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty) - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), - ssub_0)), dsub_0)), + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)), dsub_0)>; } @@ -5720,18 +5732,18 @@ defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">; defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">; // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64 -def : Pat<(v2i64 (extloadvi8 addrmode5:$addr)), +def : Pat<(v2i64 (extloadvi8 addrmode6oneL32:$addr)), (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), - dsub_0)), dsub_0))>; -def : Pat<(v2i64 (zextloadvi8 addrmode5:$addr)), + (VLD1LNd32 addrmode6oneL32:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; +def : Pat<(v2i64 (zextloadvi8 addrmode6oneL32:$addr)), (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), - dsub_0)), dsub_0))>; -def : Pat<(v2i64 (sextloadvi8 addrmode5:$addr)), + (VLD1LNd32 addrmode6oneL32:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; +def : Pat<(v2i64 (sextloadvi8 addrmode6oneL32:$addr)), (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)), - dsub_0)), dsub_0))>; + (VLD1LNd32 addrmode6oneL32:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; //===----------------------------------------------------------------------===// // Assembler aliases @@ -5742,69 +5754,6 @@ def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn", def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn", (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>; - -// VADD two-operand aliases. -def : NEONInstAlias<"vadd${p}.i8 $Vdn, $Vm", - (VADDv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vadd${p}.i16 $Vdn, $Vm", - (VADDv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vadd${p}.i32 $Vdn, $Vm", - (VADDv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vadd${p}.i64 $Vdn, $Vm", - (VADDv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vadd${p}.i8 $Vdn, $Vm", - (VADDv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vadd${p}.i16 $Vdn, $Vm", - (VADDv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vadd${p}.i32 $Vdn, $Vm", - (VADDv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vadd${p}.i64 $Vdn, $Vm", - (VADDv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vadd${p}.f32 $Vdn, $Vm", - (VADDfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vadd${p}.f32 $Vdn, $Vm", - (VADDfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -// VSUB two-operand aliases. -def : NEONInstAlias<"vsub${p}.i8 $Vdn, $Vm", - (VSUBv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vsub${p}.i16 $Vdn, $Vm", - (VSUBv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vsub${p}.i32 $Vdn, $Vm", - (VSUBv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vsub${p}.i64 $Vdn, $Vm", - (VSUBv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vsub${p}.i8 $Vdn, $Vm", - (VSUBv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vsub${p}.i16 $Vdn, $Vm", - (VSUBv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vsub${p}.i32 $Vdn, $Vm", - (VSUBv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vsub${p}.i64 $Vdn, $Vm", - (VSUBv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vsub${p}.f32 $Vdn, $Vm", - (VSUBfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vsub${p}.f32 $Vdn, $Vm", - (VSUBfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -// VADDW two-operand aliases. -def : NEONInstAlias<"vaddw${p}.s8 $Vdn, $Vm", - (VADDWsv8i16 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vaddw${p}.s16 $Vdn, $Vm", - (VADDWsv4i32 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vaddw${p}.s32 $Vdn, $Vm", - (VADDWsv2i64 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vaddw${p}.u8 $Vdn, $Vm", - (VADDWuv8i16 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vaddw${p}.u16 $Vdn, $Vm", - (VADDWuv4i32 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vaddw${p}.u32 $Vdn, $Vm", - (VADDWuv2i64 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>; - // VAND/VBIC/VEOR/VORR accept but do not require a type suffix. defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm", (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>; @@ -5823,23 +5772,6 @@ defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm", defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm", (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>; // ... two-operand aliases -def : NEONInstAlias<"vand${p} $Vdn, $Vm", - (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vand${p} $Vdn, $Vm", - (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vbic${p} $Vdn, $Vm", - (VBICd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vbic${p} $Vdn, $Vm", - (VBICq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"veor${p} $Vdn, $Vm", - (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"veor${p} $Vdn, $Vm", - (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vorr${p} $Vdn, $Vm", - (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vorr${p} $Vdn, $Vm", - (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm", (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm", @@ -5853,212 +5785,6 @@ defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm", defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm", (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -// VMUL two-operand aliases. -def : NEONInstAlias<"vmul${p}.p8 $Qdn, $Qm", - (VMULpq QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>; -def : NEONInstAlias<"vmul${p}.i8 $Qdn, $Qm", - (VMULv16i8 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>; -def : NEONInstAlias<"vmul${p}.i16 $Qdn, $Qm", - (VMULv8i16 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>; -def : NEONInstAlias<"vmul${p}.i32 $Qdn, $Qm", - (VMULv4i32 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>; - -def : NEONInstAlias<"vmul${p}.p8 $Ddn, $Dm", - (VMULpd DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>; -def : NEONInstAlias<"vmul${p}.i8 $Ddn, $Dm", - (VMULv8i8 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>; -def : NEONInstAlias<"vmul${p}.i16 $Ddn, $Dm", - (VMULv4i16 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>; -def : NEONInstAlias<"vmul${p}.i32 $Ddn, $Dm", - (VMULv2i32 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>; - -def : NEONInstAlias<"vmul${p}.f32 $Qdn, $Qm", - (VMULfq QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>; -def : NEONInstAlias<"vmul${p}.f32 $Ddn, $Dm", - (VMULfd DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>; - -def : NEONInstAlias<"vmul${p}.i16 $Ddn, $Dm$lane", - (VMULslv4i16 DPR:$Ddn, DPR:$Ddn, DPR_8:$Dm, - VectorIndex16:$lane, pred:$p)>; -def : NEONInstAlias<"vmul${p}.i16 $Qdn, $Dm$lane", - (VMULslv8i16 QPR:$Qdn, QPR:$Qdn, DPR_8:$Dm, - VectorIndex16:$lane, pred:$p)>; - -def : NEONInstAlias<"vmul${p}.i32 $Ddn, $Dm$lane", - (VMULslv2i32 DPR:$Ddn, DPR:$Ddn, DPR_VFP2:$Dm, - VectorIndex32:$lane, pred:$p)>; -def : NEONInstAlias<"vmul${p}.i32 $Qdn, $Dm$lane", - (VMULslv4i32 QPR:$Qdn, QPR:$Qdn, DPR_VFP2:$Dm, - VectorIndex32:$lane, pred:$p)>; - -def : NEONInstAlias<"vmul${p}.f32 $Ddn, $Dm$lane", - (VMULslfd DPR:$Ddn, DPR:$Ddn, DPR_VFP2:$Dm, - VectorIndex32:$lane, pred:$p)>; -def : NEONInstAlias<"vmul${p}.f32 $Qdn, $Dm$lane", - (VMULslfq QPR:$Qdn, QPR:$Qdn, DPR_VFP2:$Dm, - VectorIndex32:$lane, pred:$p)>; - -// VQADD (register) two-operand aliases. -def : NEONInstAlias<"vqadd${p}.s8 $Vdn, $Vm", - (VQADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.s16 $Vdn, $Vm", - (VQADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.s32 $Vdn, $Vm", - (VQADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.s64 $Vdn, $Vm", - (VQADDsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.u8 $Vdn, $Vm", - (VQADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.u16 $Vdn, $Vm", - (VQADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.u32 $Vdn, $Vm", - (VQADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.u64 $Vdn, $Vm", - (VQADDuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vqadd${p}.s8 $Vdn, $Vm", - (VQADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.s16 $Vdn, $Vm", - (VQADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.s32 $Vdn, $Vm", - (VQADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.s64 $Vdn, $Vm", - (VQADDsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.u8 $Vdn, $Vm", - (VQADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.u16 $Vdn, $Vm", - (VQADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.u32 $Vdn, $Vm", - (VQADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqadd${p}.u64 $Vdn, $Vm", - (VQADDuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -// VSHL (immediate) two-operand aliases. -def : NEONInstAlias<"vshl${p}.i8 $Vdn, $imm", - (VSHLiv8i8 DPR:$Vdn, DPR:$Vdn, imm0_7:$imm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.i16 $Vdn, $imm", - (VSHLiv4i16 DPR:$Vdn, DPR:$Vdn, imm0_15:$imm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.i32 $Vdn, $imm", - (VSHLiv2i32 DPR:$Vdn, DPR:$Vdn, imm0_31:$imm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.i64 $Vdn, $imm", - (VSHLiv1i64 DPR:$Vdn, DPR:$Vdn, imm0_63:$imm, pred:$p)>; - -def : NEONInstAlias<"vshl${p}.i8 $Vdn, $imm", - (VSHLiv16i8 QPR:$Vdn, QPR:$Vdn, imm0_7:$imm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.i16 $Vdn, $imm", - (VSHLiv8i16 QPR:$Vdn, QPR:$Vdn, imm0_15:$imm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.i32 $Vdn, $imm", - (VSHLiv4i32 QPR:$Vdn, QPR:$Vdn, imm0_31:$imm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.i64 $Vdn, $imm", - (VSHLiv2i64 QPR:$Vdn, QPR:$Vdn, imm0_63:$imm, pred:$p)>; - -// VSHL (register) two-operand aliases. -def : NEONInstAlias<"vshl${p}.s8 $Vdn, $Vm", - (VSHLsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.s16 $Vdn, $Vm", - (VSHLsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.s32 $Vdn, $Vm", - (VSHLsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.s64 $Vdn, $Vm", - (VSHLsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.u8 $Vdn, $Vm", - (VSHLuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.u16 $Vdn, $Vm", - (VSHLuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm", - (VSHLuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm", - (VSHLuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vshl${p}.s8 $Vdn, $Vm", - (VSHLsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.s16 $Vdn, $Vm", - (VSHLsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.s32 $Vdn, $Vm", - (VSHLsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.s64 $Vdn, $Vm", - (VSHLsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.u8 $Vdn, $Vm", - (VSHLuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.u16 $Vdn, $Vm", - (VSHLuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm", - (VSHLuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm", - (VSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -// VSHR (immediate) two-operand aliases. -def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm", - (VSHRsv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm", - (VSHRsv4i16 DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.s32 $Vdn, $imm", - (VSHRsv2i32 DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.s64 $Vdn, $imm", - (VSHRsv1i64 DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>; - -def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm", - (VSHRsv16i8 QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm", - (VSHRsv8i16 QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.s32 $Vdn, $imm", - (VSHRsv4i32 QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.s64 $Vdn, $imm", - (VSHRsv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>; - -def : NEONInstAlias<"vshr${p}.u8 $Vdn, $imm", - (VSHRuv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.u16 $Vdn, $imm", - (VSHRuv4i16 DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm", - (VSHRuv2i32 DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm", - (VSHRuv1i64 DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>; - -def : NEONInstAlias<"vshr${p}.u8 $Vdn, $imm", - (VSHRuv16i8 QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.u16 $Vdn, $imm", - (VSHRuv8i16 QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm", - (VSHRuv4i32 QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm", - (VSHRuv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>; - -// VRSHL two-operand aliases. -def : NEONInstAlias<"vrshl${p}.s8 $Vdn, $Vm", - (VRSHLsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.s16 $Vdn, $Vm", - (VRSHLsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.s32 $Vdn, $Vm", - (VRSHLsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.s64 $Vdn, $Vm", - (VRSHLsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.u8 $Vdn, $Vm", - (VRSHLuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.u16 $Vdn, $Vm", - (VRSHLuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.u32 $Vdn, $Vm", - (VRSHLuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.u64 $Vdn, $Vm", - (VRSHLuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vrshl${p}.s8 $Vdn, $Vm", - (VRSHLsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.s16 $Vdn, $Vm", - (VRSHLsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.s32 $Vdn, $Vm", - (VRSHLsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.s64 $Vdn, $Vm", - (VRSHLsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.u8 $Vdn, $Vm", - (VRSHLuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.u16 $Vdn, $Vm", - (VRSHLuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.u32 $Vdn, $Vm", - (VRSHLuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vrshl${p}.u64 $Vdn, $Vm", - (VRSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - // VLD1 single-lane pseudo-instructions. These need special handling for // the lane index that an InstAlias can't handle, so we use these instead. def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr", @@ -6223,17 +5949,17 @@ def VST2LNqWB_register_Asm_32 : // VLD3 all-lanes pseudo-instructions. These need special handling for // the lane index that an InstAlias can't handle, so we use these instead. -def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", +def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD3DUPdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", +def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD3DUPdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", +def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", +def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD3DUPqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", +def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD3DUPqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", +def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>; def VLD3DUPdWB_fixed_Asm_8 : @@ -6499,17 +6225,17 @@ def VST3qWB_register_Asm_32 : // VLD4 all-lanes pseudo-instructions. These need special handling for // the lane index that an InstAlias can't handle, so we use these instead. -def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", +def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD4DUPdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", +def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD4DUPdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", +def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", +def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD4DUPqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", +def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>; -def VLD4DUPqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", +def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>; def VLD4DUPdWB_fixed_Asm_8 : @@ -6845,277 +6571,6 @@ def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm", def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm", (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; -// Two-operand variants for VEXT -def : NEONInstAlias<"vext${p}.8 $Vdn, $Vm, $imm", - (VEXTd8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_7:$imm, pred:$p)>; -def : NEONInstAlias<"vext${p}.16 $Vdn, $Vm, $imm", - (VEXTd16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_3:$imm, pred:$p)>; -def : NEONInstAlias<"vext${p}.32 $Vdn, $Vm, $imm", - (VEXTd32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_1:$imm, pred:$p)>; - -def : NEONInstAlias<"vext${p}.8 $Vdn, $Vm, $imm", - (VEXTq8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_15:$imm, pred:$p)>; -def : NEONInstAlias<"vext${p}.16 $Vdn, $Vm, $imm", - (VEXTq16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_7:$imm, pred:$p)>; -def : NEONInstAlias<"vext${p}.32 $Vdn, $Vm, $imm", - (VEXTq32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_3:$imm, pred:$p)>; -def : NEONInstAlias<"vext${p}.64 $Vdn, $Vm, $imm", - (VEXTq64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_1:$imm, pred:$p)>; - -// Two-operand variants for VQDMULH -def : NEONInstAlias<"vqdmulh${p}.s16 $Vdn, $Vm", - (VQDMULHv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqdmulh${p}.s32 $Vdn, $Vm", - (VQDMULHv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vqdmulh${p}.s16 $Vdn, $Vm", - (VQDMULHv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vqdmulh${p}.s32 $Vdn, $Vm", - (VQDMULHv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -// Two-operand variants for VMAX. -def : NEONInstAlias<"vmax${p}.s8 $Vdn, $Vm", - (VMAXsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.s16 $Vdn, $Vm", - (VMAXsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.s32 $Vdn, $Vm", - (VMAXsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.u8 $Vdn, $Vm", - (VMAXuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.u16 $Vdn, $Vm", - (VMAXuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.u32 $Vdn, $Vm", - (VMAXuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.f32 $Vdn, $Vm", - (VMAXfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vmax${p}.s8 $Vdn, $Vm", - (VMAXsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.s16 $Vdn, $Vm", - (VMAXsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.s32 $Vdn, $Vm", - (VMAXsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.u8 $Vdn, $Vm", - (VMAXuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.u16 $Vdn, $Vm", - (VMAXuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.u32 $Vdn, $Vm", - (VMAXuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmax${p}.f32 $Vdn, $Vm", - (VMAXfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -// Two-operand variants for VMIN. -def : NEONInstAlias<"vmin${p}.s8 $Vdn, $Vm", - (VMINsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.s16 $Vdn, $Vm", - (VMINsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.s32 $Vdn, $Vm", - (VMINsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.u8 $Vdn, $Vm", - (VMINuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.u16 $Vdn, $Vm", - (VMINuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.u32 $Vdn, $Vm", - (VMINuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.f32 $Vdn, $Vm", - (VMINfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vmin${p}.s8 $Vdn, $Vm", - (VMINsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.s16 $Vdn, $Vm", - (VMINsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.s32 $Vdn, $Vm", - (VMINsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.u8 $Vdn, $Vm", - (VMINuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.u16 $Vdn, $Vm", - (VMINuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.u32 $Vdn, $Vm", - (VMINuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vmin${p}.f32 $Vdn, $Vm", - (VMINfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -// Two-operand variants for VPADD. -def : NEONInstAlias<"vpadd${p}.i8 $Vdn, $Vm", - (VPADDi8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vpadd${p}.i16 $Vdn, $Vm", - (VPADDi16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vpadd${p}.i32 $Vdn, $Vm", - (VPADDi32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vpadd${p}.f32 $Vdn, $Vm", - (VPADDf DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -// Two-operand variants for VSRA. - // Signed. -def : NEONInstAlias<"vsra${p}.s8 $Vdm, $imm", - (VSRAsv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.s16 $Vdm, $imm", - (VSRAsv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.s32 $Vdm, $imm", - (VSRAsv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.s64 $Vdm, $imm", - (VSRAsv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>; - -def : NEONInstAlias<"vsra${p}.s8 $Vdm, $imm", - (VSRAsv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.s16 $Vdm, $imm", - (VSRAsv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.s32 $Vdm, $imm", - (VSRAsv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.s64 $Vdm, $imm", - (VSRAsv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>; - - // Unsigned. -def : NEONInstAlias<"vsra${p}.u8 $Vdm, $imm", - (VSRAuv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.u16 $Vdm, $imm", - (VSRAuv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.u32 $Vdm, $imm", - (VSRAuv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.u64 $Vdm, $imm", - (VSRAuv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>; - -def : NEONInstAlias<"vsra${p}.u8 $Vdm, $imm", - (VSRAuv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.u16 $Vdm, $imm", - (VSRAuv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.u32 $Vdm, $imm", - (VSRAuv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vsra${p}.u64 $Vdm, $imm", - (VSRAuv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>; - -// Two-operand variants for VSRI. -def : NEONInstAlias<"vsri${p}.8 $Vdm, $imm", - (VSRIv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vsri${p}.16 $Vdm, $imm", - (VSRIv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vsri${p}.32 $Vdm, $imm", - (VSRIv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vsri${p}.64 $Vdm, $imm", - (VSRIv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>; - -def : NEONInstAlias<"vsri${p}.8 $Vdm, $imm", - (VSRIv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vsri${p}.16 $Vdm, $imm", - (VSRIv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vsri${p}.32 $Vdm, $imm", - (VSRIv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vsri${p}.64 $Vdm, $imm", - (VSRIv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>; - -// Two-operand variants for VSLI. -def : NEONInstAlias<"vsli${p}.8 $Vdm, $imm", - (VSLIv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vsli${p}.16 $Vdm, $imm", - (VSLIv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vsli${p}.32 $Vdm, $imm", - (VSLIv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vsli${p}.64 $Vdm, $imm", - (VSLIv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>; - -def : NEONInstAlias<"vsli${p}.8 $Vdm, $imm", - (VSLIv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>; -def : NEONInstAlias<"vsli${p}.16 $Vdm, $imm", - (VSLIv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>; -def : NEONInstAlias<"vsli${p}.32 $Vdm, $imm", - (VSLIv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>; -def : NEONInstAlias<"vsli${p}.64 $Vdm, $imm", - (VSLIv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>; - -// Two-operand variants for VHSUB. - // Signed. -def : NEONInstAlias<"vhsub${p}.s8 $Vdn, $Vm", - (VHSUBsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhsub${p}.s16 $Vdn, $Vm", - (VHSUBsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhsub${p}.s32 $Vdn, $Vm", - (VHSUBsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vhsub${p}.s8 $Vdn, $Vm", - (VHSUBsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhsub${p}.s16 $Vdn, $Vm", - (VHSUBsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhsub${p}.s32 $Vdn, $Vm", - (VHSUBsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - - // Unsigned. -def : NEONInstAlias<"vhsub${p}.u8 $Vdn, $Vm", - (VHSUBuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhsub${p}.u16 $Vdn, $Vm", - (VHSUBuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhsub${p}.u32 $Vdn, $Vm", - (VHSUBuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vhsub${p}.u8 $Vdn, $Vm", - (VHSUBuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhsub${p}.u16 $Vdn, $Vm", - (VHSUBuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhsub${p}.u32 $Vdn, $Vm", - (VHSUBuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - - -// Two-operand variants for VHADD. - // Signed. -def : NEONInstAlias<"vhadd${p}.s8 $Vdn, $Vm", - (VHADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhadd${p}.s16 $Vdn, $Vm", - (VHADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhadd${p}.s32 $Vdn, $Vm", - (VHADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vhadd${p}.s8 $Vdn, $Vm", - (VHADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhadd${p}.s16 $Vdn, $Vm", - (VHADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhadd${p}.s32 $Vdn, $Vm", - (VHADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - - // Unsigned. -def : NEONInstAlias<"vhadd${p}.u8 $Vdn, $Vm", - (VHADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhadd${p}.u16 $Vdn, $Vm", - (VHADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhadd${p}.u32 $Vdn, $Vm", - (VHADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; - -def : NEONInstAlias<"vhadd${p}.u8 $Vdn, $Vm", - (VHADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhadd${p}.u16 $Vdn, $Vm", - (VHADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; -def : NEONInstAlias<"vhadd${p}.u32 $Vdn, $Vm", - (VHADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; - -// Two-operand variants for VRHADD. - // Signed. -def : NEONInstAlias<"vrhadd${p}.s8 $Vdn, $Rm", - (VRHADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>; -def : NEONInstAlias<"vrhadd${p}.s16 $Vdn, $Rm", - (VRHADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>; -def : NEONInstAlias<"vrhadd${p}.s32 $Vdn, $Rm", - (VRHADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>; - -def : NEONInstAlias<"vrhadd${p}.s8 $Vdn, $Rm", - (VRHADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>; -def : NEONInstAlias<"vrhadd${p}.s16 $Vdn, $Rm", - (VRHADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>; -def : NEONInstAlias<"vrhadd${p}.s32 $Vdn, $Rm", - (VRHADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>; - - // Unsigned. -def : NEONInstAlias<"vrhadd${p}.u8 $Vdn, $Rm", - (VRHADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>; -def : NEONInstAlias<"vrhadd${p}.u16 $Vdn, $Rm", - (VRHADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>; -def : NEONInstAlias<"vrhadd${p}.u32 $Vdn, $Rm", - (VRHADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>; - -def : NEONInstAlias<"vrhadd${p}.u8 $Vdn, $Rm", - (VRHADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>; -def : NEONInstAlias<"vrhadd${p}.u16 $Vdn, $Rm", - (VRHADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>; -def : NEONInstAlias<"vrhadd${p}.u32 $Vdn, $Rm", - (VRHADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>; - // VSWP allows, but does not require, a type suffix. defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm", (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>; diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 6335229..554f6d9 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -32,9 +32,6 @@ def imm_sr : Operand<i32>, PatLeaf<(imm), [{ let ParserMatchClass = ThumbSRImmAsmOperand; } -def imm_neg_XFORM : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32); -}]>; def imm_comp_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32); }]>; @@ -258,16 +255,20 @@ def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>, Requires<[IsThumb2]>; def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>, - T1SystemEncoding<0x10>; // A8.6.410 + T1SystemEncoding<0x10>, // A8.6.410 + Requires<[IsThumb2]>; def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>, - T1SystemEncoding<0x20>; // A8.6.408 + T1SystemEncoding<0x20>, // A8.6.408 + Requires<[IsThumb2]>; def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>, - T1SystemEncoding<0x30>; // A8.6.409 + T1SystemEncoding<0x30>, // A8.6.409 + Requires<[IsThumb2]>; def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>, - T1SystemEncoding<0x40>; // A8.6.157 + T1SystemEncoding<0x40>, // A8.6.157 + Requires<[IsThumb2]>; // The imm operand $val can be used by a debugger to store more information // about the breakpoint. @@ -363,8 +364,8 @@ def : tInstAlias<"sub${p} sp, sp, $imm", (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>; // ADD <Rm>, sp -def tADDrSP : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPRsp:$sp), IIC_iALUr, - "add", "\t$Rdn, $sp, $Rn", []>, +def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr, + "add", "\t$Rdn, $sp, $Rn", []>, T1Special<{0,0,?,?}> { // A8.6.9 Encoding T1 bits<4> Rdn; @@ -419,34 +420,35 @@ let isCall = 1, Defs = [LR], Uses = [SP] in { // Also used for Thumb2 def tBL : TIx2<0b11110, 0b11, 1, - (outs), (ins pred:$p, t_bltarget:$func, variable_ops), IIC_Br, + (outs), (ins pred:$p, t_bltarget:$func), IIC_Br, "bl${p}\t$func", [(ARMtcall tglobaladdr:$func)]>, Requires<[IsThumb]> { - bits<22> func; - let Inst{26} = func{21}; + bits<24> func; + let Inst{26} = func{23}; let Inst{25-16} = func{20-11}; - let Inst{13} = 1; - let Inst{11} = 1; + let Inst{13} = func{22}; + let Inst{11} = func{21}; let Inst{10-0} = func{10-0}; } // ARMv5T and above, also used for Thumb2 def tBLXi : TIx2<0b11110, 0b11, 0, - (outs), (ins pred:$p, t_blxtarget:$func, variable_ops), IIC_Br, + (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br, "blx${p}\t$func", [(ARMcall tglobaladdr:$func)]>, Requires<[IsThumb, HasV5T]> { - bits<21> func; + bits<24> func; + let Inst{26} = func{23}; let Inst{25-16} = func{20-11}; - let Inst{13} = 1; - let Inst{11} = 1; + let Inst{13} = func{22}; + let Inst{11} = func{21}; let Inst{10-1} = func{10-1}; let Inst{0} = 0; // func{0} is assumed zero } // Also used for Thumb2 - def tBLXr : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br, + def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br, "blx${p}\t$func", [(ARMtcall GPR:$func)]>, Requires<[IsThumb, HasV5T]>, @@ -457,7 +459,7 @@ let isCall = 1, } // ARMv4T - def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops), + def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func), 4, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsThumb, IsThumb1Only]>; @@ -504,7 +506,7 @@ let isBranch = 1, isTerminator = 1 in let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { // IOS versions. let Uses = [SP] in { - def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops), + def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst), 4, IIC_Br, [], (tBX GPR:$dst, (ops 14, zero_reg))>, Requires<[IsThumb]>; @@ -514,7 +516,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { // Non-IOS version: let Uses = [SP] in { def tTAILJMPdND : tPseudoExpand<(outs), - (ins t_brtarget:$dst, pred:$p, variable_ops), + (ins t_brtarget:$dst, pred:$p), 4, IIC_Br, [], (tB t_brtarget:$dst, pred:$p)>, Requires<[IsThumb, IsNotIOS]>; @@ -1398,7 +1400,7 @@ def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>; // For round-trip assembly/disassembly, we have to handle a CPS instruction // without any iflags. That's not, strictly speaking, valid syntax, but it's -// a useful extention and assembles to defined behaviour (the insn does +// a useful extension and assembles to defined behaviour (the insn does // nothing). def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>; def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index e6fb9d5..d83530a 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -62,6 +62,15 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32); }]>; +// so_imm_notSext_XFORM - Return a so_imm value packed into the format +// described for so_imm_notSext def below, with sign extension from 16 +// bits. +def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{ + APInt apIntN = N->getAPIntValue(); + unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue(); + return CurDAG->getTargetConstant(~N16bitSignExt, MVT::i32); +}]>; + // t2_so_imm - Match a 32-bit immediate operand, which is an // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit // immediate splatted into multiple bytes of the word. @@ -86,6 +95,17 @@ def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{ let ParserMatchClass = t2_so_imm_not_asmoperand; } +// t2_so_imm_notSext - match an immediate that is a complement of a t2_so_imm +// if the upper 16 bits are zero. +def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{ + APInt apIntN = N->getAPIntValue(); + if (!apIntN.isIntN(16)) return false; + unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue(); + return ARM_AM::getT2SOImmVal(~N16bitSignExt) != -1; + }], t2_so_imm_notSext16_XFORM> { + let ParserMatchClass = t2_so_imm_not_asmoperand; +} + // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm. def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; } def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{ @@ -668,16 +688,16 @@ let hasPostISelHook = 1, Defs = [CPSR] in { multiclass T2I_rbin_s_is<PatFrag opnode> { // shifted imm def ri : t2PseudoInst<(outs rGPR:$Rd), - (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p), + (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p), 4, IIC_iALUi, [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm, - GPRnopc:$Rn))]>; + rGPR:$Rn))]>; // shifted register def rs : t2PseudoInst<(outs rGPR:$Rd), - (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p), + (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p), 4, IIC_iALUsi, [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm, - GPRnopc:$Rn))]>; + rGPR:$Rn))]>; } } @@ -1911,11 +1931,16 @@ def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; +def : T2Pat<(add GPR:$src, imm0_65535_neg:$imm), + (t2SUBrr GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>; + let AddedComplexity = 1 in def : T2Pat<(ARMaddc rGPR:$src, imm0_255_neg:$imm), (t2SUBSri rGPR:$src, imm0_255_neg:$imm)>; def : T2Pat<(ARMaddc rGPR:$src, t2_so_imm_neg:$imm), (t2SUBSri rGPR:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(ARMaddc rGPR:$src, imm0_65535_neg:$imm), + (t2SUBSrr rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>; // The with-carry-in form matches bitwise not instead of the negation. // Effectively, the inverse interpretation of the carry flag already accounts // for part of the negation. @@ -1924,6 +1949,8 @@ def : T2Pat<(ARMadde rGPR:$src, imm0_255_not:$imm, CPSR), (t2SBCri rGPR:$src, imm0_255_not:$imm)>; def : T2Pat<(ARMadde rGPR:$src, t2_so_imm_not:$imm, CPSR), (t2SBCri rGPR:$src, t2_so_imm_not:$imm)>; +def : T2Pat<(ARMadde rGPR:$src, imm0_65535_neg:$imm, CPSR), + (t2SBCrr rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>; // Select Bytes -- for disassembly only @@ -2134,8 +2161,8 @@ defm t2ROR : T2I_sh_ir<0b11, "ror", imm0_31, BinOpFrag<(rotr node:$LHS, node:$RHS)>, "t2ROR">; // (rotr x, (and y, 0x...1f)) ==> (ROR x, y) -def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)), - (t2RORrr rGPR:$lhs, rGPR:$rhs)>; +def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)), + (t2RORrr rGPR:$lhs, rGPR:$rhs)>; let Uses = [CPSR] in { def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, @@ -2332,6 +2359,17 @@ let AddedComplexity = 1 in def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm), (t2BICri rGPR:$src, t2_so_imm_not:$imm)>; +// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise +def top16Zero: PatLeaf<(i32 rGPR:$src), [{ + return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16)); + }]>; + +// so_imm_notSext is needed instead of so_imm_not, as the value of imm +// will match the extended, not the original bitWidth for $src. +def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm), + (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>; + + // FIXME: Disable this pattern on Darwin to workaround an assembler bug. def : T2Pat<(or rGPR:$src, t2_so_imm_not:$imm), (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>, @@ -2849,20 +2887,64 @@ def : T2Pat<(ARMcmpZ GPRnopc:$lhs, rGPR:$rhs), def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_reg:$rhs), (t2CMPrs GPRnopc:$lhs, t2_so_reg:$rhs)>; -//FIXME: Disable CMN, as CCodes are backwards from compare expectations -// Compare-to-zero still works out, just not the relationals -//defm t2CMN : T2I_cmp_irs<0b1000, "cmn", -// BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; -defm t2CMNz : T2I_cmp_irs<0b1000, "cmn", - IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, - BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>, - "t2CMNz">; +let isCompare = 1, Defs = [CPSR] in { + // shifted imm + def t2CMNri : T2OneRegCmpImm< + (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi, + "cmn", ".w\t$Rn, $imm", + [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b1000; + let Inst{20} = 1; // The S bit. + let Inst{15} = 0; + let Inst{11-8} = 0b1111; // Rd + } + // register + def t2CMNzrr : T2TwoRegCmp< + (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr, + "cmn", ".w\t$Rn, $Rm", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPRnopc:$Rn, rGPR:$Rm)]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b1000; + let Inst{20} = 1; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{11-8} = 0b1111; // Rd + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def t2CMNzrs : T2OneRegCmpShiftedReg< + (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi, + "cmn", ".w\t$Rn, $ShiftedRm", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b1000; + let Inst{20} = 1; // The S bit. + let Inst{11-8} = 0b1111; // Rd + } +} -//def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm), -// (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>; +// Assembler aliases w/o the ".w" suffix. +// No alias here for 'rr' version as not all instantiations of this multiclass +// want one (CMP in particular, does not). +def : t2InstAlias<!strconcat("cmn", "${p}", " $Rn, $imm"), + (!cast<Instruction>(!strconcat("t2CMN", "ri")) GPRnopc:$Rn, + t2_so_imm:$imm, pred:$p)>; +def : t2InstAlias<!strconcat("cmn", "${p}", " $Rn, $shift"), + (!cast<Instruction>(!strconcat("t2CMNz", "rs")) GPRnopc:$Rn, + t2_so_reg:$shift, + pred:$p)>; -def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm), - (t2CMNzri GPRnopc:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm), + (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>; + +def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm), + (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>; defm t2TST : T2I_cmp_irs<0b0000, "tst", IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, @@ -3017,7 +3099,7 @@ def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary, def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary, "isb", "\t$opt", - []>, Requires<[IsThumb2, HasDB]> { + []>, Requires<[IsThumb, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf3bf8f6; let Inst{3-0} = opt; @@ -3271,7 +3353,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { // IOS version. let Uses = [SP] in def tTAILJMPd: tPseudoExpand<(outs), - (ins uncondbrtarget:$dst, pred:$p, variable_ops), + (ins uncondbrtarget:$dst, pred:$p), 4, IIC_Br, [], (t2B uncondbrtarget:$dst, pred:$p)>, Requires<[IsThumb2, IsIOS]>; @@ -3281,7 +3363,7 @@ let isCall = 1, Defs = [LR], Uses = [SP] in { // mov lr, pc; b if callee is marked noreturn to avoid confusing the // return stack predictor. def t2BMOVPCB_CALL : tPseudoInst<(outs), - (ins t_bltarget:$func, variable_ops), + (ins t_bltarget:$func), 6, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>, Requires<[IsThumb]>; } @@ -3382,21 +3464,18 @@ let imod = 0, iflags = 0, M = 1 in // A6.3.4 Branches and miscellaneous control // Table A6-14 Change Processor State, and hint instructions -class T2I_hint<bits<8> op7_0, string opc, string asm> - : T2I<(outs), (ins), NoItinerary, opc, asm, []> { - let Inst{31-20} = 0xf3a; - let Inst{19-16} = 0b1111; - let Inst{15-14} = 0b10; - let Inst{12} = 0; - let Inst{10-8} = 0b000; - let Inst{7-0} = op7_0; +def t2HINT : T2I<(outs), (ins imm0_255:$imm), NoItinerary, "hint", "\t$imm",[]>{ + bits<8> imm; + let Inst{31-8} = 0b111100111010111110000000; + let Inst{7-0} = imm; } -def t2NOP : T2I_hint<0b00000000, "nop", ".w">; -def t2YIELD : T2I_hint<0b00000001, "yield", ".w">; -def t2WFE : T2I_hint<0b00000010, "wfe", ".w">; -def t2WFI : T2I_hint<0b00000011, "wfi", ".w">; -def t2SEV : T2I_hint<0b00000100, "sev", ".w">; +def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_255:$imm, pred:$p)>; +def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>; +def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>; +def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>; +def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>; +def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>; def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> { bits<4> opt; @@ -3622,8 +3701,8 @@ defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">; // A/R class MRS. // // A/R class can only move from CPSR or SPSR. -def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []>, - Requires<[IsThumb2,IsARClass]> { +def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", + []>, Requires<[IsThumb2,IsARClass]> { bits<4> Rd; let Inst{31-12} = 0b11110011111011111000; let Inst{11-8} = Rd; @@ -3632,8 +3711,8 @@ def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []> def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>; -def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", []>, - Requires<[IsThumb2,IsARClass]> { +def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", + []>, Requires<[IsThumb2,IsARClass]> { bits<4> Rd; let Inst{31-12} = 0b11110011111111111000; let Inst{11-8} = Rd; @@ -3646,7 +3725,7 @@ def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", [ // the A/R class (a full msr_mask). def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary, "mrs", "\t$Rd, $mask", []>, - Requires<[IsThumb2,IsMClass]> { + Requires<[IsThumb,IsMClass]> { bits<4> Rd; bits<8> mask; let Inst{31-12} = 0b11110011111011111000; @@ -3682,14 +3761,14 @@ def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn), // Move from ARM core register to Special Register def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn), NoItinerary, "msr", "\t$SYSm, $Rn", []>, - Requires<[IsThumb2,IsMClass]> { - bits<8> SYSm; + Requires<[IsThumb,IsMClass]> { + bits<12> SYSm; bits<4> Rn; let Inst{31-21} = 0b11110011100; let Inst{20} = 0b0; let Inst{19-16} = Rn; let Inst{15-12} = 0b1000; - let Inst{7-0} = SYSm; + let Inst{11-0} = SYSm; } @@ -3969,6 +4048,17 @@ def : t2InstAlias<"add${s}${p} $Rdn, $imm", def : t2InstAlias<"add${p} $Rdn, $imm", (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>; +def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm", + (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstAlias<"addw${p} $Rd, $Rn, $imm", + (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; +def : t2InstAlias<"add${s}${p}.w $Rdn, $imm", + (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstAlias<"addw${p} $Rdn, $imm", + (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>; + // Aliases for SUB without the ".w" optional width specifier. def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm", @@ -4002,9 +4092,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm", (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; // Memory barriers -def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb2, HasDB]>; -def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb2, HasDB]>; -def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb2, HasDB]>; +def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb, HasDB]>; +def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb, HasDB]>; +def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb, HasDB]>; // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional // width specifier. @@ -4213,7 +4303,7 @@ def : t2InstAlias<"add${s}${p} $Rd, $imm", pred:$p, cc_out:$s)>; // Same for CMP <--> CMN via t2_so_imm_neg def : t2InstAlias<"cmp${p} $Rd, $imm", - (t2CMNzri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>; + (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>; def : t2InstAlias<"cmn${p} $Rd, $imm", (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>; diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index 3600b88..23c132e 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -221,11 +221,13 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r", // FP Binary Operations. // +let TwoOperandAliasConstraint = "$Dn = $Dd" in def VADDD : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>; +let TwoOperandAliasConstraint = "$Sn = $Sd" in def VADDS : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", @@ -235,11 +237,13 @@ def VADDS : ASbIn<0b11100, 0b11, 0, 0, let D = VFPNeonA8Domain; } +let TwoOperandAliasConstraint = "$Dn = $Dd" in def VSUBD : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>; +let TwoOperandAliasConstraint = "$Sn = $Sd" in def VSUBS : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", @@ -249,21 +253,25 @@ def VSUBS : ASbIn<0b11100, 0b11, 1, 0, let D = VFPNeonA8Domain; } +let TwoOperandAliasConstraint = "$Dn = $Dd" in def VDIVD : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>; +let TwoOperandAliasConstraint = "$Sn = $Sd" in def VDIVS : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>; +let TwoOperandAliasConstraint = "$Dn = $Dd" in def VMULD : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>; +let TwoOperandAliasConstraint = "$Sn = $Sd" in def VMULS : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", @@ -559,8 +567,8 @@ def VMOVRRS : AVConv3I<0b11000101, 0b1010, bits<4> Rt2; // Encode instruction operands. - let Inst{3-0} = src1{3-0}; - let Inst{5} = src1{4}; + let Inst{3-0} = src1{4-1}; + let Inst{5} = src1{0}; let Inst{15-12} = Rt; let Inst{19-16} = Rt2; @@ -609,8 +617,8 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010, bits<4> src2; // Encode instruction operands. - let Inst{3-0} = dst1{3-0}; - let Inst{5} = dst1{4}; + let Inst{3-0} = dst1{4-1}; + let Inst{5} = dst1{0}; let Inst{15-12} = src1; let Inst{19-16} = src2; @@ -819,9 +827,9 @@ let Constraints = "$a = $dst" in { // FP to Fixed-Point: // Single Precision register -class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, - dag oops, dag iops, InstrItinClass itin, string opc, string asm, - list<dag> pattern> +class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, + bit op5, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> { bits<5> dst; // if dp_operation then UInt(D:Vd) else UInt(Vd:D); @@ -830,9 +838,9 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bi } // Double Precision register -class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, - dag oops, dag iops, InstrItinClass itin, string opc, string asm, - list<dag> pattern> +class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, + bit op5, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> { bits<5> dst; // if dp_operation then UInt(D:Vd) else UInt(Vd:D); @@ -1081,10 +1089,11 @@ def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics -def : Pat<(f64 (fma DPR:$Ddin, DPR:$Dn, DPR:$Dm)), +// (fma x, y, z) -> (vfms z, x, y) +def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4]>; -def : Pat<(f32 (fma SPR:$Sdin, SPR:$Sn, SPR:$Sm)), +def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; @@ -1115,18 +1124,18 @@ def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics -// (fma (fneg x), y, z) -> (vfms x, y, z) -def : Pat<(f64 (fma (fneg DPR:$Ddin), DPR:$Dn, DPR:$Dm)), +// (fma (fneg x), y, z) -> (vfms z, x, y) +def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4]>; -def : Pat<(f32 (fma (fneg SPR:$Sdin), SPR:$Sn, SPR:$Sm)), +def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -// (fneg (fma x, (fneg y), z) -> (vfms x, y, z) -def : Pat<(fneg (f64 (fma DPR:$Ddin, (fneg DPR:$Dn), DPR:$Dm))), +// (fma x, (fneg y), z) -> (vfms z, x, y) +def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)), (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (f32 (fma SPR:$Sdin, (fneg SPR:$Sn), SPR:$Sm))), +def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; @@ -1157,18 +1166,18 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics -// (fneg (fma x, y, z)) -> (vfnma x, y, z) -def : Pat<(fneg (fma (f64 DPR:$Ddin), (f64 DPR:$Dn), (f64 DPR:$Dm))), +// (fneg (fma x, y, z)) -> (vfnma z, x, y) +def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (fma (f32 SPR:$Sdin), (f32 SPR:$Sn), (f32 SPR:$Sm))), +def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -// (fma (fneg x), y, (fneg z)) -> (vfnma x, y, z) -def : Pat<(f64 (fma (fneg DPR:$Ddin), DPR:$Dn, (fneg DPR:$Dm))), +// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) +def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4]>; -def : Pat<(f32 (fma (fneg SPR:$Sdin), SPR:$Sn, (fneg SPR:$Sm))), +def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; @@ -1198,18 +1207,26 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics -// (fneg (fma (fneg x), y, z)) -> (vnfms x, y, z) -def : Pat<(fneg (f64 (fma (fneg DPR:$Ddin), DPR:$Dn, DPR:$Dm))), + +// (fma x, y, (fneg z)) -> (vfnms z, x, y)) +def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), + (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4]>; +def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), + (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; +// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) +def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (f32 (fma (fneg SPR:$Sdin), SPR:$Sn, SPR:$Sm))), +def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -// (fma x, (fneg y), z) -> (vnfms x, y, z) -def : Pat<(f64 (fma DPR:$Ddin, (fneg DPR:$Dn), DPR:$Dm)), +// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y) +def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))), (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, Requires<[HasVFP4]>; -def : Pat<(f32 (fma SPR:$Sdin, (fneg SPR:$Sn), SPR:$Sm)), +def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; @@ -1426,22 +1443,6 @@ def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr", def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr", (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>; -// VMUL has a two-operand form (implied destination operand) -def : VFP2InstAlias<"vmul${p}.f64 $Dn, $Dm", - (VMULD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>; -def : VFP2InstAlias<"vmul${p}.f32 $Sn, $Sm", - (VMULS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>; -// VADD has a two-operand form (implied destination operand) -def : VFP2InstAlias<"vadd${p}.f64 $Dn, $Dm", - (VADDD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>; -def : VFP2InstAlias<"vadd${p}.f32 $Sn, $Sm", - (VADDS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>; -// VSUB has a two-operand form (implied destination operand) -def : VFP2InstAlias<"vsub${p}.f64 $Dn, $Dm", - (VSUBD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>; -def : VFP2InstAlias<"vsub${p}.f32 $Sn, $Sm", - (VSUBS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>; - // VMOV can accept optional 32-bit or less data type suffix suffix. def : VFP2InstAlias<"vmov${p}.8 $Rt, $Sn", (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>; diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 9ef2ace..cb1b2a2 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1177,8 +1177,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, BaseReg, false, BaseUndef, false, OffUndef, Pred, PredReg, TII, isT2); NewBBI = llvm::prior(MBBI); - if (isT2 && NewOpc == ARM::t2LDRi8 && OffImm+4 >= 0) - NewOpc = ARM::t2LDRi12; InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill, false, BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, @@ -1326,7 +1324,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // First advance to the instruction just before the start of the chain. AdvanceRS(MBB, MemOps); // Find a scratch register. - unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); + unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass); // Process the load / store instructions. RS->forward(prior(MBBI)); @@ -1739,7 +1737,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, Ops.pop_back(); const MCInstrDesc &MCID = TII->get(NewOpc); - const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF); MRI->constrainRegClass(EvenReg, TRC); MRI->constrainRegClass(OddReg, TRC); diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index 1466e98..3857647 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -267,21 +267,16 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, // Subset of DPR that are accessible with VFP2 (and so that also have // 32-bit SPR subregs). def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, - (trunc DPR, 16)> { - let SubRegClasses = [(SPR ssub_0, ssub_1)]; -} + (trunc DPR, 16)>; // Subset of DPR which can be used as a source of NEON scalars for 16-bit // operations def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, - (trunc DPR, 8)> { - let SubRegClasses = [(SPR_8 ssub_0, ssub_1)]; -} + (trunc DPR, 8)>; // Generic 128-bit vector register class. def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, (sequence "Q%u", 0, 15)> { - let SubRegClasses = [(DPR dsub_0, dsub_1)]; // Allocate non-VFP2 aliases Q8-Q15 first. let AltOrders = [(rotl QPR, 8)]; let AltOrderSelect = [{ return 1; }]; @@ -289,17 +284,11 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, // Subset of QPR that have 32-bit SPR subregs. def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, (trunc QPR, 8)> { - let SubRegClasses = [(SPR ssub_0, ssub_1, ssub_2, ssub_3), - (DPR_VFP2 dsub_0, dsub_1)]; -} + 128, (trunc QPR, 8)>; // Subset of QPR that have DPR_8 and SPR_8 subregs. def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, (trunc QPR, 4)> { - let SubRegClasses = [(SPR_8 ssub_0, ssub_1, ssub_2, ssub_3), - (DPR_8 dsub_0, dsub_1)]; -} + 128, (trunc QPR, 4)>; // Pseudo-registers representing odd-even pairs of D registers. The even-odd // pairs are already represented by the Q registers. @@ -338,8 +327,6 @@ def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], [(shl QPR, 0), (shl QPR, 1)]>; // Pseudo 256-bit vector register class to model pairs of Q registers // (4 consecutive D registers). def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> { - let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3), - (QPR qsub_0, qsub_1)]; // Allocate non-VFP2 aliases first. let AltOrders = [(rotl QQPR, 8)]; let AltOrderSelect = [{ return 1; }]; @@ -363,9 +350,6 @@ def Tuples2QQ : RegisterTuples<[qqsub_0, qqsub_1], // Pseudo 512-bit vector register class to model 4 consecutive Q registers // (8 consecutive D registers). def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> { - let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3, - dsub_4, dsub_5, dsub_6, dsub_7), - (QPR qsub_0, qsub_1, qsub_2, qsub_3)]; // Allocate non-VFP2 aliases first. let AltOrders = [(rotl QQQQPR, 8)]; let AltOrderSelect = [{ return 1; }]; diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 45486fd..81d2fa3 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -70,11 +70,11 @@ def IIC_iLoad_bh_siu : InstrItinClass; def IIC_iLoad_d_i : InstrItinClass; def IIC_iLoad_d_r : InstrItinClass; def IIC_iLoad_d_ru : InstrItinClass; -def IIC_iLoad_m : InstrItinClass<0>; // micro-coded -def IIC_iLoad_mu : InstrItinClass<0>; // micro-coded -def IIC_iLoad_mBr : InstrItinClass<0>; // micro-coded -def IIC_iPop : InstrItinClass<0>; // micro-coded -def IIC_iPop_Br : InstrItinClass<0>; // micro-coded +def IIC_iLoad_m : InstrItinClass; +def IIC_iLoad_mu : InstrItinClass; +def IIC_iLoad_mBr : InstrItinClass; +def IIC_iPop : InstrItinClass; +def IIC_iPop_Br : InstrItinClass; def IIC_iLoadiALU : InstrItinClass; def IIC_iStore_i : InstrItinClass; def IIC_iStore_r : InstrItinClass; @@ -91,8 +91,8 @@ def IIC_iStore_bh_siu : InstrItinClass; def IIC_iStore_d_i : InstrItinClass; def IIC_iStore_d_r : InstrItinClass; def IIC_iStore_d_ru : InstrItinClass; -def IIC_iStore_m : InstrItinClass<0>; // micro-coded -def IIC_iStore_mu : InstrItinClass<0>; // micro-coded +def IIC_iStore_m : InstrItinClass; +def IIC_iStore_mu : InstrItinClass; def IIC_Preload : InstrItinClass; def IIC_Br : InstrItinClass; def IIC_fpSTAT : InstrItinClass; @@ -126,12 +126,12 @@ def IIC_fpSQRT32 : InstrItinClass; def IIC_fpSQRT64 : InstrItinClass; def IIC_fpLoad32 : InstrItinClass; def IIC_fpLoad64 : InstrItinClass; -def IIC_fpLoad_m : InstrItinClass<0>; // micro-coded -def IIC_fpLoad_mu : InstrItinClass<0>; // micro-coded +def IIC_fpLoad_m : InstrItinClass; +def IIC_fpLoad_mu : InstrItinClass; def IIC_fpStore32 : InstrItinClass; def IIC_fpStore64 : InstrItinClass; -def IIC_fpStore_m : InstrItinClass<0>; // micro-coded -def IIC_fpStore_mu : InstrItinClass<0>; // micro-coded +def IIC_fpStore_m : InstrItinClass; +def IIC_fpStore_mu : InstrItinClass; def IIC_VLD1 : InstrItinClass; def IIC_VLD1x2 : InstrItinClass; def IIC_VLD1x3 : InstrItinClass; @@ -258,8 +258,6 @@ def IIC_VTBX4 : InstrItinClass; //===----------------------------------------------------------------------===// // Processor instruction itineraries. -def GenericItineraries : ProcessorItineraries<[], [], []>; - include "ARMScheduleV6.td" include "ARMScheduleA8.td" include "ARMScheduleA9.td" diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index 8b1fb93..56197d4 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -151,28 +151,30 @@ def CortexA8Itineraries : ProcessorItineraries< // Load multiple, def is the 5th operand. Pipeline 0 only. // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. InstrItinData<IIC_iLoad_m , [InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_LSPipe]>], [1, 1, 1, 1, 3]>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1, 3], [], -1>, // dynamic uops // // Load multiple + update, defs are the 1st and 5th operands. InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>, - InstrStage<3, [A8_LSPipe]>], [2, 1, 1, 1, 3]>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 3], [], -1>, // dynamic uops // // Load multiple plus branch InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>, InstrStage<3, [A8_LSPipe]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>], - [1, 2, 1, 1, 3]>, + [1, 2, 1, 1, 3], [], -1>, // dynamic uops // // Pop, def is the 3rd operand. InstrItinData<IIC_iPop , [InstrStage<3, [A8_Pipe0], 0>, - InstrStage<3, [A8_LSPipe]>], [1, 1, 3]>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 3], [], -1>, // dynamic uops // // Push, def is the 3th operand. InstrItinData<IIC_iPop_Br, [InstrStage<3, [A8_Pipe0], 0>, InstrStage<3, [A8_LSPipe]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>], - [1, 1, 3]>, - + [1, 1, 3], [], -1>, // dynamic uops // // iLoadi + iALUr for t2LDRpci_pic. InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, @@ -227,12 +229,13 @@ def CortexA8Itineraries : ProcessorItineraries< // Store multiple. Pipeline 0 only. // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_LSPipe]>]>, + InstrStage<2, [A8_LSPipe]>], + [], [], -1>, // dynamic uops // // Store multiple + update InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>, - InstrStage<2, [A8_LSPipe]>], [2]>, - + InstrStage<2, [A8_LSPipe]>], + [2], [], -1>, // dynamic uops // // Preload InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, @@ -393,14 +396,16 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<1, [A8_NLSPipe], 0>, InstrStage<1, [A8_LSPipe]>, InstrStage<1, [A8_NLSPipe], 0>, - InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 2]>, + InstrStage<1, [A8_LSPipe]>], + [1, 1, 1, 2], [], -1>, // dynamic uops // // FP Load Multiple + update InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe], 0>, InstrStage<1, [A8_LSPipe]>, InstrStage<1, [A8_NLSPipe], 0>, - InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 2]>, + InstrStage<1, [A8_LSPipe]>], + [2, 1, 1, 1, 2], [], -1>, // dynamic uops // // Single-precision FP Store InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, @@ -419,15 +424,16 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<1, [A8_NLSPipe], 0>, InstrStage<1, [A8_LSPipe]>, InstrStage<1, [A8_NLSPipe], 0>, - InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 1]>, + InstrStage<1, [A8_LSPipe]>], + [1, 1, 1, 1], [], -1>, // dynamic uops // // FP Store Multiple + update InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NLSPipe], 0>, InstrStage<1, [A8_LSPipe]>, InstrStage<1, [A8_NLSPipe], 0>, - InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 1]>, - + InstrStage<1, [A8_LSPipe]>], + [2, 1, 1, 1, 1], [], -1>, // dynamic uops // NEON // Issue through integer pipeline, and execute in NEON unit. // @@ -1051,3 +1057,18 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<1, [A8_NPipe], 0>, InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]> ]>; + +// ===---------------------------------------------------------------------===// +// This following definitions describe the simple machine model which +// will replace itineraries. + +// Cortex-A8 machine model for scheduling and other instruction cost heuristics. +def CortexA8Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 2; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = CortexA8Itineraries; +} diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 0d710cc..738974e 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -11,6 +11,10 @@ // //===----------------------------------------------------------------------===// +// ===---------------------------------------------------------------------===// +// This section contains legacy support for itineraries. This is +// required until SD and PostRA schedulers are replaced by MachineScheduler. + // // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical // Reference Manual". @@ -280,7 +284,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_AGU], 1>, InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 3], - [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // Load multiple + update, defs are the 1st and 5th operands. InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -288,7 +293,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_AGU], 1>, InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 3], - [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // Load multiple plus branch InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -297,7 +303,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_LSUnit]>, InstrStage<1, [A9_Branch]>], [1, 2, 1, 1, 3], - [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // Pop, def is the 3rd operand. InstrItinData<IIC_iPop , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -305,7 +312,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_AGU], 1>, InstrStage<2, [A9_LSUnit]>], [1, 1, 3], - [NoBypass, NoBypass, A9_LdBypass]>, + [NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // Pop + branch, def is the 3rd operand. InstrItinData<IIC_iPop_Br, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -314,8 +322,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_LSUnit]>, InstrStage<1, [A9_Branch]>], [1, 1, 3], - [NoBypass, NoBypass, A9_LdBypass]>, - + [NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // iLoadi + iALUr for t2LDRpci_pic. InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -409,14 +417,15 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_AGU], 0>, - InstrStage<2, [A9_LSUnit]>]>, + InstrStage<2, [A9_LSUnit]>], + [], [], -1>, // dynamic uops // // Store multiple + update InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_AGU], 0>, - InstrStage<2, [A9_LSUnit]>], [2]>, - + InstrStage<2, [A9_LSUnit]>], + [2], [], -1>, // dynamic uops // // Preload InstrItinData<IIC_Preload, [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>, @@ -713,7 +722,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1], [], -1>, // dynamic uops // // FP Load Multiple + update // FIXME: assumes 2 doubles which requires 2 LS cycles. @@ -722,7 +732,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1], [], -1>, // dynamic uops // // Single-precision FP Store InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -749,7 +760,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1], [], -1>, // dynamic uops // // FP Store Multiple + update // FIXME: assumes 2 doubles which requires 2 LS cycles. @@ -758,7 +770,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1], [], -1>, // dynamic uops // NEON // VLD1 InstrItinData<IIC_VLD1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -1861,3 +1874,21 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]> ]>; + +// ===---------------------------------------------------------------------===// +// This following definitions describe the simple machine model which +// will replace itineraries. + +// Cortex-A9 machine model for scheduling and other instruction cost heuristics. +def CortexA9Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = 0; // Data dependencies are allowed within dispatch groups. + let LoadLatency = 2; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = CortexA9Itineraries; +} + +// TODO: Add Cortex-A9 processor and scheduler resources. + diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index e2530d0..31d5d38 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -179,8 +179,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, Args.push_back(Entry); // Emit __eabi_memset call - std::pair<SDValue,SDValue> CallResult = - TLI.LowerCallTo(Chain, + TargetLowering::CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()), // return type false, // return sign ext false, // return zero ext @@ -193,7 +192,9 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, false, // is return val used DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET), TLI.getPointerTy()), // callee - Args, DAG, dl); // arg list, DAG and debug + Args, DAG, dl); + std::pair<SDValue,SDValue> CallResult = + TLI.LowerCallTo(CLI); return CallResult.second; } diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index ca172ed..e067a9f 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -67,6 +67,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, , HasDataBarrier(false) , Pref32BitThumb(false) , AvoidCPSRPartialUpdate(false) + , HasRAS(false) , HasMPExtension(false) , FPOnlySP(false) , AllowsUnalignedMem(false) @@ -82,7 +83,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, // Insert the architecture feature derived from the target triple into the // feature string. This is important for setting features that are implied // based on the architecture version. - std::string ArchFS = ARM_MC::ParseARMTriple(TT); + std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPUString); if (!FS.empty()) { if (!ArchFS.empty()) ArchFS = ArchFS + "," + FS; @@ -99,10 +100,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, // Initialize scheduling itinerary for the specified CPU. InstrItins = getInstrItineraryForCPU(CPUString); - // After parsing Itineraries, set ItinData.IssueWidth. - computeIssueWidth(); - - if (TT.find("eabi") != std::string::npos) + if ((TT.find("eabi") != std::string::npos) || (isTargetIOS() && isMClass())) // FIXME: We might want to separate AAPCS and EABI. Some systems, e.g. // Darwin-EABI conforms to AACPS but not the rest of EABI. TargetABI = ARM_ABI_AAPCS; @@ -192,22 +190,6 @@ unsigned ARMSubtarget::getMispredictionPenalty() const { return 10; } -void ARMSubtarget::computeIssueWidth() { - unsigned allStage1Units = 0; - for (const InstrItinerary *itin = InstrItins.Itineraries; - itin->FirstStage != ~0U; ++itin) { - const InstrStage *IS = InstrItins.Stages + itin->FirstStage; - allStage1Units |= IS->getUnits(); - } - InstrItins.IssueWidth = 0; - while (allStage1Units) { - ++InstrItins.IssueWidth; - // clear the lowest bit - allStage1Units ^= allStage1Units & ~(allStage1Units - 1); - } - assert(InstrItins.IssueWidth <= 2 && "itinerary bug, too many stage 1 units"); -} - bool ARMSubtarget::enablePostRAScheduler( CodeGenOpt::Level OptLevel, TargetSubtargetInfo::AntiDepBreakMode& Mode, diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 047efc2..171c9ad 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -136,22 +136,22 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { bool ARMPassConfig::addPreISel() { if (TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge) - PM.add(createGlobalMergePass(TM->getTargetLowering())); + addPass(createGlobalMergePass(TM->getTargetLowering())); return false; } bool ARMPassConfig::addInstSelector() { - PM.add(createARMISelDag(getARMTargetMachine(), getOptLevel())); + addPass(createARMISelDag(getARMTargetMachine(), getOptLevel())); return false; } bool ARMPassConfig::addPreRegAlloc() { // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only()) - PM.add(createARMLoadStoreOptimizationPass(true)); + addPass(createARMLoadStoreOptimizationPass(true)); if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9()) - PM.add(createMLxExpansionPass()); + addPass(createMLxExpansionPass()); return true; } @@ -159,23 +159,23 @@ bool ARMPassConfig::addPreSched2() { // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (getOptLevel() != CodeGenOpt::None) { if (!getARMSubtarget().isThumb1Only()) { - PM.add(createARMLoadStoreOptimizationPass()); + addPass(createARMLoadStoreOptimizationPass()); printAndVerify("After ARM load / store optimizer"); } if (getARMSubtarget().hasNEON()) - PM.add(createExecutionDependencyFixPass(&ARM::DPRRegClass)); + addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass)); } // Expand some pseudo instructions into multiple instructions to allow // proper scheduling. - PM.add(createARMExpandPseudoPass()); + addPass(createARMExpandPseudoPass()); if (getOptLevel() != CodeGenOpt::None) { if (!getARMSubtarget().isThumb1Only()) - addPass(IfConverterID); + addPass(&IfConverterID); } if (getARMSubtarget().isThumb2()) - PM.add(createThumb2ITBlockPass()); + addPass(createThumb2ITBlockPass()); return true; } @@ -183,13 +183,13 @@ bool ARMPassConfig::addPreSched2() { bool ARMPassConfig::addPreEmitPass() { if (getARMSubtarget().isThumb2()) { if (!getARMSubtarget().prefers32BitThumb()) - PM.add(createThumb2SizeReductionPass()); + addPass(createThumb2SizeReductionPass()); // Constant island pass work on unbundled instructions. - addPass(UnpackMachineBundlesID); + addPass(&UnpackMachineBundlesID); } - PM.add(createARMConstantIslandPass()); + addPass(createARMConstantIslandPass()); return true; } diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp index a5ea1c2..3d85ca7 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -24,20 +24,11 @@ using namespace dwarf; void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { + bool isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI(); TargetLoweringObjectFileELF::Initialize(Ctx, TM); - isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI(); + InitializeELF(isAAPCS_ABI); if (isAAPCS_ABI) { - StaticCtorSection = - getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY, - ELF::SHF_WRITE | - ELF::SHF_ALLOC, - SectionKind::getDataRel()); - StaticDtorSection = - getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY, - ELF::SHF_WRITE | - ELF::SHF_ALLOC, - SectionKind::getDataRel()); LSDASection = NULL; } @@ -47,33 +38,3 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, 0, SectionKind::getMetadata()); } - -const MCSection * -ARMElfTargetObjectFile::getStaticCtorSection(unsigned Priority) const { - if (!isAAPCS_ABI) - return TargetLoweringObjectFileELF::getStaticCtorSection(Priority); - - if (Priority == 65535) - return StaticCtorSection; - - // Emit ctors in priority order. - std::string Name = std::string(".init_array.") + utostr(Priority); - return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY, - ELF::SHF_ALLOC | ELF::SHF_WRITE, - SectionKind::getDataRel()); -} - -const MCSection * -ARMElfTargetObjectFile::getStaticDtorSection(unsigned Priority) const { - if (!isAAPCS_ABI) - return TargetLoweringObjectFileELF::getStaticDtorSection(Priority); - - if (Priority == 65535) - return StaticDtorSection; - - // Emit dtors in priority order. - std::string Name = std::string(".fini_array.") + utostr(Priority); - return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY, - ELF::SHF_ALLOC | ELF::SHF_WRITE, - SectionKind::getDataRel()); -} diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h index ff21060..c6a7261 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.h +++ b/lib/Target/ARM/ARMTargetObjectFile.h @@ -20,7 +20,6 @@ class TargetMachine; class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF { protected: const MCSection *AttributesSection; - bool isAAPCS_ABI; public: ARMElfTargetObjectFile() : TargetLoweringObjectFileELF(), @@ -32,9 +31,6 @@ public: virtual const MCSection *getAttributesSection() const { return AttributesSection; } - - const MCSection * getStaticCtorSection(unsigned Priority) const; - const MCSection * getStaticDtorSection(unsigned Priority) const; }; } // end namespace llvm diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 2c53e3f..4497720 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -236,7 +236,10 @@ public: Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY, Match_RequiresNotITBlock, Match_RequiresV6, - Match_RequiresThumb2 + Match_RequiresThumb2, +#define GET_OPERAND_DIAGNOSTIC_TYPES +#include "ARMGenAsmMatcher.inc" + }; ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser) @@ -914,7 +917,9 @@ public: // Immediate offset in range [-255, 255]. if (!Memory.OffsetImm) return true; int64_t Val = Memory.OffsetImm->getValue(); - return Val > -256 && Val < 256; + // The #-0 offset is encoded as INT32_MIN, and we have to check + // for this too. + return (Val > -256 && Val < 256) || Val == INT32_MIN; } bool isAM3Offset() const { if (Kind != k_Immediate && Kind != k_PostIndexRegister) @@ -1446,8 +1451,10 @@ public: assert(isRegShiftedImm() && "addRegShiftedImmOperands() on non RegShiftedImm!"); Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg)); + // Shift of #32 is encoded as 0 where permitted + unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm); Inst.addOperand(MCOperand::CreateImm( - ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, RegShiftedImm.ShiftImm))); + ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, Imm))); } void addShifterImmOperands(MCInst &Inst, unsigned N) const { @@ -2301,7 +2308,7 @@ void ARMOperand::print(raw_ostream &OS) const { OS << "<ccout " << getReg() << ">"; break; case k_ITCondMask: { - static const char *MaskStr[] = { + static const char *const MaskStr[] = { "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)", "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)" }; @@ -2672,7 +2679,7 @@ parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { const AsmToken &Tok = Parser.getTok(); if (!Tok.is(AsmToken::Identifier)) return MatchOperand_NoMatch; - unsigned CC = StringSwitch<unsigned>(Tok.getString()) + unsigned CC = StringSwitch<unsigned>(Tok.getString().lower()) .Case("eq", ARMCC::EQ) .Case("ne", ARMCC::NE) .Case("hs", ARMCC::HS) @@ -3249,10 +3256,11 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser:: parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); - assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; StringRef OptStr = Tok.getString(); - unsigned Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size())) + unsigned Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower()) .Case("sy", ARM_MB::SY) .Case("st", ARM_MB::ST) .Case("sh", ARM_MB::ISH) @@ -3280,7 +3288,8 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser:: parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); - assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; StringRef IFlagsStr = Tok.getString(); // An iflags string of "none" is interpreted to mean that none of the AIF @@ -3320,26 +3329,51 @@ parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // See ARMv6-M 10.1.1 std::string Name = Mask.lower(); unsigned FlagsVal = StringSwitch<unsigned>(Name) - .Case("apsr", 0) - .Case("iapsr", 1) - .Case("eapsr", 2) - .Case("xpsr", 3) - .Case("ipsr", 5) - .Case("epsr", 6) - .Case("iepsr", 7) - .Case("msp", 8) - .Case("psp", 9) - .Case("primask", 16) - .Case("basepri", 17) - .Case("basepri_max", 18) - .Case("faultmask", 19) - .Case("control", 20) + // Note: in the documentation: + // ARM deprecates using MSR APSR without a _<bits> qualifier as an alias + // for MSR APSR_nzcvq. + // but we do make it an alias here. This is so to get the "mask encoding" + // bits correct on MSR APSR writes. + // + // FIXME: Note the 0xc00 "mask encoding" bits version of the registers + // should really only be allowed when writing a special register. Note + // they get dropped in the MRS instruction reading a special register as + // the SYSm field is only 8 bits. + // + // FIXME: the _g and _nzcvqg versions are only allowed if the processor + // includes the DSP extension but that is not checked. + .Case("apsr", 0x800) + .Case("apsr_nzcvq", 0x800) + .Case("apsr_g", 0x400) + .Case("apsr_nzcvqg", 0xc00) + .Case("iapsr", 0x801) + .Case("iapsr_nzcvq", 0x801) + .Case("iapsr_g", 0x401) + .Case("iapsr_nzcvqg", 0xc01) + .Case("eapsr", 0x802) + .Case("eapsr_nzcvq", 0x802) + .Case("eapsr_g", 0x402) + .Case("eapsr_nzcvqg", 0xc02) + .Case("xpsr", 0x803) + .Case("xpsr_nzcvq", 0x803) + .Case("xpsr_g", 0x403) + .Case("xpsr_nzcvqg", 0xc03) + .Case("ipsr", 0x805) + .Case("epsr", 0x806) + .Case("iepsr", 0x807) + .Case("msp", 0x808) + .Case("psp", 0x809) + .Case("primask", 0x810) + .Case("basepri", 0x811) + .Case("basepri_max", 0x812) + .Case("faultmask", 0x813) + .Case("control", 0x814) .Default(~0U); if (FlagsVal == ~0U) return MatchOperand_NoMatch; - if (!hasV7Ops() && FlagsVal >= 17 && FlagsVal <= 19) + if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813) // basepri, basepri_max and faultmask only valid for V7m. return MatchOperand_NoMatch; @@ -5315,6 +5349,16 @@ validateInstruction(MCInst &Inst, "registers must be in range r0-r7"); break; } + case ARM::tADDrSP: { + // If the non-SP source operand and the destination operand are not the + // same, we need thumb2 (for the wide encoding), or we have an error. + if (!isThumbTwo() && + Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) { + return Error(Operands[4]->getStartLoc(), + "source register must be the same as destination"); + } + break; + } } return false; @@ -6750,8 +6794,8 @@ processInstruction(MCInst &Inst, case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break; case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break; } - unsigned Ammount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()); - if (Ammount == 32) Ammount = 0; + unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()); + if (Amount == 32) Amount = 0; TmpInst.setOpcode(newOpc); TmpInst.addOperand(Inst.getOperand(0)); // Rd if (isNarrow) @@ -6759,7 +6803,7 @@ processInstruction(MCInst &Inst, Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0)); TmpInst.addOperand(Inst.getOperand(1)); // Rn if (newOpc != ARM::t2RRX) - TmpInst.addOperand(MCOperand::CreateImm(Ammount)); + TmpInst.addOperand(MCOperand::CreateImm(Amount)); TmpInst.addOperand(Inst.getOperand(3)); // CondCode TmpInst.addOperand(Inst.getOperand(4)); if (!isNarrow) @@ -6809,6 +6853,9 @@ processInstruction(MCInst &Inst, // A shift by zero is a plain MOVr, not a MOVsi. unsigned Amt = Inst.getOperand(2).getImm(); unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi; + // A shift by 32 should be encoded as 0 when permitted + if (Amt == 32 && (ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr)) + Amt = 0; unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt); MCInst TmpInst; TmpInst.setOpcode(Opc); @@ -6985,6 +7032,16 @@ processInstruction(MCInst &Inst, Inst = TmpInst; return true; } + case ARM::tADDrSP: { + // If the non-SP source operand and the destination operand are not the + // same, we need to use the 32-bit encoding if it's available. + if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) { + Inst.setOpcode(ARM::t2ADDrr); + Inst.addOperand(MCOperand::CreateReg(0)); // cc_out + return true; + } + break; + } case ARM::tB: // A Thumb conditional branch outside of an IT block is a tBcc. if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) { @@ -7154,7 +7211,9 @@ processInstruction(MCInst &Inst, } case ARM::MOVsi: { ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm()); - if (SOpc == ARM_AM::rrx) return false; + // rrx shifts and asr/lsr of #32 is encoded as 0 + if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) + return false; if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) { // Shifting by zero is accepted as a vanilla 'MOVr' MCInst TmpInst; @@ -7188,7 +7247,9 @@ processInstruction(MCInst &Inst, case ARM::ADDrsi: newOpc = ARM::ADDrr; break; } // If the shift is by zero, use the non-shifted instruction definition. - if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0) { + // The exception is for right shifts, where 0 == 32 + if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0 && + !(SOpc == ARM_AM::lsr || SOpc == ARM_AM::asr)) { MCInst TmpInst; TmpInst.setOpcode(newOpc); TmpInst.addOperand(Inst.getOperand(0)); @@ -7207,9 +7268,7 @@ processInstruction(MCInst &Inst, // The mask bits for all but the first condition are represented as // the low bit of the condition code value implies 't'. We currently // always have 1 implies 't', so XOR toggle the bits if the low bit - // of the condition code is zero. The encoding also expects the low - // bit of the condition to be encoded as bit 4 of the mask operand, - // so mask that in if needed + // of the condition code is zero. MCOperand &MO = Inst.getOperand(1); unsigned Mask = MO.getImm(); unsigned OrigMask = Mask; @@ -7218,8 +7277,7 @@ processInstruction(MCInst &Inst, assert(Mask && TZ <= 3 && "illegal IT mask value!"); for (unsigned i = 3; i != TZ; --i) Mask ^= 1 << i; - } else - Mask |= 0x10; + } MO.setImm(Mask); // Set up the IT block state according to the IT instruction we just @@ -7231,6 +7289,86 @@ processInstruction(MCInst &Inst, ITState.FirstCond = true; break; } + case ARM::t2LSLrr: + case ARM::t2LSRrr: + case ARM::t2ASRrr: + case ARM::t2SBCrr: + case ARM::t2RORrr: + case ARM::t2BICrr: + { + // Assemblers should use the narrow encodings of these instructions when permissible. + if ((isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg())) && + Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && + ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) || + (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && + (!static_cast<ARMOperand*>(Operands[3])->isToken() || + !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) { + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case ARM::t2LSLrr: NewOpc = ARM::tLSLrr; break; + case ARM::t2LSRrr: NewOpc = ARM::tLSRrr; break; + case ARM::t2ASRrr: NewOpc = ARM::tASRrr; break; + case ARM::t2SBCrr: NewOpc = ARM::tSBC; break; + case ARM::t2RORrr: NewOpc = ARM::tROR; break; + case ARM::t2BICrr: NewOpc = ARM::tBIC; break; + } + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(5)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + return false; + } + case ARM::t2ANDrr: + case ARM::t2EORrr: + case ARM::t2ADCrr: + case ARM::t2ORRrr: + { + // Assemblers should use the narrow encodings of these instructions when permissible. + // These instructions are special in that they are commutable, so shorter encodings + // are available more often. + if ((isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg())) && + (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() || + Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) && + ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) || + (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && + (!static_cast<ARMOperand*>(Operands[3])->isToken() || + !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) { + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case ARM::t2ADCrr: NewOpc = ARM::tADC; break; + case ARM::t2ANDrr: NewOpc = ARM::tAND; break; + case ARM::t2EORrr: NewOpc = ARM::tEOR; break; + case ARM::t2ORRrr: NewOpc = ARM::tORR; break; + } + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(5)); + if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) { + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + } else { + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(1)); + } + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + return false; + } } return false; } @@ -7277,6 +7415,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) { return Match_Success; } +static const char *getSubtargetFeatureName(unsigned Val); bool ARMAsmParser:: MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, @@ -7317,9 +7456,21 @@ MatchAndEmitInstruction(SMLoc IDLoc, Inst.setLoc(IDLoc); Out.EmitInstruction(Inst); return false; - case Match_MissingFeature: - Error(IDLoc, "instruction requires a CPU feature not currently enabled"); - return true; + case Match_MissingFeature: { + assert(ErrorInfo && "Unknown missing feature!"); + // Special case the error message for the very common case where only + // a single subtarget feature is missing (Thumb vs. ARM, e.g.). + std::string Msg = "instruction requires:"; + unsigned Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) { + Msg += " "; + Msg += getSubtargetFeatureName(ErrorInfo & Mask); + } + Mask <<= 1; + } + return Error(IDLoc, Msg); + } case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; if (ErrorInfo != ~0U) { @@ -7336,7 +7487,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, return Error(IDLoc, "invalid instruction", ((ARMOperand*)Operands[0])->getLocRange()); case Match_ConversionFail: - // The converter function will have already emited a diagnostic. + // The converter function will have already emitted a diagnostic. return true; case Match_RequiresNotITBlock: return Error(IDLoc, "flag setting instruction only valid outside IT block"); @@ -7346,6 +7497,11 @@ MatchAndEmitInstruction(SMLoc IDLoc, return Error(IDLoc, "instruction variant requires ARMv6 or later"); case Match_RequiresThumb2: return Error(IDLoc, "instruction variant requires Thumb2"); + case Match_ImmRange0_15: { + SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate operand must be in the range [0,15]"); + } } llvm_unreachable("Implement any new match types added!"); @@ -7582,5 +7738,6 @@ extern "C" void LLVMInitializeARMAsmParser() { } #define GET_REGISTER_MATCHER +#define GET_SUBTARGET_FEATURE_NAME #define GET_MATCHER_IMPLEMENTATION #include "ARMGenAsmMatcher.inc" diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt index 9a2aab5..ac916cc 100644 --- a/lib/Target/ARM/CMakeLists.txt +++ b/lib/Target/ARM/CMakeLists.txt @@ -49,6 +49,8 @@ add_llvm_target(ARMCodeGen Thumb2SizeReduction.cpp ) +add_dependencies(LLVMARMCodeGen intrinsics_gen) + # workaround for hanging compilation on MSVC9, 10 if( MSVC_VERSION EQUAL 1600 OR MSVC_VERSION EQUAL 1500 ) set_property( diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 912935d..47cca2a 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -24,12 +24,66 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include <vector> using namespace llvm; typedef MCDisassembler::DecodeStatus DecodeStatus; namespace { + // Handles the condition code status of instructions in IT blocks + class ITStatus + { + public: + // Returns the condition code for instruction in IT block + unsigned getITCC() { + unsigned CC = ARMCC::AL; + if (instrInITBlock()) + CC = ITStates.back(); + return CC; + } + + // Advances the IT block state to the next T or E + void advanceITState() { + ITStates.pop_back(); + } + + // Returns true if the current instruction is in an IT block + bool instrInITBlock() { + return !ITStates.empty(); + } + + // Returns true if current instruction is the last instruction in an IT block + bool instrLastInITBlock() { + return ITStates.size() == 1; + } + + // Called when decoding an IT instruction. Sets the IT state for the following + // instructions that for the IT block. Firstcond and Mask correspond to the + // fields in the IT instruction encoding. + void setITState(char Firstcond, char Mask) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned CondBit0 = Firstcond & 1; + unsigned NumTZ = CountTrailingZeros_32(Mask); + unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf); + assert(NumTZ <= 3 && "Invalid IT mask!"); + // push condition codes onto the stack the correct order for the pops + for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) { + bool T = ((Mask >> Pos) & 1) == CondBit0; + if (T) + ITStates.push_back(CCBits); + else + ITStates.push_back(CCBits ^ 1); + } + ITStates.push_back(CCBits); + } + + private: + std::vector<unsigned char> ITStates; + }; +} + +namespace { /// ARMDisassembler - ARM disassembler for all ARM platforms. class ARMDisassembler : public MCDisassembler { public: @@ -78,7 +132,7 @@ public: /// getEDInfo - See MCDisassembler. const EDInstInfo *getEDInfo() const; private: - mutable std::vector<unsigned> ITBlock; + mutable ITStatus ITBlock; DecodeStatus AddThumbPredicate(MCInst&) const; void UpdateThumbVFPPredicate(MCInst&) const; }; @@ -549,7 +603,7 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value, /// These can often be values in a literal pool near the Address of the /// instruction. The Address of the instruction and its immediate Value are /// used as a possible literal pool entry. The SymbolLookUp call back will -/// return the name of a symbol referenced by the the literal pool's entry if +/// return the name of a symbol referenced by the literal pool's entry if /// the referenced address is that of a symbol. Or it will return a pointer to /// a literal 'C' string if the referenced address of the literal pool's entry /// is an address into a section with 'C' string literals. @@ -612,7 +666,7 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { case ARM::tSETEND: // Some instructions (mostly conditional branches) are not // allowed in IT blocks. - if (!ITBlock.empty()) + if (ITBlock.instrInITBlock()) S = SoftFail; else return Success; @@ -623,7 +677,7 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { case ARM::t2TBH: // Some instructions (mostly unconditional branches) can // only appears at the end of, or outside of, an IT. - if (ITBlock.size() > 1) + if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock()) S = SoftFail; break; default: @@ -633,13 +687,11 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { // If we're in an IT block, base the predicate on that. Otherwise, // assume a predicate of AL. unsigned CC; - if (!ITBlock.empty()) { - CC = ITBlock.back(); - if (CC == 0xF) - CC = ARMCC::AL; - ITBlock.pop_back(); - } else + CC = ITBlock.getITCC(); + if (CC == 0xF) CC = ARMCC::AL; + if (ITBlock.instrInITBlock()) + ITBlock.advanceITState(); const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; @@ -674,11 +726,9 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { // context as a post-pass. void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const { unsigned CC; - if (!ITBlock.empty()) { - CC = ITBlock.back(); - ITBlock.pop_back(); - } else - CC = ARMCC::AL; + CC = ITBlock.getITCC(); + if (ITBlock.instrInITBlock()) + ITBlock.advanceITState(); const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; MCInst::iterator I = MI.begin(); @@ -726,7 +776,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, result = decodeThumbSBitInstruction16(MI, insn16, Address, this, STI); if (result) { Size = 2; - bool InITBlock = !ITBlock.empty(); + bool InITBlock = ITBlock.instrInITBlock(); Check(result, AddThumbPredicate(MI)); AddThumb1SBit(MI, InITBlock); return result; @@ -739,7 +789,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Nested IT blocks are UNPREDICTABLE. Must be checked before we add // the Thumb predicate. - if (MI.getOpcode() == ARM::t2IT && !ITBlock.empty()) + if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock()) result = MCDisassembler::SoftFail; Check(result, AddThumbPredicate(MI)); @@ -749,21 +799,9 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // to the subsequent instructions. if (MI.getOpcode() == ARM::t2IT) { - // (3 - the number of trailing zeros) is the number of then / else. - unsigned firstcond = MI.getOperand(0).getImm(); + unsigned Firstcond = MI.getOperand(0).getImm(); unsigned Mask = MI.getOperand(1).getImm(); - unsigned CondBit0 = Mask >> 4 & 1; - unsigned NumTZ = CountTrailingZeros_32(Mask); - assert(NumTZ <= 3 && "Invalid IT mask!"); - for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { - bool T = ((Mask >> Pos) & 1) == CondBit0; - if (T) - ITBlock.insert(ITBlock.begin(), firstcond); - else - ITBlock.insert(ITBlock.begin(), firstcond ^ 1); - } - - ITBlock.push_back(firstcond); + ITBlock.setITState(Firstcond, Mask); } return result; @@ -783,7 +821,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, result = decodeThumbInstruction32(MI, insn32, Address, this, STI); if (result != MCDisassembler::Fail) { Size = 4; - bool InITBlock = ITBlock.size(); + bool InITBlock = ITBlock.instrInITBlock(); Check(result, AddThumbPredicate(MI)); AddThumb1SBit(MI, InITBlock); return result; @@ -1186,8 +1224,8 @@ static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; - unsigned Vd = fieldFromInstruction32(Val, 8, 4); - unsigned regs = Val & 0xFF; + unsigned Vd = fieldFromInstruction32(Val, 8, 5); + unsigned regs = fieldFromInstruction32(Val, 0, 8); if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder))) return MCDisassembler::Fail; @@ -1203,8 +1241,10 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; - unsigned Vd = fieldFromInstruction32(Val, 8, 4); - unsigned regs = (Val & 0xFF) / 2; + unsigned Vd = fieldFromInstruction32(Val, 8, 5); + unsigned regs = fieldFromInstruction32(Val, 0, 8); + + regs = regs >> 1; if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder))) return MCDisassembler::Fail; @@ -2976,7 +3016,7 @@ static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val, static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { - if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<22>(Val<<1) + 4, + if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4, true, 4, Inst, Decoder)) Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val))); return MCDisassembler::Success; @@ -3258,9 +3298,9 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn, if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder))) return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateReg(ARM::SP)); if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder))) return MCDisassembler::Fail; - Inst.addOperand(MCOperand::CreateReg(ARM::SP)); } else if (Inst.getOpcode() == ARM::tADDspr) { unsigned Rm = fieldFromInstruction16(Insn, 3, 4); @@ -3299,10 +3339,25 @@ static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { + // Val is passed in as S:J1:J2:imm10H:imm10L:'0' + // Note only one trailing zero not two. Also the J1 and J2 values are from + // the encoded instruction. So here change to I1 and I2 values via: + // I1 = NOT(J1 EOR S); + // I2 = NOT(J2 EOR S); + // and build the imm32 with two trailing zeros as documented: + // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:'00', 32); + unsigned S = (Val >> 23) & 1; + unsigned J1 = (Val >> 22) & 1; + unsigned J2 = (Val >> 21) & 1; + unsigned I1 = !(J1 ^ S); + unsigned I2 = !(J2 ^ S); + unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21); + int imm32 = SignExtend32<25>(tmp << 1); + if (!tryAddingSymbolicOperand(Address, - (Address & ~2u) + SignExtend32<22>(Val << 1) + 4, + (Address & ~2u) + imm32 + 4, true, 4, Inst, Decoder)) - Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1))); + Inst.addOperand(MCOperand::CreateImm(imm32)); return MCDisassembler::Success; } @@ -3408,17 +3463,32 @@ static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val, static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder){ - if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<8>(Val<<1) + 4, + if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4, true, 2, Inst, Decoder)) - Inst.addOperand(MCOperand::CreateImm(SignExtend32<8>(Val << 1))); + Inst.addOperand(MCOperand::CreateImm(SignExtend32<9>(Val << 1))); return MCDisassembler::Success; } static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder){ - if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<22>(Val<<1) + 4, + // Val is passed in as S:J1:J2:imm10:imm11 + // Note no trailing zero after imm11. Also the J1 and J2 values are from + // the encoded instruction. So here change to I1 and I2 values via: + // I1 = NOT(J1 EOR S); + // I2 = NOT(J2 EOR S); + // and build the imm32 with one trailing zero as documented: + // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32); + unsigned S = (Val >> 23) & 1; + unsigned J1 = (Val >> 22) & 1; + unsigned J2 = (Val >> 21) & 1; + unsigned I1 = !(J1 ^ S); + unsigned I2 = !(J2 ^ S); + unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21); + int imm32 = SignExtend32<25>(tmp << 1); + + if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4, true, 4, Inst, Decoder)) - Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1))); + Inst.addOperand(MCOperand::CreateImm(imm32)); return MCDisassembler::Success; } @@ -4128,9 +4198,9 @@ static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn, DecodeStatus S = MCDisassembler::Success; unsigned Rt = fieldFromInstruction32(Insn, 12, 4); unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4); - unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rm = fieldFromInstruction32(Insn, 5, 1); unsigned pred = fieldFromInstruction32(Insn, 28, 4); - Rm |= fieldFromInstruction32(Insn, 5, 1) << 4; + Rm |= fieldFromInstruction32(Insn, 0, 4) << 1; if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F) S = MCDisassembler::SoftFail; @@ -4154,9 +4224,9 @@ static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn, DecodeStatus S = MCDisassembler::Success; unsigned Rt = fieldFromInstruction32(Insn, 12, 4); unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4); - unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rm = fieldFromInstruction32(Insn, 5, 1); unsigned pred = fieldFromInstruction32(Insn, 28, 4); - Rm |= fieldFromInstruction32(Insn, 5, 1) << 4; + Rm |= fieldFromInstruction32(Insn, 0, 4) << 1; if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F) S = MCDisassembler::SoftFail; @@ -4179,19 +4249,14 @@ static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; unsigned pred = fieldFromInstruction16(Insn, 4, 4); - // The InstPrinter needs to have the low bit of the predicate in - // the mask operand to be able to print it properly. - unsigned mask = fieldFromInstruction16(Insn, 0, 5); + unsigned mask = fieldFromInstruction16(Insn, 0, 4); if (pred == 0xF) { pred = 0xE; S = MCDisassembler::SoftFail; } - if ((mask & 0xF) == 0) { - // Preserve the high bit of the mask, which is the low bit of - // the predicate. - mask &= 0x10; + if (mask == 0x0) { mask |= 0x8; S = MCDisassembler::SoftFail; } diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index cbd81c1..2f6b1b0 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -52,6 +52,27 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) { unsigned Opcode = MI->getOpcode(); + // Check for HINT instructions w/ canonical names. + if (Opcode == ARM::HINT || Opcode == ARM::t2HINT) { + switch (MI->getOperand(0).getImm()) { + case 0: O << "\tnop"; break; + case 1: O << "\tyield"; break; + case 2: O << "\twfe"; break; + case 3: O << "\twfi"; break; + case 4: O << "\tsev"; break; + default: + // Anything else should just print normally. + printInstruction(MI, O); + printAnnotation(O, Annot); + return; + } + printPredicateOperand(MI, 1, O); + if (Opcode == ARM::t2HINT) + O << ".w"; + printAnnotation(O, Annot); + return; + } + // Check for MOVs and print canonical forms, instead. if (Opcode == ARM::MOVsr) { // FIXME: Thumb variants? @@ -426,9 +447,13 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, return; } - if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm())) + //If the op is sub we have to print the immediate even if it is 0 + unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()); + ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm()); + + if (ImmOffs || (op == ARM_AM::sub)) O << ", #" - << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) + << ARM_AM::getAddrOpcStr(op) << ImmOffs; O << ']'; } @@ -643,22 +668,50 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, unsigned Mask = Op.getImm() & 0xf; if (getAvailableFeatures() & ARM::FeatureMClass) { - switch (Op.getImm()) { + unsigned SYSm = Op.getImm(); + unsigned Opcode = MI->getOpcode(); + // For reads of the special registers ignore the "mask encoding" bits + // which are only for writes. + if (Opcode == ARM::t2MRS_M) + SYSm &= 0xff; + switch (SYSm) { default: llvm_unreachable("Unexpected mask value!"); - case 0: O << "apsr"; return; - case 1: O << "iapsr"; return; - case 2: O << "eapsr"; return; - case 3: O << "xpsr"; return; - case 5: O << "ipsr"; return; - case 6: O << "epsr"; return; - case 7: O << "iepsr"; return; - case 8: O << "msp"; return; - case 9: O << "psp"; return; - case 16: O << "primask"; return; - case 17: O << "basepri"; return; - case 18: O << "basepri_max"; return; - case 19: O << "faultmask"; return; - case 20: O << "control"; return; + case 0: + case 0x800: O << "apsr"; return; // with _nzcvq bits is an alias for aspr + case 0x400: O << "apsr_g"; return; + case 0xc00: O << "apsr_nzcvqg"; return; + case 1: + case 0x801: O << "iapsr"; return; // with _nzcvq bits is an alias for iapsr + case 0x401: O << "iapsr_g"; return; + case 0xc01: O << "iapsr_nzcvqg"; return; + case 2: + case 0x802: O << "eapsr"; return; // with _nzcvq bits is an alias for eapsr + case 0x402: O << "eapsr_g"; return; + case 0xc02: O << "eapsr_nzcvqg"; return; + case 3: + case 0x803: O << "xpsr"; return; // with _nzcvq bits is an alias for xpsr + case 0x403: O << "xpsr_g"; return; + case 0xc03: O << "xpsr_nzcvqg"; return; + case 5: + case 0x805: O << "ipsr"; return; + case 6: + case 0x806: O << "epsr"; return; + case 7: + case 0x807: O << "iepsr"; return; + case 8: + case 0x808: O << "msp"; return; + case 9: + case 0x809: O << "psp"; return; + case 0x10: + case 0x810: O << "primask"; return; + case 0x11: + case 0x811: O << "basepri"; return; + case 0x12: + case 0x812: O << "basepri_max"; return; + case 0x13: + case 0x813: O << "faultmask"; return; + case 0x14: + case 0x814: O << "control"; return; } } @@ -754,7 +807,8 @@ void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O) { // (3 - the number of trailing zeros) is the number of then / else. unsigned Mask = MI->getOperand(OpNum).getImm(); - unsigned CondBit0 = Mask >> 4 & 1; + unsigned Firstcond = MI->getOperand(OpNum-1).getImm(); + unsigned CondBit0 = Firstcond & 1; unsigned NumTZ = CountTrailingZeros_32(Mask); assert(NumTZ <= 3 && "Invalid IT mask!"); for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index d10bfc1..ac6ce64 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -12,6 +12,7 @@ #include "MCTargetDesc/ARMFixupKinds.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" @@ -84,7 +85,8 @@ public: { "fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, -{ "fixup_arm_thumb_cp", 0, 8, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_cp", 0, 8, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, { "fixup_arm_thumb_bcc", 0, 8, MCFixupKindInfo::FKF_IsPCRel }, // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19. { "fixup_arm_movt_hi16", 0, 20, 0 }, @@ -110,32 +112,7 @@ public: void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup, const MCFragment *DF, MCValue &Target, uint64_t &Value, - bool &IsResolved) { - const MCSymbolRefExpr *A = Target.getSymA(); - // Some fixups to thumb function symbols need the low bit (thumb bit) - // twiddled. - if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 && - (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 && - (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 && - (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 && - (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 && - (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) { - if (A) { - const MCSymbol &Sym = A->getSymbol().AliasedSymbol(); - if (Asm.isThumbFunc(&Sym)) - Value |= 1; - } - } - // We must always generate a relocation for BL/BLX instructions if we have - // a symbol to reference, as the linker relies on knowing the destination - // symbol's thumb-ness to get interworking right. - if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx || - (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl || - (unsigned)Fixup.getKind() == ARM::fixup_arm_blx || - (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl || - (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl)) - IsResolved = false; - } + bool &IsResolved); bool mayNeedRelaxation(const MCInst &Inst) const; @@ -269,7 +246,9 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { return true; } -static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { +static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, + MCContext *Ctx = NULL) { + unsigned Kind = Fixup.getKind(); switch (Kind) { default: llvm_unreachable("Unknown fixup kind!"); @@ -322,7 +301,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { Value = -Value; isAdd = false; } - assert ((Value < 4096) && "Out of range pc-relative fixup value!"); + if (Ctx && Value >= 4096) + Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); Value |= isAdd << 23; // Same addressing mode as fixup_arm_pcrel_10, @@ -345,8 +325,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { Value = -Value; opc = 2; // 0b0010 } - assert(ARM_AM::getSOImmVal(Value) != -1 && - "Out of range pc-relative fixup value!"); + if (Ctx && ARM_AM::getSOImmVal(Value) == -1) + Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); // Encode the immediate and shift the opcode into place. return ARM_AM::getSOImmVal(Value) | (opc << 21); } @@ -414,39 +394,65 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { return swapped; } case ARM::fixup_arm_thumb_bl: { - // The value doesn't encode the low bit (always zero) and is offset by - // four. The value is encoded into disjoint bit positions in the destination - // opcode. x = unchanged, I = immediate value bit, S = sign extension bit - // - // BL: xxxxxSIIIIIIIIII xxxxxIIIIIIIIIII - // - // Note that the halfwords are stored high first, low second; so we need - // to transpose the fixup value here to map properly. - unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0; - uint32_t Binary = 0; - Value = 0x3fffff & ((Value - 4) >> 1); - Binary = (Value & 0x7ff) << 16; // Low imm11 value. - Binary |= (Value & 0x1ffc00) >> 11; // High imm10 value. - Binary |= isNeg << 10; // Sign bit. - return Binary; + // The value doesn't encode the low bit (always zero) and is offset by + // four. The 32-bit immediate value is encoded as + // imm32 = SignExtend(S:I1:I2:imm10:imm11:0) + // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S). + // The value is encoded into disjoint bit positions in the destination + // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, + // J = either J1 or J2 bit + // + // BL: xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII + // + // Note that the halfwords are stored high first, low second; so we need + // to transpose the fixup value here to map properly. + uint32_t offset = (Value - 4) >> 1; + uint32_t signBit = (offset & 0x800000) >> 23; + uint32_t I1Bit = (offset & 0x400000) >> 22; + uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit; + uint32_t I2Bit = (offset & 0x200000) >> 21; + uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit; + uint32_t imm10Bits = (offset & 0x1FF800) >> 11; + uint32_t imm11Bits = (offset & 0x000007FF); + + uint32_t Binary = 0; + uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits); + uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | + (uint16_t)imm11Bits); + Binary |= secondHalf << 16; + Binary |= firstHalf; + return Binary; + } case ARM::fixup_arm_thumb_blx: { - // The value doesn't encode the low two bits (always zero) and is offset by - // four (see fixup_arm_thumb_cp). The value is encoded into disjoint bit - // positions in the destination opcode. x = unchanged, I = immediate value - // bit, S = sign extension bit, 0 = zero. - // - // BLX: xxxxxSIIIIIIIIII xxxxxIIIIIIIIII0 - // - // Note that the halfwords are stored high first, low second; so we need - // to transpose the fixup value here to map properly. - unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0; - uint32_t Binary = 0; - Value = 0xfffff & ((Value - 2) >> 2); - Binary = (Value & 0x3ff) << 17; // Low imm10L value. - Binary |= (Value & 0xffc00) >> 10; // High imm10H value. - Binary |= isNeg << 10; // Sign bit. - return Binary; + // The value doesn't encode the low two bits (always zero) and is offset by + // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as + // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00) + // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S). + // The value is encoded into disjoint bit positions in the destination + // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, + // J = either J1 or J2 bit, 0 = zero. + // + // BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0 + // + // Note that the halfwords are stored high first, low second; so we need + // to transpose the fixup value here to map properly. + uint32_t offset = (Value - 2) >> 2; + uint32_t signBit = (offset & 0x400000) >> 22; + uint32_t I1Bit = (offset & 0x200000) >> 21; + uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit; + uint32_t I2Bit = (offset & 0x100000) >> 20; + uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit; + uint32_t imm10HBits = (offset & 0xFFC00) >> 10; + uint32_t imm10LBits = (offset & 0x3FF); + + uint32_t Binary = 0; + uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits); + uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | + ((uint16_t)imm10LBits) << 1); + Binary |= secondHalf << 16; + Binary |= firstHalf; + return Binary; } case ARM::fixup_arm_thumb_cp: // Offset by 4, and don't encode the low two bits. Two bytes of that @@ -473,7 +479,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { isAdd = false; } // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8]. - assert ((Value < 256) && "Out of range pc-relative fixup value!"); + if (Ctx && Value >= 256) + Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); Value = (Value & 0xf) | ((Value & 0xf0) << 4); return Value | (isAdd << 23); } @@ -491,7 +498,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { } // These values don't encode the low two bits since they're always zero. Value >>= 2; - assert ((Value < 256) && "Out of range pc-relative fixup value!"); + if (Ctx && Value >= 256) + Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); Value |= isAdd << 23; // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords @@ -507,6 +515,43 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { } } +void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFixup &Fixup, + const MCFragment *DF, + MCValue &Target, uint64_t &Value, + bool &IsResolved) { + const MCSymbolRefExpr *A = Target.getSymA(); + // Some fixups to thumb function symbols need the low bit (thumb bit) + // twiddled. + if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 && + (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 && + (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 && + (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 && + (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 && + (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) { + if (A) { + const MCSymbol &Sym = A->getSymbol().AliasedSymbol(); + if (Asm.isThumbFunc(&Sym)) + Value |= 1; + } + } + // We must always generate a relocation for BL/BLX instructions if we have + // a symbol to reference, as the linker relies on knowing the destination + // symbol's thumb-ness to get interworking right. + if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx || + (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl || + (unsigned)Fixup.getKind() == ARM::fixup_arm_blx || + (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl || + (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl)) + IsResolved = false; + + // Try to get the encoded value for the fixup as-if we're mapping it into + // the instruction. This allows adjustFixupValue() to issue a diagnostic + // if the value aren't invalid. + (void)adjustFixupValue(Fixup, Value, &Asm.getContext()); +} + namespace { // FIXME: This should be in a separate file. @@ -530,7 +575,7 @@ public: void ELFARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value) const { unsigned NumBytes = 4; // FIXME: 2 for Thumb - Value = adjustFixupValue(Fixup.getKind(), Value); + Value = adjustFixupValue(Fixup, Value); if (!Value) return; // Doesn't change encoding. unsigned Offset = Fixup.getOffset(); @@ -615,7 +660,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { void DarwinARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - Value = adjustFixupValue(Fixup.getKind(), Value); + Value = adjustFixupValue(Fixup, Value); if (!Value) return; // Doesn't change encoding. unsigned Offset = Fixup.getOffset(); diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index aa649ba..7d6acbc 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -178,9 +178,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, break; } break; - case ARM::fixup_arm_uncondbl: case ARM::fixup_arm_blx: - case ARM::fixup_arm_uncondbranch: + case ARM::fixup_arm_uncondbl: switch (Modifier) { case MCSymbolRefExpr::VK_ARM_PLT: Type = ELF::R_ARM_PLT32; @@ -192,6 +191,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, break; case ARM::fixup_arm_condbl: case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: Type = ELF::R_ARM_JUMP24; break; case ARM::fixup_arm_movt_hi16: @@ -252,10 +252,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case ARM::fixup_arm_thumb_cp: case ARM::fixup_arm_thumb_br: llvm_unreachable("Unimplemented"); - case ARM::fixup_arm_uncondbranch: - Type = ELF::R_ARM_CALL; - break; case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: Type = ELF::R_ARM_JUMP24; break; case ARM::fixup_arm_movt_hi16: diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index 03e8d5f..d32805e 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -22,40 +22,14 @@ EnableARMEHABI("arm-enable-ehabi", cl::Hidden, cl::init(false)); -static const char *const arm_asm_table[] = { - "{r0}", "r0", - "{r1}", "r1", - "{r2}", "r2", - "{r3}", "r3", - "{r4}", "r4", - "{r5}", "r5", - "{r6}", "r6", - "{r7}", "r7", - "{r8}", "r8", - "{r9}", "r9", - "{r10}", "r10", - "{r11}", "r11", - "{r12}", "r12", - "{r13}", "r13", - "{r14}", "r14", - "{lr}", "lr", - "{sp}", "sp", - "{ip}", "ip", - "{fp}", "fp", - "{sl}", "sl", - "{memory}", "memory", - "{cc}", "cc", - 0,0 -}; - void ARMMCAsmInfoDarwin::anchor() { } ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() { - AsmTransCBE = arm_asm_table; Data64bitsDirective = 0; CommentString = "@"; Code16Directive = ".code\t16"; Code32Directive = ".code\t32"; + UseDataRegionDirectives = true; SupportsDebugInformation = true; diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index 10d1c48..1964bcd 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -336,6 +336,7 @@ public: } // end anonymous namespace MCCodeEmitter *llvm::createARMMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx) { return new ARMMCCodeEmitter(MCII, STI, Ctx); @@ -861,11 +862,11 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, // Handle :upper16: and :lower16: assembly prefixes. const MCExpr *E = MO.getExpr(); + MCFixupKind Kind; if (E->getKind() == MCExpr::Target) { const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E); E = ARM16Expr->getSubExpr(); - MCFixupKind Kind; switch (ARM16Expr->getKind()) { default: llvm_unreachable("Unsupported ARMFixup"); case ARMMCExpr::VK_ARM_HI16: @@ -891,9 +892,21 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, } Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc())); return 0; - }; - - llvm_unreachable("Unsupported MCExpr type in MCOperand!"); + } + // If the expression doesn't have :upper16: or :lower16: on it, + // it's just a plain immediate expression, and those evaluate to + // the lower 16 bits of the expression regardless of whether + // we have a movt or a movw. + if (!isTargetDarwin() && EvaluateAsPCRel(E)) + Kind = MCFixupKind(isThumb2() + ? ARM::fixup_t2_movw_lo16_pcrel + : ARM::fixup_arm_movw_lo16_pcrel); + else + Kind = MCFixupKind(isThumb2() + ? ARM::fixup_t2_movw_lo16 + : ARM::fixup_arm_movw_lo16); + Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc())); + return 0; } uint32_t ARMMCCodeEmitter:: @@ -1192,8 +1205,7 @@ getSORegImmOpValue(const MCInst &MI, unsigned OpIdx, // Encode shift_imm bit[11:7]. Binary |= SBits << 4; unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm()); - assert(Offset && "Offset must be in range 1-32!"); - if (Offset == 32) Offset = 0; + assert(Offset < 32 && "Offset must be in range 0-31!"); return Binary | (Offset << 7); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index e3512cd..5df84c8 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -35,7 +35,7 @@ using namespace llvm; -std::string ARM_MC::ParseARMTriple(StringRef TT) { +std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) { // Set the boolean corresponding to the current target triple, or the default // if one cannot be determined, to true. unsigned Len = TT.size(); @@ -51,27 +51,48 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) { Idx = 6; } + bool NoCPU = CPU == "generic" || CPU.empty(); std::string ARMArchFeature; if (Idx) { unsigned SubVer = TT[Idx]; if (SubVer >= '7' && SubVer <= '9') { if (Len >= Idx+2 && TT[Idx+1] == 'm') { - // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass - ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass"; + if (NoCPU) + // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass + ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass"; + else + // Use CPU to figure out the exact features. + ARMArchFeature = "+v7"; } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') { - // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2, - // FeatureT2XtPk, FeatureMClass - ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass"; - } else - // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk - ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk"; + if (NoCPU) + // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2, + // FeatureT2XtPk, FeatureMClass + ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass"; + else + // Use CPU to figure out the exact features. + ARMArchFeature = "+v7"; + } else { + // v7 CPUs have lots of different feature sets. If no CPU is specified, + // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return + // the "minimum" feature set and use CPU string to figure out the exact + // features. + if (NoCPU) + // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk + ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk"; + else + // Use CPU to figure out the exact features. + ARMArchFeature = "+v7"; + } } else if (SubVer == '6') { if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2') ARMArchFeature = "+v6t2"; - else if (Len >= Idx+2 && TT[Idx+1] == 'm') - // v6m: FeatureNoARM, FeatureMClass - ARMArchFeature = "+v6t2,+noarm,+mclass"; - else + else if (Len >= Idx+2 && TT[Idx+1] == 'm') { + if (NoCPU) + // v6m: FeatureNoARM, FeatureMClass + ARMArchFeature = "+v6,+noarm,+mclass"; + else + ARMArchFeature = "+v6"; + } else ARMArchFeature = "+v6"; } else if (SubVer == '5') { if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e') @@ -94,7 +115,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) { MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { - std::string ArchFS = ARM_MC::ParseARMTriple(TT); + std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU); if (!FS.empty()) { if (!ArchFS.empty()) ArchFS = ArchFS + "," + FS.str(); diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 88472d7..510302d 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -23,6 +23,7 @@ class MCCodeEmitter; class MCContext; class MCInstrInfo; class MCObjectWriter; +class MCRegisterInfo; class MCSubtargetInfo; class StringRef; class Target; @@ -31,7 +32,7 @@ class raw_ostream; extern Target TheARMTarget, TheThumbTarget; namespace ARM_MC { - std::string ParseARMTriple(StringRef TT); + std::string ParseARMTriple(StringRef TT, StringRef CPU); /// createARMMCSubtargetInfo - Create a ARM MCSubtargetInfo instance. /// This is exposed so Asm parser, etc. do not need to go through @@ -41,6 +42,7 @@ namespace ARM_MC { } MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx); diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 8057cb6..78faf59 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -190,7 +190,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, // 0 - arm instructions // 1 - thumb instructions // the other half of the relocated expression is in the following pair - // relocation entry in the the low 16 bits of r_address field. + // relocation entry in the low 16 bits of r_address field. unsigned ThumbBit = 0; unsigned MovtBit = 0; switch ((unsigned)Fixup.getKind()) { diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp index 2899836..ad60e32 100644 --- a/lib/Target/ARM/MLxExpansionPass.cpp +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -220,7 +220,9 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, const MCInstrDesc &MCID1 = TII->get(MulOpc); const MCInstrDesc &MCID2 = TII->get(AddSubOpc); - unsigned TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI)); + const MachineFunction &MF = *MI->getParent()->getParent(); + unsigned TmpReg = MRI->createVirtualRegister( + TII->getRegClass(MCID1, 0, TRI, MF)); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg) .addReg(Src1Reg, getKillRegState(Src1Kill)) diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt index 3eddda8..57dc6cb 100644 --- a/lib/Target/ARM/README.txt +++ b/lib/Target/ARM/README.txt @@ -710,3 +710,24 @@ targets, e.g., PPC, that share this behavior, it would be best to implement this in a target-independent way: we should probably fold that (when using "undefined at zero" semantics) to set the "defined at zero" bit and have the code generator expand out the right code. + + +//===---------------------------------------------------------------------===// + +Clean up the test/MC/ARM files to have more robust register choices. + +R0 should not be used as a register operand in the assembler tests as it's then +not possible to distinguish between a correct encoding and a missing operand +encoding, as zero is the default value for the binary encoder. +e.g., + add r0, r0 // bad + add r3, r5 // good + +Register operands should be distinct. That is, when the encoding does not +require two syntactical operands to refer to the same register, two different +registers should be used in the test so as to catch errors where the +operands are swapped in the encoding. +e.g., + subs.w r1, r1, r1 // bad + subs.w r1, r2, r3 // good + diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index e03e758..735b255 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -53,11 +53,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - assert((RC == ARM::tGPRRegisterClass || + assert((RC == &ARM::tGPRRegClass || (TargetRegisterInfo::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) && "Unknown regclass!"); - if (RC == ARM::tGPRRegisterClass || + if (RC == &ARM::tGPRRegClass || (TargetRegisterInfo::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) { DebugLoc DL; @@ -81,11 +81,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - assert((RC == ARM::tGPRRegisterClass || + assert((RC == &ARM::tGPRRegClass || (TargetRegisterInfo::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) && "Unknown regclass!"); - if (RC == ARM::tGPRRegisterClass || + if (RC == &ARM::tGPRRegClass || (TargetRegisterInfo::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) { DebugLoc DL; diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp index ef77bbd..a39b722 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -49,13 +49,14 @@ const TargetRegisterClass* Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const { if (ARM::tGPRRegClass.hasSubClassEq(RC)) - return ARM::tGPRRegisterClass; + return &ARM::tGPRRegClass; return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC); } const TargetRegisterClass * -Thumb1RegisterInfo::getPointerRegClass(unsigned Kind) const { - return ARM::tGPRRegisterClass; +Thumb1RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { + return &ARM::tGPRRegClass; } /// emitLoadConstPool - Emits a load from constpool to materialize the @@ -109,7 +110,7 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, unsigned LdReg = DestReg; if (DestReg == ARM::SP) { assert(BaseReg == ARM::SP && "Unexpected!"); - LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); + LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass); } if (NumBytes <= 255 && NumBytes >= 0) @@ -693,7 +694,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // register. The offset is already handled in the vreg value. MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false); } else if (MI.mayStore()) { - VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); + VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass); bool UseRR = false; if (Opcode == ARM::tSTRspi) { diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h index 6971842..f2e4b08 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.h +++ b/lib/Target/ARM/Thumb1RegisterInfo.h @@ -30,7 +30,8 @@ public: const TargetRegisterClass* getLargestLegalSuperClass(const TargetRegisterClass *RC) const; - const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + const TargetRegisterClass* + getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const; /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp index ecb4c2f..d54aa93 100644 --- a/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -24,8 +24,6 @@ STATISTIC(NumMovedInsts, "Number of predicated instructions moved"); namespace { class Thumb2ITBlockPass : public MachineFunctionPass { - bool PreRegAlloc; - public: static char ID; Thumb2ITBlockPass() : MachineFunctionPass(ID) {} @@ -76,16 +74,14 @@ static void TrackDefUses(MachineInstr *MI, for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) { unsigned Reg = LocalUses[i]; Uses.insert(Reg); - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) + for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg) Uses.insert(*Subreg); } for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) { unsigned Reg = LocalDefs[i]; Defs.insert(Reg); - for (const uint16_t *Subreg = TRI->getSubRegisters(Reg); - *Subreg; ++Subreg) + for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg) Defs.insert(*Subreg); if (Reg == ARM::CPSR) continue; diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 8ab486b..2097bb9 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -126,9 +126,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass || - RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass || - RC == ARM::GPRnopcRegisterClass) { + if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || + RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || + RC == &ARM::GPRnopcRegClass) { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); @@ -153,9 +153,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass || - RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass || - RC == ARM::GPRnopcRegisterClass) { + if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || + RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || + RC == &ARM::GPRnopcRegClass) { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index b5a397e..f18f491 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -67,6 +67,7 @@ namespace { { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0 }, //FIXME: Disable CMN, as CCodes are backwards from compare expectations //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2CMNzrr, ARM::tCMNz, 0, 0, 0, 1, 0, 2,0, 0,0 }, { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0 }, { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1 }, { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0 }, diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt index cf4f796..1f8ca86 100644 --- a/lib/Target/CellSPU/CMakeLists.txt +++ b/lib/Target/CellSPU/CMakeLists.txt @@ -24,5 +24,7 @@ add_llvm_target(CellSPUCodeGen SPUNopFiller.cpp ) +add_dependencies(LLVMCellSPUCodeGen intrinsics_gen) + add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) diff --git a/lib/Target/CellSPU/README.txt b/lib/Target/CellSPU/README.txt index 3e7e0b6..3bce960 100644 --- a/lib/Target/CellSPU/README.txt +++ b/lib/Target/CellSPU/README.txt @@ -37,6 +37,20 @@ to add 'spu' to configure's --enable-targets option, e.g.: --------------------------------------------------------------------------- TODO: +* In commit r142152 vector legalization was set to element promotion per + default. This breaks half vectors (e.g. v2i32) badly as they get element + promoted to much slower types (v2i64). + +* Many CellSPU specific codegen tests only grep & count the number of + instructions, not checking their place with FileCheck. There have also + been some commits that change the CellSPU checks, some of which might + have not been thoroughly scrutinized w.r.t. to the changes they cause in SPU + assembly. (especially since about the time of r142152) + +* Some of the i64 math have huge tablegen rules, which sometime cause + tablegen to run out of memory. See e.g. bug 8850. i64 arithmetics + should probably be done with libraries. + * Create a machine pass for performing dual-pipeline scheduling specifically for CellSPU, and insert branch prediction instructions as needed. diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp index 14021fe..03d5a9a 100644 --- a/lib/Target/CellSPU/SPUAsmPrinter.cpp +++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp @@ -301,7 +301,9 @@ bool SPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, if (ExtraCode[1] != 0) return true; // Unknown modifier. switch (ExtraCode[0]) { - default: return true; // Unknown modifier. + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); case 'L': // Write second word of DImode reference. // Verify that this operand has two consecutive registers. if (!MI->getOperand(OpNo).isReg() || diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/lib/Target/CellSPU/SPUHazardRecognizers.cpp index 403d7ef..67a83f1 100644 --- a/lib/Target/CellSPU/SPUHazardRecognizers.cpp +++ b/lib/Target/CellSPU/SPUHazardRecognizers.cpp @@ -30,12 +30,6 @@ using namespace llvm; // very little right now. //===----------------------------------------------------------------------===// -SPUHazardRecognizer::SPUHazardRecognizer(const TargetInstrInfo &tii) : - TII(tii), - EvenOdd(0) -{ -} - /// Return the pipeline hazard type encountered or generated by this /// instruction. Currently returns NoHazard. /// diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.h b/lib/Target/CellSPU/SPUHazardRecognizers.h index 675632c..30acaea 100644 --- a/lib/Target/CellSPU/SPUHazardRecognizers.h +++ b/lib/Target/CellSPU/SPUHazardRecognizers.h @@ -24,12 +24,8 @@ class TargetInstrInfo; /// SPUHazardRecognizer class SPUHazardRecognizer : public ScheduleHazardRecognizer { -private: - const TargetInstrInfo &TII; - int EvenOdd; - public: - SPUHazardRecognizer(const TargetInstrInfo &TII); + SPUHazardRecognizer(const TargetInstrInfo &/*TII*/) {} virtual HazardType getHazardType(SUnit *SU, int Stalls); virtual void EmitInstruction(SUnit *SU); virtual void AdvanceCycle(); diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 0623741..4e9fcd1 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -77,12 +77,14 @@ namespace { // Splice the libcall in wherever FindInputOutputChains tells us to. Type *RetTy = Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext()); - std::pair<SDValue, SDValue> CallInfo = - TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false, + TargetLowering::CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, + false, false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false, - /*doesNotRet=*/false, /*isReturnValueUsed=*/true, + /*doesNotRet=*/false, + /*isReturnValueUsed=*/true, Callee, Args, DAG, Op.getDebugLoc()); + std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); return CallInfo.first; } @@ -100,13 +102,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setLibcallName(RTLIB::DIV_F64, "__fast_divdf3"); // Set up the SPU's register classes: - addRegisterClass(MVT::i8, SPU::R8CRegisterClass); - addRegisterClass(MVT::i16, SPU::R16CRegisterClass); - addRegisterClass(MVT::i32, SPU::R32CRegisterClass); - addRegisterClass(MVT::i64, SPU::R64CRegisterClass); - addRegisterClass(MVT::f32, SPU::R32FPRegisterClass); - addRegisterClass(MVT::f64, SPU::R64FPRegisterClass); - addRegisterClass(MVT::i128, SPU::GPRCRegisterClass); + addRegisterClass(MVT::i8, &SPU::R8CRegClass); + addRegisterClass(MVT::i16, &SPU::R16CRegClass); + addRegisterClass(MVT::i32, &SPU::R32CRegClass); + addRegisterClass(MVT::i64, &SPU::R64CRegClass); + addRegisterClass(MVT::f32, &SPU::R32FPRegClass); + addRegisterClass(MVT::f64, &SPU::R64FPRegClass); + addRegisterClass(MVT::i128, &SPU::GPRCRegClass); // SPU has no sign or zero extended loads for i1, i8, i16: setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); @@ -397,12 +399,12 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. - addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass); - addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass); - addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass); - addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass); - addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass); - addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass); + addRegisterClass(MVT::v16i8, &SPU::VECREGRegClass); + addRegisterClass(MVT::v8i16, &SPU::VECREGRegClass); + addRegisterClass(MVT::v4i32, &SPU::VECREGRegClass); + addRegisterClass(MVT::v2i64, &SPU::VECREGRegClass); + addRegisterClass(MVT::v4f32, &SPU::VECREGRegClass); + addRegisterClass(MVT::v2f64, &SPU::VECREGRegClass); for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { @@ -1133,7 +1135,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); // FIXME: allow for other calling conventions CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU); @@ -1263,14 +1265,19 @@ static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) { } SDValue -SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +SPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + // CellSPU target does not yet support tail call optimization. isTailCall = false; @@ -1280,7 +1287,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); // FIXME: allow for other calling conventions CCInfo.AnalyzeCallOperands(Outs, CCC_SPU); @@ -1441,7 +1448,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Now handle the return value(s) SmallVector<CCValAssign, 16> RVLocs; CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU); @@ -1468,7 +1475,7 @@ SPUTargetLowering::LowerReturn(SDValue Chain, SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_SPU); // If this is the first return lowered for this function, add the regs to the @@ -3139,16 +3146,16 @@ SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case 'b': // R1-R31 case 'r': // R0-R31 if (VT == MVT::i64) - return std::make_pair(0U, SPU::R64CRegisterClass); - return std::make_pair(0U, SPU::R32CRegisterClass); + return std::make_pair(0U, &SPU::R64CRegClass); + return std::make_pair(0U, &SPU::R32CRegClass); case 'f': if (VT == MVT::f32) - return std::make_pair(0U, SPU::R32FPRegisterClass); - else if (VT == MVT::f64) - return std::make_pair(0U, SPU::R64FPRegisterClass); + return std::make_pair(0U, &SPU::R32FPRegClass); + if (VT == MVT::f64) + return std::make_pair(0U, &SPU::R64FPRegClass); break; case 'v': - return std::make_pair(0U, SPU::GPRCRegisterClass); + return std::make_pair(0U, &SPU::GPRCRegClass); } } diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index e3db7b2..9f1599f 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -86,7 +86,6 @@ namespace llvm { class SPUTargetLowering : public TargetLowering { - int VarArgsFrameIndex; // FrameIndex for start of varargs area. SPUTargetMachine &SPUTM; public: @@ -159,13 +158,7 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp index 759923d..b25a639 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.cpp +++ b/lib/Target/CellSPU/SPUInstrInfo.cpp @@ -140,29 +140,27 @@ SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const -{ + const TargetRegisterInfo *TRI) const { unsigned opc; bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset()); - if (RC == SPU::GPRCRegisterClass) { - opc = (isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128); - } else if (RC == SPU::R64CRegisterClass) { - opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64); - } else if (RC == SPU::R64FPRegisterClass) { - opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64); - } else if (RC == SPU::R32CRegisterClass) { - opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32); - } else if (RC == SPU::R32FPRegisterClass) { - opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32); - } else if (RC == SPU::R16CRegisterClass) { - opc = (isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16); - } else if (RC == SPU::R8CRegisterClass) { - opc = (isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8); - } else if (RC == SPU::VECREGRegisterClass) { - opc = (isValidFrameIdx) ? SPU::STQDv16i8 : SPU::STQXv16i8; - } else { + if (RC == &SPU::GPRCRegClass) + opc = isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128; + else if (RC == &SPU::R64CRegClass) + opc = isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64; + else if (RC == &SPU::R64FPRegClass) + opc = isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64; + else if (RC == &SPU::R32CRegClass) + opc = isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32; + else if (RC == &SPU::R32FPRegClass) + opc = isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32; + else if (RC == &SPU::R16CRegClass) + opc = isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16; + else if (RC == &SPU::R8CRegClass) + opc = isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8; + else if (RC == &SPU::VECREGRegClass) + opc = isValidFrameIdx ? SPU::STQDv16i8 : SPU::STQXv16i8; + else llvm_unreachable("Unknown regclass!"); - } DebugLoc DL; if (MI != MBB.end()) DL = MI->getDebugLoc(); @@ -175,29 +173,27 @@ SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const -{ + const TargetRegisterInfo *TRI) const { unsigned opc; bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset()); - if (RC == SPU::GPRCRegisterClass) { - opc = (isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128); - } else if (RC == SPU::R64CRegisterClass) { - opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64); - } else if (RC == SPU::R64FPRegisterClass) { - opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64); - } else if (RC == SPU::R32CRegisterClass) { - opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32); - } else if (RC == SPU::R32FPRegisterClass) { - opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32); - } else if (RC == SPU::R16CRegisterClass) { - opc = (isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16); - } else if (RC == SPU::R8CRegisterClass) { - opc = (isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8); - } else if (RC == SPU::VECREGRegisterClass) { - opc = (isValidFrameIdx) ? SPU::LQDv16i8 : SPU::LQXv16i8; - } else { + if (RC == &SPU::GPRCRegClass) + opc = isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128; + else if (RC == &SPU::R64CRegClass) + opc = isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64; + else if (RC == &SPU::R64FPRegClass) + opc = isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64; + else if (RC == &SPU::R32CRegClass) + opc = isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32; + else if (RC == &SPU::R32FPRegClass) + opc = isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32; + else if (RC == &SPU::R16CRegClass) + opc = isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16; + else if (RC == &SPU::R8CRegClass) + opc = isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8; + else if (RC == &SPU::VECREGRegClass) + opc = isValidFrameIdx ? SPU::LQDv16i8 : SPU::LQXv16i8; + else llvm_unreachable("Unknown regclass in loadRegFromStackSlot!"); - } DebugLoc DL; if (MI != MBB.end()) DL = MI->getDebugLoc(); @@ -340,11 +336,11 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { static MachineBasicBlock::iterator findHBRPosition(MachineBasicBlock &MBB) { MachineBasicBlock::iterator J = MBB.end(); - for( int i=0; i<8; i++) { - if( J == MBB.begin() ) return J; - J--; - } - return J; + for( int i=0; i<8; i++) { + if( J == MBB.begin() ) return J; + J--; + } + return J; } unsigned @@ -360,7 +356,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineInstrBuilder MIB; //TODO: make a more accurate algorithm. bool haveHBR = MBB.size()>8; - + removeHBR(MBB); MCSymbol *branchLabel = MBB.getParent()->getContext().CreateTempSymbol(); // Add a label just before the branch @@ -382,7 +378,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA)); MIB.addSym(branchLabel); MIB.addMBB(TBB); - } + } } else { // Conditional branch MIB = BuildMI(&MBB, DL, get(Cond[0].getImm())); @@ -392,7 +388,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MIB = BuildMI(MBB, findHBRPosition(MBB), DL, get(SPU::HBRA)); MIB.addSym(branchLabel); MIB.addMBB(TBB); - } + } DEBUG(errs() << "Inserted one-way cond branch: "); DEBUG((*MIB).dump()); @@ -410,7 +406,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA)); MIB.addSym(branchLabel); MIB.addMBB(FBB); - } + } DEBUG(errs() << "Inserted conditional branch: "); DEBUG((*MIB).dump()); diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index f76ebd7..117acd7 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -3421,14 +3421,14 @@ let isCall = 1, // Branch relative and set link: Used if we actually know that the target // is within [-32768, 32767] bytes of the target def BRSL: - BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func, variable_ops), + BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func), "brsl\t$$lr, $func", [(SPUcall (SPUpcrel tglobaladdr:$func, 0))]>; // Branch absolute and set link: Used if we actually know that the target // is an absolute address def BRASL: - BranchSetLink<0b011001100, (outs), (ins calltarget:$func, variable_ops), + BranchSetLink<0b011001100, (outs), (ins calltarget:$func), "brasl\t$$lr, $func", [(SPUcall (SPUaform tglobaladdr:$func, 0))]>; diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp index 1b2da5f..e6c872d 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.cpp +++ b/lib/Target/CellSPU/SPURegisterInfo.cpp @@ -193,7 +193,8 @@ SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget, /// getPointerRegClass - Return the register class to use to hold pointers. /// This is used for addressing modes. const TargetRegisterClass * -SPURegisterInfo::getPointerRegClass(unsigned Kind) const { +SPURegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { return &SPU::R32CRegClass; } diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h index e5ab224..e9f9aba 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.h +++ b/lib/Target/CellSPU/SPURegisterInfo.h @@ -46,7 +46,7 @@ namespace llvm { /// getPointerRegClass - Return the register class to use to hold pointers. /// This is used for addressing modes. virtual const TargetRegisterClass * - getPointerRegClass(unsigned Kind = 0) const; + getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const; /// After allocating this many registers, the allocator should feel /// register pressure. The value is a somewhat random guess, based on the @@ -63,6 +63,11 @@ namespace llvm { virtual bool requiresRegisterScavenging(const MachineFunction &MF) const { return true; } + //! Enable tracking of liveness after register allocation, since register + // scavenging is enabled. + virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const + { return true; } + //! Return the reserved registers BitVector getReservedRegs(const MachineFunction &MF) const; diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp index 21f6b25..54764f1 100644 --- a/lib/Target/CellSPU/SPUTargetMachine.cpp +++ b/lib/Target/CellSPU/SPUTargetMachine.cpp @@ -72,7 +72,7 @@ TargetPassConfig *SPUTargetMachine::createPassConfig(PassManagerBase &PM) { bool SPUPassConfig::addInstSelector() { // Install an instruction selector. - PM.add(createSPUISelDag(getSPUTargetMachine())); + addPass(createSPUISelDag(getSPUTargetMachine())); return false; } @@ -85,9 +85,9 @@ bool SPUPassConfig::addPreEmitPass() { (BuilderFunc)(intptr_t)sys::DynamicLibrary::SearchForAddressOfSymbol( "createTCESchedulerPass"); if (schedulerCreator != NULL) - PM.add(schedulerCreator("cellspu")); + addPass(schedulerCreator("cellspu")); //align instructions with nops/lnops for dual issue - PM.add(createSPUNopFillerPass(getSPUTargetMachine())); + addPass(createSPUNopFillerPass(getSPUTargetMachine())); return true; } diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp index 69f0ff8..c8e757b 100644 --- a/lib/Target/CppBackend/CPPBackend.cpp +++ b/lib/Target/CppBackend/CPPBackend.cpp @@ -130,6 +130,7 @@ namespace { private: void printLinkageType(GlobalValue::LinkageTypes LT); void printVisibilityType(GlobalValue::VisibilityTypes VisTypes); + void printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM); void printCallingConv(CallingConv::ID cc); void printEscapedString(const std::string& str); void printCFP(const ConstantFP* CFP); @@ -325,6 +326,26 @@ void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) { } } +void CppWriter::printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM) { + switch (TLM) { + case GlobalVariable::NotThreadLocal: + Out << "GlobalVariable::NotThreadLocal"; + break; + case GlobalVariable::GeneralDynamicTLSModel: + Out << "GlobalVariable::GeneralDynamicTLSModel"; + break; + case GlobalVariable::LocalDynamicTLSModel: + Out << "GlobalVariable::LocalDynamicTLSModel"; + break; + case GlobalVariable::InitialExecTLSModel: + Out << "GlobalVariable::InitialExecTLSModel"; + break; + case GlobalVariable::LocalExecTLSModel: + Out << "GlobalVariable::LocalExecTLSModel"; + break; + } +} + // printEscapedString - Print each character of the specified string, escaping // it if it is not printable or if it is an escape char. void CppWriter::printEscapedString(const std::string &Str) { @@ -496,7 +517,7 @@ void CppWriter::printAttributes(const AttrListPtr &PAL, Out << "Attrs.push_back(PAWI);"; nl(Out); } - Out << name << "_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());"; + Out << name << "_PAL = AttrListPtr::get(Attrs);"; nl(Out); out(); nl(Out); Out << '}'; nl(Out); @@ -996,7 +1017,9 @@ void CppWriter::printVariableHead(const GlobalVariable *GV) { } if (GV->isThreadLocal()) { printCppName(GV); - Out << "->setThreadLocal(true);"; + Out << "->setThreadLocalMode("; + printThreadLocalMode(GV->getThreadLocalMode()); + Out << ");"; nl(Out); } if (is_inline) { @@ -1105,7 +1128,7 @@ void CppWriter::printInstruction(const Instruction *I, nl(Out); for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { - const ConstantInt* CaseVal = i.getCaseValue(); + const IntegersSubset CaseVal = i.getCaseValueEx(); const BasicBlock *BB = i.getCaseSuccessor(); Out << iName << "->addCase(" << getOpName(CaseVal) << ", " @@ -2078,7 +2101,9 @@ char CppWriter::ID = 0; bool CPPTargetMachine::addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &o, CodeGenFileType FileType, - bool DisableVerify) { + bool DisableVerify, + AnalysisID StartAfter, + AnalysisID StopAfter) { if (FileType != TargetMachine::CGFT_AssemblyFile) return true; PM.add(new CppWriter(o)); return false; diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h index 92bca6c..9cbe798 100644 --- a/lib/Target/CppBackend/CPPTargetMachine.h +++ b/lib/Target/CppBackend/CPPTargetMachine.h @@ -31,7 +31,9 @@ struct CPPTargetMachine : public TargetMachine { virtual bool addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &Out, CodeGenFileType FileType, - bool DisableVerify); + bool DisableVerify, + AnalysisID StartAfter, + AnalysisID StopAfter); virtual const TargetData *getTargetData() const { return 0; } }; diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt index af9e813..1f2d8ac 100644 --- a/lib/Target/Hexagon/CMakeLists.txt +++ b/lib/Target/Hexagon/CMakeLists.txt @@ -28,8 +28,12 @@ add_llvm_target(HexagonCodeGen HexagonSubtarget.cpp HexagonTargetMachine.cpp HexagonTargetObjectFile.cpp + HexagonVLIWPacketizer.cpp + HexagonNewValueJump.cpp ) +add_dependencies(LLVMHexagonCodeGen intrinsics_gen) + add_subdirectory(TargetInfo) add_subdirectory(InstPrinter) add_subdirectory(MCTargetDesc) diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h index 0808323..45f857b 100644 --- a/lib/Target/Hexagon/Hexagon.h +++ b/lib/Target/Hexagon/Hexagon.h @@ -40,6 +40,9 @@ namespace llvm { FunctionPass *createHexagonHardwareLoops(); FunctionPass *createHexagonPeephole(); FunctionPass *createHexagonFixupHwLoops(); + FunctionPass *createHexagonPacketizer(); + FunctionPass *createHexagonNewValueJump(); + /* TODO: object output. MCCodeEmitter *createHexagonMCCodeEmitter(const Target &, @@ -47,7 +50,8 @@ namespace llvm { MCContext &Ctx); */ /* TODO: assembler input. - TargetAsmBackend *createHexagonAsmBackend(const Target &, const std::string &); + TargetAsmBackend *createHexagonAsmBackend(const Target &, + const std::string &); */ void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI, HexagonAsmPrinter &AP); @@ -67,7 +71,7 @@ namespace llvm { // Normal instruction size (in bytes). #define HEXAGON_INSTR_SIZE 4 -// Maximum number of words in a packet (in instructions). +// Maximum number of words and instructions in a packet. #define HEXAGON_PACKET_SIZE 4 #endif diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td index 4a50d16..451e562 100644 --- a/lib/Target/Hexagon/Hexagon.td +++ b/lib/Target/Hexagon/Hexagon.td @@ -28,6 +28,8 @@ def ArchV3 : SubtargetFeature<"v3", "HexagonArchVersion", "V3", "Hexagon v3">; def ArchV4 : SubtargetFeature<"v4", "HexagonArchVersion", "V4", "Hexagon v4">; +def ArchV5 : SubtargetFeature<"v5", "HexagonArchVersion", "V5", + "Hexagon v5">; //===----------------------------------------------------------------------===// // Register File, Calling Conv, Instruction Descriptions @@ -45,13 +47,15 @@ def HexagonInstrInfo : InstrInfo; // Hexagon processors supported. //===----------------------------------------------------------------------===// -class Proc<string Name, ProcessorItineraries Itin, +class Proc<string Name, SchedMachineModel Model, list<SubtargetFeature> Features> - : Processor<Name, Itin, Features>; + : ProcessorModel<Name, Model, Features>; + +def : Proc<"hexagonv2", HexagonModel, [ArchV2]>; +def : Proc<"hexagonv3", HexagonModel, [ArchV2, ArchV3]>; +def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>; +def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>; -def : Proc<"hexagonv2", HexagonItineraries, [ArchV2]>; -def : Proc<"hexagonv3", HexagonItineraries, [ArchV2, ArchV3]>; -def : Proc<"hexagonv4", HexagonItinerariesV4, [ArchV2, ArchV3, ArchV4]>; // Hexagon Uses the MC printer for assembler output, so make sure the TableGen // AsmWriter bits get associated with the correct class. diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 39bf45d..5fa4740 100644 --- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -13,11 +13,11 @@ // //===----------------------------------------------------------------------===// - #define DEBUG_TYPE "asm-printer" #include "Hexagon.h" #include "HexagonAsmPrinter.h" #include "HexagonMachineFunctionInfo.h" +#include "HexagonMCInst.h" #include "HexagonTargetMachine.h" #include "HexagonSubtarget.h" #include "InstPrinter/HexagonInstPrinter.h" @@ -77,8 +77,7 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO = MI->getOperand(OpNo); switch (MO.getType()) { - default: - assert(0 && "<unknown operand type>"); + default: llvm_unreachable ("<unknown operand type>"); case MachineOperand::MO_Register: O << HexagonInstPrinter::getRegisterName(MO.getReg()); return; @@ -134,7 +133,9 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, if (ExtraCode[1] != 0) return true; // Unknown modifier. switch (ExtraCode[0]) { - default: return true; // Unknown modifier. + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS); case 'c': // Don't print "$" before a global var name or constant. // Hexagon never has a prefix. printOperand(MI, OpNo, OS); @@ -196,10 +197,45 @@ void HexagonAsmPrinter::printPredicateOperand(const MachineInstr *MI, /// the current output stream. /// void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) { - MCInst MCI; - - HexagonLowerToMC(MI, MCI, *this); - OutStreamer.EmitInstruction(MCI); + if (MI->isBundle()) { + std::vector<const MachineInstr*> BundleMIs; + + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator MII = MI; + ++MII; + unsigned int IgnoreCount = 0; + while (MII != MBB->end() && MII->isInsideBundle()) { + const MachineInstr *MInst = MII; + if (MInst->getOpcode() == TargetOpcode::DBG_VALUE || + MInst->getOpcode() == TargetOpcode::IMPLICIT_DEF) { + IgnoreCount++; + ++MII; + continue; + } + //BundleMIs.push_back(&*MII); + BundleMIs.push_back(MInst); + ++MII; + } + unsigned Size = BundleMIs.size(); + assert((Size+IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!"); + for (unsigned Index = 0; Index < Size; Index++) { + HexagonMCInst MCI; + MCI.setStartPacket(Index == 0); + MCI.setEndPacket(Index == (Size-1)); + + HexagonLowerToMC(BundleMIs[Index], MCI, *this); + OutStreamer.EmitInstruction(MCI); + } + } + else { + HexagonMCInst MCI; + if (MI->getOpcode() == Hexagon::ENDLOOP0) { + MCI.setStartPacket(true); + MCI.setEndPacket(true); + } + HexagonLowerToMC(MI, MCI, *this); + OutStreamer.EmitInstruction(MCI); + } return; } @@ -241,15 +277,15 @@ void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo, void HexagonAsmPrinter::printJumpTable(const MachineInstr *MI, int OpNo, raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNo); - assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) && - "Expecting jump table index"); + assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) && + "Expecting jump table index"); // Hexagon_TODO: Do we need name mangling? O << *GetJTISymbol(MO.getIndex()); } void HexagonAsmPrinter::printConstantPool(const MachineInstr *MI, int OpNo, - raw_ostream &O) { + raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNo); assert( (MO.getType() == MachineOperand::MO_ConstantPoolIndex) && "Expecting constant pool index"); diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td index bd9608b..e61b2a7 100644 --- a/lib/Target/Hexagon/HexagonCallingConv.td +++ b/lib/Target/Hexagon/HexagonCallingConv.td @@ -17,8 +17,8 @@ // Hexagon 32-bit C return-value convention. def RetCC_Hexagon32 : CallingConv<[ - CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>, - CCIfType<[i64], CCAssignToReg<[D0, D1, D2]>>, + CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>, + CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>, // Alternatively, they are assigned to the stack in 4-byte aligned units. CCAssignToStack<4, 4> @@ -27,8 +27,8 @@ def RetCC_Hexagon32 : CallingConv<[ // Hexagon 32-bit C Calling convention. def CC_Hexagon32 : CallingConv<[ // All arguments get passed in integer registers if there is space. - CCIfType<[i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>, - CCIfType<[i64], CCAssignToReg<[D0, D1, D2]>>, + CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>, + CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>, // Alternatively, they are assigned to the stack in 4-byte aligned units. CCAssignToStack<4, 4> diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp index 46c20e9..ba8e679 100644 --- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp +++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp @@ -56,11 +56,8 @@ void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT, /// MarkAllocated - Mark a register and all of its aliases as allocated. void Hexagon_CCState::MarkAllocated(unsigned Reg) { - UsedRegs[Reg/32] |= 1 << (Reg&31); - - if (const uint16_t *RegAliases = TRI.getAliasSet(Reg)) - for (; (Reg = *RegAliases); ++RegAliases) - UsedRegs[Reg/32] |= 1 << (Reg&31); + for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) + UsedRegs[*AI/32] |= 1 << (*AI&31); } /// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node, diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp index 2100474..ae2ca37 100644 --- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp +++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp @@ -7,9 +7,9 @@ // //===----------------------------------------------------------------------===// // The Hexagon processor has no instructions that load or store predicate -// registers directly. So, when these registers must be spilled a general -// purpose register must be found and the value copied to/from it from/to -// the predicate register. This code currently does not use the register +// registers directly. So, when these registers must be spilled a general +// purpose register must be found and the value copied to/from it from/to +// the predicate register. This code currently does not use the register // scavenger mechanism available in the allocator. There are two registers // reserved to allow spilling/restoring predicate registers. One is used to // hold the predicate value. The other is used when stack frame offsets are @@ -84,7 +84,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) { int SrcReg = MI->getOperand(2).getReg(); assert(Hexagon::PredRegsRegClass.contains(SrcReg) && "Not a predicate register"); - if (!TII->isValidOffset(Hexagon::STriw, Offset)) { + if (!TII->isValidOffset(Hexagon::STriw_indexed, Offset)) { if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) { BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::CONST32_Int_Real), @@ -95,7 +95,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) { BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd), HEXAGON_RESERVED_REG_2).addReg(SrcReg); BuildMI(*MBB, MII, MI->getDebugLoc(), - TII->get(Hexagon::STriw)) + TII->get(Hexagon::STriw_indexed)) .addReg(HEXAGON_RESERVED_REG_1) .addImm(0).addReg(HEXAGON_RESERVED_REG_2); } else { @@ -103,7 +103,8 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) { HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset); BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd), HEXAGON_RESERVED_REG_2).addReg(SrcReg); - BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::STriw)) + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::STriw_indexed)) .addReg(HEXAGON_RESERVED_REG_1) .addImm(0) .addReg(HEXAGON_RESERVED_REG_2); @@ -111,7 +112,8 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) { } else { BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd), HEXAGON_RESERVED_REG_2).addReg(SrcReg); - BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::STriw)). + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::STriw_indexed)). addReg(FP).addImm(Offset).addReg(HEXAGON_RESERVED_REG_2); } MII = MBB->erase(MI); diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index e8a6924..cd682df 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -209,6 +209,16 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const { FuncInfo->hasClobberLR() ); } +static inline +unsigned uniqueSuperReg(unsigned Reg, const TargetRegisterInfo *TRI) { + MCSuperRegIterator SRI(Reg, TRI); + assert(SRI.isValid() && "Expected a superreg"); + unsigned SuperReg = *SRI; + ++SRI; + assert(!SRI.isValid() && "Expected exactly one superreg"); + return SuperReg; +} + bool HexagonFrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, @@ -235,26 +245,21 @@ HexagonFrameLowering::spillCalleeSavedRegisters( // // Check if we can use a double-word store. // - const uint16_t* SuperReg = TRI->getSuperRegisters(Reg); - - // Assume that there is exactly one superreg. - assert(SuperReg[0] && !SuperReg[1] && "Expected exactly one superreg"); + unsigned SuperReg = uniqueSuperReg(Reg, TRI); bool CanUseDblStore = false; const TargetRegisterClass* SuperRegClass = 0; if (ContiguousRegs && (i < CSI.size()-1)) { - const uint16_t* SuperRegNext = TRI->getSuperRegisters(CSI[i+1].getReg()); - assert(SuperRegNext[0] && !SuperRegNext[1] && - "Expected exactly one superreg"); - SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg[0]); - CanUseDblStore = (SuperRegNext[0] == SuperReg[0]); + unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI); + SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg); + CanUseDblStore = (SuperRegNext == SuperReg); } if (CanUseDblStore) { - TII.storeRegToStackSlot(MBB, MI, SuperReg[0], true, + TII.storeRegToStackSlot(MBB, MI, SuperReg, true, CSI[i+1].getFrameIdx(), SuperRegClass, TRI); - MBB.addLiveIn(SuperReg[0]); + MBB.addLiveIn(SuperReg); ++i; } else { // Cannot use a double-word store. @@ -295,25 +300,20 @@ bool HexagonFrameLowering::restoreCalleeSavedRegisters( // // Check if we can use a double-word load. // - const uint16_t* SuperReg = TRI->getSuperRegisters(Reg); + unsigned SuperReg = uniqueSuperReg(Reg, TRI); const TargetRegisterClass* SuperRegClass = 0; - - // Assume that there is exactly one superreg. - assert(SuperReg[0] && !SuperReg[1] && "Expected exactly one superreg"); bool CanUseDblLoad = false; if (ContiguousRegs && (i < CSI.size()-1)) { - const uint16_t* SuperRegNext = TRI->getSuperRegisters(CSI[i+1].getReg()); - assert(SuperRegNext[0] && !SuperRegNext[1] && - "Expected exactly one superreg"); - SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg[0]); - CanUseDblLoad = (SuperRegNext[0] == SuperReg[0]); + unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI); + SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg); + CanUseDblLoad = (SuperRegNext == SuperReg); } if (CanUseDblLoad) { - TII.loadRegFromStackSlot(MBB, MI, SuperReg[0], CSI[i+1].getFrameIdx(), + TII.loadRegFromStackSlot(MBB, MI, SuperReg, CSI[i+1].getFrameIdx(), SuperRegClass, TRI); - MBB.addLiveIn(SuperReg[0]); + MBB.addLiveIn(SuperReg); ++i; } else { // Cannot use a double-word load. diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 57772a5..1357cc5 100644 --- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -491,7 +491,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) { TII->get(Hexagon::NEG), CountReg).addReg(CountReg1); } - // Add the Loop instruction to the begining of the loop. + // Add the Loop instruction to the beginning of the loop. BuildMI(*Preheader, InsertPos, InsertPos->getDebugLoc(), TII->get(Hexagon::LOOP0_r)).addMBB(LoopStart).addReg(CountReg); } else { @@ -623,7 +623,7 @@ void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF, const TargetInstrInfo *TII = MF.getTarget().getInstrInfo(); MachineBasicBlock *MBB = MII->getParent(); DebugLoc DL = MII->getDebugLoc(); - unsigned Scratch = RS.scavengeRegister(Hexagon::IntRegsRegisterClass, MII, 0); + unsigned Scratch = RS.scavengeRegister(&Hexagon::IntRegsRegClass, MII, 0); // First, set the LC0 with the trip count. if (MII->getOperand(1).isReg()) { diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 9df965e..5499134 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -90,7 +90,9 @@ public: SDNode *SelectMul(SDNode *N); SDNode *SelectZeroExtend(SDNode *N); SDNode *SelectIntrinsicWOChain(SDNode *N); + SDNode *SelectIntrinsicWChain(SDNode *N); SDNode *SelectConstant(SDNode *N); + SDNode *SelectConstantFP(SDNode *N); SDNode *SelectAdd(SDNode *N); // Include the pieces autogenerated from the target description. @@ -318,7 +320,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) { else if (LoadedVT == MVT::i32) Opcode = Hexagon::LDriw_indexed; else if (LoadedVT == MVT::i16) Opcode = Hexagon::LDrih_indexed; else if (LoadedVT == MVT::i8) Opcode = Hexagon::LDrib_indexed; - else assert (0 && "unknown memory type"); + else llvm_unreachable("unknown memory type"); // Build indexed load. SDValue TargetConstOff = CurDAG->getTargetConstant(Offset, PointerTy); @@ -375,7 +377,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD, }; ReplaceUses(Froms, Tos, 3); return Result_2; - } + } SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32); SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32); SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, @@ -516,7 +518,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) { else Opcode = zextval ? Hexagon::LDriub : Hexagon::LDrib; } else - assert (0 && "unknown memory type"); + llvm_unreachable("unknown memory type"); // For zero ext i64 loads, we need to add combine instructions. if (LD->getValueType(0) == MVT::i64 && @@ -613,7 +615,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) { else if (StoredVT == MVT::i32) Opcode = Hexagon::POST_STwri; else if (StoredVT == MVT::i16) Opcode = Hexagon::POST_SThri; else if (StoredVT == MVT::i8) Opcode = Hexagon::POST_STbri; - else assert (0 && "unknown memory type"); + else llvm_unreachable("unknown memory type"); // Build post increment store. SDNode* Result = CurDAG->getMachineNode(Opcode, dl, MVT::i32, @@ -636,10 +638,10 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) { // Figure out the opcode. if (StoredVT == MVT::i64) Opcode = Hexagon::STrid; - else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw; + else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed; else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih; else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib; - else assert (0 && "unknown memory type"); + else llvm_unreachable("unknown memory type"); // Build regular store. SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32); @@ -693,7 +695,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST, else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed; else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih_indexed; else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib_indexed; - else assert (0 && "unknown memory type"); + else llvm_unreachable("unknown memory type"); SDValue Ops[] = {SDValue(NewBase,0), CurDAG->getTargetConstant(Offset,PointerTy), @@ -723,7 +725,7 @@ SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) { if (AM != ISD::UNINDEXED) { return SelectIndexedStore(ST, dl); } - + return SelectBaseOffsetStore(ST, dl); } @@ -752,7 +754,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) { if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) { SDValue Sext0 = MulOp0.getOperand(0); if (Sext0.getNode()->getValueType(0) != MVT::i32) { - SelectCode(N); + return SelectCode(N); } OP0 = Sext0; @@ -761,7 +763,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) { if (LD->getMemoryVT() != MVT::i32 || LD->getExtensionType() != ISD::SEXTLOAD || LD->getAddressingMode() != ISD::UNINDEXED) { - SelectCode(N); + return SelectCode(N); } SDValue Chain = LD->getChain(); @@ -1128,12 +1130,12 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) { // For immediates, lower it. for (unsigned i = 1; i < N->getNumOperands(); ++i) { SDNode *Arg = N->getOperand(i).getNode(); - const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI); + const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI, *MF); - if (RC == Hexagon::IntRegsRegisterClass || - RC == Hexagon::DoubleRegsRegisterClass) { + if (RC == &Hexagon::IntRegsRegClass || + RC == &Hexagon::DoubleRegsRegClass) { Ops.push_back(SDValue(Arg, 0)); - } else if (RC == Hexagon::PredRegsRegisterClass) { + } else if (RC == &Hexagon::PredRegsRegClass) { // Do the transfer. SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1, SDValue(Arg, 0)); @@ -1158,6 +1160,25 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) { return SelectCode(N); } +// +// Map floating point constant values. +// +SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) { + DebugLoc dl = N->getDebugLoc(); + ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N); + APFloat APF = CN->getValueAPF(); + if (N->getValueType(0) == MVT::f32) { + return CurDAG->getMachineNode(Hexagon::TFRI_f, dl, MVT::f32, + CurDAG->getTargetConstantFP(APF.convertToFloat(), MVT::f32)); + } + else if (N->getValueType(0) == MVT::f64) { + return CurDAG->getMachineNode(Hexagon::CONST64_Float_Real, dl, MVT::f64, + CurDAG->getTargetConstantFP(APF.convertToDouble(), MVT::f64)); + } + + return SelectCode(N); +} + // // Map predicate true (encoded as -1 in LLVM) to a XOR. @@ -1215,7 +1236,7 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) { // Build Rd = Rd' + asr(Rs, Rt). The machine constraints will ensure that // Rd and Rd' are assigned to the same register - SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_rr_acc, dl, MVT::i32, + SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_ADD_rr, dl, MVT::i32, N->getOperand(1), Src1->getOperand(0), Src1->getOperand(1)); @@ -1234,6 +1255,9 @@ SDNode *HexagonDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: return SelectConstant(N); + case ISD::ConstantFP: + return SelectConstantFP(N); + case ISD::ADD: return SelectAdd(N); diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index d6da0d0..703a128 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -32,9 +32,11 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" + using namespace llvm; const unsigned Hexagon_MAX_RET_SIZE = 64; @@ -101,12 +103,12 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT, State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); return false; } - if (LocVT == MVT::i32) { + if (LocVT == MVT::i32 || LocVT == MVT::f32) { ofst = State.AllocateStack(4, 4); State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); return false; } - if (LocVT == MVT::i64) { + if (LocVT == MVT::i64 || LocVT == MVT::f64) { ofst = State.AllocateStack(8, 8); State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); return false; @@ -140,12 +142,12 @@ CC_Hexagon (unsigned ValNo, MVT ValVT, LocInfo = CCValAssign::AExt; } - if (LocVT == MVT::i32) { + if (LocVT == MVT::i32 || LocVT == MVT::f32) { if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) return false; } - if (LocVT == MVT::i64) { + if (LocVT == MVT::i64 || LocVT == MVT::f64) { if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) return false; } @@ -215,12 +217,12 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, LocInfo = CCValAssign::AExt; } - if (LocVT == MVT::i32) { + if (LocVT == MVT::i32 || LocVT == MVT::f32) { if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) return false; } - if (LocVT == MVT::i64) { + if (LocVT == MVT::i64 || LocVT == MVT::f64) { if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) return false; } @@ -232,7 +234,7 @@ static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { - if (LocVT == MVT::i32) { + if (LocVT == MVT::i32 || LocVT == MVT::f32) { if (unsigned Reg = State.AllocateReg(Hexagon::R0)) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; @@ -247,7 +249,7 @@ static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT, static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { - if (LocVT == MVT::i64) { + if (LocVT == MVT::i64 || LocVT == MVT::f64) { if (unsigned Reg = State.AllocateReg(Hexagon::D0)) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; @@ -297,7 +299,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain, // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); // Analyze return values of ISD::RET CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon); @@ -349,7 +351,7 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon); @@ -368,21 +370,25 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, /// LowerCall - Functions arguments are copied from virtual regs to /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted. SDValue -HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); // Check for varargs. NumNamedVarArgParams = -1; @@ -502,7 +508,7 @@ HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; if (!isTailCall) { @@ -522,7 +528,7 @@ HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. // - // Do not flag preceeding copytoreg stuff together with the following stuff. + // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, @@ -811,7 +817,7 @@ const { // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon); @@ -837,14 +843,15 @@ const { // 1. int, long long, ptr args that get allocated in register. // 2. Large struct that gets an register to put its address in. EVT RegVT = VA.getLocVT(); - if (RegVT == MVT::i8 || RegVT == MVT::i16 || RegVT == MVT::i32) { + if (RegVT == MVT::i8 || RegVT == MVT::i16 || + RegVT == MVT::i32 || RegVT == MVT::f32) { unsigned VReg = - RegInfo.createVirtualRegister(Hexagon::IntRegsRegisterClass); + RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); } else if (RegVT == MVT::i64) { unsigned VReg = - RegInfo.createVirtualRegister(Hexagon::DoubleRegsRegisterClass); + RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); } else { @@ -916,14 +923,33 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue CC = Op.getOperand(4); + SDValue TrueVal = Op.getOperand(2); + SDValue FalseVal = Op.getOperand(3); + DebugLoc dl = Op.getDebugLoc(); SDNode* OpNode = Op.getNode(); + EVT SVT = OpNode->getValueType(0); - SDValue Cond = DAG.getNode(ISD::SETCC, Op.getDebugLoc(), MVT::i1, - Op.getOperand(2), Op.getOperand(3), - Op.getOperand(4)); - return DAG.getNode(ISD::SELECT, Op.getDebugLoc(), OpNode->getValueType(0), - Cond, Op.getOperand(0), - Op.getOperand(1)); + SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i1, LHS, RHS, CC); + return DAG.getNode(ISD::SELECT, dl, SVT, Cond, TrueVal, FalseVal); +} + +SDValue +HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { + EVT ValTy = Op.getValueType(); + + DebugLoc dl = Op.getDebugLoc(); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + SDValue Res; + if (CP->isMachineConstantPoolEntry()) + Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy, + CP->getAlignment()); + else + Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy, + CP->getAlignment()); + return DAG.getNode(HexagonISD::CONST32, dl, ValTy, Res); } SDValue @@ -1008,11 +1034,18 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine : TargetLowering(targetmachine, new HexagonTargetObjectFile()), TM(targetmachine) { + const HexagonRegisterInfo* QRI = TM.getRegisterInfo(); + // Set up the register classes. - addRegisterClass(MVT::i32, Hexagon::IntRegsRegisterClass); - addRegisterClass(MVT::i64, Hexagon::DoubleRegsRegisterClass); + addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); + + if (QRI->Subtarget.hasV5TOps()) { + addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass); + } - addRegisterClass(MVT::i1, Hexagon::PredRegsRegisterClass); + addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass); computeRegisterProperties(); @@ -1026,32 +1059,16 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine // // Library calls for unsupported operations // - setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2"); - setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf"); setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf"); setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf"); - setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf"); - setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf"); - setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf"); - setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf"); - setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi"); - setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi"); setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti"); - - setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi"); - setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi"); setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti"); - setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf"); - setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi"); setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti"); - setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi"); setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti"); - setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2"); - setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3"); setOperationAction(ISD::SDIV, MVT::i32, Expand); setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3"); @@ -1080,92 +1097,184 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3"); setOperationAction(ISD::FDIV, MVT::f64, Expand); - setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2"); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand); + setOperationAction(ISD::FSQRT, MVT::f32, Expand); + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FSIN, MVT::f64, Expand); + + if (QRI->Subtarget.hasV5TOps()) { + // Hexagon V5 Support. + setOperationAction(ISD::FADD, MVT::f32, Legal); + setOperationAction(ISD::FADD, MVT::f64, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); + setCondCodeAction(ISD::SETOEQ, MVT::f32, Legal); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Legal); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Legal); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOGE, MVT::f32, Legal); + setCondCodeAction(ISD::SETOGE, MVT::f64, Legal); + setCondCodeAction(ISD::SETUGE, MVT::f32, Legal); + setCondCodeAction(ISD::SETUGE, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOGT, MVT::f32, Legal); + setCondCodeAction(ISD::SETOGT, MVT::f64, Legal); + setCondCodeAction(ISD::SETUGT, MVT::f32, Legal); + setCondCodeAction(ISD::SETUGT, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOLE, MVT::f32, Legal); + setCondCodeAction(ISD::SETOLE, MVT::f64, Legal); + setCondCodeAction(ISD::SETOLT, MVT::f32, Legal); + setCondCodeAction(ISD::SETOLT, MVT::f64, Legal); + + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); + + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); + + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f64, Expand); + + setOperationAction(ISD::FNEG, MVT::f32, Legal); + setOperationAction(ISD::FNEG, MVT::f64, Expand); + } else { - setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf"); - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); + // Expand fp<->uint. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); - setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3"); - setOperationAction(ISD::FADD, MVT::f64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); - setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3"); - setOperationAction(ISD::FADD, MVT::f32, Expand); + setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf"); + setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf"); - setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3"); - setOperationAction(ISD::FADD, MVT::f32, Expand); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf"); - setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2"); - setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); + setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf"); + setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf"); - setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi"); - setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf"); + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf"); - setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi"); - setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi"); + setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi"); - setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf"); - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); + setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi"); + setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi"); - setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2"); - setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi"); + setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi"); - setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2"); - setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); + setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3"); + setOperationAction(ISD::FADD, MVT::f64, Expand); - setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2"); - setCondCodeAction(ISD::SETOGT, MVT::f32, Expand); + setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3"); + setOperationAction(ISD::FADD, MVT::f32, Expand); - setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2"); - setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); + setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2"); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand); - setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2"); - setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2"); + setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); - setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2"); - setCondCodeAction(ISD::SETOLT, MVT::f64, Expand); + setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2"); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); - setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2"); - setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); + setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2"); + setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); - setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3"); - setOperationAction(ISD::SREM, MVT::i32, Expand); + setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2"); + setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); + + setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2"); + setCondCodeAction(ISD::SETOGT, MVT::f32, Expand); + + setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2"); + setCondCodeAction(ISD::SETOGT, MVT::f64, Expand); - setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3"); - setOperationAction(ISD::FMUL, MVT::f64, Expand); + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi"); + setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand); - setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3"); - setOperationAction(ISD::MUL, MVT::f32, Expand); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi"); + setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand); - setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2"); - setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); + setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2"); + setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); - setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2"); + setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2"); + setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2"); + setCondCodeAction(ISD::SETOLT, MVT::f64, Expand); - setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3"); - setOperationAction(ISD::SUB, MVT::f64, Expand); + setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2"); + setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); - setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3"); - setOperationAction(ISD::SUB, MVT::f32, Expand); + setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3"); + setOperationAction(ISD::FMUL, MVT::f64, Expand); - setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2"); - setOperationAction(ISD::FP_ROUND, MVT::f64, Expand); + setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3"); + setOperationAction(ISD::MUL, MVT::f32, Expand); - setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2"); - setCondCodeAction(ISD::SETUO, MVT::f64, Expand); + setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2"); + setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); - setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2"); - setCondCodeAction(ISD::SETO, MVT::f64, Expand); + setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2"); - setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2"); - setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); + setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3"); + setOperationAction(ISD::SUB, MVT::f64, Expand); - setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2"); - setCondCodeAction(ISD::SETO, MVT::f32, Expand); + setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3"); + setOperationAction(ISD::SUB, MVT::f32, Expand); - setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2"); - setCondCodeAction(ISD::SETUO, MVT::f32, Expand); + setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2"); + setOperationAction(ISD::FP_ROUND, MVT::f64, Expand); + + setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2"); + setCondCodeAction(ISD::SETUO, MVT::f64, Expand); + + setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2"); + setCondCodeAction(ISD::SETO, MVT::f64, Expand); + + setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2"); + setCondCodeAction(ISD::SETO, MVT::f32, Expand); + + setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2"); + setCondCodeAction(ISD::SETUO, MVT::f32, Expand); + + setOperationAction(ISD::FABS, MVT::f32, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FNEG, MVT::f32, Expand); + setOperationAction(ISD::FNEG, MVT::f64, Expand); + } + + setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3"); + setOperationAction(ISD::SREM, MVT::i32, Expand); setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal); setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal); @@ -1206,20 +1315,33 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine setOperationAction(ISD::BSWAP, MVT::i64, Expand); - // Expand fp<->uint. - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); - - // Hexagon has no select or setcc: expand to SELECT_CC. - setOperationAction(ISD::SELECT, MVT::f32, Expand); - setOperationAction(ISD::SELECT, MVT::f64, Expand); - // Lower SELECT_CC to SETCC and SELECT. setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); - // This is a workaround documented in DAGCombiner.cpp:2892 We don't - // support SELECT_CC on every type. - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + + if (QRI->Subtarget.hasV5TOps()) { + + // We need to make the operation type of SELECT node to be Custom, + // such that we don't go into the infinite loop of + // select -> setcc -> select_cc -> select loop. + setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + + } else { + + // Hexagon has no select or setcc: expand to SELECT_CC. + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::f64, Expand); + + // This is a workaround documented in DAGCombiner.cpp:2892 We don't + // support SELECT_CC on every type. + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + + } setOperationAction(ISD::BR_CC, MVT::Other, Expand); setOperationAction(ISD::BRIND, MVT::Other, Expand); @@ -1305,22 +1427,22 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: return 0; - case HexagonISD::CONST32: return "HexagonISD::CONST32"; + case HexagonISD::CONST32: return "HexagonISD::CONST32"; case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC"; - case HexagonISD::CMPICC: return "HexagonISD::CMPICC"; - case HexagonISD::CMPFCC: return "HexagonISD::CMPFCC"; - case HexagonISD::BRICC: return "HexagonISD::BRICC"; - case HexagonISD::BRFCC: return "HexagonISD::BRFCC"; - case HexagonISD::SELECT_ICC: return "HexagonISD::SELECT_ICC"; - case HexagonISD::SELECT_FCC: return "HexagonISD::SELECT_FCC"; - case HexagonISD::Hi: return "HexagonISD::Hi"; - case HexagonISD::Lo: return "HexagonISD::Lo"; - case HexagonISD::FTOI: return "HexagonISD::FTOI"; - case HexagonISD::ITOF: return "HexagonISD::ITOF"; - case HexagonISD::CALL: return "HexagonISD::CALL"; - case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; - case HexagonISD::BR_JT: return "HexagonISD::BR_JT"; - case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN"; + case HexagonISD::CMPICC: return "HexagonISD::CMPICC"; + case HexagonISD::CMPFCC: return "HexagonISD::CMPFCC"; + case HexagonISD::BRICC: return "HexagonISD::BRICC"; + case HexagonISD::BRFCC: return "HexagonISD::BRFCC"; + case HexagonISD::SELECT_ICC: return "HexagonISD::SELECT_ICC"; + case HexagonISD::SELECT_FCC: return "HexagonISD::SELECT_FCC"; + case HexagonISD::Hi: return "HexagonISD::Hi"; + case HexagonISD::Lo: return "HexagonISD::Lo"; + case HexagonISD::FTOI: return "HexagonISD::FTOI"; + case HexagonISD::ITOF: return "HexagonISD::ITOF"; + case HexagonISD::CALL: return "HexagonISD::CALL"; + case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; + case HexagonISD::BR_JT: return "HexagonISD::BR_JT"; + case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN"; } } @@ -1345,9 +1467,10 @@ SDValue HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); // Frame & Return address. Currently unimplemented. - case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); - case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::GlobalTLSAddress: llvm_unreachable("TLS not implemented for Hexagon."); case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG); @@ -1357,9 +1480,10 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::SELECT: return Op; case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::INLINEASM: return LowerINLINEASM(Op, DAG); + case ISD::INLINEASM: return LowerINLINEASM(Op, DAG); } } @@ -1402,9 +1526,11 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const case MVT::i32: case MVT::i16: case MVT::i8: - return std::make_pair(0U, Hexagon::IntRegsRegisterClass); + case MVT::f32: + return std::make_pair(0U, &Hexagon::IntRegsRegClass); case MVT::i64: - return std::make_pair(0U, Hexagon::DoubleRegsRegisterClass); + case MVT::f64: + return std::make_pair(0U, &Hexagon::DoubleRegsRegClass); } default: llvm_unreachable("Unknown asm register class"); @@ -1414,6 +1540,14 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } +/// isFPImmLegal - Returns true if the target can instruction select the +/// specified FP immediate natively. If false, the legalizer will +/// materialize the FP immediate as a load from a constant pool. +bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + const HexagonRegisterInfo* QRI = TM.getRegisterInfo(); + return QRI->Subtarget.hasV5TOps(); +} + /// isLegalAddressingMode - Return true if the addressing mode represented by /// AM is legal for this target, for a load/store of the specified type. bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM, diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 4208bcb..fe6c905 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -27,6 +27,7 @@ namespace llvm { CONST32, CONST32_GP, // For marking data present in GP. + FCONST32, SETCC, ADJDYNALLOC, ARGEXTEND, @@ -48,6 +49,7 @@ namespace llvm { BR_JT, // Jump table. BARRIER, // Memory barrier. WrapperJT, + WrapperCP, TC_RETURN }; } @@ -94,13 +96,7 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, @@ -128,6 +124,7 @@ namespace llvm { MachineBasicBlock *BB) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; virtual EVT getSetCCResultType(EVT VT) const { return MVT::i1; } @@ -150,6 +147,7 @@ namespace llvm { /// mode is legal for a load/store of any legal type. /// TODO: Handle pre/postinc as well. virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const; + virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can diff --git a/lib/Target/Hexagon/HexagonImmediates.td b/lib/Target/Hexagon/HexagonImmediates.td index e78bb79..18692c4 100644 --- a/lib/Target/Hexagon/HexagonImmediates.td +++ b/lib/Target/Hexagon/HexagonImmediates.td @@ -371,7 +371,7 @@ def s4_3ImmPred : PatLeaf<(i32 imm), [{ def u64ImmPred : PatLeaf<(i64 imm), [{ // immS16 predicate - True if the immediate fits in a 16-bit sign extended // field. - // Adding "N ||" to supress gcc unused warning. + // Adding "N ||" to suppress gcc unused warning. return (N || true); }]>; diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td index c9f16fb..e472d49 100644 --- a/lib/Target/Hexagon/HexagonInstrFormats.td +++ b/lib/Target/Hexagon/HexagonInstrFormats.td @@ -13,29 +13,48 @@ // *** Must match HexagonBaseInfo.h *** //===----------------------------------------------------------------------===// +class Type<bits<5> t> { + bits<5> Value = t; +} +def TypePSEUDO : Type<0>; +def TypeALU32 : Type<1>; +def TypeCR : Type<2>; +def TypeJR : Type<3>; +def TypeJ : Type<4>; +def TypeLD : Type<5>; +def TypeST : Type<6>; +def TypeSYSTEM : Type<7>; +def TypeXTYPE : Type<8>; +def TypeMARKER : Type<31>; //===----------------------------------------------------------------------===// // Intruction Class Declaration + //===----------------------------------------------------------------------===// class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, - string cstr, InstrItinClass itin> : Instruction { + string cstr, InstrItinClass itin, Type type> : Instruction { field bits<32> Inst; let Namespace = "Hexagon"; dag OutOperandList = outs; dag InOperandList = ins; - let AsmString = asmstr; + let AsmString = asmstr; let Pattern = pattern; let Constraints = cstr; - let Itinerary = itin; - - // *** The code below must match HexagonBaseInfo.h *** - + let Itinerary = itin; + let Size = 4; + + // *** Must match HexagonBaseInfo.h *** + // Instruction type according to the ISA. + Type HexagonType = type; + let TSFlags{4-0} = HexagonType.Value; + // Solo instructions, i.e., those that cannot be in a packet with others. + bits<1> isHexagonSolo = 0; + let TSFlags{5} = isHexagonSolo; // Predicated instructions. bits<1> isPredicated = 0; - let TSFlags{1} = isPredicated; + let TSFlags{6} = isPredicated; // *** The code above must match HexagonBaseInfo.h *** } @@ -47,17 +66,25 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, // LD Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", LD> { + : InstHexagon<outs, ins, asmstr, pattern, "", LD, TypeLD> { bits<5> rd; bits<5> rs; bits<13> imm13; } +class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern> + : InstHexagon<outs, ins, asmstr, pattern, "", LD, TypeLD> { + bits<5> rd; + bits<5> rs; + bits<13> imm13; + let mayLoad = 1; +} + // LD Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr> - : InstHexagon<outs, ins, asmstr, pattern, cstr, LD> { + : InstHexagon<outs, ins, asmstr, pattern, cstr, LD, TypeLD> { bits<5> rd; bits<5> rs; bits<5> rt; @@ -68,7 +95,24 @@ class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern, // ST Instruction Class in V4 can take SLOT0 & SLOT1. // Definition of the instruction class CHANGED from V2/V3 to V4. class STInst<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", ST> { + : InstHexagon<outs, ins, asmstr, pattern, "", ST, TypeST> { + bits<5> rd; + bits<5> rs; + bits<13> imm13; +} + +class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern> + : InstHexagon<outs, ins, asmstr, pattern, "", ST, TypeST> { + bits<5> rd; + bits<5> rs; + bits<13> imm13; + let mayStore = 1; +} + +// SYSTEM Instruction Class in V4 can take SLOT0 only +// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1. +class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern> + : InstHexagon<outs, ins, asmstr, pattern, "", SYS, TypeSYSTEM> { bits<5> rd; bits<5> rs; bits<13> imm13; @@ -79,7 +123,7 @@ class STInst<dag outs, dag ins, string asmstr, list<dag> pattern> // Definition of the instruction class CHANGED from V2/V3 to V4. class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr> - : InstHexagon<outs, ins, asmstr, pattern, cstr, ST> { + : InstHexagon<outs, ins, asmstr, pattern, cstr, ST, TypeST> { bits<5> rd; bits<5> rs; bits<5> rt; @@ -89,7 +133,7 @@ class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern, // ALU32 Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class ALU32Type<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", ALU32> { + : InstHexagon<outs, ins, asmstr, pattern, "", ALU32, TypeALU32> { bits<5> rd; bits<5> rs; bits<5> rt; @@ -102,7 +146,17 @@ class ALU32Type<dag outs, dag ins, string asmstr, list<dag> pattern> // Definition of the instruction class NOT CHANGED. // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4. class ALU64Type<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", ALU64> { + : InstHexagon<outs, ins, asmstr, pattern, "", ALU64, TypeXTYPE> { + bits<5> rd; + bits<5> rs; + bits<5> rt; + bits<16> imm16; + bits<16> imm16_2; +} + +class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern, + string cstr> + : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU64, TypeXTYPE> { bits<5> rd; bits<5> rs; bits<5> rt; @@ -115,7 +169,7 @@ class ALU64Type<dag outs, dag ins, string asmstr, list<dag> pattern> // Definition of the instruction class NOT CHANGED. // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4. class MInst<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", M> { + : InstHexagon<outs, ins, asmstr, pattern, "", M, TypeXTYPE> { bits<5> rd; bits<5> rs; bits<5> rt; @@ -126,8 +180,8 @@ class MInst<dag outs, dag ins, string asmstr, list<dag> pattern> // Definition of the instruction class NOT CHANGED. // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4. class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern, - string cstr> - : InstHexagon<outs, ins, asmstr, pattern, cstr, M> { + string cstr> + : InstHexagon<outs, ins, asmstr, pattern, cstr, M, TypeXTYPE> { bits<5> rd; bits<5> rs; bits<5> rt; @@ -138,9 +192,7 @@ class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern, // Definition of the instruction class NOT CHANGED. // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4. class SInst<dag outs, dag ins, string asmstr, list<dag> pattern> -//: InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, M)> { - : InstHexagon<outs, ins, asmstr, pattern, "", S> { -// : InstHexagon<outs, ins, asmstr, pattern, "", S> { + : InstHexagon<outs, ins, asmstr, pattern, "", S, TypeXTYPE> { bits<5> rd; bits<5> rs; bits<5> rt; @@ -151,8 +203,8 @@ class SInst<dag outs, dag ins, string asmstr, list<dag> pattern> // Definition of the instruction class NOT CHANGED. // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4. class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern, - string cstr> - : InstHexagon<outs, ins, asmstr, pattern, cstr, S> { + string cstr> + : InstHexagon<outs, ins, asmstr, pattern, cstr, S, TypeXTYPE> { // : InstHexagon<outs, ins, asmstr, pattern, cstr, S> { // : InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, S)> { bits<5> rd; @@ -163,14 +215,14 @@ class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern, // J Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class JType<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", J> { + : InstHexagon<outs, ins, asmstr, pattern, "", J, TypeJ> { bits<16> imm16; } // JR Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class JRType<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", JR> { + : InstHexagon<outs, ins, asmstr, pattern, "", JR, TypeJR> { bits<5> rs; bits<5> pu; // Predicate register } @@ -178,15 +230,22 @@ class JRType<dag outs, dag ins, string asmstr, list<dag> pattern> // CR Instruction Class in V2/V3/V4. // Definition of the instruction class NOT CHANGED. class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", CR> { + : InstHexagon<outs, ins, asmstr, pattern, "", CR, TypeCR> { bits<5> rs; bits<10> imm10; } +class Marker<dag outs, dag ins, string asmstr, list<dag> pattern> + : InstHexagon<outs, ins, asmstr, pattern, "", MARKER, TypeMARKER> { + let isCodeGenOnly = 1; + let isPseudo = 1; +} class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", PSEUDO>; - + : InstHexagon<outs, ins, asmstr, pattern, "", PSEUDO, TypePSEUDO> { + let isCodeGenOnly = 1; + let isPseudo = 1; +} //===----------------------------------------------------------------------===// // Intruction Classes Definitions - @@ -222,6 +281,11 @@ class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern> : ALU64Type<outs, ins, asmstr, pattern> { } +class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern> + : ALU64Type<outs, ins, asmstr, pattern> { + let rt{0-4} = 0; +} + // J Type Instructions. class JInst<dag outs, dag ins, string asmstr, list<dag> pattern> : JType<outs, ins, asmstr, pattern> { @@ -234,15 +298,31 @@ class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern> // Post increment ST Instruction. -class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr> +class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, + string cstr> + : STInstPost<outs, ins, asmstr, pattern, cstr> { + let rt{0-4} = 0; +} + +class STInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern, + string cstr> : STInstPost<outs, ins, asmstr, pattern, cstr> { let rt{0-4} = 0; + let mayStore = 1; } // Post increment LD Instruction. -class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr> +class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, + string cstr> + : LDInstPost<outs, ins, asmstr, pattern, cstr> { + let rt{0-4} = 0; +} + +class LDInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern, + string cstr> : LDInstPost<outs, ins, asmstr, pattern, cstr> { let rt{0-4} = 0; + let mayLoad = 1; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td index bd5e449..49741a3 100644 --- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td +++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td @@ -11,11 +11,25 @@ // //===----------------------------------------------------------------------===// +//----------------------------------------------------------------------------// +// Hexagon Intruction Flags + +// +// *** Must match BaseInfo.h *** +//----------------------------------------------------------------------------// + +def TypeMEMOP : Type<9>; +def TypeNV : Type<10>; +def TypePREFIX : Type<30>; + +//----------------------------------------------------------------------------// +// Intruction Classes Definitions + +//----------------------------------------------------------------------------// + // // NV type instructions. // class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", NV_V4> { + : InstHexagon<outs, ins, asmstr, pattern, "", NV_V4, TypeNV> { bits<5> rd; bits<5> rs; bits<13> imm13; @@ -24,7 +38,7 @@ class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern> // Definition of Post increment new value store. class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr> - : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4> { + : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4, TypeNV> { bits<5> rd; bits<5> rs; bits<5> rt; @@ -39,8 +53,15 @@ class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern, } class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern> - : InstHexagon<outs, ins, asmstr, pattern, "", MEM_V4> { + : InstHexagon<outs, ins, asmstr, pattern, "", MEM_V4, TypeMEMOP> { bits<5> rd; bits<5> rs; bits<6> imm6; } + +class Immext<dag outs, dag ins, string asmstr, list<dag> pattern> + : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypePREFIX> { + let isCodeGenOnly = 1; + + bits<26> imm26; +} diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 77b3663..c8f933d 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -11,10 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "Hexagon.h" #include "HexagonInstrInfo.h" #include "HexagonRegisterInfo.h" #include "HexagonSubtarget.h" +#include "Hexagon.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/DFAPacketizer.h" @@ -34,24 +34,23 @@ using namespace llvm; /// Constants for Hexagon instructions. /// const int Hexagon_MEMW_OFFSET_MAX = 4095; -const int Hexagon_MEMW_OFFSET_MIN = 4096; +const int Hexagon_MEMW_OFFSET_MIN = -4096; const int Hexagon_MEMD_OFFSET_MAX = 8191; -const int Hexagon_MEMD_OFFSET_MIN = 8192; +const int Hexagon_MEMD_OFFSET_MIN = -8192; const int Hexagon_MEMH_OFFSET_MAX = 2047; -const int Hexagon_MEMH_OFFSET_MIN = 2048; +const int Hexagon_MEMH_OFFSET_MIN = -2048; const int Hexagon_MEMB_OFFSET_MAX = 1023; -const int Hexagon_MEMB_OFFSET_MIN = 1024; +const int Hexagon_MEMB_OFFSET_MIN = -1024; const int Hexagon_ADDI_OFFSET_MAX = 32767; -const int Hexagon_ADDI_OFFSET_MIN = 32768; +const int Hexagon_ADDI_OFFSET_MIN = -32768; const int Hexagon_MEMD_AUTOINC_MAX = 56; -const int Hexagon_MEMD_AUTOINC_MIN = 64; +const int Hexagon_MEMD_AUTOINC_MIN = -64; const int Hexagon_MEMW_AUTOINC_MAX = 28; -const int Hexagon_MEMW_AUTOINC_MIN = 32; +const int Hexagon_MEMW_AUTOINC_MIN = -32; const int Hexagon_MEMH_AUTOINC_MAX = 14; -const int Hexagon_MEMH_AUTOINC_MIN = 16; +const int Hexagon_MEMH_AUTOINC_MIN = -16; const int Hexagon_MEMB_AUTOINC_MAX = 7; -const int Hexagon_MEMB_AUTOINC_MIN = 8; - +const int Hexagon_MEMB_AUTOINC_MIN = -8; HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST) @@ -70,6 +69,7 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, switch (MI->getOpcode()) { + default: break; case Hexagon::LDriw: case Hexagon::LDrid: case Hexagon::LDrih: @@ -81,11 +81,7 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, return MI->getOperand(0).getReg(); } break; - - default: - break; } - return 0; } @@ -98,21 +94,18 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const { switch (MI->getOpcode()) { + default: break; case Hexagon::STriw: case Hexagon::STrid: case Hexagon::STrih: case Hexagon::STrib: if (MI->getOperand(2).isFI() && MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { - FrameIndex = MI->getOperand(2).getIndex(); - return MI->getOperand(0).getReg(); + FrameIndex = MI->getOperand(0).getIndex(); + return MI->getOperand(2).getReg(); } break; - - default: - break; } - return 0; } @@ -176,6 +169,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const { + TBB = NULL; FBB = NULL; // If the block has no terminators, it just falls into the block after it. @@ -328,7 +322,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB, DestReg).addReg(SrcReg).addReg(SrcReg); return; } - if (Hexagon::DoubleRegsRegClass.contains(DestReg, SrcReg)) { + if (Hexagon::DoubleRegsRegClass.contains(DestReg) && + Hexagon::IntRegsRegClass.contains(SrcReg)) { // We can have an overlap between single and double reg: r1:0 = r0. if(SrcReg == RI.getSubReg(DestReg, Hexagon::subreg_loreg)) { // r1:0 = r0 @@ -343,7 +338,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } return; } - if (Hexagon::CRRegsRegClass.contains(DestReg, SrcReg)) { + if (Hexagon::CRRegsRegClass.contains(DestReg) && + Hexagon::IntRegsRegClass.contains(SrcReg)) { BuildMI(MBB, I, DL, get(Hexagon::TFCR), DestReg).addReg(SrcReg); return; } @@ -370,15 +366,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MFI.getObjectSize(FI), Align); - if (Hexagon::IntRegsRegisterClass->hasSubClassEq(RC)) { + if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(Hexagon::STriw)) .addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); - } else if (Hexagon::DoubleRegsRegisterClass->hasSubClassEq(RC)) { + } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(Hexagon::STrid)) .addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); - } else if (Hexagon::PredRegsRegisterClass->hasSubClassEq(RC)) { + } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(Hexagon::STriw_pred)) .addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); @@ -415,14 +411,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); - - if (RC == Hexagon::IntRegsRegisterClass) { + if (RC == &Hexagon::IntRegsRegClass) { BuildMI(MBB, I, DL, get(Hexagon::LDriw), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO); - } else if (RC == Hexagon::DoubleRegsRegisterClass) { + } else if (RC == &Hexagon::DoubleRegsRegClass) { BuildMI(MBB, I, DL, get(Hexagon::LDrid), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO); - } else if (RC == Hexagon::PredRegsRegisterClass) { + } else if (RC == &Hexagon::PredRegsRegClass) { BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO); } else { @@ -453,11 +448,11 @@ unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const { MachineRegisterInfo &RegInfo = MF->getRegInfo(); const TargetRegisterClass *TRC; if (VT == MVT::i1) { - TRC = Hexagon::PredRegsRegisterClass; - } else if (VT == MVT::i32) { - TRC = Hexagon::IntRegsRegisterClass; - } else if (VT == MVT::i64) { - TRC = Hexagon::DoubleRegsRegisterClass; + TRC = &Hexagon::PredRegsRegClass; + } else if (VT == MVT::i32 || VT == MVT::f32) { + TRC = &Hexagon::IntRegsRegClass; + } else if (VT == MVT::i64 || VT == MVT::f64) { + TRC = &Hexagon::DoubleRegsRegClass; } else { llvm_unreachable("Cannot handle this register class"); } @@ -466,7 +461,852 @@ unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const { return NewReg; } +bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const { + switch(MI->getOpcode()) { + default: return false; + // JMP_EQri + case Hexagon::JMP_EQriPt_nv_V4: + case Hexagon::JMP_EQriPnt_nv_V4: + case Hexagon::JMP_EQriNotPt_nv_V4: + case Hexagon::JMP_EQriNotPnt_nv_V4: + + // JMP_EQri - with -1 + case Hexagon::JMP_EQriPtneg_nv_V4: + case Hexagon::JMP_EQriPntneg_nv_V4: + case Hexagon::JMP_EQriNotPtneg_nv_V4: + case Hexagon::JMP_EQriNotPntneg_nv_V4: + + // JMP_EQrr + case Hexagon::JMP_EQrrPt_nv_V4: + case Hexagon::JMP_EQrrPnt_nv_V4: + case Hexagon::JMP_EQrrNotPt_nv_V4: + case Hexagon::JMP_EQrrNotPnt_nv_V4: + + // JMP_GTri + case Hexagon::JMP_GTriPt_nv_V4: + case Hexagon::JMP_GTriPnt_nv_V4: + case Hexagon::JMP_GTriNotPt_nv_V4: + case Hexagon::JMP_GTriNotPnt_nv_V4: + + // JMP_GTri - with -1 + case Hexagon::JMP_GTriPtneg_nv_V4: + case Hexagon::JMP_GTriPntneg_nv_V4: + case Hexagon::JMP_GTriNotPtneg_nv_V4: + case Hexagon::JMP_GTriNotPntneg_nv_V4: + // JMP_GTrr + case Hexagon::JMP_GTrrPt_nv_V4: + case Hexagon::JMP_GTrrPnt_nv_V4: + case Hexagon::JMP_GTrrNotPt_nv_V4: + case Hexagon::JMP_GTrrNotPnt_nv_V4: + + // JMP_GTrrdn + case Hexagon::JMP_GTrrdnPt_nv_V4: + case Hexagon::JMP_GTrrdnPnt_nv_V4: + case Hexagon::JMP_GTrrdnNotPt_nv_V4: + case Hexagon::JMP_GTrrdnNotPnt_nv_V4: + + // JMP_GTUri + case Hexagon::JMP_GTUriPt_nv_V4: + case Hexagon::JMP_GTUriPnt_nv_V4: + case Hexagon::JMP_GTUriNotPt_nv_V4: + case Hexagon::JMP_GTUriNotPnt_nv_V4: + + // JMP_GTUrr + case Hexagon::JMP_GTUrrPt_nv_V4: + case Hexagon::JMP_GTUrrPnt_nv_V4: + case Hexagon::JMP_GTUrrNotPt_nv_V4: + case Hexagon::JMP_GTUrrNotPnt_nv_V4: + + // JMP_GTUrrdn + case Hexagon::JMP_GTUrrdnPt_nv_V4: + case Hexagon::JMP_GTUrrdnPnt_nv_V4: + case Hexagon::JMP_GTUrrdnNotPt_nv_V4: + case Hexagon::JMP_GTUrrdnNotPnt_nv_V4: + + // TFR_FI + case Hexagon::TFR_FI: + return true; + } +} + +bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const { + switch(MI->getOpcode()) { + default: return false; + // JMP_EQri + case Hexagon::JMP_EQriPt_ie_nv_V4: + case Hexagon::JMP_EQriPnt_ie_nv_V4: + case Hexagon::JMP_EQriNotPt_ie_nv_V4: + case Hexagon::JMP_EQriNotPnt_ie_nv_V4: + + // JMP_EQri - with -1 + case Hexagon::JMP_EQriPtneg_ie_nv_V4: + case Hexagon::JMP_EQriPntneg_ie_nv_V4: + case Hexagon::JMP_EQriNotPtneg_ie_nv_V4: + case Hexagon::JMP_EQriNotPntneg_ie_nv_V4: + + // JMP_EQrr + case Hexagon::JMP_EQrrPt_ie_nv_V4: + case Hexagon::JMP_EQrrPnt_ie_nv_V4: + case Hexagon::JMP_EQrrNotPt_ie_nv_V4: + case Hexagon::JMP_EQrrNotPnt_ie_nv_V4: + + // JMP_GTri + case Hexagon::JMP_GTriPt_ie_nv_V4: + case Hexagon::JMP_GTriPnt_ie_nv_V4: + case Hexagon::JMP_GTriNotPt_ie_nv_V4: + case Hexagon::JMP_GTriNotPnt_ie_nv_V4: + + // JMP_GTri - with -1 + case Hexagon::JMP_GTriPtneg_ie_nv_V4: + case Hexagon::JMP_GTriPntneg_ie_nv_V4: + case Hexagon::JMP_GTriNotPtneg_ie_nv_V4: + case Hexagon::JMP_GTriNotPntneg_ie_nv_V4: + + // JMP_GTrr + case Hexagon::JMP_GTrrPt_ie_nv_V4: + case Hexagon::JMP_GTrrPnt_ie_nv_V4: + case Hexagon::JMP_GTrrNotPt_ie_nv_V4: + case Hexagon::JMP_GTrrNotPnt_ie_nv_V4: + + // JMP_GTrrdn + case Hexagon::JMP_GTrrdnPt_ie_nv_V4: + case Hexagon::JMP_GTrrdnPnt_ie_nv_V4: + case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4: + case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4: + + // JMP_GTUri + case Hexagon::JMP_GTUriPt_ie_nv_V4: + case Hexagon::JMP_GTUriPnt_ie_nv_V4: + case Hexagon::JMP_GTUriNotPt_ie_nv_V4: + case Hexagon::JMP_GTUriNotPnt_ie_nv_V4: + + // JMP_GTUrr + case Hexagon::JMP_GTUrrPt_ie_nv_V4: + case Hexagon::JMP_GTUrrPnt_ie_nv_V4: + case Hexagon::JMP_GTUrrNotPt_ie_nv_V4: + case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4: + + // JMP_GTUrrdn + case Hexagon::JMP_GTUrrdnPt_ie_nv_V4: + case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4: + case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4: + case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4: + + // V4 absolute set addressing. + case Hexagon::LDrid_abs_setimm_V4: + case Hexagon::LDriw_abs_setimm_V4: + case Hexagon::LDrih_abs_setimm_V4: + case Hexagon::LDrib_abs_setimm_V4: + case Hexagon::LDriuh_abs_setimm_V4: + case Hexagon::LDriub_abs_setimm_V4: + + case Hexagon::STrid_abs_setimm_V4: + case Hexagon::STrib_abs_setimm_V4: + case Hexagon::STrih_abs_setimm_V4: + case Hexagon::STriw_abs_setimm_V4: + + // V4 global address load. + case Hexagon::LDrid_GP_cPt_V4 : + case Hexagon::LDrid_GP_cNotPt_V4 : + case Hexagon::LDrid_GP_cdnPt_V4 : + case Hexagon::LDrid_GP_cdnNotPt_V4 : + case Hexagon::LDrib_GP_cPt_V4 : + case Hexagon::LDrib_GP_cNotPt_V4 : + case Hexagon::LDrib_GP_cdnPt_V4 : + case Hexagon::LDrib_GP_cdnNotPt_V4 : + case Hexagon::LDriub_GP_cPt_V4 : + case Hexagon::LDriub_GP_cNotPt_V4 : + case Hexagon::LDriub_GP_cdnPt_V4 : + case Hexagon::LDriub_GP_cdnNotPt_V4 : + case Hexagon::LDrih_GP_cPt_V4 : + case Hexagon::LDrih_GP_cNotPt_V4 : + case Hexagon::LDrih_GP_cdnPt_V4 : + case Hexagon::LDrih_GP_cdnNotPt_V4 : + case Hexagon::LDriuh_GP_cPt_V4 : + case Hexagon::LDriuh_GP_cNotPt_V4 : + case Hexagon::LDriuh_GP_cdnPt_V4 : + case Hexagon::LDriuh_GP_cdnNotPt_V4 : + case Hexagon::LDriw_GP_cPt_V4 : + case Hexagon::LDriw_GP_cNotPt_V4 : + case Hexagon::LDriw_GP_cdnPt_V4 : + case Hexagon::LDriw_GP_cdnNotPt_V4 : + case Hexagon::LDd_GP_cPt_V4 : + case Hexagon::LDd_GP_cNotPt_V4 : + case Hexagon::LDd_GP_cdnPt_V4 : + case Hexagon::LDd_GP_cdnNotPt_V4 : + case Hexagon::LDb_GP_cPt_V4 : + case Hexagon::LDb_GP_cNotPt_V4 : + case Hexagon::LDb_GP_cdnPt_V4 : + case Hexagon::LDb_GP_cdnNotPt_V4 : + case Hexagon::LDub_GP_cPt_V4 : + case Hexagon::LDub_GP_cNotPt_V4 : + case Hexagon::LDub_GP_cdnPt_V4 : + case Hexagon::LDub_GP_cdnNotPt_V4 : + case Hexagon::LDh_GP_cPt_V4 : + case Hexagon::LDh_GP_cNotPt_V4 : + case Hexagon::LDh_GP_cdnPt_V4 : + case Hexagon::LDh_GP_cdnNotPt_V4 : + case Hexagon::LDuh_GP_cPt_V4 : + case Hexagon::LDuh_GP_cNotPt_V4 : + case Hexagon::LDuh_GP_cdnPt_V4 : + case Hexagon::LDuh_GP_cdnNotPt_V4 : + case Hexagon::LDw_GP_cPt_V4 : + case Hexagon::LDw_GP_cNotPt_V4 : + case Hexagon::LDw_GP_cdnPt_V4 : + case Hexagon::LDw_GP_cdnNotPt_V4 : + + // V4 global address store. + case Hexagon::STrid_GP_cPt_V4 : + case Hexagon::STrid_GP_cNotPt_V4 : + case Hexagon::STrid_GP_cdnPt_V4 : + case Hexagon::STrid_GP_cdnNotPt_V4 : + case Hexagon::STrib_GP_cPt_V4 : + case Hexagon::STrib_GP_cNotPt_V4 : + case Hexagon::STrib_GP_cdnPt_V4 : + case Hexagon::STrib_GP_cdnNotPt_V4 : + case Hexagon::STrih_GP_cPt_V4 : + case Hexagon::STrih_GP_cNotPt_V4 : + case Hexagon::STrih_GP_cdnPt_V4 : + case Hexagon::STrih_GP_cdnNotPt_V4 : + case Hexagon::STriw_GP_cPt_V4 : + case Hexagon::STriw_GP_cNotPt_V4 : + case Hexagon::STriw_GP_cdnPt_V4 : + case Hexagon::STriw_GP_cdnNotPt_V4 : + case Hexagon::STd_GP_cPt_V4 : + case Hexagon::STd_GP_cNotPt_V4 : + case Hexagon::STd_GP_cdnPt_V4 : + case Hexagon::STd_GP_cdnNotPt_V4 : + case Hexagon::STb_GP_cPt_V4 : + case Hexagon::STb_GP_cNotPt_V4 : + case Hexagon::STb_GP_cdnPt_V4 : + case Hexagon::STb_GP_cdnNotPt_V4 : + case Hexagon::STh_GP_cPt_V4 : + case Hexagon::STh_GP_cNotPt_V4 : + case Hexagon::STh_GP_cdnPt_V4 : + case Hexagon::STh_GP_cdnNotPt_V4 : + case Hexagon::STw_GP_cPt_V4 : + case Hexagon::STw_GP_cNotPt_V4 : + case Hexagon::STw_GP_cdnPt_V4 : + case Hexagon::STw_GP_cdnNotPt_V4 : + + // V4 predicated global address new value store. + case Hexagon::STrib_GP_cPt_nv_V4 : + case Hexagon::STrib_GP_cNotPt_nv_V4 : + case Hexagon::STrib_GP_cdnPt_nv_V4 : + case Hexagon::STrib_GP_cdnNotPt_nv_V4 : + case Hexagon::STrih_GP_cPt_nv_V4 : + case Hexagon::STrih_GP_cNotPt_nv_V4 : + case Hexagon::STrih_GP_cdnPt_nv_V4 : + case Hexagon::STrih_GP_cdnNotPt_nv_V4 : + case Hexagon::STriw_GP_cPt_nv_V4 : + case Hexagon::STriw_GP_cNotPt_nv_V4 : + case Hexagon::STriw_GP_cdnPt_nv_V4 : + case Hexagon::STriw_GP_cdnNotPt_nv_V4 : + case Hexagon::STb_GP_cPt_nv_V4 : + case Hexagon::STb_GP_cNotPt_nv_V4 : + case Hexagon::STb_GP_cdnPt_nv_V4 : + case Hexagon::STb_GP_cdnNotPt_nv_V4 : + case Hexagon::STh_GP_cPt_nv_V4 : + case Hexagon::STh_GP_cNotPt_nv_V4 : + case Hexagon::STh_GP_cdnPt_nv_V4 : + case Hexagon::STh_GP_cdnNotPt_nv_V4 : + case Hexagon::STw_GP_cPt_nv_V4 : + case Hexagon::STw_GP_cNotPt_nv_V4 : + case Hexagon::STw_GP_cdnPt_nv_V4 : + case Hexagon::STw_GP_cdnNotPt_nv_V4 : + + // TFR_FI + case Hexagon::TFR_FI_immext_V4: + + // TFRI_F + case Hexagon::TFRI_f: + case Hexagon::TFRI_cPt_f: + case Hexagon::TFRI_cNotPt_f: + case Hexagon::CONST64_Float_Real: + return true; + } +} + +bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: return false; + // JMP_EQri + case Hexagon::JMP_EQriPt_nv_V4: + case Hexagon::JMP_EQriPnt_nv_V4: + case Hexagon::JMP_EQriNotPt_nv_V4: + case Hexagon::JMP_EQriNotPnt_nv_V4: + case Hexagon::JMP_EQriPt_ie_nv_V4: + case Hexagon::JMP_EQriPnt_ie_nv_V4: + case Hexagon::JMP_EQriNotPt_ie_nv_V4: + case Hexagon::JMP_EQriNotPnt_ie_nv_V4: + + // JMP_EQri - with -1 + case Hexagon::JMP_EQriPtneg_nv_V4: + case Hexagon::JMP_EQriPntneg_nv_V4: + case Hexagon::JMP_EQriNotPtneg_nv_V4: + case Hexagon::JMP_EQriNotPntneg_nv_V4: + case Hexagon::JMP_EQriPtneg_ie_nv_V4: + case Hexagon::JMP_EQriPntneg_ie_nv_V4: + case Hexagon::JMP_EQriNotPtneg_ie_nv_V4: + case Hexagon::JMP_EQriNotPntneg_ie_nv_V4: + + // JMP_EQrr + case Hexagon::JMP_EQrrPt_nv_V4: + case Hexagon::JMP_EQrrPnt_nv_V4: + case Hexagon::JMP_EQrrNotPt_nv_V4: + case Hexagon::JMP_EQrrNotPnt_nv_V4: + case Hexagon::JMP_EQrrPt_ie_nv_V4: + case Hexagon::JMP_EQrrPnt_ie_nv_V4: + case Hexagon::JMP_EQrrNotPt_ie_nv_V4: + case Hexagon::JMP_EQrrNotPnt_ie_nv_V4: + + // JMP_GTri + case Hexagon::JMP_GTriPt_nv_V4: + case Hexagon::JMP_GTriPnt_nv_V4: + case Hexagon::JMP_GTriNotPt_nv_V4: + case Hexagon::JMP_GTriNotPnt_nv_V4: + case Hexagon::JMP_GTriPt_ie_nv_V4: + case Hexagon::JMP_GTriPnt_ie_nv_V4: + case Hexagon::JMP_GTriNotPt_ie_nv_V4: + case Hexagon::JMP_GTriNotPnt_ie_nv_V4: + + // JMP_GTri - with -1 + case Hexagon::JMP_GTriPtneg_nv_V4: + case Hexagon::JMP_GTriPntneg_nv_V4: + case Hexagon::JMP_GTriNotPtneg_nv_V4: + case Hexagon::JMP_GTriNotPntneg_nv_V4: + case Hexagon::JMP_GTriPtneg_ie_nv_V4: + case Hexagon::JMP_GTriPntneg_ie_nv_V4: + case Hexagon::JMP_GTriNotPtneg_ie_nv_V4: + case Hexagon::JMP_GTriNotPntneg_ie_nv_V4: + + // JMP_GTrr + case Hexagon::JMP_GTrrPt_nv_V4: + case Hexagon::JMP_GTrrPnt_nv_V4: + case Hexagon::JMP_GTrrNotPt_nv_V4: + case Hexagon::JMP_GTrrNotPnt_nv_V4: + case Hexagon::JMP_GTrrPt_ie_nv_V4: + case Hexagon::JMP_GTrrPnt_ie_nv_V4: + case Hexagon::JMP_GTrrNotPt_ie_nv_V4: + case Hexagon::JMP_GTrrNotPnt_ie_nv_V4: + + // JMP_GTrrdn + case Hexagon::JMP_GTrrdnPt_nv_V4: + case Hexagon::JMP_GTrrdnPnt_nv_V4: + case Hexagon::JMP_GTrrdnNotPt_nv_V4: + case Hexagon::JMP_GTrrdnNotPnt_nv_V4: + case Hexagon::JMP_GTrrdnPt_ie_nv_V4: + case Hexagon::JMP_GTrrdnPnt_ie_nv_V4: + case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4: + case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4: + + // JMP_GTUri + case Hexagon::JMP_GTUriPt_nv_V4: + case Hexagon::JMP_GTUriPnt_nv_V4: + case Hexagon::JMP_GTUriNotPt_nv_V4: + case Hexagon::JMP_GTUriNotPnt_nv_V4: + case Hexagon::JMP_GTUriPt_ie_nv_V4: + case Hexagon::JMP_GTUriPnt_ie_nv_V4: + case Hexagon::JMP_GTUriNotPt_ie_nv_V4: + case Hexagon::JMP_GTUriNotPnt_ie_nv_V4: + + // JMP_GTUrr + case Hexagon::JMP_GTUrrPt_nv_V4: + case Hexagon::JMP_GTUrrPnt_nv_V4: + case Hexagon::JMP_GTUrrNotPt_nv_V4: + case Hexagon::JMP_GTUrrNotPnt_nv_V4: + case Hexagon::JMP_GTUrrPt_ie_nv_V4: + case Hexagon::JMP_GTUrrPnt_ie_nv_V4: + case Hexagon::JMP_GTUrrNotPt_ie_nv_V4: + case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4: + + // JMP_GTUrrdn + case Hexagon::JMP_GTUrrdnPt_nv_V4: + case Hexagon::JMP_GTUrrdnPnt_nv_V4: + case Hexagon::JMP_GTUrrdnNotPt_nv_V4: + case Hexagon::JMP_GTUrrdnNotPnt_nv_V4: + case Hexagon::JMP_GTUrrdnPt_ie_nv_V4: + case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4: + case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4: + case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4: + return true; + } +} + +unsigned HexagonInstrInfo::getImmExtForm(const MachineInstr* MI) const { + switch(MI->getOpcode()) { + default: llvm_unreachable("Unknown type of instruction."); + // JMP_EQri + case Hexagon::JMP_EQriPt_nv_V4: + return Hexagon::JMP_EQriPt_ie_nv_V4; + case Hexagon::JMP_EQriNotPt_nv_V4: + return Hexagon::JMP_EQriNotPt_ie_nv_V4; + case Hexagon::JMP_EQriPnt_nv_V4: + return Hexagon::JMP_EQriPnt_ie_nv_V4; + case Hexagon::JMP_EQriNotPnt_nv_V4: + return Hexagon::JMP_EQriNotPnt_ie_nv_V4; + + // JMP_EQri -- with -1 + case Hexagon::JMP_EQriPtneg_nv_V4: + return Hexagon::JMP_EQriPtneg_ie_nv_V4; + case Hexagon::JMP_EQriNotPtneg_nv_V4: + return Hexagon::JMP_EQriNotPtneg_ie_nv_V4; + case Hexagon::JMP_EQriPntneg_nv_V4: + return Hexagon::JMP_EQriPntneg_ie_nv_V4; + case Hexagon::JMP_EQriNotPntneg_nv_V4: + return Hexagon::JMP_EQriNotPntneg_ie_nv_V4; + + // JMP_EQrr + case Hexagon::JMP_EQrrPt_nv_V4: + return Hexagon::JMP_EQrrPt_ie_nv_V4; + case Hexagon::JMP_EQrrNotPt_nv_V4: + return Hexagon::JMP_EQrrNotPt_ie_nv_V4; + case Hexagon::JMP_EQrrPnt_nv_V4: + return Hexagon::JMP_EQrrPnt_ie_nv_V4; + case Hexagon::JMP_EQrrNotPnt_nv_V4: + return Hexagon::JMP_EQrrNotPnt_ie_nv_V4; + + // JMP_GTri + case Hexagon::JMP_GTriPt_nv_V4: + return Hexagon::JMP_GTriPt_ie_nv_V4; + case Hexagon::JMP_GTriNotPt_nv_V4: + return Hexagon::JMP_GTriNotPt_ie_nv_V4; + case Hexagon::JMP_GTriPnt_nv_V4: + return Hexagon::JMP_GTriPnt_ie_nv_V4; + case Hexagon::JMP_GTriNotPnt_nv_V4: + return Hexagon::JMP_GTriNotPnt_ie_nv_V4; + + // JMP_GTri -- with -1 + case Hexagon::JMP_GTriPtneg_nv_V4: + return Hexagon::JMP_GTriPtneg_ie_nv_V4; + case Hexagon::JMP_GTriNotPtneg_nv_V4: + return Hexagon::JMP_GTriNotPtneg_ie_nv_V4; + case Hexagon::JMP_GTriPntneg_nv_V4: + return Hexagon::JMP_GTriPntneg_ie_nv_V4; + case Hexagon::JMP_GTriNotPntneg_nv_V4: + return Hexagon::JMP_GTriNotPntneg_ie_nv_V4; + + // JMP_GTrr + case Hexagon::JMP_GTrrPt_nv_V4: + return Hexagon::JMP_GTrrPt_ie_nv_V4; + case Hexagon::JMP_GTrrNotPt_nv_V4: + return Hexagon::JMP_GTrrNotPt_ie_nv_V4; + case Hexagon::JMP_GTrrPnt_nv_V4: + return Hexagon::JMP_GTrrPnt_ie_nv_V4; + case Hexagon::JMP_GTrrNotPnt_nv_V4: + return Hexagon::JMP_GTrrNotPnt_ie_nv_V4; + + // JMP_GTrrdn + case Hexagon::JMP_GTrrdnPt_nv_V4: + return Hexagon::JMP_GTrrdnPt_ie_nv_V4; + case Hexagon::JMP_GTrrdnNotPt_nv_V4: + return Hexagon::JMP_GTrrdnNotPt_ie_nv_V4; + case Hexagon::JMP_GTrrdnPnt_nv_V4: + return Hexagon::JMP_GTrrdnPnt_ie_nv_V4; + case Hexagon::JMP_GTrrdnNotPnt_nv_V4: + return Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4; + + // JMP_GTUri + case Hexagon::JMP_GTUriPt_nv_V4: + return Hexagon::JMP_GTUriPt_ie_nv_V4; + case Hexagon::JMP_GTUriNotPt_nv_V4: + return Hexagon::JMP_GTUriNotPt_ie_nv_V4; + case Hexagon::JMP_GTUriPnt_nv_V4: + return Hexagon::JMP_GTUriPnt_ie_nv_V4; + case Hexagon::JMP_GTUriNotPnt_nv_V4: + return Hexagon::JMP_GTUriNotPnt_ie_nv_V4; + + // JMP_GTUrr + case Hexagon::JMP_GTUrrPt_nv_V4: + return Hexagon::JMP_GTUrrPt_ie_nv_V4; + case Hexagon::JMP_GTUrrNotPt_nv_V4: + return Hexagon::JMP_GTUrrNotPt_ie_nv_V4; + case Hexagon::JMP_GTUrrPnt_nv_V4: + return Hexagon::JMP_GTUrrPnt_ie_nv_V4; + case Hexagon::JMP_GTUrrNotPnt_nv_V4: + return Hexagon::JMP_GTUrrNotPnt_ie_nv_V4; + + // JMP_GTUrrdn + case Hexagon::JMP_GTUrrdnPt_nv_V4: + return Hexagon::JMP_GTUrrdnPt_ie_nv_V4; + case Hexagon::JMP_GTUrrdnNotPt_nv_V4: + return Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4; + case Hexagon::JMP_GTUrrdnPnt_nv_V4: + return Hexagon::JMP_GTUrrdnPnt_ie_nv_V4; + case Hexagon::JMP_GTUrrdnNotPnt_nv_V4: + return Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4; + + case Hexagon::TFR_FI: + return Hexagon::TFR_FI_immext_V4; + + case Hexagon::MEMw_ADDSUBi_indexed_MEM_V4 : + case Hexagon::MEMw_ADDi_indexed_MEM_V4 : + case Hexagon::MEMw_SUBi_indexed_MEM_V4 : + case Hexagon::MEMw_ADDr_indexed_MEM_V4 : + case Hexagon::MEMw_SUBr_indexed_MEM_V4 : + case Hexagon::MEMw_ANDr_indexed_MEM_V4 : + case Hexagon::MEMw_ORr_indexed_MEM_V4 : + case Hexagon::MEMw_ADDSUBi_MEM_V4 : + case Hexagon::MEMw_ADDi_MEM_V4 : + case Hexagon::MEMw_SUBi_MEM_V4 : + case Hexagon::MEMw_ADDr_MEM_V4 : + case Hexagon::MEMw_SUBr_MEM_V4 : + case Hexagon::MEMw_ANDr_MEM_V4 : + case Hexagon::MEMw_ORr_MEM_V4 : + case Hexagon::MEMh_ADDSUBi_indexed_MEM_V4 : + case Hexagon::MEMh_ADDi_indexed_MEM_V4 : + case Hexagon::MEMh_SUBi_indexed_MEM_V4 : + case Hexagon::MEMh_ADDr_indexed_MEM_V4 : + case Hexagon::MEMh_SUBr_indexed_MEM_V4 : + case Hexagon::MEMh_ANDr_indexed_MEM_V4 : + case Hexagon::MEMh_ORr_indexed_MEM_V4 : + case Hexagon::MEMh_ADDSUBi_MEM_V4 : + case Hexagon::MEMh_ADDi_MEM_V4 : + case Hexagon::MEMh_SUBi_MEM_V4 : + case Hexagon::MEMh_ADDr_MEM_V4 : + case Hexagon::MEMh_SUBr_MEM_V4 : + case Hexagon::MEMh_ANDr_MEM_V4 : + case Hexagon::MEMh_ORr_MEM_V4 : + case Hexagon::MEMb_ADDSUBi_indexed_MEM_V4 : + case Hexagon::MEMb_ADDi_indexed_MEM_V4 : + case Hexagon::MEMb_SUBi_indexed_MEM_V4 : + case Hexagon::MEMb_ADDr_indexed_MEM_V4 : + case Hexagon::MEMb_SUBr_indexed_MEM_V4 : + case Hexagon::MEMb_ANDr_indexed_MEM_V4 : + case Hexagon::MEMb_ORr_indexed_MEM_V4 : + case Hexagon::MEMb_ADDSUBi_MEM_V4 : + case Hexagon::MEMb_ADDi_MEM_V4 : + case Hexagon::MEMb_SUBi_MEM_V4 : + case Hexagon::MEMb_ADDr_MEM_V4 : + case Hexagon::MEMb_SUBr_MEM_V4 : + case Hexagon::MEMb_ANDr_MEM_V4 : + case Hexagon::MEMb_ORr_MEM_V4 : + llvm_unreachable("Needs implementing."); + } +} + +unsigned HexagonInstrInfo::getNormalBranchForm(const MachineInstr* MI) const { + switch(MI->getOpcode()) { + default: llvm_unreachable("Unknown type of jump instruction."); + // JMP_EQri + case Hexagon::JMP_EQriPt_ie_nv_V4: + return Hexagon::JMP_EQriPt_nv_V4; + case Hexagon::JMP_EQriNotPt_ie_nv_V4: + return Hexagon::JMP_EQriNotPt_nv_V4; + case Hexagon::JMP_EQriPnt_ie_nv_V4: + return Hexagon::JMP_EQriPnt_nv_V4; + case Hexagon::JMP_EQriNotPnt_ie_nv_V4: + return Hexagon::JMP_EQriNotPnt_nv_V4; + + // JMP_EQri -- with -1 + case Hexagon::JMP_EQriPtneg_ie_nv_V4: + return Hexagon::JMP_EQriPtneg_nv_V4; + case Hexagon::JMP_EQriNotPtneg_ie_nv_V4: + return Hexagon::JMP_EQriNotPtneg_nv_V4; + case Hexagon::JMP_EQriPntneg_ie_nv_V4: + return Hexagon::JMP_EQriPntneg_nv_V4; + case Hexagon::JMP_EQriNotPntneg_ie_nv_V4: + return Hexagon::JMP_EQriNotPntneg_nv_V4; + + // JMP_EQrr + case Hexagon::JMP_EQrrPt_ie_nv_V4: + return Hexagon::JMP_EQrrPt_nv_V4; + case Hexagon::JMP_EQrrNotPt_ie_nv_V4: + return Hexagon::JMP_EQrrNotPt_nv_V4; + case Hexagon::JMP_EQrrPnt_ie_nv_V4: + return Hexagon::JMP_EQrrPnt_nv_V4; + case Hexagon::JMP_EQrrNotPnt_ie_nv_V4: + return Hexagon::JMP_EQrrNotPnt_nv_V4; + + // JMP_GTri + case Hexagon::JMP_GTriPt_ie_nv_V4: + return Hexagon::JMP_GTriPt_nv_V4; + case Hexagon::JMP_GTriNotPt_ie_nv_V4: + return Hexagon::JMP_GTriNotPt_nv_V4; + case Hexagon::JMP_GTriPnt_ie_nv_V4: + return Hexagon::JMP_GTriPnt_nv_V4; + case Hexagon::JMP_GTriNotPnt_ie_nv_V4: + return Hexagon::JMP_GTriNotPnt_nv_V4; + + // JMP_GTri -- with -1 + case Hexagon::JMP_GTriPtneg_ie_nv_V4: + return Hexagon::JMP_GTriPtneg_nv_V4; + case Hexagon::JMP_GTriNotPtneg_ie_nv_V4: + return Hexagon::JMP_GTriNotPtneg_nv_V4; + case Hexagon::JMP_GTriPntneg_ie_nv_V4: + return Hexagon::JMP_GTriPntneg_nv_V4; + case Hexagon::JMP_GTriNotPntneg_ie_nv_V4: + return Hexagon::JMP_GTriNotPntneg_nv_V4; + + // JMP_GTrr + case Hexagon::JMP_GTrrPt_ie_nv_V4: + return Hexagon::JMP_GTrrPt_nv_V4; + case Hexagon::JMP_GTrrNotPt_ie_nv_V4: + return Hexagon::JMP_GTrrNotPt_nv_V4; + case Hexagon::JMP_GTrrPnt_ie_nv_V4: + return Hexagon::JMP_GTrrPnt_nv_V4; + case Hexagon::JMP_GTrrNotPnt_ie_nv_V4: + return Hexagon::JMP_GTrrNotPnt_nv_V4; + + // JMP_GTrrdn + case Hexagon::JMP_GTrrdnPt_ie_nv_V4: + return Hexagon::JMP_GTrrdnPt_nv_V4; + case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4: + return Hexagon::JMP_GTrrdnNotPt_nv_V4; + case Hexagon::JMP_GTrrdnPnt_ie_nv_V4: + return Hexagon::JMP_GTrrdnPnt_nv_V4; + case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4: + return Hexagon::JMP_GTrrdnNotPnt_nv_V4; + + // JMP_GTUri + case Hexagon::JMP_GTUriPt_ie_nv_V4: + return Hexagon::JMP_GTUriPt_nv_V4; + case Hexagon::JMP_GTUriNotPt_ie_nv_V4: + return Hexagon::JMP_GTUriNotPt_nv_V4; + case Hexagon::JMP_GTUriPnt_ie_nv_V4: + return Hexagon::JMP_GTUriPnt_nv_V4; + case Hexagon::JMP_GTUriNotPnt_ie_nv_V4: + return Hexagon::JMP_GTUriNotPnt_nv_V4; + + // JMP_GTUrr + case Hexagon::JMP_GTUrrPt_ie_nv_V4: + return Hexagon::JMP_GTUrrPt_nv_V4; + case Hexagon::JMP_GTUrrNotPt_ie_nv_V4: + return Hexagon::JMP_GTUrrNotPt_nv_V4; + case Hexagon::JMP_GTUrrPnt_ie_nv_V4: + return Hexagon::JMP_GTUrrPnt_nv_V4; + case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4: + return Hexagon::JMP_GTUrrNotPnt_nv_V4; + + // JMP_GTUrrdn + case Hexagon::JMP_GTUrrdnPt_ie_nv_V4: + return Hexagon::JMP_GTUrrdnPt_nv_V4; + case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4: + return Hexagon::JMP_GTUrrdnNotPt_nv_V4; + case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4: + return Hexagon::JMP_GTUrrdnPnt_nv_V4; + case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4: + return Hexagon::JMP_GTUrrdnNotPnt_nv_V4; + } +} + + +bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: return false; + // Store Byte + case Hexagon::STrib_nv_V4: + case Hexagon::STrib_indexed_nv_V4: + case Hexagon::STrib_indexed_shl_nv_V4: + case Hexagon::STrib_shl_nv_V4: + case Hexagon::STrib_GP_nv_V4: + case Hexagon::STb_GP_nv_V4: + case Hexagon::POST_STbri_nv_V4: + case Hexagon::STrib_cPt_nv_V4: + case Hexagon::STrib_cdnPt_nv_V4: + case Hexagon::STrib_cNotPt_nv_V4: + case Hexagon::STrib_cdnNotPt_nv_V4: + case Hexagon::STrib_indexed_cPt_nv_V4: + case Hexagon::STrib_indexed_cdnPt_nv_V4: + case Hexagon::STrib_indexed_cNotPt_nv_V4: + case Hexagon::STrib_indexed_cdnNotPt_nv_V4: + case Hexagon::STrib_indexed_shl_cPt_nv_V4: + case Hexagon::STrib_indexed_shl_cdnPt_nv_V4: + case Hexagon::STrib_indexed_shl_cNotPt_nv_V4: + case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4: + case Hexagon::POST_STbri_cPt_nv_V4: + case Hexagon::POST_STbri_cdnPt_nv_V4: + case Hexagon::POST_STbri_cNotPt_nv_V4: + case Hexagon::POST_STbri_cdnNotPt_nv_V4: + case Hexagon::STb_GP_cPt_nv_V4: + case Hexagon::STb_GP_cNotPt_nv_V4: + case Hexagon::STb_GP_cdnPt_nv_V4: + case Hexagon::STb_GP_cdnNotPt_nv_V4: + case Hexagon::STrib_GP_cPt_nv_V4: + case Hexagon::STrib_GP_cNotPt_nv_V4: + case Hexagon::STrib_GP_cdnPt_nv_V4: + case Hexagon::STrib_GP_cdnNotPt_nv_V4: + case Hexagon::STrib_abs_nv_V4: + case Hexagon::STrib_abs_cPt_nv_V4: + case Hexagon::STrib_abs_cdnPt_nv_V4: + case Hexagon::STrib_abs_cNotPt_nv_V4: + case Hexagon::STrib_abs_cdnNotPt_nv_V4: + case Hexagon::STrib_imm_abs_nv_V4: + case Hexagon::STrib_imm_abs_cPt_nv_V4: + case Hexagon::STrib_imm_abs_cdnPt_nv_V4: + case Hexagon::STrib_imm_abs_cNotPt_nv_V4: + case Hexagon::STrib_imm_abs_cdnNotPt_nv_V4: + + // Store Halfword + case Hexagon::STrih_nv_V4: + case Hexagon::STrih_indexed_nv_V4: + case Hexagon::STrih_indexed_shl_nv_V4: + case Hexagon::STrih_shl_nv_V4: + case Hexagon::STrih_GP_nv_V4: + case Hexagon::STh_GP_nv_V4: + case Hexagon::POST_SThri_nv_V4: + case Hexagon::STrih_cPt_nv_V4: + case Hexagon::STrih_cdnPt_nv_V4: + case Hexagon::STrih_cNotPt_nv_V4: + case Hexagon::STrih_cdnNotPt_nv_V4: + case Hexagon::STrih_indexed_cPt_nv_V4: + case Hexagon::STrih_indexed_cdnPt_nv_V4: + case Hexagon::STrih_indexed_cNotPt_nv_V4: + case Hexagon::STrih_indexed_cdnNotPt_nv_V4: + case Hexagon::STrih_indexed_shl_cPt_nv_V4: + case Hexagon::STrih_indexed_shl_cdnPt_nv_V4: + case Hexagon::STrih_indexed_shl_cNotPt_nv_V4: + case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4: + case Hexagon::POST_SThri_cPt_nv_V4: + case Hexagon::POST_SThri_cdnPt_nv_V4: + case Hexagon::POST_SThri_cNotPt_nv_V4: + case Hexagon::POST_SThri_cdnNotPt_nv_V4: + case Hexagon::STh_GP_cPt_nv_V4: + case Hexagon::STh_GP_cNotPt_nv_V4: + case Hexagon::STh_GP_cdnPt_nv_V4: + case Hexagon::STh_GP_cdnNotPt_nv_V4: + case Hexagon::STrih_GP_cPt_nv_V4: + case Hexagon::STrih_GP_cNotPt_nv_V4: + case Hexagon::STrih_GP_cdnPt_nv_V4: + case Hexagon::STrih_GP_cdnNotPt_nv_V4: + case Hexagon::STrih_abs_nv_V4: + case Hexagon::STrih_abs_cPt_nv_V4: + case Hexagon::STrih_abs_cdnPt_nv_V4: + case Hexagon::STrih_abs_cNotPt_nv_V4: + case Hexagon::STrih_abs_cdnNotPt_nv_V4: + case Hexagon::STrih_imm_abs_nv_V4: + case Hexagon::STrih_imm_abs_cPt_nv_V4: + case Hexagon::STrih_imm_abs_cdnPt_nv_V4: + case Hexagon::STrih_imm_abs_cNotPt_nv_V4: + case Hexagon::STrih_imm_abs_cdnNotPt_nv_V4: + + // Store Word + case Hexagon::STriw_nv_V4: + case Hexagon::STriw_indexed_nv_V4: + case Hexagon::STriw_indexed_shl_nv_V4: + case Hexagon::STriw_shl_nv_V4: + case Hexagon::STriw_GP_nv_V4: + case Hexagon::STw_GP_nv_V4: + case Hexagon::POST_STwri_nv_V4: + case Hexagon::STriw_cPt_nv_V4: + case Hexagon::STriw_cdnPt_nv_V4: + case Hexagon::STriw_cNotPt_nv_V4: + case Hexagon::STriw_cdnNotPt_nv_V4: + case Hexagon::STriw_indexed_cPt_nv_V4: + case Hexagon::STriw_indexed_cdnPt_nv_V4: + case Hexagon::STriw_indexed_cNotPt_nv_V4: + case Hexagon::STriw_indexed_cdnNotPt_nv_V4: + case Hexagon::STriw_indexed_shl_cPt_nv_V4: + case Hexagon::STriw_indexed_shl_cdnPt_nv_V4: + case Hexagon::STriw_indexed_shl_cNotPt_nv_V4: + case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4: + case Hexagon::POST_STwri_cPt_nv_V4: + case Hexagon::POST_STwri_cdnPt_nv_V4: + case Hexagon::POST_STwri_cNotPt_nv_V4: + case Hexagon::POST_STwri_cdnNotPt_nv_V4: + case Hexagon::STw_GP_cPt_nv_V4: + case Hexagon::STw_GP_cNotPt_nv_V4: + case Hexagon::STw_GP_cdnPt_nv_V4: + case Hexagon::STw_GP_cdnNotPt_nv_V4: + case Hexagon::STriw_GP_cPt_nv_V4: + case Hexagon::STriw_GP_cNotPt_nv_V4: + case Hexagon::STriw_GP_cdnPt_nv_V4: + case Hexagon::STriw_GP_cdnNotPt_nv_V4: + case Hexagon::STriw_abs_nv_V4: + case Hexagon::STriw_abs_cPt_nv_V4: + case Hexagon::STriw_abs_cdnPt_nv_V4: + case Hexagon::STriw_abs_cNotPt_nv_V4: + case Hexagon::STriw_abs_cdnNotPt_nv_V4: + case Hexagon::STriw_imm_abs_nv_V4: + case Hexagon::STriw_imm_abs_cPt_nv_V4: + case Hexagon::STriw_imm_abs_cdnPt_nv_V4: + case Hexagon::STriw_imm_abs_cNotPt_nv_V4: + case Hexagon::STriw_imm_abs_cdnNotPt_nv_V4: + return true; + } +} + +bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const { + switch (MI->getOpcode()) + { + default: return false; + // Load Byte + case Hexagon::POST_LDrib: + case Hexagon::POST_LDrib_cPt: + case Hexagon::POST_LDrib_cNotPt: + case Hexagon::POST_LDrib_cdnPt_V4: + case Hexagon::POST_LDrib_cdnNotPt_V4: + + // Load unsigned byte + case Hexagon::POST_LDriub: + case Hexagon::POST_LDriub_cPt: + case Hexagon::POST_LDriub_cNotPt: + case Hexagon::POST_LDriub_cdnPt_V4: + case Hexagon::POST_LDriub_cdnNotPt_V4: + + // Load halfword + case Hexagon::POST_LDrih: + case Hexagon::POST_LDrih_cPt: + case Hexagon::POST_LDrih_cNotPt: + case Hexagon::POST_LDrih_cdnPt_V4: + case Hexagon::POST_LDrih_cdnNotPt_V4: + + // Load unsigned halfword + case Hexagon::POST_LDriuh: + case Hexagon::POST_LDriuh_cPt: + case Hexagon::POST_LDriuh_cNotPt: + case Hexagon::POST_LDriuh_cdnPt_V4: + case Hexagon::POST_LDriuh_cdnNotPt_V4: + + // Load word + case Hexagon::POST_LDriw: + case Hexagon::POST_LDriw_cPt: + case Hexagon::POST_LDriw_cNotPt: + case Hexagon::POST_LDriw_cdnPt_V4: + case Hexagon::POST_LDriw_cdnNotPt_V4: + + // Load double word + case Hexagon::POST_LDrid: + case Hexagon::POST_LDrid_cPt: + case Hexagon::POST_LDrid_cNotPt: + case Hexagon::POST_LDrid_cdnPt_V4: + case Hexagon::POST_LDrid_cdnNotPt_V4: + + // Store byte + case Hexagon::POST_STbri: + case Hexagon::POST_STbri_cPt: + case Hexagon::POST_STbri_cNotPt: + case Hexagon::POST_STbri_cdnPt_V4: + case Hexagon::POST_STbri_cdnNotPt_V4: + + // Store halfword + case Hexagon::POST_SThri: + case Hexagon::POST_SThri_cPt: + case Hexagon::POST_SThri_cNotPt: + case Hexagon::POST_SThri_cdnPt_V4: + case Hexagon::POST_SThri_cdnNotPt_V4: + + // Store word + case Hexagon::POST_STwri: + case Hexagon::POST_STwri_cPt: + case Hexagon::POST_STwri_cNotPt: + case Hexagon::POST_STwri_cdnPt_V4: + case Hexagon::POST_STwri_cdnNotPt_V4: + + // Store double word + case Hexagon::POST_STdri: + case Hexagon::POST_STdri_cPt: + case Hexagon::POST_STdri_cNotPt: + case Hexagon::POST_STdri_cdnPt_V4: + case Hexagon::POST_STdri_cdnNotPt_V4: + return true; + } +} + +bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const { + return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4; +} bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { bool isPred = MI->getDesc().isPredicable(); @@ -548,7 +1388,7 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { case Hexagon::SXTH: case Hexagon::ZXTB: case Hexagon::ZXTH: - return Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4; + return Subtarget.hasV4TOps(); case Hexagon::JMPR: return false; @@ -557,8 +1397,27 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { return true; } +// This function performs the following inversiones: +// +// cPt ---> cNotPt +// cNotPt ---> cPt +// +// however, these inversiones are NOT included: +// +// cdnPt -X-> cdnNotPt +// cdnNotPt -X-> cdnPt +// cPt_nv -X-> cNotPt_nv (new value stores) +// cNotPt_nv -X-> cPt_nv (new value stores) +// +// because only the following transformations are allowed: +// +// cNotPt ---> cdnNotPt +// cPt ---> cdnPt +// cNotPt ---> cNotPt_nv +// cPt ---> cPt_nv unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const { switch(Opc) { + default: llvm_unreachable("Unexpected predicated instruction"); case Hexagon::TFR_cPt: return Hexagon::TFR_cNotPt; case Hexagon::TFR_cNotPt: @@ -805,6 +1664,47 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const { case Hexagon::STrid_indexed_shl_cNotPt_V4: return Hexagon::STrid_indexed_shl_cPt_V4; + // V4 Store to global address. + case Hexagon::STd_GP_cPt_V4: + return Hexagon::STd_GP_cNotPt_V4; + case Hexagon::STd_GP_cNotPt_V4: + return Hexagon::STd_GP_cPt_V4; + + case Hexagon::STb_GP_cPt_V4: + return Hexagon::STb_GP_cNotPt_V4; + case Hexagon::STb_GP_cNotPt_V4: + return Hexagon::STb_GP_cPt_V4; + + case Hexagon::STh_GP_cPt_V4: + return Hexagon::STh_GP_cNotPt_V4; + case Hexagon::STh_GP_cNotPt_V4: + return Hexagon::STh_GP_cPt_V4; + + case Hexagon::STw_GP_cPt_V4: + return Hexagon::STw_GP_cNotPt_V4; + case Hexagon::STw_GP_cNotPt_V4: + return Hexagon::STw_GP_cPt_V4; + + case Hexagon::STrid_GP_cPt_V4: + return Hexagon::STrid_GP_cNotPt_V4; + case Hexagon::STrid_GP_cNotPt_V4: + return Hexagon::STrid_GP_cPt_V4; + + case Hexagon::STrib_GP_cPt_V4: + return Hexagon::STrib_GP_cNotPt_V4; + case Hexagon::STrib_GP_cNotPt_V4: + return Hexagon::STrib_GP_cPt_V4; + + case Hexagon::STrih_GP_cPt_V4: + return Hexagon::STrih_GP_cNotPt_V4; + case Hexagon::STrih_GP_cNotPt_V4: + return Hexagon::STrih_GP_cPt_V4; + + case Hexagon::STriw_GP_cPt_V4: + return Hexagon::STriw_GP_cNotPt_V4; + case Hexagon::STriw_GP_cNotPt_V4: + return Hexagon::STriw_GP_cPt_V4; + // Load. case Hexagon::LDrid_cPt: return Hexagon::LDrid_cNotPt; @@ -1009,9 +1909,6 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const { return Hexagon::JMP_GTUrrdnNotPnt_nv_V4; case Hexagon::JMP_GTUrrdnNotPnt_nv_V4: return Hexagon::JMP_GTUrrdnPnt_nv_V4; - - default: - llvm_unreachable("Unexpected predicated instruction"); } } @@ -1022,12 +1919,21 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const { case Hexagon::TFR: return !invertPredicate ? Hexagon::TFR_cPt : Hexagon::TFR_cNotPt; + case Hexagon::TFRI_f: + return !invertPredicate ? Hexagon::TFRI_cPt_f : + Hexagon::TFRI_cNotPt_f; case Hexagon::TFRI: return !invertPredicate ? Hexagon::TFRI_cPt : Hexagon::TFRI_cNotPt; case Hexagon::JMP: return !invertPredicate ? Hexagon::JMP_c : Hexagon::JMP_cNot; + case Hexagon::JMP_EQrrPt_nv_V4: + return !invertPredicate ? Hexagon::JMP_EQrrPt_nv_V4 : + Hexagon::JMP_EQrrNotPt_nv_V4; + case Hexagon::JMP_EQriPt_nv_V4: + return !invertPredicate ? Hexagon::JMP_EQriPt_nv_V4 : + Hexagon::JMP_EQriNotPt_nv_V4; case Hexagon::ADD_ri: return !invertPredicate ? Hexagon::ADD_ri_cPt : Hexagon::ADD_ri_cNotPt; @@ -1121,6 +2027,46 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const { case Hexagon::LDriw_indexed_shl_V4: return !invertPredicate ? Hexagon::LDriw_indexed_shl_cPt_V4 : Hexagon::LDriw_indexed_shl_cNotPt_V4; + + // V4 Load from global address + case Hexagon::LDrid_GP_V4: + return !invertPredicate ? Hexagon::LDrid_GP_cPt_V4 : + Hexagon::LDrid_GP_cNotPt_V4; + case Hexagon::LDrib_GP_V4: + return !invertPredicate ? Hexagon::LDrib_GP_cPt_V4 : + Hexagon::LDrib_GP_cNotPt_V4; + case Hexagon::LDriub_GP_V4: + return !invertPredicate ? Hexagon::LDriub_GP_cPt_V4 : + Hexagon::LDriub_GP_cNotPt_V4; + case Hexagon::LDrih_GP_V4: + return !invertPredicate ? Hexagon::LDrih_GP_cPt_V4 : + Hexagon::LDrih_GP_cNotPt_V4; + case Hexagon::LDriuh_GP_V4: + return !invertPredicate ? Hexagon::LDriuh_GP_cPt_V4 : + Hexagon::LDriuh_GP_cNotPt_V4; + case Hexagon::LDriw_GP_V4: + return !invertPredicate ? Hexagon::LDriw_GP_cPt_V4 : + Hexagon::LDriw_GP_cNotPt_V4; + + case Hexagon::LDd_GP_V4: + return !invertPredicate ? Hexagon::LDd_GP_cPt_V4 : + Hexagon::LDd_GP_cNotPt_V4; + case Hexagon::LDb_GP_V4: + return !invertPredicate ? Hexagon::LDb_GP_cPt_V4 : + Hexagon::LDb_GP_cNotPt_V4; + case Hexagon::LDub_GP_V4: + return !invertPredicate ? Hexagon::LDub_GP_cPt_V4 : + Hexagon::LDub_GP_cNotPt_V4; + case Hexagon::LDh_GP_V4: + return !invertPredicate ? Hexagon::LDh_GP_cPt_V4 : + Hexagon::LDh_GP_cNotPt_V4; + case Hexagon::LDuh_GP_V4: + return !invertPredicate ? Hexagon::LDuh_GP_cPt_V4 : + Hexagon::LDuh_GP_cNotPt_V4; + case Hexagon::LDw_GP_V4: + return !invertPredicate ? Hexagon::LDw_GP_cPt_V4 : + Hexagon::LDw_GP_cNotPt_V4; + // Byte. case Hexagon::POST_STbri: return !invertPredicate ? Hexagon::POST_STbri_cPt : @@ -1182,6 +2128,34 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const { case Hexagon::STrid_indexed_shl_V4: return !invertPredicate ? Hexagon::STrid_indexed_shl_cPt_V4 : Hexagon::STrid_indexed_shl_cNotPt_V4; + + // V4 Store to global address + case Hexagon::STrid_GP_V4: + return !invertPredicate ? Hexagon::STrid_GP_cPt_V4 : + Hexagon::STrid_GP_cNotPt_V4; + case Hexagon::STrib_GP_V4: + return !invertPredicate ? Hexagon::STrib_GP_cPt_V4 : + Hexagon::STrib_GP_cNotPt_V4; + case Hexagon::STrih_GP_V4: + return !invertPredicate ? Hexagon::STrih_GP_cPt_V4 : + Hexagon::STrih_GP_cNotPt_V4; + case Hexagon::STriw_GP_V4: + return !invertPredicate ? Hexagon::STriw_GP_cPt_V4 : + Hexagon::STriw_GP_cNotPt_V4; + + case Hexagon::STd_GP_V4: + return !invertPredicate ? Hexagon::STd_GP_cPt_V4 : + Hexagon::STd_GP_cNotPt_V4; + case Hexagon::STb_GP_V4: + return !invertPredicate ? Hexagon::STb_GP_cPt_V4 : + Hexagon::STb_GP_cNotPt_V4; + case Hexagon::STh_GP_V4: + return !invertPredicate ? Hexagon::STh_GP_cPt_V4 : + Hexagon::STh_GP_cNotPt_V4; + case Hexagon::STw_GP_V4: + return !invertPredicate ? Hexagon::STw_GP_cPt_V4 : + Hexagon::STw_GP_cNotPt_V4; + // Load. case Hexagon::LDrid: return !invertPredicate ? Hexagon::LDrid_cPt : @@ -1201,9 +2175,6 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const { case Hexagon::LDriub: return !invertPredicate ? Hexagon::LDriub_cPt : Hexagon::LDriub_cNotPt; - case Hexagon::LDriubit: - return !invertPredicate ? Hexagon::LDriub_cPt : - Hexagon::LDriub_cNotPt; // Load Indexed. case Hexagon::LDrid_indexed: return !invertPredicate ? Hexagon::LDrid_indexed_cPt : @@ -1297,7 +2268,7 @@ PredicateInstruction(MachineInstr *MI, bool HexagonInstrInfo:: isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, unsigned ExtraPredCycles, const BranchProbability &Probability) const { return true; @@ -1323,7 +2294,6 @@ bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const { return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); } - bool HexagonInstrInfo::DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const { @@ -1331,7 +2301,7 @@ HexagonInstrInfo::DefinesPredicate(MachineInstr *MI, MachineOperand MO = MI->getOperand(oper); if (MO.isReg() && MO.isDef()) { const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg()); - if (RC == Hexagon::PredRegsRegisterClass) { + if (RC == &Hexagon::PredRegsRegClass) { Pred.push_back(MO); return true; } @@ -1373,6 +2343,7 @@ isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs, bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const { switch (MI->getOpcode()) { + default: return false; case Hexagon::DEALLOC_RET_V4 : case Hexagon::DEALLOC_RET_cPt_V4 : case Hexagon::DEALLOC_RET_cNotPt_V4 : @@ -1382,7 +2353,6 @@ bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const { case Hexagon::DEALLOC_RET_cNotdnPt_V4 : return true; } - return false; } @@ -1396,13 +2366,17 @@ isValidOffset(const int Opcode, const int Offset) const { switch(Opcode) { case Hexagon::LDriw: + case Hexagon::LDriw_f: case Hexagon::STriw: + case Hexagon::STriw_f: assert((Offset % 4 == 0) && "Offset has incorrect alignment"); return (Offset >= Hexagon_MEMW_OFFSET_MIN) && (Offset <= Hexagon_MEMW_OFFSET_MAX); case Hexagon::LDrid: + case Hexagon::LDrid_f: case Hexagon::STrid: + case Hexagon::STrid_f: assert((Offset % 8 == 0) && "Offset has incorrect alignment"); return (Offset >= Hexagon_MEMD_OFFSET_MIN) && (Offset <= Hexagon_MEMD_OFFSET_MAX); @@ -1410,7 +2384,6 @@ isValidOffset(const int Opcode, const int Offset) const { case Hexagon::LDrih: case Hexagon::LDriuh: case Hexagon::STrih: - case Hexagon::LDrih_ae: assert((Offset % 2 == 0) && "Offset has incorrect alignment"); return (Offset >= Hexagon_MEMH_OFFSET_MIN) && (Offset <= Hexagon_MEMH_OFFSET_MAX); @@ -1418,9 +2391,6 @@ isValidOffset(const int Opcode, const int Offset) const { case Hexagon::LDrib: case Hexagon::STrib: case Hexagon::LDriub: - case Hexagon::LDriubit: - case Hexagon::LDrib_ae: - case Hexagon::LDriub_ae: return (Offset >= Hexagon_MEMB_OFFSET_MIN) && (Offset <= Hexagon_MEMB_OFFSET_MAX); @@ -1528,6 +2498,7 @@ bool HexagonInstrInfo:: isMemOp(const MachineInstr *MI) const { switch (MI->getOpcode()) { + default: return false; case Hexagon::MEMw_ADDSUBi_indexed_MEM_V4 : case Hexagon::MEMw_ADDi_indexed_MEM_V4 : case Hexagon::MEMw_SUBi_indexed_MEM_V4 : @@ -1570,28 +2541,59 @@ isMemOp(const MachineInstr *MI) const { case Hexagon::MEMb_SUBr_MEM_V4 : case Hexagon::MEMb_ANDr_MEM_V4 : case Hexagon::MEMb_ORr_MEM_V4 : - return true; + return true; } - return false; } bool HexagonInstrInfo:: isSpillPredRegOp(const MachineInstr *MI) const { - switch (MI->getOpcode()) - { + switch (MI->getOpcode()) { + default: return false; case Hexagon::STriw_pred : case Hexagon::LDriw_pred : - return true; + return true; + } +} + +bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: return false; + case Hexagon::CMPEQrr: + case Hexagon::CMPEQri: + case Hexagon::CMPLTrr: + case Hexagon::CMPGTrr: + case Hexagon::CMPGTri: + case Hexagon::CMPLTUrr: + case Hexagon::CMPGTUrr: + case Hexagon::CMPGTUri: + case Hexagon::CMPGEri: + case Hexagon::CMPGEUri: + return true; } - return false; } +bool HexagonInstrInfo:: +isConditionalTransfer (const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: return false; + case Hexagon::TFR_cPt: + case Hexagon::TFR_cNotPt: + case Hexagon::TFRI_cPt: + case Hexagon::TFRI_cNotPt: + case Hexagon::TFR_cdnPt: + case Hexagon::TFR_cdnNotPt: + case Hexagon::TFRI_cdnPt: + case Hexagon::TFRI_cdnNotPt: + return true; + } +} bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const { const HexagonRegisterInfo& QRI = getRegisterInfo(); switch (MI->getOpcode()) { + default: return false; case Hexagon::ADD_ri_cPt: case Hexagon::ADD_ri_cNotPt: case Hexagon::ADD_rr_cPt: @@ -1619,19 +2621,16 @@ bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const { case Hexagon::ZXTB_cNotPt_V4: case Hexagon::ZXTH_cPt_V4: case Hexagon::ZXTH_cNotPt_V4: - return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4; - - default: - return false; + return QRI.Subtarget.hasV4TOps(); } } - bool HexagonInstrInfo:: isConditionalLoad (const MachineInstr* MI) const { const HexagonRegisterInfo& QRI = getRegisterInfo(); switch (MI->getOpcode()) { + default: return false; case Hexagon::LDrid_cPt : case Hexagon::LDrid_cNotPt : case Hexagon::LDrid_indexed_cPt : @@ -1669,7 +2668,7 @@ isConditionalLoad (const MachineInstr* MI) const { case Hexagon::POST_LDriuh_cNotPt : case Hexagon::POST_LDriub_cPt : case Hexagon::POST_LDriub_cNotPt : - return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4; + return QRI.Subtarget.hasV4TOps(); case Hexagon::LDrid_indexed_cPt_V4 : case Hexagon::LDrid_indexed_cNotPt_V4 : case Hexagon::LDrid_indexed_shl_cPt_V4 : @@ -1694,12 +2693,136 @@ isConditionalLoad (const MachineInstr* MI) const { case Hexagon::LDriw_indexed_cNotPt_V4 : case Hexagon::LDriw_indexed_shl_cPt_V4 : case Hexagon::LDriw_indexed_shl_cNotPt_V4 : - return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4; - default: - return false; + return QRI.Subtarget.hasV4TOps(); } } +// Returns true if an instruction is a conditional store. +// +// Note: It doesn't include conditional new-value stores as they can't be +// converted to .new predicate. +// +// p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ] +// ^ ^ +// / \ (not OK. it will cause new-value store to be +// / X conditional on p0.new while R2 producer is +// / \ on p0) +// / \. +// p.new store p.old NV store +// [if(p0.new)memw(R0+#0)=R2] [if(p0)memw(R0+#0)=R2.new] +// ^ ^ +// \ / +// \ / +// \ / +// p.old store +// [if (p0)memw(R0+#0)=R2] +// +// The above diagram shows the steps involoved in the conversion of a predicated +// store instruction to its .new predicated new-value form. +// +// The following set of instructions further explains the scenario where +// conditional new-value store becomes invalid when promoted to .new predicate +// form. +// +// { 1) if (p0) r0 = add(r1, r2) +// 2) p0 = cmp.eq(r3, #0) } +// +// 3) if (p0) memb(r1+#0) = r0 --> this instruction can't be grouped with +// the first two instructions because in instr 1, r0 is conditional on old value +// of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which +// is not valid for new-value stores. +bool HexagonInstrInfo:: +isConditionalStore (const MachineInstr* MI) const { + const HexagonRegisterInfo& QRI = getRegisterInfo(); + switch (MI->getOpcode()) + { + default: return false; + case Hexagon::STrib_imm_cPt_V4 : + case Hexagon::STrib_imm_cNotPt_V4 : + case Hexagon::STrib_indexed_shl_cPt_V4 : + case Hexagon::STrib_indexed_shl_cNotPt_V4 : + case Hexagon::STrib_cPt : + case Hexagon::STrib_cNotPt : + case Hexagon::POST_STbri_cPt : + case Hexagon::POST_STbri_cNotPt : + case Hexagon::STrid_indexed_cPt : + case Hexagon::STrid_indexed_cNotPt : + case Hexagon::STrid_indexed_shl_cPt_V4 : + case Hexagon::POST_STdri_cPt : + case Hexagon::POST_STdri_cNotPt : + case Hexagon::STrih_cPt : + case Hexagon::STrih_cNotPt : + case Hexagon::STrih_indexed_cPt : + case Hexagon::STrih_indexed_cNotPt : + case Hexagon::STrih_imm_cPt_V4 : + case Hexagon::STrih_imm_cNotPt_V4 : + case Hexagon::STrih_indexed_shl_cPt_V4 : + case Hexagon::STrih_indexed_shl_cNotPt_V4 : + case Hexagon::POST_SThri_cPt : + case Hexagon::POST_SThri_cNotPt : + case Hexagon::STriw_cPt : + case Hexagon::STriw_cNotPt : + case Hexagon::STriw_indexed_cPt : + case Hexagon::STriw_indexed_cNotPt : + case Hexagon::STriw_imm_cPt_V4 : + case Hexagon::STriw_imm_cNotPt_V4 : + case Hexagon::STriw_indexed_shl_cPt_V4 : + case Hexagon::STriw_indexed_shl_cNotPt_V4 : + case Hexagon::POST_STwri_cPt : + case Hexagon::POST_STwri_cNotPt : + return QRI.Subtarget.hasV4TOps(); + + // V4 global address store before promoting to dot new. + case Hexagon::STrid_GP_cPt_V4 : + case Hexagon::STrid_GP_cNotPt_V4 : + case Hexagon::STrib_GP_cPt_V4 : + case Hexagon::STrib_GP_cNotPt_V4 : + case Hexagon::STrih_GP_cPt_V4 : + case Hexagon::STrih_GP_cNotPt_V4 : + case Hexagon::STriw_GP_cPt_V4 : + case Hexagon::STriw_GP_cNotPt_V4 : + case Hexagon::STd_GP_cPt_V4 : + case Hexagon::STd_GP_cNotPt_V4 : + case Hexagon::STb_GP_cPt_V4 : + case Hexagon::STb_GP_cNotPt_V4 : + case Hexagon::STh_GP_cPt_V4 : + case Hexagon::STh_GP_cNotPt_V4 : + case Hexagon::STw_GP_cPt_V4 : + case Hexagon::STw_GP_cNotPt_V4 : + return QRI.Subtarget.hasV4TOps(); + + // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded + // from the "Conditional Store" list. Because a predicated new value store + // would NOT be promoted to a double dot new store. See diagram below: + // This function returns yes for those stores that are predicated but not + // yet promoted to predicate dot new instructions. + // + // +---------------------+ + // /-----| if (p0) memw(..)=r0 |---------\~ + // || +---------------------+ || + // promote || /\ /\ || promote + // || /||\ /||\ || + // \||/ demote || \||/ + // \/ || || \/ + // +-------------------------+ || +-------------------------+ + // | if (p0.new) memw(..)=r0 | || | if (p0) memw(..)=r0.new | + // +-------------------------+ || +-------------------------+ + // || || || + // || demote \||/ + // promote || \/ NOT possible + // || || /\~ + // \||/ || /||\~ + // \/ || || + // +-----------------------------+ + // | if (p0.new) memw(..)=r0.new | + // +-----------------------------+ + // Double Dot New Store + // + } +} + + + DFAPacketizer *HexagonInstrInfo:: CreateTargetScheduleState(const TargetMachine *TM, const ScheduleDAG *DAG) const { diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index 7306870..2bb53f8 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -112,7 +112,7 @@ public: PredicateInstruction(MachineInstr *MI, const SmallVectorImpl<MachineOperand> &Cond) const; - virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, const BranchProbability &Probability) const; @@ -160,10 +160,21 @@ public: bool isS8_Immediate(const int value) const; bool isS6_Immediate(const int value) const; + bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const; + bool isConditionalTransfer(const MachineInstr* MI) const; bool isConditionalALU32 (const MachineInstr* MI) const; bool isConditionalLoad (const MachineInstr* MI) const; + bool isConditionalStore(const MachineInstr* MI) const; bool isDeallocRet(const MachineInstr *MI) const; unsigned getInvertedPredicatedOpcode(const int Opc) const; + bool isExtendable(const MachineInstr* MI) const; + bool isExtended(const MachineInstr* MI) const; + bool isPostIncrement(const MachineInstr* MI) const; + bool isNewValueStore(const MachineInstr* MI) const; + bool isNewValueJump(const MachineInstr* MI) const; + bool isNewValueJumpCandidate(const MachineInstr *MI) const; + unsigned getImmExtForm(const MachineInstr* MI) const; + unsigned getNormalBranchForm(const MachineInstr* MI) const; private: int getMatchingCondBranchOpcode(int Opc, bool sense) const; diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td index b563ac3..c7be5ce 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.td +++ b/lib/Target/Hexagon/HexagonInstrInfo.td @@ -25,7 +25,10 @@ def HasV3TOnly : Predicate<"Subtarget.hasV3TOpsOnly()">; def NoV3T : Predicate<"!Subtarget.hasV3TOps()">; def HasV4T : Predicate<"Subtarget.hasV4TOps()">; def NoV4T : Predicate<"!Subtarget.hasV4TOps()">; +def HasV5T : Predicate<"Subtarget.hasV5TOps()">; +def NoV5T : Predicate<"!Subtarget.hasV5TOps()">; def UseMEMOP : Predicate<"Subtarget.useMemOps()">; +def IEEERndNearV5T : Predicate<"Subtarget.modeIEEERndNear()">; // Addressing modes. def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>; @@ -84,10 +87,12 @@ def symbolLo32 : Operand<i32> { multiclass ALU32_rr_ri<string OpcStr, SDNode OpNode> { def rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")), - [(set IntRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>; + [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$b), + (i32 IntRegs:$c)))]>; def ri : ALU32_ri<(outs IntRegs:$dst), (ins s10Imm:$b, IntRegs:$c), !strconcat("$dst = ", !strconcat(OpcStr, "(#$b, $c)")), - [(set IntRegs:$dst, (OpNode s10Imm:$b, IntRegs:$c))]>; + [(set (i32 IntRegs:$dst), (OpNode s10Imm:$b, + (i32 IntRegs:$c)))]>; } // Multi-class for compare ops. @@ -95,111 +100,114 @@ let isCompare = 1 in { multiclass CMP64_rr<string OpcStr, PatFrag OpNode> { def rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")), - [(set PredRegs:$dst, (OpNode DoubleRegs:$b, DoubleRegs:$c))]>; + [(set (i1 PredRegs:$dst), + (OpNode (i64 DoubleRegs:$b), (i64 DoubleRegs:$c)))]>; } multiclass CMP32_rr<string OpcStr, PatFrag OpNode> { def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")), - [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>; + [(set (i1 PredRegs:$dst), + (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>; } multiclass CMP32_rr_ri_s10<string OpcStr, PatFrag OpNode> { def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")), - [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>; + [(set (i1 PredRegs:$dst), + (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>; def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s10Imm:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")), - [(set PredRegs:$dst, (OpNode IntRegs:$b, s10ImmPred:$c))]>; + [(set (i1 PredRegs:$dst), + (OpNode (i32 IntRegs:$b), s10ImmPred:$c))]>; } multiclass CMP32_rr_ri_u9<string OpcStr, PatFrag OpNode> { def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")), - [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>; + [(set (i1 PredRegs:$dst), + (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>; def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Imm:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")), - [(set PredRegs:$dst, (OpNode IntRegs:$b, u9ImmPred:$c))]>; + [(set (i1 PredRegs:$dst), + (OpNode (i32 IntRegs:$b), u9ImmPred:$c))]>; } -multiclass CMP32_ri_u9<string OpcStr, PatFrag OpNode> { - def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Imm:$c), +multiclass CMP32_ri_u8<string OpcStr, PatFrag OpNode> { + def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u8Imm:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")), - [(set PredRegs:$dst, (OpNode IntRegs:$b, u9ImmPred:$c))]>; + [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b), + u8ImmPred:$c))]>; } multiclass CMP32_ri_s8<string OpcStr, PatFrag OpNode> { def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s8Imm:$c), !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")), - [(set PredRegs:$dst, (OpNode IntRegs:$b, s8ImmPred:$c))]>; + [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b), + s8ImmPred:$c))]>; } } //===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// http://qualnet.qualcomm.com/~erich/v1/htmldocs/index.html -// http://qualnet.qualcomm.com/~erich/v2/htmldocs/index.html -// http://qualnet.qualcomm.com/~erich/v3/htmldocs/index.html -// http://qualnet.qualcomm.com/~erich/v4/htmldocs/index.html -// http://qualnet.qualcomm.com/~erich/v5/htmldocs/index.html -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// // ALU32/ALU + //===----------------------------------------------------------------------===// // Add. -let isPredicable = 1 in +let isCommutable = 1, isPredicable = 1 in def ADD_rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = add($src1, $src2)", - [(set IntRegs:$dst, (add IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; let isPredicable = 1 in def ADD_ri : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1, s16Imm:$src2), "$dst = add($src1, #$src2)", - [(set IntRegs:$dst, (add IntRegs:$src1, s16ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1), + s16ImmPred:$src2))]>; // Logical operations. let isPredicable = 1 in def XOR_rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = xor($src1, $src2)", - [(set IntRegs:$dst, (xor IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; -let isPredicable = 1 in +let isCommutable = 1, isPredicable = 1 in def AND_rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = and($src1, $src2)", - [(set IntRegs:$dst, (and IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; def OR_ri : ALU32_ri<(outs IntRegs:$dst), - (ins IntRegs:$src1, s8Imm:$src2), + (ins IntRegs:$src1, s10Imm:$src2), "$dst = or($src1, #$src2)", - [(set IntRegs:$dst, (or IntRegs:$src1, s8ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1), + s10ImmPred:$src2))]>; def NOT_rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1), "$dst = not($src1)", - [(set IntRegs:$dst, (not IntRegs:$src1))]>; + [(set (i32 IntRegs:$dst), (not (i32 IntRegs:$src1)))]>; def AND_ri : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2), "$dst = and($src1, #$src2)", - [(set IntRegs:$dst, (and IntRegs:$src1, s10ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1), + s10ImmPred:$src2))]>; -let isPredicable = 1 in +let isCommutable = 1, isPredicable = 1 in def OR_rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = or($src1, $src2)", - [(set IntRegs:$dst, (or IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; // Negate. def NEG : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1), "$dst = neg($src1)", - [(set IntRegs:$dst, (ineg IntRegs:$src1))]>; + [(set (i32 IntRegs:$dst), (ineg (i32 IntRegs:$src1)))]>; // Nop. let neverHasSideEffects = 1 in def NOP : ALU32_rr<(outs), (ins), @@ -211,13 +219,20 @@ let isPredicable = 1 in def SUB_rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = sub($src1, $src2)", - [(set IntRegs:$dst, (sub IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; + +// Rd32=sub(#s10,Rs32) +def SUB_ri : ALU32_ri<(outs IntRegs:$dst), + (ins s10Imm:$src1, IntRegs:$src2), + "$dst = sub(#$src1, $src2)", + [(set IntRegs:$dst, (sub s10ImmPred:$src1, IntRegs:$src2))]>; // Transfer immediate. -let isReMaterializable = 1, isPredicable = 1 in +let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in def TFRI : ALU32_ri<(outs IntRegs:$dst), (ins s16Imm:$src1), "$dst = #$src1", - [(set IntRegs:$dst, s16ImmPred:$src1)]>; + [(set (i32 IntRegs:$dst), s16ImmPred:$src1)]>; // Transfer register. let neverHasSideEffects = 1, isPredicable = 1 in @@ -225,6 +240,11 @@ def TFR : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1), "$dst = $src1", []>; +let neverHasSideEffects = 1, isPredicable = 1 in +def TFR64 : ALU32_ri<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1), + "$dst = $src1", + []>; + // Transfer control register. let neverHasSideEffects = 1 in def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1), @@ -246,6 +266,12 @@ def COMBINE_rr : ALU32_rr<(outs DoubleRegs:$dst), "$dst = combine($src1, $src2)", []>; +let neverHasSideEffects = 1 in +def COMBINE_ii : ALU32_ii<(outs DoubleRegs:$dst), + (ins s8Imm:$src1, s8Imm:$src2), + "$dst = combine(#$src1, #$src2)", + []>; + // Mux. def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1, DoubleRegs:$src2, @@ -256,48 +282,52 @@ def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1, def MUX_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst = mux($src1, $src2, $src3)", - [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2, - IntRegs:$src3))]>; + [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1), + (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))]>; def MUX_ir : ALU32_ir<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Imm:$src2, IntRegs:$src3), "$dst = mux($src1, #$src2, $src3)", - [(set IntRegs:$dst, (select PredRegs:$src1, - s8ImmPred:$src2, IntRegs:$src3))]>; + [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1), + s8ImmPred:$src2, + (i32 IntRegs:$src3))))]>; def MUX_ri : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3), "$dst = mux($src1, $src2, #$src3)", - [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2, - s8ImmPred:$src3))]>; + [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1), + (i32 IntRegs:$src2), + s8ImmPred:$src3)))]>; def MUX_ii : ALU32_ii<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Imm:$src2, s8Imm:$src3), "$dst = mux($src1, #$src2, #$src3)", - [(set IntRegs:$dst, (select PredRegs:$src1, s8ImmPred:$src2, - s8ImmPred:$src3))]>; + [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1), + s8ImmPred:$src2, + s8ImmPred:$src3)))]>; // Shift halfword. let isPredicable = 1 in def ASLH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1), "$dst = aslh($src1)", - [(set IntRegs:$dst, (shl 16, IntRegs:$src1))]>; + [(set (i32 IntRegs:$dst), (shl 16, (i32 IntRegs:$src1)))]>; let isPredicable = 1 in def ASRH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1), "$dst = asrh($src1)", - [(set IntRegs:$dst, (sra 16, IntRegs:$src1))]>; + [(set (i32 IntRegs:$dst), (sra 16, (i32 IntRegs:$src1)))]>; // Sign extend. let isPredicable = 1 in def SXTB : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1), "$dst = sxtb($src1)", - [(set IntRegs:$dst, (sext_inreg IntRegs:$src1, i8))]>; + [(set (i32 IntRegs:$dst), (sext_inreg (i32 IntRegs:$src1), i8))]>; let isPredicable = 1 in def SXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1), "$dst = sxth($src1)", - [(set IntRegs:$dst, (sext_inreg IntRegs:$src1, i16))]>; + [(set (i32 IntRegs:$dst), (sext_inreg (i32 IntRegs:$src1), i16))]>; // Zero extend. let isPredicable = 1, neverHasSideEffects = 1 in @@ -321,25 +351,25 @@ def ZXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1), // Conditional add. let neverHasSideEffects = 1, isPredicated = 1 in def ADD_ri_cPt : ALU32_ri<(outs IntRegs:$dst), - (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3), + (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3), "if ($src1) $dst = add($src2, #$src3)", []>; let neverHasSideEffects = 1, isPredicated = 1 in def ADD_ri_cNotPt : ALU32_ri<(outs IntRegs:$dst), - (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3), + (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3), "if (!$src1) $dst = add($src2, #$src3)", []>; let neverHasSideEffects = 1, isPredicated = 1 in def ADD_ri_cdnPt : ALU32_ri<(outs IntRegs:$dst), - (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3), + (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3), "if ($src1.new) $dst = add($src2, #$src3)", []>; let neverHasSideEffects = 1, isPredicated = 1 in def ADD_ri_cdnNotPt : ALU32_ri<(outs IntRegs:$dst), - (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3), + (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3), "if (!$src1.new) $dst = add($src2, #$src3)", []>; @@ -497,7 +527,6 @@ def SUB_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst), // Conditional transfer. - let neverHasSideEffects = 1, isPredicated = 1 in def TFR_cPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2), "if ($src1) $dst = $src2", @@ -510,6 +539,18 @@ def TFR_cNotPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, []>; let neverHasSideEffects = 1, isPredicated = 1 in +def TFR64_cPt : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1, + DoubleRegs:$src2), + "if ($src1) $dst = $src2", + []>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def TFR64_cNotPt : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1, + DoubleRegs:$src2), + "if (!$src1) $dst = $src2", + []>; + +let neverHasSideEffects = 1, isPredicated = 1 in def TFRI_cPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, s12Imm:$src2), "if ($src1) $dst = #$src2", []>; @@ -548,25 +589,14 @@ def TFRI_cdnNotPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, defm CMPGTU : CMP32_rr_ri_u9<"cmp.gtu", setugt>; defm CMPGT : CMP32_rr_ri_s10<"cmp.gt", setgt>; defm CMPLT : CMP32_rr<"cmp.lt", setlt>; +defm CMPLTU : CMP32_rr<"cmp.ltu", setult>; defm CMPEQ : CMP32_rr_ri_s10<"cmp.eq", seteq>; defm CMPGE : CMP32_ri_s8<"cmp.ge", setge>; -defm CMPGEU : CMP32_ri_u9<"cmp.geu", setuge>; +defm CMPGEU : CMP32_ri_u8<"cmp.geu", setuge>; //===----------------------------------------------------------------------===// // ALU32/PRED - //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// ALU32/VH + -//===----------------------------------------------------------------------===// -// Vector add halfwords - -// Vector averagehalfwords - -// Vector subtract halfwords -//===----------------------------------------------------------------------===// -// ALU32/VH - -//===----------------------------------------------------------------------===// - //===----------------------------------------------------------------------===// // ALU64/ALU + @@ -575,8 +605,8 @@ defm CMPGEU : CMP32_ri_u9<"cmp.geu", setuge>; def ADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = add($src1, $src2)", - [(set DoubleRegs:$dst, (add DoubleRegs:$src1, - DoubleRegs:$src2))]>; + [(set (i64 DoubleRegs:$dst), (add (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2)))]>; // Add halfword. @@ -589,40 +619,93 @@ defm CMPGTU64 : CMP64_rr<"cmp.gtu", setugt>; def AND_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = and($src1, $src2)", - [(set DoubleRegs:$dst, (and DoubleRegs:$src1, - DoubleRegs:$src2))]>; + [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2)))]>; def OR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = or($src1, $src2)", - [(set DoubleRegs:$dst, (or DoubleRegs:$src1, DoubleRegs:$src2))]>; + [(set (i64 DoubleRegs:$dst), (or (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2)))]>; def XOR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = xor($src1, $src2)", - [(set DoubleRegs:$dst, (xor DoubleRegs:$src1, - DoubleRegs:$src2))]>; + [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2)))]>; // Maximum. def MAXw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = max($src2, $src1)", - [(set IntRegs:$dst, (select (i1 (setlt IntRegs:$src2, - IntRegs:$src1)), - IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), + (i32 (select (i1 (setlt (i32 IntRegs:$src2), + (i32 IntRegs:$src1))), + (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>; + +def MAXUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), + "$dst = maxu($src2, $src1)", + [(set (i32 IntRegs:$dst), + (i32 (select (i1 (setult (i32 IntRegs:$src2), + (i32 IntRegs:$src1))), + (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>; + +def MAXd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, + DoubleRegs:$src2), + "$dst = max($src2, $src1)", + [(set (i64 DoubleRegs:$dst), + (i64 (select (i1 (setlt (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src1))), + (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2))))]>; + +def MAXUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, + DoubleRegs:$src2), + "$dst = maxu($src2, $src1)", + [(set (i64 DoubleRegs:$dst), + (i64 (select (i1 (setult (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src1))), + (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2))))]>; // Minimum. def MINw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = min($src2, $src1)", - [(set IntRegs:$dst, (select (i1 (setgt IntRegs:$src2, - IntRegs:$src1)), - IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), + (i32 (select (i1 (setgt (i32 IntRegs:$src2), + (i32 IntRegs:$src1))), + (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>; + +def MINUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), + "$dst = minu($src2, $src1)", + [(set (i32 IntRegs:$dst), + (i32 (select (i1 (setugt (i32 IntRegs:$src2), + (i32 IntRegs:$src1))), + (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>; + +def MINd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, + DoubleRegs:$src2), + "$dst = min($src2, $src1)", + [(set (i64 DoubleRegs:$dst), + (i64 (select (i1 (setgt (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src1))), + (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2))))]>; + +def MINUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, + DoubleRegs:$src2), + "$dst = minu($src2, $src1)", + [(set (i64 DoubleRegs:$dst), + (i64 (select (i1 (setugt (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src1))), + (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2))))]>; // Subtract. def SUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = sub($src1, $src2)", - [(set DoubleRegs:$dst, (sub DoubleRegs:$src1, - DoubleRegs:$src2))]>; + [(set (i64 DoubleRegs:$dst), (sub (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2)))]>; // Subtract halfword. @@ -652,30 +735,6 @@ def TFR_64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1), //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// ALU64/VB + -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// -// ALU64/VB - -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// ALU64/VH + -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// -// ALU64/VH - -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// ALU64/VW + -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// -// ALU64/VW - -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// // CR + //===----------------------------------------------------------------------===// // Logical reductions on predicates. @@ -687,7 +746,8 @@ def TFR_64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1), // Logical operations on predicates. def AND_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2), "$dst = and($src1, $src2)", - [(set PredRegs:$dst, (and PredRegs:$src1, PredRegs:$src2))]>; + [(set (i1 PredRegs:$dst), (and (i1 PredRegs:$src1), + (i1 PredRegs:$src2)))]>; let neverHasSideEffects = 1 in def AND_pnotp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, @@ -726,15 +786,17 @@ def MASK_p : SInst<(outs DoubleRegs:$dst), (ins PredRegs:$src1), def NOT_p : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1), "$dst = not($src1)", - [(set PredRegs:$dst, (not PredRegs:$src1))]>; + [(set (i1 PredRegs:$dst), (not (i1 PredRegs:$src1)))]>; def OR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2), "$dst = or($src1, $src2)", - [(set PredRegs:$dst, (or PredRegs:$src1, PredRegs:$src2))]>; + [(set (i1 PredRegs:$dst), (or (i1 PredRegs:$src1), + (i1 PredRegs:$src2)))]>; def XOR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2), "$dst = xor($src1, $src2)", - [(set PredRegs:$dst, (xor PredRegs:$src1, PredRegs:$src2))]>; + [(set (i1 PredRegs:$dst), (xor (i1 PredRegs:$src1), + (i1 PredRegs:$src2)))]>; // User control register transfer. @@ -760,7 +822,7 @@ let isBranch = 1, isTerminator=1, Defs = [PC], def JMP_c : JInst< (outs), (ins PredRegs:$src, brtarget:$offset), "if ($src) jump $offset", - [(brcond PredRegs:$src, bb:$offset)]>; + [(brcond (i1 PredRegs:$src), bb:$offset)]>; } // if (!p0) jump @@ -826,7 +888,7 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; // Jump to address from register. -let isReturn = 1, isTerminator = 1, isBarrier = 1, +let isPredicable =1, isReturn = 1, isTerminator = 1, isBarrier = 1, Defs = [PC], Uses = [R31] in { def JMPR: JRInst<(outs), (ins), "jumpr r31", @@ -834,7 +896,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, } // Jump to address from register. -let isReturn = 1, isTerminator = 1, isBarrier = 1, +let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicated = 1, Defs = [PC], Uses = [R31] in { def JMPR_cPt: JRInst<(outs), (ins PredRegs:$src1), "if ($src1) jumpr r31", @@ -842,7 +904,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, } // Jump to address from register. -let isReturn = 1, isTerminator = 1, isBarrier = 1, +let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicated = 1, Defs = [PC], Uses = [R31] in { def JMPR_cNotPt: JRInst<(outs), (ins PredRegs:$src1), "if (!$src1) jumpr r31", @@ -865,96 +927,99 @@ let isPredicable = 1 in def LDrid : LDInst<(outs DoubleRegs:$dst), (ins MEMri:$addr), "$dst = memd($addr)", - [(set DoubleRegs:$dst, (load ADDRriS11_3:$addr))]>; + [(set (i64 DoubleRegs:$dst), (i64 (load ADDRriS11_3:$addr)))]>; let isPredicable = 1, AddedComplexity = 20 in def LDrid_indexed : LDInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, s11_3Imm:$offset), - "$dst=memd($src1+#$offset)", - [(set DoubleRegs:$dst, (load (add IntRegs:$src1, - s11_3ImmPred:$offset)))]>; + "$dst = memd($src1+#$offset)", + [(set (i64 DoubleRegs:$dst), + (i64 (load (add (i32 IntRegs:$src1), + s11_3ImmPred:$offset))))]>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_GP : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1 in +def LDrid_GP : LDInst2<(outs DoubleRegs:$dst), (ins globaladdress:$global, u16Imm:$offset), - "$dst=memd(#$global+$offset)", - []>; + "$dst = memd(#$global+$offset)", + []>, + Requires<[NoV4T]>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDd_GP : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1 in +def LDd_GP : LDInst2<(outs DoubleRegs:$dst), (ins globaladdress:$global), - "$dst=memd(#$global)", - []>; + "$dst = memd(#$global)", + []>, + Requires<[NoV4T]>; -let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrid : LDInstPI<(outs DoubleRegs:$dst, IntRegs:$dst2), +let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in +def POST_LDrid : LDInst2PI<(outs DoubleRegs:$dst, IntRegs:$dst2), (ins IntRegs:$src1, s4Imm:$offset), "$dst = memd($src1++#$offset)", [], "$src1 = $dst2">; // Load doubleword conditionally. -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_cPt : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_cPt : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1) $dst = memd($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_cNotPt : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_cNotPt : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1) $dst = memd($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_indexed_cPt : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_indexed_cPt : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3), - "if ($src1) $dst=memd($src2+#$src3)", + "if ($src1) $dst = memd($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_indexed_cNotPt : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_indexed_cNotPt : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3), - "if (!$src1) $dst=memd($src2+#$src3)", + "if (!$src1) $dst = memd($src2+#$src3)", []>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrid_cPt : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrid_cPt : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3), "if ($src1) $dst1 = memd($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrid_cNotPt : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrid_cNotPt : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3), "if (!$src1) $dst1 = memd($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_cdnPt : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_cdnPt : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1.new) $dst = memd($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_cdnNotPt : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_cdnNotPt : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1.new) $dst = memd($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_indexed_cdnPt : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_indexed_cdnPt : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3), - "if ($src1.new) $dst=memd($src2+#$src3)", + "if ($src1.new) $dst = memd($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrid_indexed_cdnNotPt : LDInst<(outs DoubleRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_indexed_cdnNotPt : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3), - "if (!$src1.new) $dst=memd($src2+#$src3)", + "if (!$src1.new) $dst = memd($src2+#$src3)", []>; @@ -963,114 +1028,113 @@ let isPredicable = 1 in def LDrib : LDInst<(outs IntRegs:$dst), (ins MEMri:$addr), "$dst = memb($addr)", - [(set IntRegs:$dst, (sextloadi8 ADDRriS11_0:$addr))]>; + [(set (i32 IntRegs:$dst), (i32 (sextloadi8 ADDRriS11_0:$addr)))]>; -def LDrib_ae : LDInst<(outs IntRegs:$dst), - (ins MEMri:$addr), - "$dst = memb($addr)", - [(set IntRegs:$dst, (extloadi8 ADDRriS11_0:$addr))]>; +// Load byte any-extend. +def : Pat < (i32 (extloadi8 ADDRriS11_0:$addr)), + (i32 (LDrib ADDRriS11_0:$addr)) >; // Indexed load byte. let isPredicable = 1, AddedComplexity = 20 in def LDrib_indexed : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s11_0Imm:$offset), - "$dst=memb($src1+#$offset)", - [(set IntRegs:$dst, (sextloadi8 (add IntRegs:$src1, - s11_0ImmPred:$offset)))]>; - + "$dst = memb($src1+#$offset)", + [(set (i32 IntRegs:$dst), + (i32 (sextloadi8 (add (i32 IntRegs:$src1), + s11_0ImmPred:$offset))))]>; // Indexed load byte any-extend. let AddedComplexity = 20 in -def LDrib_ae_indexed : LDInst<(outs IntRegs:$dst), - (ins IntRegs:$src1, s11_0Imm:$offset), - "$dst=memb($src1+#$offset)", - [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1, - s11_0ImmPred:$offset)))]>; +def : Pat < (i32 (extloadi8 (add IntRegs:$src1, s11_0ImmPred:$offset))), + (i32 (LDrib_indexed IntRegs:$src1, s11_0ImmPred:$offset)) >; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDrib_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global, u16Imm:$offset), - "$dst=memb(#$global+$offset)", - []>; + "$dst = memb(#$global+$offset)", + []>, + Requires<[NoV4T]>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDb_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDb_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), - "$dst=memb(#$global)", - []>; + "$dst = memb(#$global)", + []>, + Requires<[NoV4T]>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDub_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDub_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), - "$dst=memub(#$global)", - []>; + "$dst = memub(#$global)", + []>, + Requires<[NoV4T]>; -let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrib : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2), +let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in +def POST_LDrib : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2), (ins IntRegs:$src1, s4Imm:$offset), "$dst = memb($src1++#$offset)", [], "$src1 = $dst2">; // Load byte conditionally. -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1) $dst = memb($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1) $dst = memb($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_indexed_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_indexed_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3), "if ($src1) $dst = memb($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_indexed_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_indexed_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3), "if (!$src1) $dst = memb($src2+#$src3)", []>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrib_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrib_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3), "if ($src1) $dst1 = memb($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrib_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrib_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3), "if (!$src1) $dst1 = memb($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1.new) $dst = memb($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1.new) $dst = memb($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_indexed_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_indexed_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3), "if ($src1.new) $dst = memb($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrib_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3), "if (!$src1.new) $dst = memb($src2+#$src3)", []>; @@ -1081,112 +1145,110 @@ let isPredicable = 1 in def LDrih : LDInst<(outs IntRegs:$dst), (ins MEMri:$addr), "$dst = memh($addr)", - [(set IntRegs:$dst, (sextloadi16 ADDRriS11_1:$addr))]>; + [(set (i32 IntRegs:$dst), (i32 (sextloadi16 ADDRriS11_1:$addr)))]>; let isPredicable = 1, AddedComplexity = 20 in def LDrih_indexed : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s11_1Imm:$offset), - "$dst=memh($src1+#$offset)", - [(set IntRegs:$dst, (sextloadi16 (add IntRegs:$src1, - s11_1ImmPred:$offset)))] >; + "$dst = memh($src1+#$offset)", + [(set (i32 IntRegs:$dst), + (i32 (sextloadi16 (add (i32 IntRegs:$src1), + s11_1ImmPred:$offset))))]>; -def LDrih_ae : LDInst<(outs IntRegs:$dst), - (ins MEMri:$addr), - "$dst = memh($addr)", - [(set IntRegs:$dst, (extloadi16 ADDRriS11_1:$addr))]>; +def : Pat < (i32 (extloadi16 ADDRriS11_1:$addr)), + (i32 (LDrih ADDRriS11_1:$addr))>; let AddedComplexity = 20 in -def LDrih_ae_indexed : LDInst<(outs IntRegs:$dst), - (ins IntRegs:$src1, s11_1Imm:$offset), - "$dst=memh($src1+#$offset)", - [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1, - s11_1ImmPred:$offset)))] >; +def : Pat < (i32 (extloadi16 (add IntRegs:$src1, s11_1ImmPred:$offset))), + (i32 (LDrih_indexed IntRegs:$src1, s11_1ImmPred:$offset)) >; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDrih_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global, u16Imm:$offset), - "$dst=memh(#$global+$offset)", - []>; + "$dst = memh(#$global+$offset)", + []>, + Requires<[NoV4T]>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDh_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDh_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), - "$dst=memh(#$global)", - []>; + "$dst = memh(#$global)", + []>, + Requires<[NoV4T]>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDuh_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDuh_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), - "$dst=memuh(#$global)", - []>; - + "$dst = memuh(#$global)", + []>, + Requires<[NoV4T]>; -let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrih : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2), +let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in +def POST_LDrih : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2), (ins IntRegs:$src1, s4Imm:$offset), "$dst = memh($src1++#$offset)", [], "$src1 = $dst2">; // Load halfword conditionally. -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1) $dst = memh($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1) $dst = memh($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_indexed_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_indexed_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3), "if ($src1) $dst = memh($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_indexed_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_indexed_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3), "if (!$src1) $dst = memh($src2+#$src3)", []>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrih_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrih_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3), "if ($src1) $dst1 = memh($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDrih_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrih_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3), "if (!$src1) $dst1 = memh($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1.new) $dst = memh($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1.new) $dst = memh($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_indexed_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_indexed_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3), "if ($src1.new) $dst = memh($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDrih_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3), "if (!$src1.new) $dst = memh($src2+#$src3)", []>; @@ -1196,113 +1258,96 @@ let isPredicable = 1 in def LDriub : LDInst<(outs IntRegs:$dst), (ins MEMri:$addr), "$dst = memub($addr)", - [(set IntRegs:$dst, (zextloadi8 ADDRriS11_0:$addr))]>; + [(set (i32 IntRegs:$dst), (i32 (zextloadi8 ADDRriS11_0:$addr)))]>; -let isPredicable = 1 in -def LDriubit : LDInst<(outs IntRegs:$dst), - (ins MEMri:$addr), - "$dst = memub($addr)", - [(set IntRegs:$dst, (zextloadi1 ADDRriS11_0:$addr))]>; +def : Pat < (i32 (zextloadi1 ADDRriS11_0:$addr)), + (i32 (LDriub ADDRriS11_0:$addr))>; let isPredicable = 1, AddedComplexity = 20 in def LDriub_indexed : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s11_0Imm:$offset), - "$dst=memub($src1+#$offset)", - [(set IntRegs:$dst, (zextloadi8 (add IntRegs:$src1, - s11_0ImmPred:$offset)))]>; - -let AddedComplexity = 20 in -def LDriubit_indexed : LDInst<(outs IntRegs:$dst), - (ins IntRegs:$src1, s11_0Imm:$offset), - "$dst=memub($src1+#$offset)", - [(set IntRegs:$dst, (zextloadi1 (add IntRegs:$src1, - s11_0ImmPred:$offset)))]>; - -def LDriub_ae : LDInst<(outs IntRegs:$dst), - (ins MEMri:$addr), - "$dst = memub($addr)", - [(set IntRegs:$dst, (extloadi8 ADDRriS11_0:$addr))]>; - + "$dst = memub($src1+#$offset)", + [(set (i32 IntRegs:$dst), + (i32 (zextloadi8 (add (i32 IntRegs:$src1), + s11_0ImmPred:$offset))))]>; let AddedComplexity = 20 in -def LDriub_ae_indexed : LDInst<(outs IntRegs:$dst), - (ins IntRegs:$src1, s11_0Imm:$offset), - "$dst=memub($src1+#$offset)", - [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1, - s11_0ImmPred:$offset)))]>; +def : Pat < (i32 (zextloadi1 (add IntRegs:$src1, s11_0ImmPred:$offset))), + (i32 (LDriub_indexed IntRegs:$src1, s11_0ImmPred:$offset))>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDriub_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global, u16Imm:$offset), - "$dst=memub(#$global+$offset)", - []>; + "$dst = memub(#$global+$offset)", + []>, + Requires<[NoV4T]>; -let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriub : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2), +let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in +def POST_LDriub : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2), (ins IntRegs:$src1, s4Imm:$offset), "$dst = memub($src1++#$offset)", [], "$src1 = $dst2">; // Load unsigned byte conditionally. -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1) $dst = memub($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1) $dst = memub($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_indexed_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_indexed_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3), "if ($src1) $dst = memub($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_indexed_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_indexed_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3), "if (!$src1) $dst = memub($src2+#$src3)", []>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriub_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriub_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3), "if ($src1) $dst1 = memub($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriub_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriub_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3), "if (!$src1) $dst1 = memub($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1.new) $dst = memub($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1.new) $dst = memub($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_indexed_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_indexed_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3), "if ($src1.new) $dst = memub($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriub_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3), "if (!$src1.new) $dst = memub($src2+#$src3)", []>; @@ -1312,102 +1357,90 @@ let isPredicable = 1 in def LDriuh : LDInst<(outs IntRegs:$dst), (ins MEMri:$addr), "$dst = memuh($addr)", - [(set IntRegs:$dst, (zextloadi16 ADDRriS11_1:$addr))]>; + [(set (i32 IntRegs:$dst), (i32 (zextloadi16 ADDRriS11_1:$addr)))]>; // Indexed load unsigned halfword. let isPredicable = 1, AddedComplexity = 20 in def LDriuh_indexed : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s11_1Imm:$offset), - "$dst=memuh($src1+#$offset)", - [(set IntRegs:$dst, (zextloadi16 (add IntRegs:$src1, - s11_1ImmPred:$offset)))]>; + "$dst = memuh($src1+#$offset)", + [(set (i32 IntRegs:$dst), + (i32 (zextloadi16 (add (i32 IntRegs:$src1), + s11_1ImmPred:$offset))))]>; -def LDriuh_ae : LDInst<(outs IntRegs:$dst), - (ins MEMri:$addr), - "$dst = memuh($addr)", - [(set IntRegs:$dst, (extloadi16 ADDRriS11_1:$addr))]>; - - -// Indexed load unsigned halfword any-extend. -let AddedComplexity = 20 in -def LDriuh_ae_indexed : LDInst<(outs IntRegs:$dst), - (ins IntRegs:$src1, s11_1Imm:$offset), - "$dst=memuh($src1+#$offset)", - [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1, - s11_1ImmPred:$offset)))] >; - -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDriuh_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global, u16Imm:$offset), - "$dst=memuh(#$global+$offset)", - []>; + "$dst = memuh(#$global+$offset)", + []>, + Requires<[NoV4T]>; -let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriuh : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2), +let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in +def POST_LDriuh : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2), (ins IntRegs:$src1, s4Imm:$offset), "$dst = memuh($src1++#$offset)", [], "$src1 = $dst2">; // Load unsigned halfword conditionally. -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1) $dst = memuh($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1) $dst = memuh($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_indexed_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_indexed_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3), "if ($src1) $dst = memuh($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_indexed_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_indexed_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3), "if (!$src1) $dst = memuh($src2+#$src3)", []>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriuh_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriuh_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3), "if ($src1) $dst1 = memuh($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriuh_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriuh_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3), "if (!$src1) $dst1 = memuh($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1.new) $dst = memuh($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1.new) $dst = memuh($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_indexed_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_indexed_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3), "if ($src1.new) $dst = memuh($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriuh_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3), "if (!$src1.new) $dst = memuh($src2+#$src3)", []>; @@ -1417,10 +1450,10 @@ def LDriuh_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst), let isPredicable = 1 in def LDriw : LDInst<(outs IntRegs:$dst), (ins MEMri:$addr), "$dst = memw($addr)", - [(set IntRegs:$dst, (load ADDRriS11_2:$addr))]>; + [(set IntRegs:$dst, (i32 (load ADDRriS11_2:$addr)))]>; // Load predicate. -let mayLoad = 1, Defs = [R10,R11] in +let Defs = [R10,R11,D5], neverHasSideEffects = 1 in def LDriw_pred : LDInst<(outs PredRegs:$dst), (ins MEMri:$addr), "Error; should not emit", @@ -1430,24 +1463,26 @@ def LDriw_pred : LDInst<(outs PredRegs:$dst), let isPredicable = 1, AddedComplexity = 20 in def LDriw_indexed : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s11_2Imm:$offset), - "$dst=memw($src1+#$offset)", - [(set IntRegs:$dst, (load (add IntRegs:$src1, - s11_2ImmPred:$offset)))]>; + "$dst = memw($src1+#$offset)", + [(set IntRegs:$dst, (i32 (load (add IntRegs:$src1, + s11_2ImmPred:$offset))))]>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDriw_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global, u16Imm:$offset), - "$dst=memw(#$global+$offset)", - []>; + "$dst = memw(#$global+$offset)", + []>, + Requires<[NoV4T]>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDw_GP : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1 in +def LDw_GP : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), - "$dst=memw(#$global)", - []>; + "$dst = memw(#$global)", + []>, + Requires<[NoV4T]>; -let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriw : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2), +let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in +def POST_LDriw : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2), (ins IntRegs:$src1, s4Imm:$offset), "$dst = memw($src1++#$offset)", [], @@ -1455,71 +1490,71 @@ def POST_LDriw : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2), // Load word conditionally. -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1) $dst = memw($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1) $dst = memw($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_indexed_cPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_indexed_cPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3), - "if ($src1) $dst=memw($src2+#$src3)", + "if ($src1) $dst = memw($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_indexed_cNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_indexed_cNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3), - "if (!$src1) $dst=memw($src2+#$src3)", + "if (!$src1) $dst = memw($src2+#$src3)", []>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriw_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriw_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3), "if ($src1) $dst1 = memw($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in -def POST_LDriw_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriw_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3), "if (!$src1) $dst1 = memw($src2++#$src3)", [], "$src2 = $dst2">; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if ($src1.new) $dst = memw($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, MEMri:$addr), "if (!$src1.new) $dst = memw($addr)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_indexed_cdnPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_indexed_cdnPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3), - "if ($src1.new) $dst=memw($src2+#$src3)", + "if ($src1.new) $dst = memw($src2+#$src3)", []>; -let mayLoad = 1, neverHasSideEffects = 1 in -def LDriw_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst), +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3), - "if (!$src1.new) $dst=memw($src2+#$src3)", + "if (!$src1.new) $dst = memw($src2+#$src3)", []>; // Deallocate stack frame. let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in { - def DEALLOCFRAME : LDInst<(outs), (ins i32imm:$amt1), + def DEALLOCFRAME : LDInst2<(outs), (ins i32imm:$amt1), "deallocframe", []>; } @@ -1550,13 +1585,14 @@ let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in { // Rd=+mpyi(Rs,#u8) def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2), "$dst =+ mpyi($src1, #$src2)", - [(set IntRegs:$dst, (mul IntRegs:$src1, u8ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1), + u8ImmPred:$src2))]>; // Rd=-mpyi(Rs,#u8) def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, n8Imm:$src2), "$dst =- mpyi($src1, #$src2)", - [(set IntRegs:$dst, - (mul IntRegs:$src1, n8ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1), + n8ImmPred:$src2))]>; // Rd=mpyi(Rs,#m9) // s9 is NOT the same as m9 - but it works.. so far. @@ -1564,35 +1600,40 @@ def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, n8Imm:$src2), // depending on the value of m9. See Arch Spec. def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2), "$dst = mpyi($src1, #$src2)", - [(set IntRegs:$dst, (mul IntRegs:$src1, s9ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1), + s9ImmPred:$src2))]>; // Rd=mpyi(Rs,Rt) def MPYI : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = mpyi($src1, $src2)", - [(set IntRegs:$dst, (mul IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; // Rx+=mpyi(Rs,#u8) def MPYI_acc_ri : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3), "$dst += mpyi($src2, #$src3)", - [(set IntRegs:$dst, - (add (mul IntRegs:$src2, u8ImmPred:$src3), IntRegs:$src1))], + [(set (i32 IntRegs:$dst), + (add (mul (i32 IntRegs:$src2), u8ImmPred:$src3), + (i32 IntRegs:$src1)))], "$src1 = $dst">; // Rx+=mpyi(Rs,Rt) def MPYI_acc_rr : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst += mpyi($src2, $src3)", - [(set IntRegs:$dst, - (add (mul IntRegs:$src2, IntRegs:$src3), IntRegs:$src1))], + [(set (i32 IntRegs:$dst), + (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)), + (i32 IntRegs:$src1)))], "$src1 = $dst">; // Rx-=mpyi(Rs,#u8) def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3), "$dst -= mpyi($src2, #$src3)", - [(set IntRegs:$dst, - (sub IntRegs:$src1, (mul IntRegs:$src2, u8ImmPred:$src3)))], + [(set (i32 IntRegs:$dst), + (sub (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2), + u8ImmPred:$src3)))], "$src1 = $dst">; // Multiply and use upper result. @@ -1601,27 +1642,30 @@ def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst), // Rd=mpy(Rs,Rt) def MPY : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = mpy($src1, $src2)", - [(set IntRegs:$dst, (mulhs IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (mulhs (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; // Rd=mpy(Rs,Rt):rnd // Rd=mpyu(Rs,Rt) def MPYU : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = mpyu($src1, $src2)", - [(set IntRegs:$dst, (mulhu IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (mulhu (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; // Multiply and use full result. // Rdd=mpyu(Rs,Rt) def MPYU64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = mpyu($src1, $src2)", - [(set DoubleRegs:$dst, (mul (i64 (anyext IntRegs:$src1)), - (i64 (anyext IntRegs:$src2))))]>; + [(set (i64 DoubleRegs:$dst), + (mul (i64 (anyext (i32 IntRegs:$src1))), + (i64 (anyext (i32 IntRegs:$src2)))))]>; // Rdd=mpy(Rs,Rt) def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = mpy($src1, $src2)", - [(set DoubleRegs:$dst, (mul (i64 (sext IntRegs:$src1)), - (i64 (sext IntRegs:$src2))))]>; - + [(set (i64 DoubleRegs:$dst), + (mul (i64 (sext (i32 IntRegs:$src1))), + (i64 (sext (i32 IntRegs:$src2)))))]>; // Multiply and accumulate, use full result. // Rxx[+-]=mpy(Rs,Rt) @@ -1629,18 +1673,20 @@ def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), def MPY64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst += mpy($src2, $src3)", - [(set DoubleRegs:$dst, - (add (mul (i64 (sext IntRegs:$src2)), (i64 (sext IntRegs:$src3))), - DoubleRegs:$src1))], + [(set (i64 DoubleRegs:$dst), + (add (mul (i64 (sext (i32 IntRegs:$src2))), + (i64 (sext (i32 IntRegs:$src3)))), + (i64 DoubleRegs:$src1)))], "$src1 = $dst">; // Rxx-=mpy(Rs,Rt) def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst -= mpy($src2, $src3)", - [(set DoubleRegs:$dst, - (sub DoubleRegs:$src1, - (mul (i64 (sext IntRegs:$src2)), (i64 (sext IntRegs:$src3)))))], + [(set (i64 DoubleRegs:$dst), + (sub (i64 DoubleRegs:$src1), + (mul (i64 (sext (i32 IntRegs:$src2))), + (i64 (sext (i32 IntRegs:$src3))))))], "$src1 = $dst">; // Rxx[+-]=mpyu(Rs,Rt) @@ -1648,47 +1694,52 @@ def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst), def MPYU64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst += mpyu($src2, $src3)", - [(set DoubleRegs:$dst, (add (mul (i64 (anyext IntRegs:$src2)), - (i64 (anyext IntRegs:$src3))), - DoubleRegs:$src1))],"$src1 = $dst">; + [(set (i64 DoubleRegs:$dst), + (add (mul (i64 (anyext (i32 IntRegs:$src2))), + (i64 (anyext (i32 IntRegs:$src3)))), + (i64 DoubleRegs:$src1)))], "$src1 = $dst">; // Rxx-=mpyu(Rs,Rt) def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst += mpyu($src2, $src3)", - [(set DoubleRegs:$dst, - (sub DoubleRegs:$src1, - (mul (i64 (anyext IntRegs:$src2)), - (i64 (anyext IntRegs:$src3)))))], + [(set (i64 DoubleRegs:$dst), + (sub (i64 DoubleRegs:$src1), + (mul (i64 (anyext (i32 IntRegs:$src2))), + (i64 (anyext (i32 IntRegs:$src3))))))], "$src1 = $dst">; def ADDrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst += add($src2, $src3)", - [(set IntRegs:$dst, (add (add IntRegs:$src2, IntRegs:$src3), - IntRegs:$src1))], + [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2), + (i32 IntRegs:$src3)), + (i32 IntRegs:$src1)))], "$src1 = $dst">; def ADDri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1, IntRegs:$src2, s8Imm:$src3), "$dst += add($src2, #$src3)", - [(set IntRegs:$dst, (add (add IntRegs:$src2, s8ImmPred:$src3), - IntRegs:$src1))], + [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2), + s8ImmPred:$src3), + (i32 IntRegs:$src1)))], "$src1 = $dst">; def SUBrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst -= add($src2, $src3)", - [(set IntRegs:$dst, (sub IntRegs:$src1, (add IntRegs:$src2, - IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (sub (i32 IntRegs:$src1), (add (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">; def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1, IntRegs:$src2, s8Imm:$src3), "$dst -= add($src2, #$src3)", - [(set IntRegs:$dst, (sub IntRegs:$src1, - (add IntRegs:$src2, s8ImmPred:$src3)))], + [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1), + (add (i32 IntRegs:$src2), + s8ImmPred:$src3)))], "$src1 = $dst">; //===----------------------------------------------------------------------===// @@ -1731,57 +1782,70 @@ let isPredicable = 1 in def STrid : STInst<(outs), (ins MEMri:$addr, DoubleRegs:$src1), "memd($addr) = $src1", - [(store DoubleRegs:$src1, ADDRriS11_3:$addr)]>; + [(store (i64 DoubleRegs:$src1), ADDRriS11_3:$addr)]>; // Indexed store double word. let AddedComplexity = 10, isPredicable = 1 in def STrid_indexed : STInst<(outs), (ins IntRegs:$src1, s11_3Imm:$src2, DoubleRegs:$src3), "memd($src1+#$src2) = $src3", - [(store DoubleRegs:$src3, - (add IntRegs:$src1, s11_3ImmPred:$src2))]>; + [(store (i64 DoubleRegs:$src3), + (add (i32 IntRegs:$src1), s11_3ImmPred:$src2))]>; -let mayStore = 1, neverHasSideEffects = 1 in -def STrid_GP : STInst<(outs), +let neverHasSideEffects = 1 in +def STrid_GP : STInst2<(outs), (ins globaladdress:$global, u16Imm:$offset, DoubleRegs:$src), "memd(#$global+$offset) = $src", - []>; + []>, + Requires<[NoV4T]>; + +let neverHasSideEffects = 1 in +def STd_GP : STInst2<(outs), + (ins globaladdress:$global, DoubleRegs:$src), + "memd(#$global) = $src", + []>, + Requires<[NoV4T]>; let hasCtrlDep = 1, isPredicable = 1 in def POST_STdri : STInstPI<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2, s4Imm:$offset), "memd($src2++#$offset) = $src1", [(set IntRegs:$dst, - (post_store DoubleRegs:$src1, IntRegs:$src2, s4_3ImmPred:$offset))], + (post_store (i64 DoubleRegs:$src1), (i32 IntRegs:$src2), + s4_3ImmPred:$offset))], "$src2 = $dst">; // Store doubleword conditionally. // if ([!]Pv) memd(Rs+#u6:3)=Rtt // if (Pv) memd(Rs+#u6:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_cPt : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_cPt : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2), "if ($src1) memd($addr) = $src2", []>; // if (!Pv) memd(Rs+#u6:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_cNotPt : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_cNotPt : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2), "if (!$src1) memd($addr) = $src2", []>; // if (Pv) memd(Rs+#u6:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_indexed_cPt : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_indexed_cPt : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3, DoubleRegs:$src4), "if ($src1) memd($src2+#$src3) = $src4", []>; // if (!Pv) memd(Rs+#u6:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_indexed_cNotPt : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_indexed_cNotPt : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3, DoubleRegs:$src4), "if (!$src1) memd($src2+#$src3) = $src4", @@ -1789,8 +1853,9 @@ def STrid_indexed_cNotPt : STInst<(outs), // if ([!]Pv) memd(Rx++#s4:3)=Rtt // if (Pv) memd(Rx++#s4:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def POST_STdri_cPt : STInstPI<(outs IntRegs:$dst), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def POST_STdri_cPt : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, s4_3Imm:$offset), "if ($src1) memd($src3++#$offset) = $src2", @@ -1798,9 +1863,9 @@ def POST_STdri_cPt : STInstPI<(outs IntRegs:$dst), "$src3 = $dst">; // if (!Pv) memd(Rx++#s4:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1, +let AddedComplexity = 10, neverHasSideEffects = 1, isPredicated = 1, isPredicated = 1 in -def POST_STdri_cNotPt : STInstPI<(outs IntRegs:$dst), +def POST_STdri_cNotPt : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, s4_3Imm:$offset), "if (!$src1) memd($src3++#$offset) = $src2", @@ -1814,27 +1879,30 @@ let isPredicable = 1 in def STrib : STInst<(outs), (ins MEMri:$addr, IntRegs:$src1), "memb($addr) = $src1", - [(truncstorei8 IntRegs:$src1, ADDRriS11_0:$addr)]>; + [(truncstorei8 (i32 IntRegs:$src1), ADDRriS11_0:$addr)]>; let AddedComplexity = 10, isPredicable = 1 in def STrib_indexed : STInst<(outs), (ins IntRegs:$src1, s11_0Imm:$src2, IntRegs:$src3), "memb($src1+#$src2) = $src3", - [(truncstorei8 IntRegs:$src3, (add IntRegs:$src1, - s11_0ImmPred:$src2))]>; + [(truncstorei8 (i32 IntRegs:$src3), (add (i32 IntRegs:$src1), + s11_0ImmPred:$src2))]>; // memb(gp+#u16:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_GP : STInst<(outs), +let neverHasSideEffects = 1 in +def STrib_GP : STInst2<(outs), (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src), "memb(#$global+$offset) = $src", - []>; + []>, + Requires<[NoV4T]>; -let mayStore = 1, neverHasSideEffects = 1 in -def STb_GP : STInst<(outs), +// memb(#global)=Rt +let neverHasSideEffects = 1 in +def STb_GP : STInst2<(outs), (ins globaladdress:$global, IntRegs:$src), "memb(#$global) = $src", - []>; + []>, + Requires<[NoV4T]>; // memb(Rx++#s4:0)=Rt let hasCtrlDep = 1, isPredicable = 1 in @@ -1843,51 +1911,51 @@ def POST_STbri : STInstPI<(outs IntRegs:$dst), (ins IntRegs:$src1, s4Imm:$offset), "memb($src2++#$offset) = $src1", [(set IntRegs:$dst, - (post_truncsti8 IntRegs:$src1, IntRegs:$src2, + (post_truncsti8 (i32 IntRegs:$src1), (i32 IntRegs:$src2), s4_0ImmPred:$offset))], "$src2 = $dst">; // Store byte conditionally. // if ([!]Pv) memb(Rs+#u6:0)=Rt // if (Pv) memb(Rs+#u6:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_cPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STrib_cPt : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1) memb($addr) = $src2", []>; // if (!Pv) memb(Rs+#u6:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_cNotPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STrib_cNotPt : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1) memb($addr) = $src2", []>; // if (Pv) memb(Rs+#u6:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_indexed_cPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STrib_indexed_cPt : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4), "if ($src1) memb($src2+#$src3) = $src4", []>; // if (!Pv) memb(Rs+#u6:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_indexed_cNotPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STrib_indexed_cNotPt : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4), "if (!$src1) memb($src2+#$src3) = $src4", []>; // if ([!]Pv) memb(Rx++#s4:0)=Rt // if (Pv) memb(Rx++#s4:0)=Rt -let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in -def POST_STbri_cPt : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, isPredicated = 1 in +def POST_STbri_cPt : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset), "if ($src1) memb($src3++#$offset) = $src2", [],"$src3 = $dst">; // if (!Pv) memb(Rx++#s4:0)=Rt -let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in -def POST_STbri_cNotPt : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, isPredicated = 1 in +def POST_STbri_cNotPt : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset), "if (!$src1) memb($src3++#$offset) = $src2", [],"$src3 = $dst">; @@ -1899,27 +1967,29 @@ let isPredicable = 1 in def STrih : STInst<(outs), (ins MEMri:$addr, IntRegs:$src1), "memh($addr) = $src1", - [(truncstorei16 IntRegs:$src1, ADDRriS11_1:$addr)]>; + [(truncstorei16 (i32 IntRegs:$src1), ADDRriS11_1:$addr)]>; let AddedComplexity = 10, isPredicable = 1 in def STrih_indexed : STInst<(outs), (ins IntRegs:$src1, s11_1Imm:$src2, IntRegs:$src3), "memh($src1+#$src2) = $src3", - [(truncstorei16 IntRegs:$src3, (add IntRegs:$src1, - s11_1ImmPred:$src2))]>; + [(truncstorei16 (i32 IntRegs:$src3), (add (i32 IntRegs:$src1), + s11_1ImmPred:$src2))]>; -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_GP : STInst<(outs), +let neverHasSideEffects = 1 in +def STrih_GP : STInst2<(outs), (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src), "memh(#$global+$offset) = $src", - []>; + []>, + Requires<[NoV4T]>; -let mayStore = 1, neverHasSideEffects = 1 in -def STh_GP : STInst<(outs), +let neverHasSideEffects = 1 in +def STh_GP : STInst2<(outs), (ins globaladdress:$global, IntRegs:$src), "memh(#$global) = $src", - []>; + []>, + Requires<[NoV4T]>; // memh(Rx++#s4:1)=Rt.H // memh(Rx++#s4:1)=Rt @@ -1928,51 +1998,51 @@ def POST_SThri : STInstPI<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset), "memh($src2++#$offset) = $src1", [(set IntRegs:$dst, - (post_truncsti16 IntRegs:$src1, IntRegs:$src2, + (post_truncsti16 (i32 IntRegs:$src1), (i32 IntRegs:$src2), s4_1ImmPred:$offset))], "$src2 = $dst">; // Store halfword conditionally. // if ([!]Pv) memh(Rs+#u6:1)=Rt // if (Pv) memh(Rs+#u6:1)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_cPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STrih_cPt : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1) memh($addr) = $src2", []>; // if (!Pv) memh(Rs+#u6:1)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_cNotPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STrih_cNotPt : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1) memh($addr) = $src2", []>; // if (Pv) memh(Rs+#u6:1)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_indexed_cPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STrih_indexed_cPt : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4), "if ($src1) memh($src2+#$src3) = $src4", []>; // if (!Pv) memh(Rs+#u6:1)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_indexed_cNotPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STrih_indexed_cNotPt : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4), "if (!$src1) memh($src2+#$src3) = $src4", []>; // if ([!]Pv) memh(Rx++#s4:1)=Rt // if (Pv) memh(Rx++#s4:1)=Rt -let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in -def POST_SThri_cPt : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, isPredicated = 1 in +def POST_SThri_cPt : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset), "if ($src1) memh($src3++#$offset) = $src2", [],"$src3 = $dst">; // if (!Pv) memh(Rx++#s4:1)=Rt -let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in -def POST_SThri_cNotPt : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, isPredicated = 1 in +def POST_SThri_cNotPt : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset), "if (!$src1) memh($src3++#$offset) = $src2", [],"$src3 = $dst">; @@ -1980,8 +2050,8 @@ def POST_SThri_cNotPt : STInstPI<(outs IntRegs:$dst), // Store word. // Store predicate. -let Defs = [R10,R11] in -def STriw_pred : STInst<(outs), +let Defs = [R10,R11,D5], neverHasSideEffects = 1 in +def STriw_pred : STInst2<(outs), (ins MEMri:$addr, PredRegs:$src1), "Error; should not emit", []>; @@ -1991,69 +2061,79 @@ let isPredicable = 1 in def STriw : STInst<(outs), (ins MEMri:$addr, IntRegs:$src1), "memw($addr) = $src1", - [(store IntRegs:$src1, ADDRriS11_2:$addr)]>; + [(store (i32 IntRegs:$src1), ADDRriS11_2:$addr)]>; let AddedComplexity = 10, isPredicable = 1 in def STriw_indexed : STInst<(outs), (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3), "memw($src1+#$src2) = $src3", - [(store IntRegs:$src3, (add IntRegs:$src1, s11_2ImmPred:$src2))]>; + [(store (i32 IntRegs:$src3), + (add (i32 IntRegs:$src1), s11_2ImmPred:$src2))]>; -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_GP : STInst<(outs), +let neverHasSideEffects = 1 in +def STriw_GP : STInst2<(outs), (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src), "memw(#$global+$offset) = $src", - []>; + []>, + Requires<[NoV4T]>; + +let neverHasSideEffects = 1 in +def STw_GP : STInst2<(outs), + (ins globaladdress:$global, IntRegs:$src), + "memw(#$global) = $src", + []>, + Requires<[NoV4T]>; let hasCtrlDep = 1, isPredicable = 1 in def POST_STwri : STInstPI<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset), "memw($src2++#$offset) = $src1", [(set IntRegs:$dst, - (post_store IntRegs:$src1, IntRegs:$src2, s4_2ImmPred:$offset))], + (post_store (i32 IntRegs:$src1), (i32 IntRegs:$src2), + s4_2ImmPred:$offset))], "$src2 = $dst">; // Store word conditionally. // if ([!]Pv) memw(Rs+#u6:2)=Rt // if (Pv) memw(Rs+#u6:2)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_cPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STriw_cPt : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1) memw($addr) = $src2", []>; // if (!Pv) memw(Rs+#u6:2)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_cNotPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STriw_cNotPt : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1) memw($addr) = $src2", []>; // if (Pv) memw(Rs+#u6:2)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_indexed_cPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STriw_indexed_cPt : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4), "if ($src1) memw($src2+#$src3) = $src4", []>; // if (!Pv) memw(Rs+#u6:2)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_indexed_cNotPt : STInst<(outs), +let neverHasSideEffects = 1, isPredicated = 1 in +def STriw_indexed_cNotPt : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4), "if (!$src1) memw($src2+#$src3) = $src4", []>; // if ([!]Pv) memw(Rx++#s4:2)=Rt // if (Pv) memw(Rx++#s4:2)=Rt -let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in -def POST_STwri_cPt : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, isPredicated = 1 in +def POST_STwri_cPt : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset), "if ($src1) memw($src3++#$offset) = $src2", [],"$src3 = $dst">; // if (!Pv) memw(Rx++#s4:2)=Rt -let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in -def POST_STwri_cNotPt : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, isPredicated = 1 in +def POST_STwri_cNotPt : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset), "if (!$src1) memw($src3++#$offset) = $src2", [],"$src3 = $dst">; @@ -2062,7 +2142,7 @@ def POST_STwri_cNotPt : STInstPI<(outs IntRegs:$dst), // Allocate stack frame. let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in { - def ALLOCFRAME : STInst<(outs), + def ALLOCFRAME : STInst2<(outs), (ins i32imm:$amt), "allocframe(#$amt)", []>; @@ -2077,13 +2157,13 @@ let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in { // Logical NOT. def NOT_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1), "$dst = not($src1)", - [(set DoubleRegs:$dst, (not DoubleRegs:$src1))]>; + [(set (i64 DoubleRegs:$dst), (not (i64 DoubleRegs:$src1)))]>; // Sign extend word to doubleword. def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1), "$dst = sxtw($src1)", - [(set DoubleRegs:$dst, (sext IntRegs:$src1))]>; + [(set (i64 DoubleRegs:$dst), (sext (i32 IntRegs:$src1)))]>; //===----------------------------------------------------------------------===// // STYPE/ALU - //===----------------------------------------------------------------------===// @@ -2091,37 +2171,58 @@ def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1), //===----------------------------------------------------------------------===// // STYPE/BIT + //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// STYPE/BIT - -//===----------------------------------------------------------------------===// +// clrbit. +def CLRBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), + "$dst = clrbit($src1, #$src2)", + [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1), + (not + (shl 1, u5ImmPred:$src2))))]>; +def CLRBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), + "$dst = clrbit($src1, #$src2)", + []>; -//===----------------------------------------------------------------------===// -// STYPE/COMPLEX + -//===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// STYPE/COMPLEX - -//===----------------------------------------------------------------------===// +// Map from r0 = and(r1, 2147483647) to r0 = clrbit(r1, #31). +def : Pat <(and (i32 IntRegs:$src1), 2147483647), + (CLRBIT_31 (i32 IntRegs:$src1), 31)>; -//===----------------------------------------------------------------------===// -// STYPE/PERM + -//===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// STYPE/PERM - -//===----------------------------------------------------------------------===// +// setbit. +def SETBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), + "$dst = setbit($src1, #$src2)", + [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1), + (shl 1, u5ImmPred:$src2)))]>; + +// Map from r0 = or(r1, -2147483648) to r0 = setbit(r1, #31). +def SETBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), + "$dst = setbit($src1, #$src2)", + []>; + +def : Pat <(or (i32 IntRegs:$src1), -2147483648), + (SETBIT_31 (i32 IntRegs:$src1), 31)>; + +// togglebit. +def TOGBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), + "$dst = setbit($src1, #$src2)", + [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1), + (shl 1, u5ImmPred:$src2)))]>; + +// Map from r0 = xor(r1, -2147483648) to r0 = togglebit(r1, #31). +def TOGBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), + "$dst = togglebit($src1, #$src2)", + []>; + +def : Pat <(xor (i32 IntRegs:$src1), -2147483648), + (TOGBIT_31 (i32 IntRegs:$src1), 31)>; -//===----------------------------------------------------------------------===// -// STYPE/PRED + -//===----------------------------------------------------------------------===// // Predicate transfer. let neverHasSideEffects = 1 in def TFR_RsPd : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1), - "$dst = $src1 // Should almost never emit this", + "$dst = $src1 /* Should almost never emit this. */", []>; def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1), - "$dst = $src1 // Should almost never emit!", - [(set PredRegs:$dst, (trunc IntRegs:$src1))]>; + "$dst = $src1 /* Should almost never emit this. */", + [(set (i1 PredRegs:$dst), (trunc (i32 IntRegs:$src1)))]>; //===----------------------------------------------------------------------===// // STYPE/PRED - //===----------------------------------------------------------------------===// @@ -2132,75 +2233,85 @@ def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1), // Shift by immediate. def ASR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), "$dst = asr($src1, #$src2)", - [(set IntRegs:$dst, (sra IntRegs:$src1, u5ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1), + u5ImmPred:$src2))]>; def ASRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2), "$dst = asr($src1, #$src2)", - [(set DoubleRegs:$dst, (sra DoubleRegs:$src1, u6ImmPred:$src2))]>; + [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1), + u6ImmPred:$src2))]>; def ASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), "$dst = asl($src1, #$src2)", - [(set IntRegs:$dst, (shl IntRegs:$src1, u5ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1), + u5ImmPred:$src2))]>; + +def ASLd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2), + "$dst = asl($src1, #$src2)", + [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1), + u6ImmPred:$src2))]>; def LSR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), "$dst = lsr($src1, #$src2)", - [(set IntRegs:$dst, (srl IntRegs:$src1, u5ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1), + u5ImmPred:$src2))]>; def LSRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2), "$dst = lsr($src1, #$src2)", - [(set DoubleRegs:$dst, (srl DoubleRegs:$src1, u6ImmPred:$src2))]>; - -def LSRd_ri_acc : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, - DoubleRegs:$src2, - u6Imm:$src3), - "$dst += lsr($src2, #$src3)", - [(set DoubleRegs:$dst, (add DoubleRegs:$src1, - (srl DoubleRegs:$src2, - u6ImmPred:$src3)))], - "$src1 = $dst">; - -// Shift by immediate and accumulate. -def ASR_rr_acc : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, - IntRegs:$src2, - IntRegs:$src3), - "$dst += asr($src2, $src3)", - [], "$src1 = $dst">; + [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1), + u6ImmPred:$src2))]>; // Shift by immediate and add. +let AddedComplexity = 100 in def ADDASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u3Imm:$src3), "$dst = addasl($src1, $src2, #$src3)", - [(set IntRegs:$dst, (add IntRegs:$src1, - (shl IntRegs:$src2, - u3ImmPred:$src3)))]>; + [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u3ImmPred:$src3)))]>; // Shift by register. def ASL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = asl($src1, $src2)", - [(set IntRegs:$dst, (shl IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; def ASR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = asr($src1, $src2)", - [(set IntRegs:$dst, (sra IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; +def LSL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), + "$dst = lsl($src1, $src2)", + [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; def LSR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = lsr($src1, $src2)", - [(set IntRegs:$dst, (srl IntRegs:$src1, IntRegs:$src2))]>; + [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))]>; + +def ASLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2), + "$dst = asl($src1, $src2)", + [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1), + (i32 IntRegs:$src2)))]>; def LSLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2), "$dst = lsl($src1, $src2)", - [(set DoubleRegs:$dst, (shl DoubleRegs:$src1, IntRegs:$src2))]>; + [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1), + (i32 IntRegs:$src2)))]>; def ASRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2), "$dst = asr($src1, $src2)", - [(set DoubleRegs:$dst, (sra DoubleRegs:$src1, IntRegs:$src2))]>; + [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1), + (i32 IntRegs:$src2)))]>; def LSRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2), "$dst = lsr($src1, $src2)", - [(set DoubleRegs:$dst, (srl DoubleRegs:$src1, IntRegs:$src2))]>; + [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1), + (i32 IntRegs:$src2)))]>; //===----------------------------------------------------------------------===// // STYPE/SHIFT - @@ -2231,8 +2342,8 @@ def SDHexagonBARRIER: SDTypeProfile<0, 0, []>; def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDHexagonBARRIER, [SDNPHasChain]>; -let hasSideEffects = 1 in -def BARRIER : STInst<(outs), (ins), +let hasSideEffects = 1, isHexagonSolo = 1 in +def BARRIER : SYSInst<(outs), (ins), "barrier", [(HexagonBARRIER)]>; @@ -2244,47 +2355,50 @@ def BARRIER : STInst<(outs), (ins), let isReMaterializable = 1 in def TFRI64 : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1), "$dst = #$src1", - [(set DoubleRegs:$dst, s8Imm64Pred:$src1)]>; + [(set (i64 DoubleRegs:$dst), s8Imm64Pred:$src1)]>; // Pseudo instruction to encode a set of conditional transfers. // This instruction is used instead of a mux and trades-off codesize // for performance. We conduct this transformation optimistically in // the hope that these instructions get promoted to dot-new transfers. -let AddedComplexity = 100 in +let AddedComplexity = 100, isPredicated = 1 in def TFR_condset_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "Error; should not emit", - [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2, - IntRegs:$src3))]>; - -let AddedComplexity = 100 in + [(set (i32 IntRegs:$dst), + (i32 (select (i1 PredRegs:$src1), + (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))]>; +let AddedComplexity = 100, isPredicated = 1 in def TFR_condset_ri : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, s12Imm:$src3), "Error; should not emit", - [(set IntRegs:$dst, - (select PredRegs:$src1, IntRegs:$src2, s12ImmPred:$src3))]>; + [(set (i32 IntRegs:$dst), + (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2), + s12ImmPred:$src3)))]>; -let AddedComplexity = 100 in +let AddedComplexity = 100, isPredicated = 1 in def TFR_condset_ir : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, s12Imm:$src2, IntRegs:$src3), "Error; should not emit", - [(set IntRegs:$dst, - (select PredRegs:$src1, s12ImmPred:$src2, IntRegs:$src3))]>; + [(set (i32 IntRegs:$dst), + (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2, + (i32 IntRegs:$src3))))]>; -let AddedComplexity = 100 in +let AddedComplexity = 100, isPredicated = 1 in def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, s12Imm:$src2, s12Imm:$src3), "Error; should not emit", - [(set IntRegs:$dst, (select PredRegs:$src1, - s12ImmPred:$src2, - s12ImmPred:$src3))]>; + [(set (i32 IntRegs:$dst), + (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2, + s12ImmPred:$src3)))]>; // Generate frameindex addresses. let isReMaterializable = 1 in def TFR_FI : ALU32_ri<(outs IntRegs:$dst), (ins FrameIndex:$src1), "$dst = add($src1)", - [(set IntRegs:$dst, ADDRri:$src1)]>; + [(set (i32 IntRegs:$dst), ADDRri:$src1)]>; // // CR - Type. @@ -2303,69 +2417,116 @@ def LOOP0_r : CRInst<(outs), (ins brtarget:$offset, IntRegs:$src2), let isBranch = 1, isTerminator = 1, neverHasSideEffects = 1, Defs = [PC, LC0], Uses = [SA0, LC0] in { -def ENDLOOP0 : CRInst<(outs), (ins brtarget:$offset), +def ENDLOOP0 : Marker<(outs), (ins brtarget:$offset), ":endloop0", []>; } // Support for generating global address. // Taken from X86InstrInfo.td. -def SDTHexagonCONST32 : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, - SDTCisPtrTy<0>]>; +def SDTHexagonCONST32 : SDTypeProfile<1, 1, [ + SDTCisVT<0, i32>, + SDTCisVT<1, i32>, + SDTCisPtrTy<0>]>; def HexagonCONST32 : SDNode<"HexagonISD::CONST32", SDTHexagonCONST32>; def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP", SDTHexagonCONST32>; +// HI/LO Instructions +let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in +def LO : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global), + "$dst.l = #LO($global)", + []>; + +let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in +def HI : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global), + "$dst.h = #HI($global)", + []>; + +let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in +def LOi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value), + "$dst.l = #LO($imm_value)", + []>; + + +let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in +def HIi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value), + "$dst.h = #HI($imm_value)", + []>; + +let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in +def LO_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt), + "$dst.l = #LO($jt)", + []>; + +let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in +def HI_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt), + "$dst.h = #HI($jt)", + []>; + + +let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in +def LO_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label), + "$dst.l = #LO($label)", + []>; + +let isReMaterializable = 1, isMoveImm = 1 , neverHasSideEffects = 1 in +def HI_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label), + "$dst.h = #HI($label)", + []>; + // This pattern is incorrect. When we add small data, we should change // this pattern to use memw(#foo). +// This is for sdata. let isMoveImm = 1 in def CONST32 : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global), "$dst = CONST32(#$global)", - [(set IntRegs:$dst, - (load (HexagonCONST32 tglobaltlsaddr:$global)))]>; + [(set (i32 IntRegs:$dst), + (load (HexagonCONST32 tglobaltlsaddr:$global)))]>; +// This is for non-sdata. let isReMaterializable = 1, isMoveImm = 1 in -def CONST32_set : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global), +def CONST32_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), "$dst = CONST32(#$global)", - [(set IntRegs:$dst, - (HexagonCONST32 tglobaladdr:$global))]>; + [(set (i32 IntRegs:$dst), + (HexagonCONST32 tglobaladdr:$global))]>; let isReMaterializable = 1, isMoveImm = 1 in -def CONST32_set_jt : LDInst<(outs IntRegs:$dst), (ins jumptablebase:$jt), +def CONST32_set_jt : LDInst2<(outs IntRegs:$dst), (ins jumptablebase:$jt), "$dst = CONST32(#$jt)", - [(set IntRegs:$dst, - (HexagonCONST32 tjumptable:$jt))]>; + [(set (i32 IntRegs:$dst), + (HexagonCONST32 tjumptable:$jt))]>; let isReMaterializable = 1, isMoveImm = 1 in -def CONST32GP_set : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global), +def CONST32GP_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global), "$dst = CONST32(#$global)", - [(set IntRegs:$dst, - (HexagonCONST32_GP tglobaladdr:$global))]>; + [(set (i32 IntRegs:$dst), + (HexagonCONST32_GP tglobaladdr:$global))]>; let isReMaterializable = 1, isMoveImm = 1 in -def CONST32_Int_Real : LDInst<(outs IntRegs:$dst), (ins i32imm:$global), +def CONST32_Int_Real : LDInst2<(outs IntRegs:$dst), (ins i32imm:$global), "$dst = CONST32(#$global)", - [(set IntRegs:$dst, imm:$global) ]>; + [(set (i32 IntRegs:$dst), imm:$global) ]>; let isReMaterializable = 1, isMoveImm = 1 in -def CONST32_Label : LDInst<(outs IntRegs:$dst), (ins bblabel:$label), +def CONST32_Label : LDInst2<(outs IntRegs:$dst), (ins bblabel:$label), "$dst = CONST32($label)", - [(set IntRegs:$dst, (HexagonCONST32 bbl:$label))]>; + [(set (i32 IntRegs:$dst), (HexagonCONST32 bbl:$label))]>; let isReMaterializable = 1, isMoveImm = 1 in -def CONST64_Int_Real : LDInst<(outs DoubleRegs:$dst), (ins i64imm:$global), +def CONST64_Int_Real : LDInst2<(outs DoubleRegs:$dst), (ins i64imm:$global), "$dst = CONST64(#$global)", - [(set DoubleRegs:$dst, imm:$global) ]>; + [(set (i64 DoubleRegs:$dst), imm:$global) ]>; def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins), "$dst = xor($dst, $dst)", - [(set PredRegs:$dst, 0)]>; + [(set (i1 PredRegs:$dst), 0)]>; def MPY_trsext : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), - "$dst = mpy($src1, $src2)", - [(set IntRegs:$dst, - (trunc (i64 (srl (i64 (mul (i64 (sext IntRegs:$src1)), - (i64 (sext IntRegs:$src2)))), - (i32 32)))))]>; + "$dst = mpy($src1, $src2)", + [(set (i32 IntRegs:$dst), + (trunc (i64 (srl (i64 (mul (i64 (sext (i32 IntRegs:$src1))), + (i64 (sext (i32 IntRegs:$src2))))), + (i32 32)))))]>; // Pseudo instructions. def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; @@ -2405,7 +2566,7 @@ let Defs = [R29, R30, R31], Uses = [R29] in { let isCall = 1, neverHasSideEffects = 1, Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in { - def CALL : JInst<(outs), (ins calltarget:$dst, variable_ops), + def CALL : JInst<(outs), (ins calltarget:$dst), "call $dst", []>; } @@ -2413,7 +2574,7 @@ let isCall = 1, neverHasSideEffects = 1, let isCall = 1, neverHasSideEffects = 1, Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in { - def CALLR : JRInst<(outs), (ins IntRegs:$dst, variable_ops), + def CALLR : JRInst<(outs), (ins IntRegs:$dst), "callr $dst", []>; } @@ -2422,25 +2583,25 @@ let isCall = 1, neverHasSideEffects = 1, let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1, Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in { - def TCRETURNtg : JInst<(outs), (ins calltarget:$dst, variable_ops), + def TCRETURNtg : JInst<(outs), (ins calltarget:$dst), "jump $dst // TAILCALL", []>; } let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1, Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in { - def TCRETURNtext : JInst<(outs), (ins calltarget:$dst, variable_ops), + def TCRETURNtext : JInst<(outs), (ins calltarget:$dst), "jump $dst // TAILCALL", []>; } let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1, Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in { - def TCRETURNR : JInst<(outs), (ins IntRegs:$dst, variable_ops), + def TCRETURNR : JInst<(outs), (ins IntRegs:$dst), "jumpr $dst // TAILCALL", []>; } // Map call instruction. -def : Pat<(call IntRegs:$dst), - (CALLR IntRegs:$dst)>, Requires<[HasV2TOnly]>; +def : Pat<(call (i32 IntRegs:$dst)), + (CALLR (i32 IntRegs:$dst))>, Requires<[HasV2TOnly]>; def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>, Requires<[HasV2TOnly]>; def : Pat<(call texternalsym:$dst), @@ -2450,309 +2611,516 @@ def : Pat<(HexagonTCRet tglobaladdr:$dst), (TCRETURNtg tglobaladdr:$dst)>; def : Pat<(HexagonTCRet texternalsym:$dst), (TCRETURNtext texternalsym:$dst)>; -def : Pat<(HexagonTCRet IntRegs:$dst), - (TCRETURNR IntRegs:$dst)>; +def : Pat<(HexagonTCRet (i32 IntRegs:$dst)), + (TCRETURNR (i32 IntRegs:$dst))>; + +// Atomic load and store support +// 8 bit atomic load +def : Pat<(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)), + (i32 (LDub_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_load_8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (i32 (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_load_8 ADDRriS11_0:$src1), + (i32 (LDriub ADDRriS11_0:$src1))>; + +def : Pat<(atomic_load_8 (add (i32 IntRegs:$src1), s11_0ImmPred:$offset)), + (i32 (LDriub_indexed (i32 IntRegs:$src1), s11_0ImmPred:$offset))>; + + + +// 16 bit atomic load +def : Pat<(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)), + (i32 (LDuh_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_load_16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (i32 (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_load_16 ADDRriS11_1:$src1), + (i32 (LDriuh ADDRriS11_1:$src1))>; + +def : Pat<(atomic_load_16 (add (i32 IntRegs:$src1), s11_1ImmPred:$offset)), + (i32 (LDriuh_indexed (i32 IntRegs:$src1), s11_1ImmPred:$offset))>; + + + +// 32 bit atomic load +def : Pat<(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)), + (i32 (LDw_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_load_32 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (i32 (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_load_32 ADDRriS11_2:$src1), + (i32 (LDriw ADDRriS11_2:$src1))>; + +def : Pat<(atomic_load_32 (add (i32 IntRegs:$src1), s11_2ImmPred:$offset)), + (i32 (LDriw_indexed (i32 IntRegs:$src1), s11_2ImmPred:$offset))>; + + +// 64 bit atomic load +def : Pat<(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)), + (i64 (LDd_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_load_64 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (i64 (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_load_64 ADDRriS11_3:$src1), + (i64 (LDrid ADDRriS11_3:$src1))>; + +def : Pat<(atomic_load_64 (add (i32 IntRegs:$src1), s11_3ImmPred:$offset)), + (i64 (LDrid_indexed (i32 IntRegs:$src1), s11_3ImmPred:$offset))>; -// Map from r0 = and(r1, 65535) to r0 = zxth(r1). -def : Pat <(and IntRegs:$src1, 65535), - (ZXTH IntRegs:$src1)>; + +// 64 bit atomic store +def : Pat<(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global), + (i64 DoubleRegs:$src1)), + (STd_GP tglobaladdr:$global, (i64 DoubleRegs:$src1))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_store_64 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset), + (i64 DoubleRegs:$src1)), + (STrid_GP tglobaladdr:$global, u16ImmPred:$offset, + (i64 DoubleRegs:$src1))>, Requires<[NoV4T]>; + +// 8 bit atomic store +def : Pat<(atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global), + (i32 IntRegs:$src1)), + (STb_GP tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_store_8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset), + (i32 IntRegs:$src1)), + (STrib_GP tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, Requires<[NoV4T]>; + +def : Pat<(atomic_store_8 ADDRriS11_0:$src2, (i32 IntRegs:$src1)), + (STrib ADDRriS11_0:$src2, (i32 IntRegs:$src1))>; + +def : Pat<(atomic_store_8 (add (i32 IntRegs:$src2), s11_0ImmPred:$offset), + (i32 IntRegs:$src1)), + (STrib_indexed (i32 IntRegs:$src2), s11_0ImmPred:$offset, + (i32 IntRegs:$src1))>; + + +// 16 bit atomic store +def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global), + (i32 IntRegs:$src1)), + (STh_GP tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_store_16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset), + (i32 IntRegs:$src1)), + (STrih_GP tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, Requires<[NoV4T]>; + +def : Pat<(atomic_store_16 ADDRriS11_1:$src2, (i32 IntRegs:$src1)), + (STrih ADDRriS11_1:$src2, (i32 IntRegs:$src1))>; + +def : Pat<(atomic_store_16 (i32 IntRegs:$src1), + (add (i32 IntRegs:$src2), s11_1ImmPred:$offset)), + (STrih_indexed (i32 IntRegs:$src2), s11_1ImmPred:$offset, + (i32 IntRegs:$src1))>; + + +// 32 bit atomic store +def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global), + (i32 IntRegs:$src1)), + (STw_GP tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_store_32 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset), + (i32 IntRegs:$src1)), + (STriw_GP tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; + +def : Pat<(atomic_store_32 ADDRriS11_2:$src2, (i32 IntRegs:$src1)), + (STriw ADDRriS11_2:$src2, (i32 IntRegs:$src1))>; + +def : Pat<(atomic_store_32 (add (i32 IntRegs:$src2), s11_2ImmPred:$offset), + (i32 IntRegs:$src1)), + (STriw_indexed (i32 IntRegs:$src2), s11_2ImmPred:$offset, + (i32 IntRegs:$src1))>; + + + + +def : Pat<(atomic_store_64 ADDRriS11_3:$src2, (i64 DoubleRegs:$src1)), + (STrid ADDRriS11_3:$src2, (i64 DoubleRegs:$src1))>; + +def : Pat<(atomic_store_64 (add (i32 IntRegs:$src2), s11_3ImmPred:$offset), + (i64 DoubleRegs:$src1)), + (STrid_indexed (i32 IntRegs:$src2), s11_3ImmPred:$offset, + (i64 DoubleRegs:$src1))>; + +// Map from r0 = and(r1, 65535) to r0 = zxth(r1) +def : Pat <(and (i32 IntRegs:$src1), 65535), + (ZXTH (i32 IntRegs:$src1))>; // Map from r0 = and(r1, 255) to r0 = zxtb(r1). -def : Pat <(and IntRegs:$src1, 255), - (ZXTB IntRegs:$src1)>; +def : Pat <(and (i32 IntRegs:$src1), 255), + (ZXTB (i32 IntRegs:$src1))>; // Map Add(p1, true) to p1 = not(p1). // Add(p1, false) should never be produced, // if it does, it got to be mapped to NOOP. -def : Pat <(add PredRegs:$src1, -1), - (NOT_p PredRegs:$src1)>; +def : Pat <(add (i1 PredRegs:$src1), -1), + (NOT_p (i1 PredRegs:$src1))>; // Map from p0 = setlt(r0, r1) r2 = mux(p0, r3, r4) => // p0 = cmp.lt(r0, r1), r0 = mux(p0, r2, r1). -def : Pat <(select (i1 (setlt IntRegs:$src1, IntRegs:$src2)), IntRegs:$src3, - IntRegs:$src4), - (TFR_condset_rr (CMPLTrr IntRegs:$src1, IntRegs:$src2), IntRegs:$src4, - IntRegs:$src3)>, Requires<[HasV2TOnly]>; +def : Pat <(select (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + (i32 IntRegs:$src3), + (i32 IntRegs:$src4)), + (i32 (TFR_condset_rr (CMPLTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)), + (i32 IntRegs:$src4), (i32 IntRegs:$src3)))>, + Requires<[HasV2TOnly]>; // Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i). -def : Pat <(select (not PredRegs:$src1), s8ImmPred:$src2, s8ImmPred:$src3), - (TFR_condset_ii PredRegs:$src1, s8ImmPred:$src3, s8ImmPred:$src2)>; +def : Pat <(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s8ImmPred:$src3), + (i32 (TFR_condset_ii (i1 PredRegs:$src1), s8ImmPred:$src3, + s8ImmPred:$src2))>; + +// Map from p0 = pnot(p0); r0 = select(p0, #i, r1) +// => r0 = TFR_condset_ri(p0, r1, #i) +def : Pat <(select (not (i1 PredRegs:$src1)), s12ImmPred:$src2, + (i32 IntRegs:$src3)), + (i32 (TFR_condset_ri (i1 PredRegs:$src1), (i32 IntRegs:$src3), + s12ImmPred:$src2))>; + +// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i) +// => r0 = TFR_condset_ir(p0, #i, r1) +def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, s12ImmPred:$src3), + (i32 (TFR_condset_ir (i1 PredRegs:$src1), s12ImmPred:$src3, + (i32 IntRegs:$src2)))>; // Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump. def : Pat <(brcond (not PredRegs:$src1), bb:$offset), - (JMP_cNot PredRegs:$src1, bb:$offset)>; + (JMP_cNot (i1 PredRegs:$src1), bb:$offset)>; // Map from p2 = pnot(p2); p1 = and(p0, p2) => p1 = and(p0, !p2). def : Pat <(and PredRegs:$src1, (not PredRegs:$src2)), - (AND_pnotp PredRegs:$src1, PredRegs:$src2)>; + (i1 (AND_pnotp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>; // Map from store(globaladdress + x) -> memd(#foo + x). let AddedComplexity = 100 in -def : Pat <(store DoubleRegs:$src1, +def : Pat <(store (i64 DoubleRegs:$src1), (add (HexagonCONST32_GP tglobaladdr:$global), u16ImmPred:$offset)), - (STrid_GP tglobaladdr:$global, u16ImmPred:$offset, DoubleRegs:$src1)>; + (STrid_GP tglobaladdr:$global, u16ImmPred:$offset, + (i64 DoubleRegs:$src1))>, Requires<[NoV4T]>; -// Map from store(globaladdress) -> memd(#foo + 0). +// Map from store(globaladdress) -> memd(#foo). let AddedComplexity = 100 in -def : Pat <(store DoubleRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)), - (STrid_GP tglobaladdr:$global, 0, DoubleRegs:$src1)>; +def : Pat <(store (i64 DoubleRegs:$src1), + (HexagonCONST32_GP tglobaladdr:$global)), + (STd_GP tglobaladdr:$global, (i64 DoubleRegs:$src1))>, + Requires<[NoV4T]>; // Map from store(globaladdress + x) -> memw(#foo + x). let AddedComplexity = 100 in -def : Pat <(store IntRegs:$src1, (add (HexagonCONST32_GP tglobaladdr:$global), +def : Pat <(store (i32 IntRegs:$src1), + (add (HexagonCONST32_GP tglobaladdr:$global), u16ImmPred:$offset)), - (STriw_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>; + (STriw_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; // Map from store(globaladdress) -> memw(#foo + 0). let AddedComplexity = 100 in -def : Pat <(store IntRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)), - (STriw_GP tglobaladdr:$global, 0, IntRegs:$src1)>; +def : Pat <(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)), + (STriw_GP tglobaladdr:$global, 0, (i32 IntRegs:$src1))>; -// Map from store(globaladdress) -> memw(#foo + 0). +// Map from store(globaladdress) -> memw(#foo). let AddedComplexity = 100 in -def : Pat <(store IntRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)), - (STriw_GP tglobaladdr:$global, 0, IntRegs:$src1)>; +def : Pat <(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)), + (STriw_GP tglobaladdr:$global, 0, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; // Map from store(globaladdress + x) -> memh(#foo + x). let AddedComplexity = 100 in -def : Pat <(truncstorei16 IntRegs:$src1, +def : Pat <(truncstorei16 (i32 IntRegs:$src1), (add (HexagonCONST32_GP tglobaladdr:$global), u16ImmPred:$offset)), - (STrih_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>; + (STrih_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; // Map from store(globaladdress) -> memh(#foo). let AddedComplexity = 100 in -def : Pat <(truncstorei16 IntRegs:$src1, +def : Pat <(truncstorei16 (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)), - (STh_GP tglobaladdr:$global, IntRegs:$src1)>; + (STh_GP tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; // Map from store(globaladdress + x) -> memb(#foo + x). let AddedComplexity = 100 in -def : Pat <(truncstorei8 IntRegs:$src1, +def : Pat <(truncstorei8 (i32 IntRegs:$src1), (add (HexagonCONST32_GP tglobaladdr:$global), u16ImmPred:$offset)), - (STrib_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>; + (STrib_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; // Map from store(globaladdress) -> memb(#foo). let AddedComplexity = 100 in -def : Pat <(truncstorei8 IntRegs:$src1, +def : Pat <(truncstorei8 (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)), - (STb_GP tglobaladdr:$global, IntRegs:$src1)>; + (STb_GP tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[NoV4T]>; // Map from load(globaladdress + x) -> memw(#foo + x). let AddedComplexity = 100 in -def : Pat <(load (add (HexagonCONST32_GP tglobaladdr:$global), - u16ImmPred:$offset)), - (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset)>; +def : Pat <(i32 (load (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; -// Map from load(globaladdress) -> memw(#foo + 0). +// Map from load(globaladdress) -> memw(#foo). let AddedComplexity = 100 in -def : Pat <(load (HexagonCONST32_GP tglobaladdr:$global)), - (LDw_GP tglobaladdr:$global)>; +def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDw_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; // Map from load(globaladdress + x) -> memd(#foo + x). let AddedComplexity = 100 in def : Pat <(i64 (load (add (HexagonCONST32_GP tglobaladdr:$global), u16ImmPred:$offset))), - (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset)>; + (i64 (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; // Map from load(globaladdress) -> memw(#foo + 0). let AddedComplexity = 100 in def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))), - (LDd_GP tglobaladdr:$global)>; - + (i64 (LDd_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; -// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress + 0), Pd = Rd. +// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd. let AddedComplexity = 100 in def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))), - (TFR_PdRs (LDrib_GP tglobaladdr:$global, 0))>; + (i1 (TFR_PdRs (i32 (LDb_GP tglobaladdr:$global))))>, + Requires<[NoV4T]>; // Map from load(globaladdress + x) -> memh(#foo + x). let AddedComplexity = 100 in -def : Pat <(sextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global), - u16ImmPred:$offset)), - (LDrih_GP tglobaladdr:$global, u16ImmPred:$offset)>; +def : Pat <(i32 (extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDrih_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; -// Map from load(globaladdress) -> memh(#foo + 0). +// Map from load(globaladdress + x) -> memh(#foo + x). let AddedComplexity = 100 in -def : Pat <(sextloadi16 (HexagonCONST32_GP tglobaladdr:$global)), - (LDrih_GP tglobaladdr:$global, 0)>; +def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDrih_GP tglobaladdr:$global, 0))>, + Requires<[NoV4T]>; // Map from load(globaladdress + x) -> memuh(#foo + x). let AddedComplexity = 100 in -def : Pat <(zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global), - u16ImmPred:$offset)), - (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset)>; +def : Pat <(i32 (zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; -// Map from load(globaladdress) -> memuh(#foo + 0). +// Map from load(globaladdress) -> memuh(#foo). let AddedComplexity = 100 in -def : Pat <(zextloadi16 (HexagonCONST32_GP tglobaladdr:$global)), - (LDriuh_GP tglobaladdr:$global, 0)>; +def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDriuh_GP tglobaladdr:$global, 0))>, + Requires<[NoV4T]>; -// Map from load(globaladdress + x) -> memuh(#foo + x). +// Map from load(globaladdress) -> memh(#foo). let AddedComplexity = 100 in -def : Pat <(extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global), - u16ImmPred:$offset)), - (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset)>; +def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDh_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; -// Map from load(globaladdress) -> memuh(#foo + 0). -let AddedComplexity = 100 in -def : Pat <(extloadi16 (HexagonCONST32_GP tglobaladdr:$global)), - (LDriuh_GP tglobaladdr:$global, 0)>; -// Map from load(globaladdress + x) -> memub(#foo + x). +// Map from load(globaladdress) -> memuh(#foo). let AddedComplexity = 100 in -def : Pat <(zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global), - u16ImmPred:$offset)), - (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset)>; +def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDuh_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; -// Map from load(globaladdress) -> memuh(#foo + 0). +// Map from load(globaladdress + x) -> memb(#foo + x). let AddedComplexity = 100 in -def : Pat <(zextloadi8 (HexagonCONST32_GP tglobaladdr:$global)), - (LDriub_GP tglobaladdr:$global, 0)>; +def : Pat <(i32 (extloadi8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; // Map from load(globaladdress + x) -> memb(#foo + x). let AddedComplexity = 100 in -def : Pat <(sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global), - u16ImmPred:$offset)), - (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset)>; +def : Pat <(i32 (sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; + +// Map from load(globaladdress + x) -> memub(#foo + x). +let AddedComplexity = 100 in +def : Pat <(i32 (zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[NoV4T]>; // Map from load(globaladdress) -> memb(#foo). let AddedComplexity = 100 in -def : Pat <(extloadi8 (HexagonCONST32_GP tglobaladdr:$global)), - (LDb_GP tglobaladdr:$global)>; +def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDb_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; // Map from load(globaladdress) -> memb(#foo). let AddedComplexity = 100 in -def : Pat <(sextloadi8 (HexagonCONST32_GP tglobaladdr:$global)), - (LDb_GP tglobaladdr:$global)>; +def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDb_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; // Map from load(globaladdress) -> memub(#foo). let AddedComplexity = 100 in -def : Pat <(zextloadi8 (HexagonCONST32_GP tglobaladdr:$global)), - (LDub_GP tglobaladdr:$global)>; +def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDub_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; // When the Interprocedural Global Variable optimizer realizes that a // certain global variable takes only two constant values, it shrinks the // global to a boolean. Catch those loads here in the following 3 patterns. let AddedComplexity = 100 in -def : Pat <(extloadi1 (HexagonCONST32_GP tglobaladdr:$global)), - (LDb_GP tglobaladdr:$global)>; - -let AddedComplexity = 100 in -def : Pat <(sextloadi1 (HexagonCONST32_GP tglobaladdr:$global)), - (LDb_GP tglobaladdr:$global)>; +def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDb_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; let AddedComplexity = 100 in -def : Pat <(zextloadi1 (HexagonCONST32_GP tglobaladdr:$global)), - (LDub_GP tglobaladdr:$global)>; +def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDb_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; -// Map from load(globaladdress) -> memh(#foo). -let AddedComplexity = 100 in -def : Pat <(extloadi16 (HexagonCONST32_GP tglobaladdr:$global)), - (LDh_GP tglobaladdr:$global)>; - -// Map from load(globaladdress) -> memh(#foo). -let AddedComplexity = 100 in -def : Pat <(sextloadi16 (HexagonCONST32_GP tglobaladdr:$global)), - (LDh_GP tglobaladdr:$global)>; - -// Map from load(globaladdress) -> memuh(#foo). let AddedComplexity = 100 in -def : Pat <(zextloadi16 (HexagonCONST32_GP tglobaladdr:$global)), - (LDuh_GP tglobaladdr:$global)>; +def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDub_GP tglobaladdr:$global))>, + Requires<[NoV4T]>; // Map from i1 loads to 32 bits. This assumes that the i1* is byte aligned. def : Pat <(i32 (zextloadi1 ADDRriS11_0:$addr)), - (AND_rr (LDrib ADDRriS11_0:$addr), (TFRI 0x1))>; + (i32 (AND_rr (i32 (LDrib ADDRriS11_0:$addr)), (TFRI 0x1)))>; // Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = SXTW(Rss.lo). -def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i32)), - (i64 (SXTW (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg)))>; +def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i32)), + (i64 (SXTW (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg))))>; // Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = SXTW(SXTH(Rss.lo)). -def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i16)), - (i64 (SXTW (SXTH (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg))))>; +def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i16)), + (i64 (SXTW (i32 (SXTH (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), + subreg_loreg))))))>; // Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = SXTW(SXTB(Rss.lo)). -def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i8)), - (i64 (SXTW (SXTB (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg))))>; +def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i8)), + (i64 (SXTW (i32 (SXTB (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), + subreg_loreg))))))>; -// We want to prevent emiting pnot's as much as possible. +// We want to prevent emitting pnot's as much as possible. // Map brcond with an unsupported setcc to a JMP_cNot. -def : Pat <(brcond (i1 (setne IntRegs:$src1, IntRegs:$src2)), bb:$offset), - (JMP_cNot (CMPEQrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>; +def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + bb:$offset), + (JMP_cNot (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)), + bb:$offset)>; -def : Pat <(brcond (i1 (setne IntRegs:$src1, s10ImmPred:$src2)), bb:$offset), - (JMP_cNot (CMPEQri IntRegs:$src1, s10ImmPred:$src2), bb:$offset)>; +def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)), + bb:$offset), + (JMP_cNot (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2), bb:$offset)>; -def : Pat <(brcond (i1 (setne PredRegs:$src1, (i1 -1))), bb:$offset), - (JMP_cNot PredRegs:$src1, bb:$offset)>; +def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 -1))), bb:$offset), + (JMP_cNot (i1 PredRegs:$src1), bb:$offset)>; -def : Pat <(brcond (i1 (setne PredRegs:$src1, (i1 0))), bb:$offset), - (JMP_c PredRegs:$src1, bb:$offset)>; +def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 0))), bb:$offset), + (JMP_c (i1 PredRegs:$src1), bb:$offset)>; -def : Pat <(brcond (i1 (setlt IntRegs:$src1, s8ImmPred:$src2)), bb:$offset), - (JMP_cNot (CMPGEri IntRegs:$src1, s8ImmPred:$src2), bb:$offset)>; +def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)), + bb:$offset), + (JMP_cNot (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2), bb:$offset)>; -def : Pat <(brcond (i1 (setlt IntRegs:$src1, IntRegs:$src2)), bb:$offset), - (JMP_c (CMPLTrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>; +def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + bb:$offset), + (JMP_c (CMPLTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)), bb:$offset)>; -def : Pat <(brcond (i1 (setuge DoubleRegs:$src1, DoubleRegs:$src2)), +def : Pat <(brcond (i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), bb:$offset), - (JMP_cNot (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1), + (JMP_cNot (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)), bb:$offset)>; -def : Pat <(brcond (i1 (setule IntRegs:$src1, IntRegs:$src2)), bb:$offset), - (JMP_cNot (CMPGTUrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>; +def : Pat <(brcond (i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + bb:$offset), + (JMP_cNot (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)), + bb:$offset)>; -def : Pat <(brcond (i1 (setule DoubleRegs:$src1, DoubleRegs:$src2)), +def : Pat <(brcond (i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), bb:$offset), - (JMP_cNot (CMPGTU64rr DoubleRegs:$src1, DoubleRegs:$src2), - bb:$offset)>; + (JMP_cNot (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)), + bb:$offset)>; // Map from a 64-bit select to an emulated 64-bit mux. // Hexagon does not support 64-bit MUXes; so emulate with combines. -def : Pat <(select PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), - (COMBINE_rr - (MUX_rr PredRegs:$src1, - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg), - (EXTRACT_SUBREG DoubleRegs:$src3, subreg_hireg)), - (MUX_rr PredRegs:$src1, - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src3, subreg_loreg)))>; +def : Pat <(select (i1 PredRegs:$src1), (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src3)), + (i64 (COMBINE_rr (i32 (MUX_rr (i1 PredRegs:$src1), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), + subreg_hireg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3), + subreg_hireg)))), + (i32 (MUX_rr (i1 PredRegs:$src1), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), + subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3), + subreg_loreg))))))>; // Map from a 1-bit select to logical ops. // From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3). -def : Pat <(select PredRegs:$src1, PredRegs:$src2, PredRegs:$src3), - (OR_pp (AND_pp PredRegs:$src1, PredRegs:$src2), - (AND_pp (NOT_p PredRegs:$src1), PredRegs:$src3))>; +def : Pat <(select (i1 PredRegs:$src1), (i1 PredRegs:$src2), + (i1 PredRegs:$src3)), + (OR_pp (AND_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)), + (AND_pp (NOT_p (i1 PredRegs:$src1)), (i1 PredRegs:$src3)))>; // Map Pd = load(addr) -> Rs = load(addr); Pd = Rs. def : Pat<(i1 (load ADDRriS11_2:$addr)), (i1 (TFR_PdRs (i32 (LDrib ADDRriS11_2:$addr))))>; // Map for truncating from 64 immediates to 32 bit immediates. -def : Pat<(i32 (trunc DoubleRegs:$src)), - (i32 (EXTRACT_SUBREG DoubleRegs:$src, subreg_loreg))>; +def : Pat<(i32 (trunc (i64 DoubleRegs:$src))), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg))>; // Map for truncating from i64 immediates to i1 bit immediates. -def : Pat<(i1 (trunc DoubleRegs:$src)), - (i1 (TFR_PdRs (i32(EXTRACT_SUBREG DoubleRegs:$src, subreg_loreg))))>; +def : Pat<(i1 (trunc (i64 DoubleRegs:$src))), + (i1 (TFR_PdRs (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), + subreg_loreg))))>; // Map memb(Rs) = Rdd -> memb(Rs) = Rt. -def : Pat<(truncstorei8 DoubleRegs:$src, ADDRriS11_0:$addr), - (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src, +def : Pat<(truncstorei8 (i64 DoubleRegs:$src), ADDRriS11_0:$addr), + (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg)))>; // Map memh(Rs) = Rdd -> memh(Rs) = Rt. -def : Pat<(truncstorei16 DoubleRegs:$src, ADDRriS11_0:$addr), - (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src, +def : Pat<(truncstorei16 (i64 DoubleRegs:$src), ADDRriS11_0:$addr), + (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), + subreg_loreg)))>; +// Map memw(Rs) = Rdd -> memw(Rs) = Rt +def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr), + (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg)))>; // Map memw(Rs) = Rdd -> memw(Rs) = Rt. -def : Pat<(truncstorei32 DoubleRegs:$src, ADDRriS11_0:$addr), - (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src, +def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr), + (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg)))>; // Map from i1 = constant<-1>; memw(addr) = i1 -> r0 = 1; memw(addr) = r0. @@ -2763,118 +3131,134 @@ let AddedComplexity = 100 in // Map from i1 = constant<-1>; memw(CONST32(#foo)) = i1 -> r0 = 1; // memw(#foo) = r0 def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)), - (STb_GP tglobaladdr:$global, (TFRI 1))>; - + (STb_GP tglobaladdr:$global, (TFRI 1))>, + Requires<[NoV4T]>; // Map from i1 = constant<-1>; store i1 -> r0 = 1; store r0. def : Pat<(store (i1 -1), ADDRriS11_2:$addr), (STrib ADDRriS11_2:$addr, (TFRI 1))>; // Map from memb(Rs) = Pd -> Rt = mux(Pd, #0, #1); store Rt. -def : Pat<(store PredRegs:$src1, ADDRriS11_2:$addr), - (STrib ADDRriS11_2:$addr, (i32 (MUX_ii PredRegs:$src1, 1, 0)) )>; +def : Pat<(store (i1 PredRegs:$src1), ADDRriS11_2:$addr), + (STrib ADDRriS11_2:$addr, (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0)) )>; // Map Rdd = anyext(Rs) -> Rdd = sxtw(Rs). // Hexagon_TODO: We can probably use combine but that will cost 2 instructions. // Better way to do this? -def : Pat<(i64 (anyext IntRegs:$src1)), - (i64 (SXTW IntRegs:$src1))>; +def : Pat<(i64 (anyext (i32 IntRegs:$src1))), + (i64 (SXTW (i32 IntRegs:$src1)))>; // Map cmple -> cmpgt. // rs <= rt -> !(rs > rt). -def : Pat<(i1 (setle IntRegs:$src1, s10ImmPred:$src2)), - (i1 (NOT_p (CMPGTri IntRegs:$src1, s10ImmPred:$src2)))>; +def : Pat<(i1 (setle (i32 IntRegs:$src1), s10ImmPred:$src2)), + (i1 (NOT_p (CMPGTri (i32 IntRegs:$src1), s10ImmPred:$src2)))>; // rs <= rt -> !(rs > rt). -def : Pat<(i1 (setle IntRegs:$src1, IntRegs:$src2)), - (i1 (NOT_p (CMPGTrr IntRegs:$src1, IntRegs:$src2)))>; +def : Pat<(i1 (setle (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + (i1 (NOT_p (CMPGTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>; // Rss <= Rtt -> !(Rss > Rtt). -def : Pat<(i1 (setle DoubleRegs:$src1, DoubleRegs:$src2)), - (i1 (NOT_p (CMPGT64rr DoubleRegs:$src1, DoubleRegs:$src2)))>; +def : Pat<(i1 (setle (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), + (i1 (NOT_p (CMPGT64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>; // Map cmpne -> cmpeq. // Hexagon_TODO: We should improve on this. // rs != rt -> !(rs == rt). -def : Pat <(i1 (setne IntRegs:$src1, s10ImmPred:$src2)), - (i1 (NOT_p(i1 (CMPEQri IntRegs:$src1, s10ImmPred:$src2))))>; +def : Pat <(i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)), + (i1 (NOT_p(i1 (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2))))>; // Map cmpne(Rs) -> !cmpeqe(Rs). // rs != rt -> !(rs == rt). -def : Pat <(i1 (setne IntRegs:$src1, IntRegs:$src2)), - (i1 (NOT_p(i1 (CMPEQrr IntRegs:$src1, IntRegs:$src2))))>; +def : Pat <(i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + (i1 (NOT_p (i1 (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)))))>; // Convert setne back to xor for hexagon since we compute w/ pred registers. -def : Pat <(i1 (setne PredRegs:$src1, PredRegs:$src2)), - (i1 (XOR_pp PredRegs:$src1, PredRegs:$src2))>; +def : Pat <(i1 (setne (i1 PredRegs:$src1), (i1 PredRegs:$src2))), + (i1 (XOR_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>; // Map cmpne(Rss) -> !cmpew(Rss). // rs != rt -> !(rs == rt). -def : Pat <(i1 (setne DoubleRegs:$src1, DoubleRegs:$src2)), - (i1 (NOT_p(i1 (CMPEHexagon4rr DoubleRegs:$src1, DoubleRegs:$src2))))>; +def : Pat <(i1 (setne (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), + (i1 (NOT_p (i1 (CMPEHexagon4rr (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2)))))>; // Map cmpge(Rs, Rt) -> !(cmpgt(Rs, Rt). // rs >= rt -> !(rt > rs). -def : Pat <(i1 (setge IntRegs:$src1, IntRegs:$src2)), - (i1 (NOT_p(i1 (CMPGTrr IntRegs:$src2, IntRegs:$src1))))>; +def : Pat <(i1 (setge (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + (i1 (NOT_p (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))))>; -def : Pat <(i1 (setge IntRegs:$src1, s8ImmPred:$src2)), - (i1 (CMPGEri IntRegs:$src1, s8ImmPred:$src2))>; +def : Pat <(i1 (setge (i32 IntRegs:$src1), s8ImmPred:$src2)), + (i1 (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2))>; // Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss). // rss >= rtt -> !(rtt > rss). -def : Pat <(i1 (setge DoubleRegs:$src1, DoubleRegs:$src2)), - (i1 (NOT_p(i1 (CMPGT64rr DoubleRegs:$src2, DoubleRegs:$src1))))>; +def : Pat <(i1 (setge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), + (i1 (NOT_p (i1 (CMPGT64rr (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src1)))))>; // Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm). // rs < rt -> !(rs >= rt). -def : Pat <(i1 (setlt IntRegs:$src1, s8ImmPred:$src2)), - (i1 (NOT_p (CMPGEri IntRegs:$src1, s8ImmPred:$src2)))>; +def : Pat <(i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)), + (i1 (NOT_p (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2)))>; -// Map cmplt(Rs, Rt) -> cmplt(Rs, Rt). -// rs < rt -> rs < rt. Let assembler map it. -def : Pat <(i1 (setlt IntRegs:$src1, IntRegs:$src2)), - (i1 (CMPLTrr IntRegs:$src2, IntRegs:$src1))>; +// Map cmplt(Rs, Rt) -> cmpgt(Rt, Rs). +// rs < rt -> rt > rs. +// We can let assembler map it, or we can do in the compiler itself. +def : Pat <(i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>; // Map cmplt(Rss, Rtt) -> cmpgt(Rtt, Rss). // rss < rtt -> (rtt > rss). -def : Pat <(i1 (setlt DoubleRegs:$src1, DoubleRegs:$src2)), - (i1 (CMPGT64rr DoubleRegs:$src2, DoubleRegs:$src1))>; +def : Pat <(i1 (setlt (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), + (i1 (CMPGT64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>; -// Map from cmpltu(Rs, Rd) -> !cmpgtu(Rs, Rd - 1). +// Map from cmpltu(Rs, Rd) -> cmpgtu(Rd, Rs) // rs < rt -> rt > rs. -def : Pat <(i1 (setult IntRegs:$src1, IntRegs:$src2)), - (i1 (CMPGTUrr IntRegs:$src2, IntRegs:$src1))>; +// We can let assembler map it, or we can do in the compiler itself. +def : Pat <(i1 (setult (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + (i1 (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>; -// Map from cmpltu(Rss, Rdd) -> !cmpgtu(Rss, Rdd - 1). +// Map from cmpltu(Rss, Rdd) -> cmpgtu(Rdd, Rss). // rs < rt -> rt > rs. -def : Pat <(i1 (setult DoubleRegs:$src1, DoubleRegs:$src2)), - (i1 (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1))>; +def : Pat <(i1 (setult (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), + (i1 (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>; + +// Generate cmpgeu(Rs, #u8) +def : Pat <(i1 (setuge (i32 IntRegs:$src1), u8ImmPred:$src2)), + (i1 (CMPGEUri (i32 IntRegs:$src1), u8ImmPred:$src2))>; + +// Generate cmpgtu(Rs, #u9) +def : Pat <(i1 (setugt (i32 IntRegs:$src1), u9ImmPred:$src2)), + (i1 (CMPGTUri (i32 IntRegs:$src1), u9ImmPred:$src2))>; // Map from Rs >= Rt -> !(Rt > Rs). // rs >= rt -> !(rt > rs). -def : Pat <(i1 (setuge IntRegs:$src1, IntRegs:$src2)), - (i1 (NOT_p (CMPGTUrr IntRegs:$src2, IntRegs:$src1)))>; +def : Pat <(i1 (setuge (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1))))>; // Map from Rs >= Rt -> !(Rt > Rs). // rs >= rt -> !(rt > rs). -def : Pat <(i1 (setuge DoubleRegs:$src1, DoubleRegs:$src2)), - (i1 (NOT_p (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1)))>; +def : Pat <(i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), + (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1))))>; // Map from cmpleu(Rs, Rs) -> !cmpgtu(Rs, Rs). // Map from (Rs <= Rt) -> !(Rs > Rt). -def : Pat <(i1 (setule IntRegs:$src1, IntRegs:$src2)), - (i1 (NOT_p (CMPGTUrr IntRegs:$src1, IntRegs:$src2)))>; +def : Pat <(i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))), + (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>; // Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1). // Map from (Rs <= Rt) -> !(Rs > Rt). -def : Pat <(i1 (setule DoubleRegs:$src1, DoubleRegs:$src2)), - (i1 (NOT_p (CMPGTU64rr DoubleRegs:$src1, DoubleRegs:$src2)))>; +def : Pat <(i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))), + (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>; // Sign extends. // i1 -> i32 -def : Pat <(i32 (sext PredRegs:$src1)), - (i32 (MUX_ii PredRegs:$src1, -1, 0))>; +def : Pat <(i32 (sext (i1 PredRegs:$src1))), + (i32 (MUX_ii (i1 PredRegs:$src1), -1, 0))>; + +// i1 -> i64 +def : Pat <(i64 (sext (i1 PredRegs:$src1))), + (i64 (COMBINE_rr (TFRI -1), (MUX_ii (i1 PredRegs:$src1), -1, 0)))>; // Convert sign-extended load back to load and sign extend. // i8 -> i64 @@ -2899,16 +3283,16 @@ def: Pat <(i64 (sextloadi32 ADDRriS11_2:$src1)), // Zero extends. // i1 -> i32 -def : Pat <(i32 (zext PredRegs:$src1)), - (i32 (MUX_ii PredRegs:$src1, 1, 0))>; +def : Pat <(i32 (zext (i1 PredRegs:$src1))), + (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>; // i1 -> i64 -def : Pat <(i64 (zext PredRegs:$src1)), - (i64 (COMBINE_rr (TFRI 0), (MUX_ii PredRegs:$src1, 1, 0)))>; +def : Pat <(i64 (zext (i1 PredRegs:$src1))), + (i64 (COMBINE_rr (TFRI 0), (MUX_ii (i1 PredRegs:$src1), 1, 0)))>; // i32 -> i64 -def : Pat <(i64 (zext IntRegs:$src1)), - (i64 (COMBINE_rr (TFRI 0), IntRegs:$src1))>; +def : Pat <(i64 (zext (i32 IntRegs:$src1))), + (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>; // i8 -> i64 def: Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)), @@ -2926,16 +3310,16 @@ def: Pat <(i32 (zextloadi1 ADDRriS11_0:$src1)), (i32 (LDriw ADDRriS11_0:$src1))>; // Map from Rs = Pd to Pd = mux(Pd, #1, #0) -def : Pat <(i32 (zext PredRegs:$src1)), - (i32 (MUX_ii PredRegs:$src1, 1, 0))>; +def : Pat <(i32 (zext (i1 PredRegs:$src1))), + (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>; // Map from Rs = Pd to Pd = mux(Pd, #1, #0) -def : Pat <(i32 (anyext PredRegs:$src1)), - (i32 (MUX_ii PredRegs:$src1, 1, 0))>; +def : Pat <(i32 (anyext (i1 PredRegs:$src1))), + (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>; // Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0)) -def : Pat <(i64 (anyext PredRegs:$src1)), - (i64 (SXTW (i32 (MUX_ii PredRegs:$src1, 1, 0))))>; +def : Pat <(i64 (anyext (i1 PredRegs:$src1))), + (i64 (SXTW (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))))>; // Any extended 64-bit load. @@ -2948,75 +3332,103 @@ def: Pat <(i64 (extloadi16 ADDRriS11_2:$src1)), (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>; // Map from Rdd = zxtw(Rs) -> Rdd = combine(0, Rs). -def : Pat<(i64 (zext IntRegs:$src1)), - (i64 (COMBINE_rr (TFRI 0), IntRegs:$src1))>; +def : Pat<(i64 (zext (i32 IntRegs:$src1))), + (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>; // Multiply 64-bit unsigned and use upper result. -def : Pat <(mulhu DoubleRegs:$src1, DoubleRegs:$src2), - (MPYU64_acc(COMBINE_rr (TFRI 0), - (EXTRACT_SUBREG - (LSRd_ri(MPYU64_acc(MPYU64_acc(COMBINE_rr (TFRI 0), - (EXTRACT_SUBREG (LSRd_ri(MPYU64 - (EXTRACT_SUBREG DoubleRegs:$src1, - subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src2, - subreg_loreg)), - 32) ,subreg_loreg)), - (EXTRACT_SUBREG DoubleRegs:$src1, - subreg_hireg), - (EXTRACT_SUBREG DoubleRegs:$src2, - subreg_loreg)), - (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)), - 32),subreg_loreg)), - (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg), - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg) - )>; +def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)), + (i64 + (MPYU64_acc + (i64 + (COMBINE_rr + (TFRI 0), + (i32 + (EXTRACT_SUBREG + (i64 + (LSRd_ri + (i64 + (MPYU64_acc + (i64 + (MPYU64_acc + (i64 + (COMBINE_rr (TFRI 0), + (i32 + (EXTRACT_SUBREG + (i64 + (LSRd_ri + (i64 + (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), + subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), + subreg_loreg)))), 32)), + subreg_loreg)))), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))), + 32)), subreg_loreg)))), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>; // Multiply 64-bit signed and use upper result. -def : Pat <(mulhs DoubleRegs:$src1, DoubleRegs:$src2), - (MPY64_acc(COMBINE_rr (TFRI 0), - (EXTRACT_SUBREG - (LSRd_ri(MPY64_acc(MPY64_acc(COMBINE_rr (TFRI 0), - (EXTRACT_SUBREG (LSRd_ri(MPYU64 - (EXTRACT_SUBREG DoubleRegs:$src1, - subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src2, - subreg_loreg)), - 32) ,subreg_loreg)), - (EXTRACT_SUBREG DoubleRegs:$src1, - subreg_hireg), - (EXTRACT_SUBREG DoubleRegs:$src2, - subreg_loreg)), - (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)), - 32),subreg_loreg)), - (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg), - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg) - )>; +def : Pat <(mulhs (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)), + (i64 + (MPY64_acc + (i64 + (COMBINE_rr (TFRI 0), + (i32 + (EXTRACT_SUBREG + (i64 + (LSRd_ri + (i64 + (MPY64_acc + (i64 + (MPY64_acc + (i64 + (COMBINE_rr (TFRI 0), + (i32 + (EXTRACT_SUBREG + (i64 + (LSRd_ri + (i64 + (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), + subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), + subreg_loreg)))), 32)), + subreg_loreg)))), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))), + 32)), subreg_loreg)))), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>; // Hexagon specific ISD nodes. -def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>; +//def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>; +def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, + [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def Hexagon_ADJDYNALLOC : SDNode<"HexagonISD::ADJDYNALLOC", - SDTHexagonADJDYNALLOC>; + SDTHexagonADJDYNALLOC>; // Needed to tag these instructions for stack layout. let usesCustomInserter = 1 in def ADJDYNALLOC : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1, s16Imm:$src2), "$dst = add($src1, #$src2)", - [(set IntRegs:$dst, (Hexagon_ADJDYNALLOC IntRegs:$src1, - s16ImmPred:$src2))]>; + [(set (i32 IntRegs:$dst), + (Hexagon_ADJDYNALLOC (i32 IntRegs:$src1), + s16ImmPred:$src2))]>; -def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, []>; +def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; def Hexagon_ARGEXTEND : SDNode<"HexagonISD::ARGEXTEND", SDTHexagonARGEXTEND>; def ARGEXTEND : ALU32_rr <(outs IntRegs:$dst), (ins IntRegs:$src1), "$dst = $src1", - [(set IntRegs:$dst, (Hexagon_ARGEXTEND IntRegs:$src1))]>; + [(set (i32 IntRegs:$dst), + (Hexagon_ARGEXTEND (i32 IntRegs:$src1)))]>; let AddedComplexity = 100 in -def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND IntRegs:$src1), i16)), - (TFR IntRegs:$src1)>; - +def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND (i32 IntRegs:$src1)), i16)), + (COPY (i32 IntRegs:$src1))>; def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>; @@ -3024,12 +3436,91 @@ def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>; let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in def BR_JT : JRInst<(outs), (ins IntRegs:$src), "jumpr $src", - [(HexagonBR_JT IntRegs:$src)]>; + [(HexagonBR_JT (i32 IntRegs:$src))]>; + def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>; def : Pat<(HexagonWrapperJT tjumptable:$dst), - (CONST32_set_jt tjumptable:$dst)>; + (i32 (CONST32_set_jt tjumptable:$dst))>; + +// XTYPE/SHIFT + +// Multi-class for logical operators : +// Shift by immediate/register and accumulate/logical +multiclass xtype_imm<string OpcStr, SDNode OpNode1, SDNode OpNode2> { + def _ri : SInst_acc<(outs IntRegs:$dst), + (ins IntRegs:$src1, IntRegs:$src2, u5Imm:$src3), + !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")), + [(set (i32 IntRegs:$dst), + (OpNode2 (i32 IntRegs:$src1), + (OpNode1 (i32 IntRegs:$src2), + u5ImmPred:$src3)))], + "$src1 = $dst">; + def d_ri : SInst_acc<(outs DoubleRegs:$dst), + (ins DoubleRegs:$src1, DoubleRegs:$src2, u6Imm:$src3), + !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")), + [(set (i64 DoubleRegs:$dst), (OpNode2 (i64 DoubleRegs:$src1), + (OpNode1 (i64 DoubleRegs:$src2), u6ImmPred:$src3)))], + "$src1 = $dst">; +} + +// Multi-class for logical operators : +// Shift by register and accumulate/logical (32/64 bits) +multiclass xtype_reg<string OpcStr, SDNode OpNode1, SDNode OpNode2> { + def _rr : SInst_acc<(outs IntRegs:$dst), + (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")), + [(set (i32 IntRegs:$dst), + (OpNode2 (i32 IntRegs:$src1), + (OpNode1 (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], + "$src1 = $dst">; + + def d_rr : SInst_acc<(outs DoubleRegs:$dst), + (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), + !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")), + [(set (i64 DoubleRegs:$dst), + (OpNode2 (i64 DoubleRegs:$src1), + (OpNode1 (i64 DoubleRegs:$src2), + (i32 IntRegs:$src3))))], + "$src1 = $dst">; + +} + +multiclass basic_xtype_imm<string OpcStr, SDNode OpNode> { +let AddedComplexity = 100 in + defm _ADD : xtype_imm< !strconcat("+= ", OpcStr), OpNode, add>; + defm _SUB : xtype_imm< !strconcat("-= ", OpcStr), OpNode, sub>; + defm _AND : xtype_imm< !strconcat("&= ", OpcStr), OpNode, and>; + defm _OR : xtype_imm< !strconcat("|= ", OpcStr), OpNode, or>; +} + +multiclass basic_xtype_reg<string OpcStr, SDNode OpNode> { +let AddedComplexity = 100 in + defm _ADD : xtype_reg< !strconcat("+= ", OpcStr), OpNode, add>; + defm _SUB : xtype_reg< !strconcat("-= ", OpcStr), OpNode, sub>; + defm _AND : xtype_reg< !strconcat("&= ", OpcStr), OpNode, and>; + defm _OR : xtype_reg< !strconcat("|= ", OpcStr), OpNode, or>; +} + +multiclass xtype_xor_imm<string OpcStr, SDNode OpNode> { +let AddedComplexity = 100 in + defm _XOR : xtype_imm< !strconcat("^= ", OpcStr), OpNode, xor>; +} + +defm ASL : basic_xtype_imm<"asl", shl>, basic_xtype_reg<"asl", shl>, + xtype_xor_imm<"asl", shl>; + +defm LSR : basic_xtype_imm<"lsr", srl>, basic_xtype_reg<"lsr", srl>, + xtype_xor_imm<"lsr", srl>; + +defm ASR : basic_xtype_imm<"asr", sra>, basic_xtype_reg<"asr", sra>; +defm LSL : basic_xtype_reg<"lsl", shl>; + +// Change the sign of the immediate for Rd=-mpyi(Rs,#u8) +def : Pat <(mul (i32 IntRegs:$src1), (ineg n8ImmPred:$src2)), + (i32 (MPYI_rin (i32 IntRegs:$src1), u8ImmPred:$src2))>; //===----------------------------------------------------------------------===// // V3 Instructions + @@ -3046,3 +3537,19 @@ include "HexagonInstrInfoV3.td" //===----------------------------------------------------------------------===// include "HexagonInstrInfoV4.td" + +//===----------------------------------------------------------------------===// +// V4 Instructions - +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// V5 Instructions + +//===----------------------------------------------------------------------===// + +include "HexagonInstrInfoV5.td" + +//===----------------------------------------------------------------------===// +// V5 Instructions - +//===----------------------------------------------------------------------===// + + diff --git a/lib/Target/Hexagon/HexagonInstrInfoV3.td b/lib/Target/Hexagon/HexagonInstrInfoV3.td index a73897e..157ab3d 100644 --- a/lib/Target/Hexagon/HexagonInstrInfoV3.td +++ b/lib/Target/Hexagon/HexagonInstrInfoV3.td @@ -19,7 +19,7 @@ let isCall = 1, neverHasSideEffects = 1, Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in { - def CALLv3 : JInst<(outs), (ins calltarget:$dst, variable_ops), + def CALLv3 : JInst<(outs), (ins calltarget:$dst), "call $dst", []>, Requires<[HasV3T]>; } @@ -35,16 +35,17 @@ let isCall = 1, neverHasSideEffects = 1, let isCall = 1, neverHasSideEffects = 1, Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in { - def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst, variable_ops), + def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst), "callr $dst", []>, Requires<[HasV3TOnly]>; } +// Jump to address from register // if(p?.new) jumpr:t r? let isReturn = 1, isTerminator = 1, isBarrier = 1, Defs = [PC], Uses = [R31] in { - def JMPR_cPnewt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2), + def JMPR_cdnPt_V3: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2), "if ($src1.new) jumpr:t $src2", []>, Requires<[HasV3T]>; } @@ -52,7 +53,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, // if (!p?.new) jumpr:t r? let isReturn = 1, isTerminator = 1, isBarrier = 1, Defs = [PC], Uses = [R31] in { - def JMPR_cNotPnewt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2), + def JMPR_cdnNotPt_V3: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2), "if (!$src1.new) jumpr:t $src2", []>, Requires<[HasV3T]>; } @@ -61,7 +62,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, // if(p?.new) jumpr:nt r? let isReturn = 1, isTerminator = 1, isBarrier = 1, Defs = [PC], Uses = [R31] in { - def JMPR_cPnewNt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2), + def JMPR_cdnPnt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2), "if ($src1.new) jumpr:nt $src2", []>, Requires<[HasV3T]>; } @@ -69,7 +70,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, // if (!p?.new) jumpr:nt r? let isReturn = 1, isTerminator = 1, isBarrier = 1, Defs = [PC], Uses = [R31] in { - def JMPR_cNotPnewNt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2), + def JMPR_cdnNotPnt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2), "if (!$src1.new) jumpr:nt $src2", []>, Requires<[HasV3T]>; } @@ -86,20 +87,22 @@ let AddedComplexity = 200 in def MAXw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = max($src2, $src1)", - [(set DoubleRegs:$dst, (select (i1 (setlt DoubleRegs:$src2, - DoubleRegs:$src1)), - DoubleRegs:$src1, - DoubleRegs:$src2))]>, + [(set (i64 DoubleRegs:$dst), + (i64 (select (i1 (setlt (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src1))), + (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2))))]>, Requires<[HasV3T]>; let AddedComplexity = 200 in def MINw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = min($src2, $src1)", - [(set DoubleRegs:$dst, (select (i1 (setgt DoubleRegs:$src2, - DoubleRegs:$src1)), - DoubleRegs:$src1, - DoubleRegs:$src2))]>, + [(set (i64 DoubleRegs:$dst), + (i64 (select (i1 (setgt (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src1))), + (i64 DoubleRegs:$src1), + (i64 DoubleRegs:$src2))))]>, Requires<[HasV3T]>; //===----------------------------------------------------------------------===// @@ -109,25 +112,25 @@ Requires<[HasV3T]>; -//def : Pat <(brcond (i1 (seteq IntRegs:$src1, 0)), bb:$offset), -// (JMP_RegEzt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>; +//def : Pat <(brcond (i1 (seteq (i32 IntRegs:$src1), 0)), bb:$offset), +// (JMP_RegEzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>; -//def : Pat <(brcond (i1 (setne IntRegs:$src1, 0)), bb:$offset), -// (JMP_RegNzt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>; +//def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), 0)), bb:$offset), +// (JMP_RegNzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>; -//def : Pat <(brcond (i1 (setle IntRegs:$src1, 0)), bb:$offset), -// (JMP_RegLezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>; +//def : Pat <(brcond (i1 (setle (i32 IntRegs:$src1), 0)), bb:$offset), +// (JMP_RegLezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>; -//def : Pat <(brcond (i1 (setge IntRegs:$src1, 0)), bb:$offset), -// (JMP_RegGezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>; +//def : Pat <(brcond (i1 (setge (i32 IntRegs:$src1), 0)), bb:$offset), +// (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>; -//def : Pat <(brcond (i1 (setgt IntRegs:$src1, -1)), bb:$offset), -// (JMP_RegGezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>; +//def : Pat <(brcond (i1 (setgt (i32 IntRegs:$src1), -1)), bb:$offset), +// (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>; // Map call instruction -def : Pat<(call IntRegs:$dst), - (CALLRv3 IntRegs:$dst)>, Requires<[HasV3T]>; +def : Pat<(call (i32 IntRegs:$dst)), + (CALLRv3 (i32 IntRegs:$dst))>, Requires<[HasV3T]>; def : Pat<(call tglobaladdr:$dst), (CALLv3 tglobaladdr:$dst)>, Requires<[HasV3T]>; def : Pat<(call texternalsym:$dst), diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td index 9e60cf2..70448fc 100644 --- a/lib/Target/Hexagon/HexagonInstrInfoV4.td +++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td @@ -11,6 +11,12 @@ // //===----------------------------------------------------------------------===// +let neverHasSideEffects = 1 in +def IMMEXT : Immext<(outs), (ins), + "/* immext #... */", + []>, + Requires<[HasV4T]>; + // Hexagon V4 Architecture spec defines 8 instruction classes: // LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the // compiler) @@ -250,23 +256,151 @@ def ZXTH_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst), []>, Requires<[HasV4T]>; +// Generate frame index addresses. +let neverHasSideEffects = 1, isReMaterializable = 1 in +def TFR_FI_immext_V4 : ALU32_ri<(outs IntRegs:$dst), + (ins IntRegs:$src1, s32Imm:$offset), + "$dst = add($src1, ##$offset)", + []>, + Requires<[HasV4T]>; + //===----------------------------------------------------------------------===// // ALU32 - //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// ALU32/PERM + +//===----------------------------------------------------------------------===// + +// Combine +// Rdd=combine(Rs, #s8) +let neverHasSideEffects = 1 in +def COMBINE_ri_V4 : ALU32_ri<(outs DoubleRegs:$dst), + (ins IntRegs:$src1, s8Imm:$src2), + "$dst = combine($src1, #$src2)", + []>, + Requires<[HasV4T]>; +// Rdd=combine(#s8, Rs) +let neverHasSideEffects = 1 in +def COMBINE_ir_V4 : ALU32_ir<(outs DoubleRegs:$dst), + (ins s8Imm:$src1, IntRegs:$src2), + "$dst = combine(#$src1, $src2)", + []>, + Requires<[HasV4T]>; +//===----------------------------------------------------------------------===// +// ALU32/PERM + +//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // LD + //===----------------------------------------------------------------------===// -/// -/// Make sure that in post increment load, the first operand is always the post -/// increment operand. -/// -//// Load doubleword. -// Rdd=memd(Re=#U6) +// +// These absolute set addressing mode instructions accept immediate as +// an operand. We have duplicated these patterns to take global address. + +let neverHasSideEffects = 1 in +def LDrid_abs_setimm_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2), + (ins u6Imm:$addr), + "$dst1 = memd($dst2=#$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memb(Re=#U6) +let neverHasSideEffects = 1 in +def LDrib_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins u6Imm:$addr), + "$dst1 = memb($dst2=#$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memh(Re=#U6) +let neverHasSideEffects = 1 in +def LDrih_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins u6Imm:$addr), + "$dst1 = memh($dst2=#$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memub(Re=#U6) +let neverHasSideEffects = 1 in +def LDriub_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins u6Imm:$addr), + "$dst1 = memub($dst2=#$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memuh(Re=#U6) +let neverHasSideEffects = 1 in +def LDriuh_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins u6Imm:$addr), + "$dst1 = memuh($dst2=#$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memw(Re=#U6) +let neverHasSideEffects = 1 in +def LDriw_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins u6Imm:$addr), + "$dst1 = memw($dst2=#$addr)", + []>, + Requires<[HasV4T]>; + +// Following patterns are defined for absolute set addressing mode +// instruction which take global address as operand. +let neverHasSideEffects = 1 in +def LDrid_abs_set_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2), + (ins globaladdress:$addr), + "$dst1 = memd($dst2=##$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memb(Re=#U6) +let neverHasSideEffects = 1 in +def LDrib_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins globaladdress:$addr), + "$dst1 = memb($dst2=##$addr)", + []>, + Requires<[HasV4T]>; +// Rd=memh(Re=#U6) +let neverHasSideEffects = 1 in +def LDrih_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins globaladdress:$addr), + "$dst1 = memh($dst2=##$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memub(Re=#U6) +let neverHasSideEffects = 1 in +def LDriub_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins globaladdress:$addr), + "$dst1 = memub($dst2=##$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memuh(Re=#U6) +let neverHasSideEffects = 1 in +def LDriuh_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins globaladdress:$addr), + "$dst1 = memuh($dst2=##$addr)", + []>, + Requires<[HasV4T]>; + +// Rd=memw(Re=#U6) +let neverHasSideEffects = 1 in +def LDriw_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2), + (ins globaladdress:$addr), + "$dst1 = memw($dst2=##$addr)", + []>, + Requires<[HasV4T]>; + +// Load doubleword. +// +// Make sure that in post increment load, the first operand is always the post +// increment operand. +// // Rdd=memd(Rs+Rt<<#u2) // Special case pattern for indexed load without offset which is easier to // match. AddedComplexity of this pattern should be lower than base+offset load @@ -276,56 +410,58 @@ let AddedComplexity = 10, isPredicable = 1 in def LDrid_indexed_V4 : LDInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst=memd($src1+$src2<<#0)", - [(set DoubleRegs:$dst, (load (add IntRegs:$src1, - IntRegs:$src2)))]>, + [(set (i64 DoubleRegs:$dst), + (i64 (load (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))))]>, Requires<[HasV4T]>; let AddedComplexity = 40, isPredicable = 1 in def LDrid_indexed_shl_V4 : LDInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset), "$dst=memd($src1+$src2<<#$offset)", - [(set DoubleRegs:$dst, (load (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$offset))))]>, + [(set (i64 DoubleRegs:$dst), + (i64 (load (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$offset)))))]>, Requires<[HasV4T]>; //// Load doubleword conditionally. // if ([!]Pv[.new]) Rd=memd(Rs+Rt<<#u2) // if (Pv) Rd=memd(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrid_indexed_cPt_V4 : LDInst<(outs DoubleRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrid_indexed_cPt_V4 : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1) $dst=memd($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv.new) Rd=memd(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrid_indexed_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrid_indexed_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1.new) $dst=memd($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv) Rd=memd(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrid_indexed_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrid_indexed_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1) $dst=memd($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv.new) Rd=memd(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrid_indexed_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrid_indexed_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1.new) $dst=memd($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv) Rd=memd(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrid_indexed_shl_cPt_V4 : LDInst<(outs DoubleRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrid_indexed_shl_cPt_V4 : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1) $dst=memd($src2+$src3<<#$offset)", @@ -333,8 +469,8 @@ def LDrid_indexed_shl_cPt_V4 : LDInst<(outs DoubleRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) Rd=memd(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrid_indexed_shl_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrid_indexed_shl_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1.new) $dst=memd($src2+$src3<<#$offset)", @@ -342,8 +478,8 @@ def LDrid_indexed_shl_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst), Requires<[HasV4T]>; // if (!Pv) Rd=memd(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrid_indexed_shl_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrid_indexed_shl_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1) $dst=memd($src2+$src3<<#$offset)", @@ -351,8 +487,8 @@ def LDrid_indexed_shl_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) Rd=memd(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrid_indexed_shl_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrid_indexed_shl_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1.new) $dst=memd($src2+$src3<<#$offset)", @@ -362,99 +498,101 @@ def LDrid_indexed_shl_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst), // Rdd=memd(Rt<<#u2+#U6) //// Load byte. -// Rd=memb(Re=#U6) - // Rd=memb(Rs+Rt<<#u2) let AddedComplexity = 10, isPredicable = 1 in def LDrib_indexed_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst=memb($src1+$src2<<#0)", - [(set IntRegs:$dst, (sextloadi8 (add IntRegs:$src1, - IntRegs:$src2)))]>, + [(set (i32 IntRegs:$dst), + (i32 (sextloadi8 (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))))]>, Requires<[HasV4T]>; let AddedComplexity = 10, isPredicable = 1 in def LDriub_indexed_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst=memub($src1+$src2<<#0)", - [(set IntRegs:$dst, (zextloadi8 (add IntRegs:$src1, - IntRegs:$src2)))]>, + [(set (i32 IntRegs:$dst), + (i32 (zextloadi8 (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))))]>, Requires<[HasV4T]>; let AddedComplexity = 10, isPredicable = 1 in def LDriub_ae_indexed_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst=memub($src1+$src2<<#0)", - [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1, - IntRegs:$src2)))]>, + [(set (i32 IntRegs:$dst), + (i32 (extloadi8 (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))))]>, Requires<[HasV4T]>; let AddedComplexity = 40, isPredicable = 1 in def LDrib_indexed_shl_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset), "$dst=memb($src1+$src2<<#$offset)", - [(set IntRegs:$dst, - (sextloadi8 (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$offset))))]>, + [(set (i32 IntRegs:$dst), + (i32 (sextloadi8 (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$offset)))))]>, Requires<[HasV4T]>; let AddedComplexity = 40, isPredicable = 1 in def LDriub_indexed_shl_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset), "$dst=memub($src1+$src2<<#$offset)", - [(set IntRegs:$dst, - (zextloadi8 (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$offset))))]>, + [(set (i32 IntRegs:$dst), + (i32 (zextloadi8 (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$offset)))))]>, Requires<[HasV4T]>; let AddedComplexity = 40, isPredicable = 1 in def LDriub_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset), "$dst=memub($src1+$src2<<#$offset)", - [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$offset))))]>, + [(set (i32 IntRegs:$dst), + (i32 (extloadi8 (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$offset)))))]>, Requires<[HasV4T]>; //// Load byte conditionally. // if ([!]Pv[.new]) Rd=memb(Rs+Rt<<#u2) // if (Pv) Rd=memb(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrib_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrib_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1) $dst=memb($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv.new) Rd=memb(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrib_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrib_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1.new) $dst=memb($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv) Rd=memb(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrib_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrib_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1) $dst=memb($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv.new) Rd=memb(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrib_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrib_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1.new) $dst=memb($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv) Rd=memb(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrib_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrib_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1) $dst=memb($src2+$src3<<#$offset)", @@ -462,8 +600,8 @@ def LDrib_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) Rd=memb(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrib_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrib_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1.new) $dst=memb($src2+$src3<<#$offset)", @@ -471,8 +609,8 @@ def LDrib_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv) Rd=memb(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrib_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrib_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1) $dst=memb($src2+$src3<<#$offset)", @@ -480,8 +618,8 @@ def LDrib_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) Rd=memb(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrib_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrib_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1.new) $dst=memb($src2+$src3<<#$offset)", @@ -491,40 +629,40 @@ def LDrib_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), //// Load unsigned byte conditionally. // if ([!]Pv[.new]) Rd=memub(Rs+Rt<<#u2) // if (Pv) Rd=memub(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriub_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriub_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1) $dst=memub($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv.new) Rd=memub(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriub_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriub_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1.new) $dst=memub($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv) Rd=memub(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriub_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriub_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1) $dst=memub($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv.new) Rd=memub(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriub_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriub_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1.new) $dst=memub($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv) Rd=memub(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriub_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriub_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1) $dst=memub($src2+$src3<<#$offset)", @@ -532,8 +670,8 @@ def LDriub_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) Rd=memub(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriub_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriub_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1.new) $dst=memub($src2+$src3<<#$offset)", @@ -541,8 +679,8 @@ def LDriub_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv) Rd=memub(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriub_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriub_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1) $dst=memub($src2+$src3<<#$offset)", @@ -550,8 +688,8 @@ def LDriub_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) Rd=memub(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriub_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriub_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1.new) $dst=memub($src2+$src3<<#$offset)", @@ -561,31 +699,32 @@ def LDriub_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), // Rd=memb(Rt<<#u2+#U6) //// Load halfword -// Rd=memh(Re=#U6) - // Rd=memh(Rs+Rt<<#u2) let AddedComplexity = 10, isPredicable = 1 in def LDrih_indexed_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst=memh($src1+$src2<<#0)", - [(set IntRegs:$dst, (sextloadi16 (add IntRegs:$src1, - IntRegs:$src2)))]>, + [(set (i32 IntRegs:$dst), + (i32 (sextloadi16 (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))))]>, Requires<[HasV4T]>; let AddedComplexity = 10, isPredicable = 1 in def LDriuh_indexed_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst=memuh($src1+$src2<<#0)", - [(set IntRegs:$dst, (zextloadi16 (add IntRegs:$src1, - IntRegs:$src2)))]>, + [(set (i32 IntRegs:$dst), + (i32 (zextloadi16 (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))))]>, Requires<[HasV4T]>; let AddedComplexity = 10, isPredicable = 1 in def LDriuh_ae_indexed_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst=memuh($src1+$src2<<#0)", - [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1, - IntRegs:$src2)))]>, + [(set (i32 IntRegs:$dst), + (i32 (extloadi16 (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))))]>, Requires<[HasV4T]>; // Rd=memh(Rs+Rt<<#u2) @@ -593,69 +732,69 @@ let AddedComplexity = 40, isPredicable = 1 in def LDrih_indexed_shl_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset), "$dst=memh($src1+$src2<<#$offset)", - [(set IntRegs:$dst, - (sextloadi16 (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$offset))))]>, + [(set (i32 IntRegs:$dst), + (i32 (sextloadi16 (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$offset)))))]>, Requires<[HasV4T]>; let AddedComplexity = 40, isPredicable = 1 in def LDriuh_indexed_shl_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset), "$dst=memuh($src1+$src2<<#$offset)", - [(set IntRegs:$dst, - (zextloadi16 (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$offset))))]>, + [(set (i32 IntRegs:$dst), + (i32 (zextloadi16 (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$offset)))))]>, Requires<[HasV4T]>; let AddedComplexity = 40, isPredicable = 1 in def LDriuh_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset), "$dst=memuh($src1+$src2<<#$offset)", - [(set IntRegs:$dst, - (extloadi16 (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$offset))))]>, + [(set (i32 IntRegs:$dst), + (i32 (extloadi16 (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$offset)))))]>, Requires<[HasV4T]>; //// Load halfword conditionally. // if ([!]Pv[.new]) Rd=memh(Rs+Rt<<#u2) // if (Pv) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrih_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrih_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1) $dst=memh($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv.new) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrih_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrih_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1.new) $dst=memh($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrih_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrih_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1) $dst=memh($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv.new) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDrih_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDrih_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1.new) $dst=memh($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrih_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrih_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1) $dst=memh($src2+$src3<<#$offset)", @@ -663,8 +802,8 @@ def LDrih_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrih_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrih_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1.new) $dst=memh($src2+$src3<<#$offset)", @@ -672,8 +811,8 @@ def LDrih_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrih_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrih_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1) $dst=memh($src2+$src3<<#$offset)", @@ -681,8 +820,8 @@ def LDrih_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDrih_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDrih_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1.new) $dst=memh($src2+$src3<<#$offset)", @@ -692,40 +831,40 @@ def LDrih_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), //// Load unsigned halfword conditionally. // if ([!]Pv[.new]) Rd=memuh(Rs+Rt<<#u2) // if (Pv) Rd=memuh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriuh_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriuh_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1) $dst=memuh($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv.new) Rd=memuh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriuh_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriuh_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1.new) $dst=memuh($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv) Rd=memuh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriuh_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriuh_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1) $dst=memuh($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv.new) Rd=memuh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriuh_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriuh_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1.new) $dst=memuh($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv) Rd=memuh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriuh_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriuh_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1) $dst=memuh($src2+$src3<<#$offset)", @@ -733,8 +872,8 @@ def LDriuh_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) Rd=memuh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriuh_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriuh_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1.new) $dst=memuh($src2+$src3<<#$offset)", @@ -742,8 +881,8 @@ def LDriuh_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv) Rd=memuh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriuh_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriuh_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1) $dst=memuh($src2+$src3<<#$offset)", @@ -751,8 +890,8 @@ def LDriuh_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) Rd=memuh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1.new) $dst=memuh($src2+$src3<<#$offset)", @@ -762,6 +901,14 @@ def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), // Rd=memh(Rt<<#u2+#U6) //// Load word. +// Load predicate: Fix for bug 5279. +let neverHasSideEffects = 1 in +def LDriw_pred_V4 : LDInst2<(outs PredRegs:$dst), + (ins MEMri:$addr), + "Error; should not emit", + []>, + Requires<[HasV4T]>; + // Rd=memw(Re=#U6) // Rd=memw(Rs+Rt<<#u2) @@ -769,8 +916,9 @@ let AddedComplexity = 10, isPredicable = 1 in def LDriw_indexed_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst=memw($src1+$src2<<#0)", - [(set IntRegs:$dst, (load (add IntRegs:$src1, - IntRegs:$src2)))]>, + [(set (i32 IntRegs:$dst), + (i32 (load (add (i32 IntRegs:$src1), + (i32 IntRegs:$src2)))))]>, Requires<[HasV4T]>; // Rd=memw(Rs+Rt<<#u2) @@ -778,48 +926,49 @@ let AddedComplexity = 40, isPredicable = 1 in def LDriw_indexed_shl_V4 : LDInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset), "$dst=memw($src1+$src2<<#$offset)", - [(set IntRegs:$dst, (load (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$offset))))]>, + [(set (i32 IntRegs:$dst), + (i32 (load (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$offset)))))]>, Requires<[HasV4T]>; //// Load word conditionally. // if ([!]Pv[.new]) Rd=memw(Rs+Rt<<#u2) // if (Pv) Rd=memw(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriw_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriw_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1) $dst=memw($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv.new) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriw_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriw_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if ($src1.new) $dst=memw($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriw_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriw_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1) $dst=memw($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (!Pv.new) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in -def LDriw_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 15, isPredicated = 1 in +def LDriw_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3), "if (!$src1.new) $dst=memw($src2+$src3<<#0)", []>, Requires<[HasV4T]>; // if (Pv) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriw_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriw_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1) $dst=memw($src2+$src3<<#$offset)", @@ -827,8 +976,8 @@ def LDriw_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriw_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriw_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if ($src1.new) $dst=memw($src2+$src3<<#$offset)", @@ -836,8 +985,8 @@ def LDriw_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriw_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriw_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1) $dst=memw($src2+$src3<<#$offset)", @@ -845,8 +994,8 @@ def LDriw_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) Rd=memh(Rs+Rt<<#u2) -let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in -def LDriw_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), +let AddedComplexity = 45, isPredicated = 1 in +def LDriw_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset), "if (!$src1.new) $dst=memw($src2+$src3<<#$offset)", @@ -859,102 +1008,729 @@ def LDriw_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst), // Post-inc Load, Predicated, Dot new -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDrid_cdnPt_V4 : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrid_cdnPt_V4 : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3), "if ($src1.new) $dst1 = memd($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDrid_cdnNotPt_V4 : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrid_cdnNotPt_V4 : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3), "if (!$src1.new) $dst1 = memd($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDrib_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrib_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3), "if ($src1.new) $dst1 = memb($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDrib_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrib_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3), "if (!$src1.new) $dst1 = memb($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDrih_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrih_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3), "if ($src1.new) $dst1 = memh($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDrih_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDrih_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3), "if (!$src1.new) $dst1 = memh($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDriub_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriub_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3), "if ($src1.new) $dst1 = memub($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDriub_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriub_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3), "if (!$src1.new) $dst1 = memub($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDriuh_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriuh_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3), "if ($src1.new) $dst1 = memuh($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDriuh_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriuh_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3), "if (!$src1.new) $dst1 = memuh($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDriw_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriw_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3), "if ($src1.new) $dst1 = memw($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; -let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in -def POST_LDriw_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), +let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in +def POST_LDriw_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2), (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3), "if (!$src1.new) $dst1 = memw($src2++#$src3)", [], "$src2 = $dst2">, Requires<[HasV4T]>; +/// Load from global offset + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDrid_GP_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins globaladdress:$global, u16Imm:$offset), + "$dst=memd(#$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_GP_cPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1) $dst=memd(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_GP_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1) $dst=memd(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_GP_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1.new) $dst=memd(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrid_GP_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1.new) $dst=memd(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDrib_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global, u16Imm:$offset), + "$dst=memb(#$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1) $dst=memb(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1) $dst=memb(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1.new) $dst=memb(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrib_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1.new) $dst=memb(##$global+$offset)", + []>, + Requires<[HasV4T]>; + + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDriub_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global, u16Imm:$offset), + "$dst=memub(#$global+$offset)", + []>, + Requires<[HasV4T]>; + + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1) $dst=memub(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1) $dst=memub(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1.new) $dst=memub(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriub_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1.new) $dst=memub(##$global+$offset)", + []>, + Requires<[HasV4T]>; + + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDrih_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global, u16Imm:$offset), + "$dst=memh(#$global+$offset)", + []>, + Requires<[HasV4T]>; + + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1) $dst=memh(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1) $dst=memh(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1.new) $dst=memh(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDrih_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1.new) $dst=memh(##$global+$offset)", + []>, + Requires<[HasV4T]>; + + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDriuh_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global, u16Imm:$offset), + "$dst=memuh(#$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1) $dst=memuh(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1) $dst=memuh(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1.new) $dst=memuh(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriuh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1.new) $dst=memuh(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDriw_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global, u16Imm:$offset), + "$dst=memw(#$global+$offset)", + []>, + Requires<[HasV4T]>; + + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1) $dst=memw(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1) $dst=memw(##$global+$offset)", + []>, + Requires<[HasV4T]>; + + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if ($src1.new) $dst=memw(##$global+$offset)", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def LDriw_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset), + "if (!$src1.new) $dst=memw(##$global+$offset)", + []>, + Requires<[HasV4T]>; + + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDd_GP_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins globaladdress:$global), + "$dst=memd(#$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rtt=memd(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDd_GP_cPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1) $dst=memd(##$global)", + []>, + Requires<[HasV4T]>; + + +// if (!Pv) Rtt=memd(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDd_GP_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1) $dst=memd(##$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rtt=memd(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDd_GP_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1.new) $dst=memd(##$global)", + []>, + Requires<[HasV4T]>; + + +// if (!Pv) Rtt=memd(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDd_GP_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1.new) $dst=memd(##$global)", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDb_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global), + "$dst=memb(#$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memb(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDb_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1) $dst=memb(##$global)", + []>, + Requires<[HasV4T]>; + +// if (!Pv) Rt=memb(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDb_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1) $dst=memb(##$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memb(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDb_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1.new) $dst=memb(##$global)", + []>, + Requires<[HasV4T]>; + +// if (!Pv) Rt=memb(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDb_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1.new) $dst=memb(##$global)", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDub_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global), + "$dst=memub(#$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memub(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDub_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1) $dst=memub(##$global)", + []>, + Requires<[HasV4T]>; + + +// if (!Pv) Rt=memub(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDub_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1) $dst=memub(##$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memub(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDub_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1.new) $dst=memub(##$global)", + []>, + Requires<[HasV4T]>; + + +// if (!Pv) Rt=memub(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDub_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1.new) $dst=memub(##$global)", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDh_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global), + "$dst=memh(#$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memh(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1) $dst=memh(##$global)", + []>, + Requires<[HasV4T]>; + +// if (!Pv) Rt=memh(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1) $dst=memh(##$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memh(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1.new) $dst=memh(##$global)", + []>, + Requires<[HasV4T]>; + +// if (!Pv) Rt=memh(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1.new) $dst=memh(##$global)", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDuh_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global), + "$dst=memuh(#$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memuh(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDuh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1) $dst=memuh(##$global)", + []>, + Requires<[HasV4T]>; + +// if (!Pv) Rt=memuh(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDuh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1) $dst=memuh(##$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memuh(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDuh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1.new) $dst=memuh(##$global)", + []>, + Requires<[HasV4T]>; + +// if (!Pv) Rt=memuh(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDuh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1.new) $dst=memuh(##$global)", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def LDw_GP_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$global), + "$dst=memw(#$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memw(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDw_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1) $dst=memw(##$global)", + []>, + Requires<[HasV4T]>; + + +// if (!Pv) Rt=memw(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDw_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1) $dst=memw(##$global)", + []>, + Requires<[HasV4T]>; + +// if (Pv) Rt=memw(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDw_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if ($src1.new) $dst=memw(##$global)", + []>, + Requires<[HasV4T]>; + + +// if (!Pv) Rt=memw(##global) +let neverHasSideEffects = 1, isPredicated = 1 in +def LDw_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$global), + "if (!$src1.new) $dst=memw(##$global)", + []>, + Requires<[HasV4T]>; + + + +def : Pat <(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)), + (i64 (LDd_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +def : Pat <(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)), + (i32 (LDw_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +def : Pat <(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)), + (i32 (LDuh_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +def : Pat <(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)), + (i32 (LDub_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress) -> memw(#foo + 0) +let AddedComplexity = 100 in +def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))), + (i64 (LDd_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd +let AddedComplexity = 100 in +def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))), + (i1 (TFR_PdRs (i32 (LDb_GP_V4 tglobaladdr:$global))))>, + Requires<[HasV4T]>; + +// When the Interprocedural Global Variable optimizer realizes that a certain +// global variable takes only two constant values, it shrinks the global to +// a boolean. Catch those loads here in the following 3 patterns. +let AddedComplexity = 100 in +def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDb_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +let AddedComplexity = 100 in +def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDb_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress) -> memb(#foo) +let AddedComplexity = 100 in +def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDb_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress) -> memb(#foo) +let AddedComplexity = 100 in +def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDb_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +let AddedComplexity = 100 in +def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDub_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress) -> memub(#foo) +let AddedComplexity = 100 in +def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDub_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress) -> memh(#foo) +let AddedComplexity = 100 in +def : Pat <(i32 (extloadi16 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDh_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress) -> memh(#foo) +let AddedComplexity = 100 in +def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDh_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress) -> memuh(#foo) +let AddedComplexity = 100 in +def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDuh_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress) -> memw(#foo) +let AddedComplexity = 100 in +def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))), + (i32 (LDw_GP_V4 tglobaladdr:$global))>, + Requires<[HasV4T]>; + +def : Pat <(atomic_load_64 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (i64 (LDrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +def : Pat <(atomic_load_32 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (i32 (LDriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +def : Pat <(atomic_load_16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (i32 (LDriuh_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +def : Pat <(atomic_load_8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (i32 (LDriub_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress + x) -> memd(#foo + x) +let AddedComplexity = 100 in +def : Pat <(i64 (load (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i64 (LDrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress + x) -> memb(#foo + x) +let AddedComplexity = 100 in +def : Pat <(i32 (extloadi8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress + x) -> memb(#foo + x) +let AddedComplexity = 100 in +def : Pat <(i32 (sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress + x) -> memub(#foo + x) +let AddedComplexity = 100 in +def : Pat <(i32 (zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDriub_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress + x) -> memuh(#foo + x) +let AddedComplexity = 100 in +def : Pat <(i32 (extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress + x) -> memh(#foo + x) +let AddedComplexity = 100 in +def : Pat <(i32 (sextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + + +// Map from load(globaladdress + x) -> memuh(#foo + x) +let AddedComplexity = 100 in +def : Pat <(i32 (zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDriuh_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + +// Map from load(globaladdress + x) -> memw(#foo + x) +let AddedComplexity = 100 in +def : Pat <(i32 (load (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset))), + (i32 (LDriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>, + Requires<[HasV4T]>; + //===----------------------------------------------------------------------===// // LD - @@ -971,18 +1747,70 @@ def POST_LDriw_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2), /// last operand. /// -// Store doubleword. // memd(Re=#U6)=Rtt -// TODO: needs to be implemented +def STrid_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1), + (ins DoubleRegs:$src1, u6Imm:$src2), + "memd($dst1=#$src2) = $src1", + []>, + Requires<[HasV4T]>; + +// memb(Re=#U6)=Rs +def STrib_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1), + (ins IntRegs:$src1, u6Imm:$src2), + "memb($dst1=#$src2) = $src1", + []>, + Requires<[HasV4T]>; + +// memh(Re=#U6)=Rs +def STrih_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1), + (ins IntRegs:$src1, u6Imm:$src2), + "memh($dst1=#$src2) = $src1", + []>, + Requires<[HasV4T]>; + +// memw(Re=#U6)=Rs +def STriw_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1), + (ins IntRegs:$src1, u6Imm:$src2), + "memw($dst1=#$src2) = $src1", + []>, + Requires<[HasV4T]>; + +// memd(Re=#U6)=Rtt +def STrid_abs_set_V4 : STInst2<(outs IntRegs:$dst1), + (ins DoubleRegs:$src1, globaladdress:$src2), + "memd($dst1=##$src2) = $src1", + []>, + Requires<[HasV4T]>; + +// memb(Re=#U6)=Rs +def STrib_abs_set_V4 : STInst2<(outs IntRegs:$dst1), + (ins IntRegs:$src1, globaladdress:$src2), + "memb($dst1=##$src2) = $src1", + []>, + Requires<[HasV4T]>; + +// memh(Re=#U6)=Rs +def STrih_abs_set_V4 : STInst2<(outs IntRegs:$dst1), + (ins IntRegs:$src1, globaladdress:$src2), + "memh($dst1=##$src2) = $src1", + []>, + Requires<[HasV4T]>; + +// memw(Re=#U6)=Rs +def STriw_abs_set_V4 : STInst2<(outs IntRegs:$dst1), + (ins IntRegs:$src1, globaladdress:$src2), + "memw($dst1=##$src2) = $src1", + []>, + Requires<[HasV4T]>; -// memd(Rs+#s11:3)=Rtt // memd(Rs+Ru<<#u2)=Rtt let AddedComplexity = 10, isPredicable = 1 in def STrid_indexed_shl_V4 : STInst<(outs), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, DoubleRegs:$src4), "memd($src1+$src2<<#$src3) = $src4", - [(store DoubleRegs:$src4, (add IntRegs:$src1, - (shl IntRegs:$src2, u2ImmPred:$src3)))]>, + [(store (i64 DoubleRegs:$src4), + (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), u2ImmPred:$src3)))]>, Requires<[HasV4T]>; // memd(Ru<<#u2+#U6)=Rtt @@ -990,9 +1818,9 @@ let AddedComplexity = 10 in def STrid_shl_V4 : STInst<(outs), (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, DoubleRegs:$src4), "memd($src1<<#$src2+#$src3) = $src4", - [(store DoubleRegs:$src4, (shl IntRegs:$src1, - (add u2ImmPred:$src2, - u6ImmPred:$src3)))]>, + [(store (i64 DoubleRegs:$src4), + (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2), + u6ImmPred:$src3))]>, Requires<[HasV4T]>; // memd(Rx++#s4:3)=Rtt @@ -1009,8 +1837,9 @@ def STrid_shl_V4 : STInst<(outs), // if ([!]Pv[.new]) memd(Rs+#u6:3)=Rtt // if (Pv) memd(Rs+#u6:3)=Rtt // if (Pv.new) memd(Rs+#u6:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_cdnPt_V4 : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2), "if ($src1.new) memd($addr) = $src2", []>, @@ -1018,8 +1847,9 @@ def STrid_cdnPt_V4 : STInst<(outs), // if (!Pv) memd(Rs+#u6:3)=Rtt // if (!Pv.new) memd(Rs+#u6:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_cdnNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2), "if (!$src1.new) memd($addr) = $src2", []>, @@ -1027,8 +1857,9 @@ def STrid_cdnNotPt_V4 : STInst<(outs), // if (Pv) memd(Rs+#u6:3)=Rtt // if (Pv.new) memd(Rs+#u6:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_indexed_cdnPt_V4 : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_indexed_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3, DoubleRegs:$src4), "if ($src1.new) memd($src2+#$src3) = $src4", @@ -1037,8 +1868,9 @@ def STrid_indexed_cdnPt_V4 : STInst<(outs), // if (!Pv) memd(Rs+#u6:3)=Rtt // if (!Pv.new) memd(Rs+#u6:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_indexed_cdnNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_indexed_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3, DoubleRegs:$src4), "if (!$src1.new) memd($src2+#$src3) = $src4", @@ -1047,8 +1879,9 @@ def STrid_indexed_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memd(Rs+Ru<<#u2)=Rtt // if (Pv) memd(Rs+Ru<<#u2)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_indexed_shl_cPt_V4 : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_indexed_shl_cPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, DoubleRegs:$src5), "if ($src1) memd($src2+$src3<<#$src4) = $src5", @@ -1056,24 +1889,27 @@ def STrid_indexed_shl_cPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (Pv.new) memd(Rs+Ru<<#u2)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_indexed_shl_cdnPt_V4 : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_indexed_shl_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, DoubleRegs:$src5), - "if ($src1) memd($src2+$src3<<#$src4) = $src5", + "if ($src1.new) memd($src2+$src3<<#$src4) = $src5", []>, Requires<[HasV4T]>; // if (!Pv) memd(Rs+Ru<<#u2)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_indexed_shl_cNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_indexed_shl_cNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, DoubleRegs:$src5), "if (!$src1) memd($src2+$src3<<#$src4) = $src5", []>, Requires<[HasV4T]>; // if (!Pv.new) memd(Rs+Ru<<#u2)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def STrid_indexed_shl_cdnNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def STrid_indexed_shl_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, DoubleRegs:$src5), "if (!$src1.new) memd($src2+$src3<<#$src4) = $src5", @@ -1083,8 +1919,9 @@ def STrid_indexed_shl_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memd(Rx++#s4:3)=Rtt // if (Pv) memd(Rx++#s4:3)=Rtt // if (Pv.new) memd(Rx++#s4:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def POST_STdri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def POST_STdri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, s4_3Imm:$offset), "if ($src1.new) memd($src3++#$offset) = $src2", @@ -1094,8 +1931,9 @@ def POST_STdri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst), // if (!Pv) memd(Rx++#s4:3)=Rtt // if (!Pv.new) memd(Rx++#s4:3)=Rtt -let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in -def POST_STdri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst), +let AddedComplexity = 10, neverHasSideEffects = 1, + isPredicated = 1 in +def POST_STdri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, s4_3Imm:$offset), "if (!$src1.new) memd($src3++#$offset) = $src2", @@ -1105,15 +1943,12 @@ def POST_STdri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst), // Store byte. -// memb(Re=#U6)=Rt -// TODO: needs to be implemented. -// memb(Rs+#s11:0)=Rt // memb(Rs+#u6:0)=#S8 let AddedComplexity = 10, isPredicable = 1 in def STrib_imm_V4 : STInst<(outs), (ins IntRegs:$src1, u6_0Imm:$src2, s8Imm:$src3), "memb($src1+#$src2) = #$src3", - [(truncstorei8 s8ImmPred:$src3, (add IntRegs:$src1, + [(truncstorei8 s8ImmPred:$src3, (add (i32 IntRegs:$src1), u6_0ImmPred:$src2))]>, Requires<[HasV4T]>; @@ -1122,9 +1957,10 @@ let AddedComplexity = 10, isPredicable = 1 in def STrib_indexed_shl_V4 : STInst<(outs), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4), "memb($src1+$src2<<#$src3) = $src4", - [(truncstorei8 IntRegs:$src4, (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$src3)))]>, + [(truncstorei8 (i32 IntRegs:$src4), + (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$src3)))]>, Requires<[HasV4T]>; // memb(Ru<<#u2+#U6)=Rt @@ -1132,9 +1968,9 @@ let AddedComplexity = 10 in def STrib_shl_V4 : STInst<(outs), (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4), "memb($src1<<#$src2+#$src3) = $src4", - [(truncstorei8 IntRegs:$src4, (shl IntRegs:$src1, - (add u2ImmPred:$src2, - u6ImmPred:$src3)))]>, + [(truncstorei8 (i32 IntRegs:$src4), + (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2), + u6ImmPred:$src3))]>, Requires<[HasV4T]>; // memb(Rx++#s4:0:circ(Mu))=Rt @@ -1148,32 +1984,36 @@ def STrib_shl_V4 : STInst<(outs), // if ([!]Pv[.new]) memb(#u6)=Rt // if ([!]Pv[.new]) memb(Rs+#u6:0)=#S6 // if (Pv) memb(Rs+#u6:0)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_imm_cPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrib_imm_cPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4), "if ($src1) memb($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (Pv.new) memb(Rs+#u6:0)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_imm_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrib_imm_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4), "if ($src1.new) memb($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (!Pv) memb(Rs+#u6:0)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_imm_cNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrib_imm_cNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4), "if (!$src1) memb($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (!Pv.new) memb(Rs+#u6:0)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_imm_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrib_imm_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4), "if (!$src1.new) memb($src2+#$src3) = #$src4", []>, @@ -1182,8 +2022,9 @@ def STrib_imm_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memb(Rs+#u6:0)=Rt // if (Pv) memb(Rs+#u6:0)=Rt // if (Pv.new) memb(Rs+#u6:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrib_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1.new) memb($addr) = $src2", []>, @@ -1191,8 +2032,9 @@ def STrib_cdnPt_V4 : STInst<(outs), // if (!Pv) memb(Rs+#u6:0)=Rt // if (!Pv.new) memb(Rs+#u6:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrib_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1.new) memb($addr) = $src2", []>, @@ -1201,16 +2043,18 @@ def STrib_cdnNotPt_V4 : STInst<(outs), // if (Pv) memb(Rs+#u6:0)=Rt // if (!Pv) memb(Rs+#u6:0)=Rt // if (Pv.new) memb(Rs+#u6:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_indexed_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrib_indexed_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4), "if ($src1.new) memb($src2+#$src3) = $src4", []>, Requires<[HasV4T]>; // if (!Pv.new) memb(Rs+#u6:0)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrib_indexed_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrib_indexed_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4), "if (!$src1.new) memb($src2+#$src3) = $src4", []>, @@ -1218,8 +2062,9 @@ def STrib_indexed_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memb(Rs+Ru<<#u2)=Rt // if (Pv) memb(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STrib_indexed_shl_cPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STrib_indexed_shl_cPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if ($src1) memb($src2+$src3<<#$src4) = $src5", @@ -1227,8 +2072,9 @@ def STrib_indexed_shl_cPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (Pv.new) memb(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STrib_indexed_shl_cdnPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STrib_indexed_shl_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if ($src1.new) memb($src2+$src3<<#$src4) = $src5", @@ -1236,8 +2082,9 @@ def STrib_indexed_shl_cdnPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (!Pv) memb(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STrib_indexed_shl_cNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STrib_indexed_shl_cNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if (!$src1) memb($src2+$src3<<#$src4) = $src5", @@ -1245,8 +2092,9 @@ def STrib_indexed_shl_cNotPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (!Pv.new) memb(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STrib_indexed_shl_cdnNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STrib_indexed_shl_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if (!$src1.new) memb($src2+$src3<<#$src4) = $src5", @@ -1256,8 +2104,9 @@ def STrib_indexed_shl_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memb(Rx++#s4:0)=Rt // if (Pv) memb(Rx++#s4:0)=Rt // if (Pv.new) memb(Rx++#s4:0)=Rt -let mayStore = 1, hasCtrlDep = 1 in -def POST_STbri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, + isPredicated = 1 in +def POST_STbri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset), "if ($src1.new) memb($src3++#$offset) = $src2", [],"$src3 = $dst">, @@ -1265,8 +2114,9 @@ def POST_STbri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst), // if (!Pv) memb(Rx++#s4:0)=Rt // if (!Pv.new) memb(Rx++#s4:0)=Rt -let mayStore = 1, hasCtrlDep = 1 in -def POST_STbri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, + isPredicated = 1 in +def POST_STbri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset), "if (!$src1.new) memb($src3++#$offset) = $src2", [],"$src3 = $dst">, @@ -1274,20 +2124,15 @@ def POST_STbri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst), // Store halfword. -// memh(Re=#U6)=Rt.H -// TODO: needs to be implemented - -// memh(Re=#U6)=Rt // TODO: needs to be implemented - +// memh(Re=#U6)=Rt.H // memh(Rs+#s11:1)=Rt.H -// memh(Rs+#s11:1)=Rt // memh(Rs+#u6:1)=#S8 let AddedComplexity = 10, isPredicable = 1 in def STrih_imm_V4 : STInst<(outs), (ins IntRegs:$src1, u6_1Imm:$src2, s8Imm:$src3), "memh($src1+#$src2) = #$src3", - [(truncstorei16 s8ImmPred:$src3, (add IntRegs:$src1, + [(truncstorei16 s8ImmPred:$src3, (add (i32 IntRegs:$src1), u6_1ImmPred:$src2))]>, Requires<[HasV4T]>; @@ -1299,9 +2144,10 @@ let AddedComplexity = 10, isPredicable = 1 in def STrih_indexed_shl_V4 : STInst<(outs), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4), "memh($src1+$src2<<#$src3) = $src4", - [(truncstorei16 IntRegs:$src4, (add IntRegs:$src1, - (shl IntRegs:$src2, - u2ImmPred:$src3)))]>, + [(truncstorei16 (i32 IntRegs:$src4), + (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$src3)))]>, Requires<[HasV4T]>; // memh(Ru<<#u2+#U6)=Rt.H @@ -1310,9 +2156,9 @@ let AddedComplexity = 10 in def STrih_shl_V4 : STInst<(outs), (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4), "memh($src1<<#$src2+#$src3) = $src4", - [(truncstorei16 IntRegs:$src4, (shl IntRegs:$src1, - (add u2ImmPred:$src2, - u6ImmPred:$src3)))]>, + [(truncstorei16 (i32 IntRegs:$src4), + (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2), + u6ImmPred:$src3))]>, Requires<[HasV4T]>; // memh(Rx++#s4:1:circ(Mu))=Rt.H @@ -1323,42 +2169,42 @@ def STrih_shl_V4 : STInst<(outs), // memh(Rx++Mu)=Rt // memh(Rx++Mu:brev)=Rt.H // memh(Rx++Mu:brev)=Rt -// memh(gp+#u16:1)=Rt.H // memh(gp+#u16:1)=Rt - - -// Store halfword conditionally. // if ([!]Pv[.new]) memh(#u6)=Rt.H // if ([!]Pv[.new]) memh(#u6)=Rt // if ([!]Pv[.new]) memh(Rs+#u6:1)=#S6 // if (Pv) memh(Rs+#u6:1)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_imm_cPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrih_imm_cPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4), "if ($src1) memh($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (Pv.new) memh(Rs+#u6:1)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_imm_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrih_imm_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4), "if ($src1.new) memh($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (!Pv) memh(Rs+#u6:1)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_imm_cNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrih_imm_cNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4), "if (!$src1) memh($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (!Pv.new) memh(Rs+#u6:1)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_imm_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrih_imm_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4), "if (!$src1.new) memh($src2+#$src3) = #$src4", []>, @@ -1370,8 +2216,9 @@ def STrih_imm_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memh(Rs+#u6:1)=Rt // if (Pv) memh(Rs+#u6:1)=Rt // if (Pv.new) memh(Rs+#u6:1)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrih_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1.new) memh($addr) = $src2", []>, @@ -1379,24 +2226,27 @@ def STrih_cdnPt_V4 : STInst<(outs), // if (!Pv) memh(Rs+#u6:1)=Rt // if (!Pv.new) memh(Rs+#u6:1)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrih_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1.new) memh($addr) = $src2", []>, Requires<[HasV4T]>; // if (Pv.new) memh(Rs+#u6:1)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_indexed_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrih_indexed_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4), "if ($src1.new) memh($src2+#$src3) = $src4", []>, Requires<[HasV4T]>; // if (!Pv.new) memh(Rs+#u6:1)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STrih_indexed_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STrih_indexed_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4), "if (!$src1.new) memh($src2+#$src3) = $src4", []>, @@ -1405,8 +2255,9 @@ def STrih_indexed_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Rt.H // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Rt // if (Pv) memh(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STrih_indexed_shl_cPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STrih_indexed_shl_cPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if ($src1) memh($src2+$src3<<#$src4) = $src5", @@ -1414,7 +2265,9 @@ def STrih_indexed_shl_cPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (Pv.new) memh(Rs+Ru<<#u2)=Rt -def STrih_indexed_shl_cdnPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STrih_indexed_shl_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if ($src1.new) memh($src2+$src3<<#$src4) = $src5", @@ -1422,8 +2275,9 @@ def STrih_indexed_shl_cdnPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (!Pv) memh(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STrih_indexed_shl_cNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STrih_indexed_shl_cNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if (!$src1) memh($src2+$src3<<#$src4) = $src5", @@ -1431,8 +2285,9 @@ def STrih_indexed_shl_cNotPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (!Pv.new) memh(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STrih_indexed_shl_cdnNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STrih_indexed_shl_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if (!$src1.new) memh($src2+$src3<<#$src4) = $src5", @@ -1445,8 +2300,9 @@ def STrih_indexed_shl_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt // if (Pv) memh(Rx++#s4:1)=Rt // if (Pv.new) memh(Rx++#s4:1)=Rt -let mayStore = 1, hasCtrlDep = 1 in -def POST_SThri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, + isPredicated = 1 in +def POST_SThri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset), "if ($src1.new) memh($src3++#$offset) = $src2", [],"$src3 = $dst">, @@ -1454,8 +2310,9 @@ def POST_SThri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst), // if (!Pv) memh(Rx++#s4:1)=Rt // if (!Pv.new) memh(Rx++#s4:1)=Rt -let mayStore = 1, hasCtrlDep = 1 in -def POST_SThri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, + isPredicated = 1 in +def POST_SThri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset), "if (!$src1.new) memh($src3++#$offset) = $src2", [],"$src3 = $dst">, @@ -1466,13 +2323,22 @@ def POST_SThri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst), // memw(Re=#U6)=Rt // TODO: Needs to be implemented. -// memw(Rs+#s11:2)=Rt +// Store predicate: +let neverHasSideEffects = 1 in +def STriw_pred_V4 : STInst2<(outs), + (ins MEMri:$addr, PredRegs:$src1), + "Error; should not emit", + []>, + Requires<[HasV4T]>; + + // memw(Rs+#u6:2)=#S8 let AddedComplexity = 10, isPredicable = 1 in def STriw_imm_V4 : STInst<(outs), (ins IntRegs:$src1, u6_2Imm:$src2, s8Imm:$src3), "memw($src1+#$src2) = #$src3", - [(store s8ImmPred:$src3, (add IntRegs:$src1, u6_2ImmPred:$src2))]>, + [(store s8ImmPred:$src3, (add (i32 IntRegs:$src1), + u6_2ImmPred:$src2))]>, Requires<[HasV4T]>; // memw(Rs+Ru<<#u2)=Rt @@ -1480,8 +2346,9 @@ let AddedComplexity = 10, isPredicable = 1 in def STriw_indexed_shl_V4 : STInst<(outs), (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4), "memw($src1+$src2<<#$src3) = $src4", - [(store IntRegs:$src4, (add IntRegs:$src1, - (shl IntRegs:$src2, u2ImmPred:$src3)))]>, + [(store (i32 IntRegs:$src4), (add (i32 IntRegs:$src1), + (shl (i32 IntRegs:$src2), + u2ImmPred:$src3)))]>, Requires<[HasV4T]>; // memw(Ru<<#u2+#U6)=Rt @@ -1489,8 +2356,9 @@ let AddedComplexity = 10 in def STriw_shl_V4 : STInst<(outs), (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4), "memw($src1<<#$src2+#$src3) = $src4", - [(store IntRegs:$src4, (shl IntRegs:$src1, - (add u2ImmPred:$src2, u6ImmPred:$src3)))]>, + [(store (i32 IntRegs:$src4), + (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2), + u6ImmPred:$src3))]>, Requires<[HasV4T]>; // memw(Rx++#s4:2)=Rt @@ -1502,37 +2370,39 @@ def STriw_shl_V4 : STInst<(outs), // Store word conditionally. -// if ([!]Pv[.new]) memw(#u6)=Rt -// TODO: Needs to be implemented. // if ([!]Pv[.new]) memw(Rs+#u6:2)=#S6 // if (Pv) memw(Rs+#u6:2)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_imm_cPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STriw_imm_cPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4), "if ($src1) memw($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (Pv.new) memw(Rs+#u6:2)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_imm_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STriw_imm_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4), "if ($src1.new) memw($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (!Pv) memw(Rs+#u6:2)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_imm_cNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STriw_imm_cNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4), "if (!$src1) memw($src2+#$src3) = #$src4", []>, Requires<[HasV4T]>; // if (!Pv.new) memw(Rs+#u6:2)=#S6 -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_imm_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STriw_imm_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4), "if (!$src1.new) memw($src2+#$src3) = #$src4", []>, @@ -1541,8 +2411,9 @@ def STriw_imm_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memw(Rs+#u6:2)=Rt // if (Pv) memw(Rs+#u6:2)=Rt // if (Pv.new) memw(Rs+#u6:2)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STriw_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1.new) memw($addr) = $src2", []>, @@ -1550,8 +2421,9 @@ def STriw_cdnPt_V4 : STInst<(outs), // if (!Pv) memw(Rs+#u6:2)=Rt // if (!Pv.new) memw(Rs+#u6:2)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STriw_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1.new) memw($addr) = $src2", []>, @@ -1560,16 +2432,18 @@ def STriw_cdnNotPt_V4 : STInst<(outs), // if (Pv) memw(Rs+#u6:2)=Rt // if (!Pv) memw(Rs+#u6:2)=Rt // if (Pv.new) memw(Rs+#u6:2)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_indexed_cdnPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STriw_indexed_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4), "if ($src1.new) memw($src2+#$src3) = $src4", []>, Requires<[HasV4T]>; // if (!Pv.new) memw(Rs+#u6:2)=Rt -let mayStore = 1, neverHasSideEffects = 1 in -def STriw_indexed_cdnNotPt_V4 : STInst<(outs), +let neverHasSideEffects = 1, + isPredicated = 1 in +def STriw_indexed_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4), "if (!$src1.new) memw($src2+#$src3) = $src4", []>, @@ -1577,8 +2451,9 @@ def STriw_indexed_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memw(Rs+Ru<<#u2)=Rt // if (Pv) memw(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STriw_indexed_shl_cPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STriw_indexed_shl_cPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if ($src1) memw($src2+$src3<<#$src4) = $src5", @@ -1586,8 +2461,9 @@ def STriw_indexed_shl_cPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (Pv.new) memw(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STriw_indexed_shl_cdnPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STriw_indexed_shl_cdnPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if ($src1.new) memw($src2+$src3<<#$src4) = $src5", @@ -1595,8 +2471,9 @@ def STriw_indexed_shl_cdnPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (!Pv) memw(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STriw_indexed_shl_cNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STriw_indexed_shl_cNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if (!$src1) memw($src2+$src3<<#$src4) = $src5", @@ -1604,8 +2481,9 @@ def STriw_indexed_shl_cNotPt_V4 : STInst<(outs), Requires<[HasV4T]>; // if (!Pv.new) memw(Rs+Ru<<#u2)=Rt -let mayStore = 1, AddedComplexity = 10 in -def STriw_indexed_shl_cdnNotPt_V4 : STInst<(outs), +let AddedComplexity = 10, + isPredicated = 1 in +def STriw_indexed_shl_cdnNotPt_V4 : STInst2<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), "if (!$src1.new) memw($src2+$src3<<#$src4) = $src5", @@ -1615,8 +2493,9 @@ def STriw_indexed_shl_cdnNotPt_V4 : STInst<(outs), // if ([!]Pv[.new]) memw(Rx++#s4:2)=Rt // if (Pv) memw(Rx++#s4:2)=Rt // if (Pv.new) memw(Rx++#s4:2)=Rt -let mayStore = 1, hasCtrlDep = 1 in -def POST_STwri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, + isPredicated = 1 in +def POST_STwri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset), "if ($src1.new) memw($src3++#$offset) = $src2", [],"$src3 = $dst">, @@ -1624,14 +2503,456 @@ def POST_STwri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst), // if (!Pv) memw(Rx++#s4:2)=Rt // if (!Pv.new) memw(Rx++#s4:2)=Rt -let mayStore = 1, hasCtrlDep = 1 in -def POST_STwri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst), +let hasCtrlDep = 1, + isPredicated = 1 in +def POST_STwri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset), "if (!$src1.new) memw($src3++#$offset) = $src2", [],"$src3 = $dst">, Requires<[HasV4T]>; +/// store to global address + +let isPredicable = 1, neverHasSideEffects = 1 in +def STrid_GP_V4 : STInst2<(outs), + (ins globaladdress:$global, u16Imm:$offset, DoubleRegs:$src), + "memd(#$global+$offset) = $src", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrid_GP_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + DoubleRegs:$src2), + "if ($src1) memd(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrid_GP_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + DoubleRegs:$src2), + "if (!$src1) memd(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrid_GP_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + DoubleRegs:$src2), + "if ($src1.new) memd(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrid_GP_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + DoubleRegs:$src2), + "if (!$src1.new) memd(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def STrib_GP_V4 : STInst2<(outs), + (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src), + "memb(#$global+$offset) = $src", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrib_GP_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1) memb(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrib_GP_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1) memb(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrib_GP_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1.new) memb(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrib_GP_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1.new) memb(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def STrih_GP_V4 : STInst2<(outs), + (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src), + "memh(#$global+$offset) = $src", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrih_GP_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1) memh(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrih_GP_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1) memh(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrih_GP_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1.new) memh(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STrih_GP_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1.new) memh(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let isPredicable = 1, neverHasSideEffects = 1 in +def STriw_GP_V4 : STInst2<(outs), + (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src), + "memw(#$global+$offset) = $src", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STriw_GP_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1) memw(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STriw_GP_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1) memw(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STriw_GP_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1.new) memw(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +let neverHasSideEffects = 1, isPredicated = 1 in +def STriw_GP_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1.new) memw(##$global+$offset) = $src2", + []>, + Requires<[HasV4T]>; + +// memd(#global)=Rtt +let isPredicable = 1, neverHasSideEffects = 1 in +def STd_GP_V4 : STInst2<(outs), + (ins globaladdress:$global, DoubleRegs:$src), + "memd(#$global) = $src", + []>, + Requires<[HasV4T]>; + +// if (Pv) memd(##global) = Rtt +let neverHasSideEffects = 1, isPredicated = 1 in +def STd_GP_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2), + "if ($src1) memd(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memd(##global) = Rtt +let neverHasSideEffects = 1, isPredicated = 1 in +def STd_GP_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2), + "if (!$src1) memd(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (Pv) memd(##global) = Rtt +let neverHasSideEffects = 1, isPredicated = 1 in +def STd_GP_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2), + "if ($src1.new) memd(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memd(##global) = Rtt +let neverHasSideEffects = 1, isPredicated = 1 in +def STd_GP_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2), + "if (!$src1.new) memd(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// memb(#global)=Rt +let isPredicable = 1, neverHasSideEffects = 1 in +def STb_GP_V4 : STInst2<(outs), + (ins globaladdress:$global, IntRegs:$src), + "memb(#$global) = $src", + []>, + Requires<[HasV4T]>; + +// if (Pv) memb(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STb_GP_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1) memb(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memb(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STb_GP_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1) memb(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (Pv) memb(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STb_GP_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1.new) memb(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memb(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STb_GP_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1.new) memb(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// memh(#global)=Rt +let isPredicable = 1, neverHasSideEffects = 1 in +def STh_GP_V4 : STInst2<(outs), + (ins globaladdress:$global, IntRegs:$src), + "memh(#$global) = $src", + []>, + Requires<[HasV4T]>; + +// if (Pv) memh(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STh_GP_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1) memh(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memh(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STh_GP_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1) memh(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (Pv) memh(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STh_GP_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1.new) memh(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memh(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STh_GP_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1.new) memh(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// memw(#global)=Rt +let isPredicable = 1, neverHasSideEffects = 1 in +def STw_GP_V4 : STInst2<(outs), + (ins globaladdress:$global, IntRegs:$src), + "memw(#$global) = $src", + []>, + Requires<[HasV4T]>; + +// if (Pv) memw(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STw_GP_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1) memw(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memw(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STw_GP_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1) memw(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (Pv) memw(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STw_GP_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1.new) memw(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memw(##global) = Rt +let neverHasSideEffects = 1, isPredicated = 1 in +def STw_GP_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1.new) memw(##$global) = $src2", + []>, + Requires<[HasV4T]>; + +// 64 bit atomic store +def : Pat <(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global), + (i64 DoubleRegs:$src1)), + (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from store(globaladdress) -> memd(#foo) +let AddedComplexity = 100 in +def : Pat <(store (i64 DoubleRegs:$src1), + (HexagonCONST32_GP tglobaladdr:$global)), + (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>, + Requires<[HasV4T]>; + +// 8 bit atomic store +def : Pat < (atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global), + (i32 IntRegs:$src1)), + (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from store(globaladdress) -> memb(#foo) +let AddedComplexity = 100 in +def : Pat<(truncstorei8 (i32 IntRegs:$src1), + (HexagonCONST32_GP tglobaladdr:$global)), + (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1" +// to "r0 = 1; memw(#foo) = r0" +let AddedComplexity = 100 in +def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)), + (STb_GP_V4 tglobaladdr:$global, (TFRI 1))>, + Requires<[HasV4T]>; + +def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global), + (i32 IntRegs:$src1)), + (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from store(globaladdress) -> memh(#foo) +let AddedComplexity = 100 in +def : Pat<(truncstorei16 (i32 IntRegs:$src1), + (HexagonCONST32_GP tglobaladdr:$global)), + (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +// 32 bit atomic store +def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global), + (i32 IntRegs:$src1)), + (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from store(globaladdress) -> memw(#foo) +let AddedComplexity = 100 in +def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)), + (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +def : Pat<(atomic_store_64 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset), + (i64 DoubleRegs:$src1)), + (STrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset, + (i64 DoubleRegs:$src1))>, + Requires<[HasV4T]>; + +def : Pat<(atomic_store_32 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset), + (i32 IntRegs:$src1)), + (STriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +def : Pat<(atomic_store_16 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset), + (i32 IntRegs:$src1)), + (STrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +def : Pat<(atomic_store_8 (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset), + (i32 IntRegs:$src1)), + (STrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from store(globaladdress + x) -> memd(#foo + x) +let AddedComplexity = 100 in +def : Pat<(store (i64 DoubleRegs:$src1), + (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (STrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset, + (i64 DoubleRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from store(globaladdress + x) -> memb(#foo + x) +let AddedComplexity = 100 in +def : Pat<(truncstorei8 (i32 IntRegs:$src1), + (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (STrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from store(globaladdress + x) -> memh(#foo + x) +let AddedComplexity = 100 in +def : Pat<(truncstorei16 (i32 IntRegs:$src1), + (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (STrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + +// Map from store(globaladdress + x) -> memw(#foo + x) +let AddedComplexity = 100 in +def : Pat<(store (i32 IntRegs:$src1), + (add (HexagonCONST32_GP tglobaladdr:$global), + u16ImmPred:$offset)), + (STriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset, + (i32 IntRegs:$src1))>, + Requires<[HasV4T]>; + + + //===----------------------------------------------------------------------=== // ST - //===----------------------------------------------------------------------=== @@ -1696,11 +3017,19 @@ def STrib_GP_nv_V4 : NVInst_V4<(outs), []>, Requires<[HasV4T]>; +// memb(#global)=Nt.new +let mayStore = 1, neverHasSideEffects = 1 in +def STb_GP_nv_V4 : NVInst_V4<(outs), + (ins globaladdress:$global, IntRegs:$src), + "memb(#$global) = $src.new", + []>, + Requires<[HasV4T]>; // Store new-value byte conditionally. // if ([!]Pv[.new]) memb(#u6)=Nt.new // if (Pv) memb(Rs+#u6:0)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrib_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1) memb($addr) = $src2.new", @@ -1708,7 +3037,8 @@ def STrib_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memb(Rs+#u6:0)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrib_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1.new) memb($addr) = $src2.new", @@ -1716,7 +3046,8 @@ def STrib_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memb(Rs+#u6:0)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrib_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1) memb($addr) = $src2.new", @@ -1724,7 +3055,8 @@ def STrib_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memb(Rs+#u6:0)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrib_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1.new) memb($addr) = $src2.new", @@ -1732,7 +3064,8 @@ def STrib_cdnNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv) memb(Rs+#u6:0)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrib_indexed_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4), "if ($src1) memb($src2+#$src3) = $src4.new", @@ -1740,7 +3073,8 @@ def STrib_indexed_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memb(Rs+#u6:0)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrib_indexed_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4), "if ($src1.new) memb($src2+#$src3) = $src4.new", @@ -1748,7 +3082,8 @@ def STrib_indexed_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memb(Rs+#u6:0)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrib_indexed_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4), "if (!$src1) memb($src2+#$src3) = $src4.new", @@ -1756,7 +3091,8 @@ def STrib_indexed_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memb(Rs+#u6:0)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrib_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4), "if (!$src1.new) memb($src2+#$src3) = $src4.new", @@ -1766,7 +3102,8 @@ def STrib_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs), // if ([!]Pv[.new]) memb(Rs+Ru<<#u2)=Nt.new // if (Pv) memb(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STrib_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -1775,7 +3112,8 @@ def STrib_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memb(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STrib_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -1784,7 +3122,8 @@ def STrib_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memb(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STrib_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -1793,7 +3132,8 @@ def STrib_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memb(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STrib_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -1803,7 +3143,8 @@ def STrib_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs), // if ([!]Pv[.new]) memb(Rx++#s4:0)=Nt.new // if (Pv) memb(Rx++#s4:0)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_STbri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset), "if ($src1) memb($src3++#$offset) = $src2.new", @@ -1811,7 +3152,8 @@ def POST_STbri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) memb(Rx++#s4:0)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_STbri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset), "if ($src1.new) memb($src3++#$offset) = $src2.new", @@ -1819,7 +3161,8 @@ def POST_STbri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv) memb(Rx++#s4:0)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_STbri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset), "if (!$src1) memb($src3++#$offset) = $src2.new", @@ -1827,7 +3170,8 @@ def POST_STbri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) memb(Rx++#s4:0)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_STbri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset), "if (!$src1.new) memb($src3++#$offset) = $src2.new", @@ -1889,6 +3233,14 @@ def STrih_GP_nv_V4 : NVInst_V4<(outs), []>, Requires<[HasV4T]>; +// memh(#global)=Nt.new +let mayStore = 1, neverHasSideEffects = 1 in +def STh_GP_nv_V4 : NVInst_V4<(outs), + (ins globaladdress:$global, IntRegs:$src), + "memh(#$global) = $src.new", + []>, + Requires<[HasV4T]>; + // Store new-value halfword conditionally. @@ -1896,7 +3248,8 @@ def STrih_GP_nv_V4 : NVInst_V4<(outs), // if ([!]Pv[.new]) memh(Rs+#u6:1)=Nt.new // if (Pv) memh(Rs+#u6:1)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrih_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1) memh($addr) = $src2.new", @@ -1904,7 +3257,8 @@ def STrih_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memh(Rs+#u6:1)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrih_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1.new) memh($addr) = $src2.new", @@ -1912,7 +3266,8 @@ def STrih_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memh(Rs+#u6:1)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrih_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1) memh($addr) = $src2.new", @@ -1920,7 +3275,8 @@ def STrih_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memh(Rs+#u6:1)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrih_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1.new) memh($addr) = $src2.new", @@ -1928,7 +3284,8 @@ def STrih_cdnNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv) memh(Rs+#u6:1)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrih_indexed_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4), "if ($src1) memh($src2+#$src3) = $src4.new", @@ -1936,7 +3293,8 @@ def STrih_indexed_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memh(Rs+#u6:1)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrih_indexed_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4), "if ($src1.new) memh($src2+#$src3) = $src4.new", @@ -1944,7 +3302,8 @@ def STrih_indexed_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memh(Rs+#u6:1)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrih_indexed_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4), "if (!$src1) memh($src2+#$src3) = $src4.new", @@ -1952,7 +3311,8 @@ def STrih_indexed_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memh(Rs+#u6:1)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STrih_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4), "if (!$src1.new) memh($src2+#$src3) = $src4.new", @@ -1961,7 +3321,8 @@ def STrih_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs), // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Nt.new // if (Pv) memh(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STrih_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -1970,7 +3331,8 @@ def STrih_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memh(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STrih_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -1979,7 +3341,8 @@ def STrih_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memh(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STrih_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -1988,7 +3351,8 @@ def STrih_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memh(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STrih_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -1998,7 +3362,8 @@ def STrih_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs), // if ([!]Pv[]) memh(Rx++#s4:1)=Nt.new // if (Pv) memh(Rx++#s4:1)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_SThri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset), "if ($src1) memh($src3++#$offset) = $src2.new", @@ -2006,7 +3371,8 @@ def POST_SThri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) memh(Rx++#s4:1)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_SThri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset), "if ($src1.new) memh($src3++#$offset) = $src2.new", @@ -2014,7 +3380,8 @@ def POST_SThri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv) memh(Rx++#s4:1)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_SThri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset), "if (!$src1) memh($src3++#$offset) = $src2.new", @@ -2022,7 +3389,8 @@ def POST_SThri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) memh(Rx++#s4:1)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_SThri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset), "if (!$src1.new) memh($src3++#$offset) = $src2.new", @@ -2085,6 +3453,12 @@ def STriw_GP_nv_V4 : NVInst_V4<(outs), []>, Requires<[HasV4T]>; +let mayStore = 1, neverHasSideEffects = 1 in +def STw_GP_nv_V4 : NVInst_V4<(outs), + (ins globaladdress:$global, IntRegs:$src), + "memw(#$global) = $src.new", + []>, + Requires<[HasV4T]>; // Store new-value word conditionally. @@ -2092,7 +3466,8 @@ def STriw_GP_nv_V4 : NVInst_V4<(outs), // if ([!]Pv[.new]) memw(Rs+#u6:2)=Nt.new // if (Pv) memw(Rs+#u6:2)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STriw_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1) memw($addr) = $src2.new", @@ -2100,7 +3475,8 @@ def STriw_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memw(Rs+#u6:2)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STriw_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if ($src1.new) memw($addr) = $src2.new", @@ -2108,7 +3484,8 @@ def STriw_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memw(Rs+#u6:2)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STriw_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1) memw($addr) = $src2.new", @@ -2116,7 +3493,8 @@ def STriw_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memw(Rs+#u6:2)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STriw_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2), "if (!$src1.new) memw($addr) = $src2.new", @@ -2124,7 +3502,8 @@ def STriw_cdnNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv) memw(Rs+#u6:2)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STriw_indexed_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4), "if ($src1) memw($src2+#$src3) = $src4.new", @@ -2132,7 +3511,8 @@ def STriw_indexed_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memw(Rs+#u6:2)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STriw_indexed_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4), "if ($src1.new) memw($src2+#$src3) = $src4.new", @@ -2140,7 +3520,8 @@ def STriw_indexed_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memw(Rs+#u6:2)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STriw_indexed_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4), "if (!$src1) memw($src2+#$src3) = $src4.new", @@ -2148,7 +3529,8 @@ def STriw_indexed_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memw(Rs+#u6:2)=Nt.new -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, neverHasSideEffects = 1, + isPredicated = 1 in def STriw_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4), "if (!$src1.new) memw($src2+#$src3) = $src4.new", @@ -2158,7 +3540,8 @@ def STriw_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs), // if ([!]Pv[.new]) memw(Rs+Ru<<#u2)=Nt.new // if (Pv) memw(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STriw_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -2167,7 +3550,8 @@ def STriw_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (Pv.new) memw(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STriw_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -2176,7 +3560,8 @@ def STriw_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv) memw(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STriw_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -2185,7 +3570,8 @@ def STriw_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs), Requires<[HasV4T]>; // if (!Pv.new) memw(Rs+Ru<<#u2)=Nt.new -let mayStore = 1, AddedComplexity = 10 in +let mayStore = 1, AddedComplexity = 10, + isPredicated = 1 in def STriw_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4, IntRegs:$src5), @@ -2195,7 +3581,8 @@ def STriw_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs), // if ([!]Pv[.new]) memw(Rx++#s4:2)=Nt.new // if (Pv) memw(Rx++#s4:2)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_STwri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset), "if ($src1) memw($src3++#$offset) = $src2.new", @@ -2203,7 +3590,8 @@ def POST_STwri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (Pv.new) memw(Rx++#s4:2)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_STwri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset), "if ($src1.new) memw($src3++#$offset) = $src2.new", @@ -2211,7 +3599,8 @@ def POST_STwri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv) memw(Rx++#s4:2)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_STwri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset), "if (!$src1) memw($src3++#$offset) = $src2.new", @@ -2219,7 +3608,8 @@ def POST_STwri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; // if (!Pv.new) memw(Rx++#s4:2)=Nt.new -let mayStore = 1, hasCtrlDep = 1 in +let mayStore = 1, hasCtrlDep = 1, + isPredicated = 1 in def POST_STwri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset), "if (!$src1.new) memw($src3++#$offset) = $src2.new", @@ -2227,6 +3617,199 @@ def POST_STwri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst), Requires<[HasV4T]>; + +// if (Pv) memb(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STb_GP_cPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1) memb(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memb(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STb_GP_cNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1) memb(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (Pv) memb(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STb_GP_cdnPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1.new) memb(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memb(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STb_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1.new) memb(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (Pv) memh(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STh_GP_cPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1) memh(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memh(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STh_GP_cNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1) memh(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (Pv) memh(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STh_GP_cdnPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1.new) memh(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memh(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STh_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1.new) memh(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (Pv) memw(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STw_GP_cPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1) memw(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memw(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STw_GP_cNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1) memw(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (Pv) memw(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STw_GP_cdnPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if ($src1.new) memw(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +// if (!Pv) memw(##global) = Rt +let mayStore = 1, neverHasSideEffects = 1 in +def STw_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2), + "if (!$src1.new) memw(##$global) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STrib_GP_cPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1) memb(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STrib_GP_cNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1) memb(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STrib_GP_cdnPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1.new) memb(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STrib_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1.new) memb(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STrih_GP_cPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1) memh(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STrih_GP_cNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1) memh(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STrih_GP_cdnPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1.new) memh(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STrih_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1.new) memh(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STriw_GP_cPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1) memw(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STriw_GP_cNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1) memw(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STriw_GP_cdnPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if ($src1.new) memw(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + +let mayStore = 1, neverHasSideEffects = 1 in +def STriw_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset, + IntRegs:$src2), + "if (!$src1.new) memw(##$global+$offset) = $src2.new", + []>, + Requires<[HasV4T]>; + //===----------------------------------------------------------------------===// // NV/ST - //===----------------------------------------------------------------------===// @@ -2253,7 +3836,8 @@ multiclass NVJ_type_basic_reg<string NotStr, string OpcStr, string TakenStr> { Requires<[HasV4T]>; } -multiclass NVJ_type_basic_2ndDotNew<string NotStr, string OpcStr, string TakenStr> { +multiclass NVJ_type_basic_2ndDotNew<string NotStr, string OpcStr, + string TakenStr> { def _ie_nv_V4 : NVInst_V4<(outs), (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset), !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr, @@ -2307,7 +3891,8 @@ multiclass NVJ_type_basic_neg<string NotStr, string OpcStr, string TakenStr> { Requires<[HasV4T]>; } -multiclass NVJ_type_basic_tstbit<string NotStr, string OpcStr, string TakenStr> { +multiclass NVJ_type_basic_tstbit<string NotStr, string OpcStr, + string TakenStr> { def _ie_nv_V4 : NVInst_V4<(outs), (ins IntRegs:$src1, u1Imm:$src2, brtarget:$offset), !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr, @@ -2416,16 +4001,18 @@ let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in { def ADDr_ADDri_V4 : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3), "$dst = add($src1, add($src2, #$src3))", - [(set IntRegs:$dst, - (add IntRegs:$src1, (add IntRegs:$src2, s6ImmPred:$src3)))]>, + [(set (i32 IntRegs:$dst), + (add (i32 IntRegs:$src1), (add (i32 IntRegs:$src2), + s6ImmPred:$src3)))]>, Requires<[HasV4T]>; // Rd=add(Rs,sub(#s6,Ru)) def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3), "$dst = add($src1, sub(#$src2, $src3))", - [(set IntRegs:$dst, - (add IntRegs:$src1, (sub s6ImmPred:$src2, IntRegs:$src3)))]>, + [(set (i32 IntRegs:$dst), + (add (i32 IntRegs:$src1), (sub s6ImmPred:$src2, + (i32 IntRegs:$src3))))]>, Requires<[HasV4T]>; // Generates the same instruction as ADDr_SUBri_V4 but matches different @@ -2434,8 +4021,9 @@ def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst), def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3), "$dst = add($src1, sub(#$src2, $src3))", - [(set IntRegs:$dst, - (sub (add IntRegs:$src1, s6ImmPred:$src2), IntRegs:$src3))]>, + [(set (i32 IntRegs:$dst), + (sub (add (i32 IntRegs:$src1), s6ImmPred:$src2), + (i32 IntRegs:$src3)))]>, Requires<[HasV4T]>; @@ -2451,16 +4039,16 @@ def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst), def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = and($src1, ~$src2)", - [(set DoubleRegs:$dst, (and DoubleRegs:$src1, - (not DoubleRegs:$src2)))]>, + [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1), + (not (i64 DoubleRegs:$src2))))]>, Requires<[HasV4T]>; // Rdd=or(Rtt,~Rss) def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), "$dst = or($src1, ~$src2)", - [(set DoubleRegs:$dst, - (or DoubleRegs:$src1, (not DoubleRegs:$src2)))]>, + [(set (i64 DoubleRegs:$dst), + (or (i64 DoubleRegs:$src1), (not (i64 DoubleRegs:$src2))))]>, Requires<[HasV4T]>; @@ -2469,8 +4057,9 @@ def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst), def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), "$dst ^= xor($src2, $src3)", - [(set DoubleRegs:$dst, - (xor DoubleRegs:$src1, (xor DoubleRegs:$src2, DoubleRegs:$src3)))], + [(set (i64 DoubleRegs:$dst), + (xor (i64 DoubleRegs:$src1), (xor (i64 DoubleRegs:$src2), + (i64 DoubleRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2480,8 +4069,9 @@ def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst), def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3), "$dst = or($src1, and($src2, #$src3))", - [(set IntRegs:$dst, - (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))], + [(set (i32 IntRegs:$dst), + (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + s10ImmPred:$src3)))], "$src2 = $dst">, Requires<[HasV4T]>; @@ -2490,8 +4080,9 @@ def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst), def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst &= and($src2, $src3)", - [(set IntRegs:$dst, - (and IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2499,8 +4090,9 @@ def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst), def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst |= and($src2, $src3)", - [(set IntRegs:$dst, - (or IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2508,8 +4100,9 @@ def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst), def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst ^= and($src2, $src3)", - [(set IntRegs:$dst, - (xor IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2518,8 +4111,9 @@ def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst), def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst &= and($src2, ~$src3)", - [(set IntRegs:$dst, - (and IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))], + [(set (i32 IntRegs:$dst), + (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + (not (i32 IntRegs:$src3)))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2527,8 +4121,9 @@ def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst), def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst |= and($src2, ~$src3)", - [(set IntRegs:$dst, - (or IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))], + [(set (i32 IntRegs:$dst), + (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + (not (i32 IntRegs:$src3)))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2536,8 +4131,9 @@ def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst), def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst ^= and($src2, ~$src3)", - [(set IntRegs:$dst, - (xor IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))], + [(set (i32 IntRegs:$dst), + (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + (not (i32 IntRegs:$src3)))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2546,8 +4142,9 @@ def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst), def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst &= or($src2, $src3)", - [(set IntRegs:$dst, - (and IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (and (i32 IntRegs:$src1), (or (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2555,8 +4152,9 @@ def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst), def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst |= or($src2, $src3)", - [(set IntRegs:$dst, - (or IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (or (i32 IntRegs:$src1), (or (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2564,8 +4162,9 @@ def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst), def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst ^= or($src2, $src3)", - [(set IntRegs:$dst, - (xor IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (xor (i32 IntRegs:$src1), (or (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2574,8 +4173,9 @@ def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst), def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst &= xor($src2, $src3)", - [(set IntRegs:$dst, - (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2583,8 +4183,9 @@ def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst), def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst |= xor($src2, $src3)", - [(set IntRegs:$dst, - (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2592,8 +4193,9 @@ def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst), def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3), "$dst ^= xor($src2, $src3)", - [(set IntRegs:$dst, - (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2601,8 +4203,9 @@ def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst), def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3), "$dst |= and($src2, #$src3)", - [(set IntRegs:$dst, - (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))], + [(set (i32 IntRegs:$dst), + (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + s10ImmPred:$src3)))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2610,8 +4213,9 @@ def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst), def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3), "$dst |= or($src2, #$src3)", - [(set IntRegs:$dst, - (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))], + [(set (i32 IntRegs:$dst), + (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2), + s10ImmPred:$src3)))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2663,8 +4267,9 @@ def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst), def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst), (ins u6Imm:$src1, IntRegs:$src2, u6Imm:$src3), "$dst = add(#$src1, mpyi($src2, #$src3))", - [(set IntRegs:$dst, - (add (mul IntRegs:$src2, u6ImmPred:$src3), u6ImmPred:$src1))]>, + [(set (i32 IntRegs:$dst), + (add (mul (i32 IntRegs:$src2), u6ImmPred:$src3), + u6ImmPred:$src1))]>, Requires<[HasV4T]>; // Rd=add(#u6,mpyi(Rs,Rt)) @@ -2672,32 +4277,36 @@ def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst), def ADDi_MPYrr_V4 : MInst<(outs IntRegs:$dst), (ins u6Imm:$src1, IntRegs:$src2, IntRegs:$src3), "$dst = add(#$src1, mpyi($src2, $src3))", - [(set IntRegs:$dst, - (add (mul IntRegs:$src2, IntRegs:$src3), u6ImmPred:$src1))]>, + [(set (i32 IntRegs:$dst), + (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)), + u6ImmPred:$src1))]>, Requires<[HasV4T]>; // Rd=add(Ru,mpyi(#u6:2,Rs)) def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u6Imm:$src2, IntRegs:$src3), "$dst = add($src1, mpyi(#$src2, $src3))", - [(set IntRegs:$dst, - (add IntRegs:$src1, (mul IntRegs:$src3, u6_2ImmPred:$src2)))]>, + [(set (i32 IntRegs:$dst), + (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src3), + u6_2ImmPred:$src2)))]>, Requires<[HasV4T]>; // Rd=add(Ru,mpyi(Rs,#u6)) def ADDr_MPYri_V4 : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u6Imm:$src3), "$dst = add($src1, mpyi($src2, #$src3))", - [(set IntRegs:$dst, - (add IntRegs:$src1, (mul IntRegs:$src2, u6ImmPred:$src3)))]>, + [(set (i32 IntRegs:$dst), + (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2), + u6ImmPred:$src3)))]>, Requires<[HasV4T]>; // Rx=add(Ru,mpyi(Rx,Rs)) def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), "$dst = add($src1, mpyi($src2, $src3))", - [(set IntRegs:$dst, - (add IntRegs:$src1, (mul IntRegs:$src2, IntRegs:$src3)))], + [(set (i32 IntRegs:$dst), + (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2), + (i32 IntRegs:$src3))))], "$src2 = $dst">, Requires<[HasV4T]>; @@ -2745,8 +4354,9 @@ def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst), def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst), (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3), "$dst = add(#$src1, asl($src2, #$src3))", - [(set IntRegs:$dst, - (add (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))], + [(set (i32 IntRegs:$dst), + (add (shl (i32 IntRegs:$src2), u5ImmPred:$src3), + u8ImmPred:$src1))], "$src2 = $dst">, Requires<[HasV4T]>; @@ -2754,8 +4364,9 @@ def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst), def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst), (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3), "$dst = add(#$src1, lsr($src2, #$src3))", - [(set IntRegs:$dst, - (add (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))], + [(set (i32 IntRegs:$dst), + (add (srl (i32 IntRegs:$src2), u5ImmPred:$src3), + u8ImmPred:$src1))], "$src2 = $dst">, Requires<[HasV4T]>; @@ -2763,8 +4374,9 @@ def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst), def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst), (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3), "$dst = sub(#$src1, asl($src2, #$src3))", - [(set IntRegs:$dst, - (sub (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))], + [(set (i32 IntRegs:$dst), + (sub (shl (i32 IntRegs:$src2), u5ImmPred:$src3), + u8ImmPred:$src1))], "$src2 = $dst">, Requires<[HasV4T]>; @@ -2772,8 +4384,9 @@ def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst), def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst), (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3), "$dst = sub(#$src1, lsr($src2, #$src3))", - [(set IntRegs:$dst, - (sub (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))], + [(set (i32 IntRegs:$dst), + (sub (srl (i32 IntRegs:$src2), u5ImmPred:$src3), + u8ImmPred:$src1))], "$src2 = $dst">, Requires<[HasV4T]>; @@ -2783,8 +4396,9 @@ def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst), def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst), (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3), "$dst = and(#$src1, asl($src2, #$src3))", - [(set IntRegs:$dst, - (and (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))], + [(set (i32 IntRegs:$dst), + (and (shl (i32 IntRegs:$src2), u5ImmPred:$src3), + u8ImmPred:$src1))], "$src2 = $dst">, Requires<[HasV4T]>; @@ -2792,26 +4406,31 @@ def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst), def ANDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst), (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3), "$dst = and(#$src1, lsr($src2, #$src3))", - [(set IntRegs:$dst, - (and (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))], + [(set (i32 IntRegs:$dst), + (and (srl (i32 IntRegs:$src2), u5ImmPred:$src3), + u8ImmPred:$src1))], "$src2 = $dst">, Requires<[HasV4T]>; //Rx=or(#u8,asl(Rx,#U5)) +let AddedComplexity = 30 in def ORi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst), (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3), "$dst = or(#$src1, asl($src2, #$src3))", - [(set IntRegs:$dst, - (or (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))], + [(set (i32 IntRegs:$dst), + (or (shl (i32 IntRegs:$src2), u5ImmPred:$src3), + u8ImmPred:$src1))], "$src2 = $dst">, Requires<[HasV4T]>; //Rx=or(#u8,lsr(Rx,#U5)) +let AddedComplexity = 30 in def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst), (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3), "$dst = or(#$src1, lsr($src2, #$src3))", - [(set IntRegs:$dst, - (or (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))], + [(set (i32 IntRegs:$dst), + (or (srl (i32 IntRegs:$src2), u5ImmPred:$src3), + u8ImmPred:$src1))], "$src2 = $dst">, Requires<[HasV4T]>; @@ -2820,7 +4439,8 @@ def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst), //Rd=lsl(#s6,Rt) def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2), "$dst = lsl(#$src1, $src2)", - [(set IntRegs:$dst, (shl s6ImmPred:$src1, IntRegs:$src2))]>, + [(set (i32 IntRegs:$dst), (shl s6ImmPred:$src1, + (i32 IntRegs:$src2)))]>, Requires<[HasV4T]>; @@ -2829,8 +4449,9 @@ def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2), def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), "$dst ^= asl($src2, $src3)", - [(set DoubleRegs:$dst, - (xor DoubleRegs:$src1, (shl DoubleRegs:$src2, IntRegs:$src3)))], + [(set (i64 DoubleRegs:$dst), + (xor (i64 DoubleRegs:$src1), (shl (i64 DoubleRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2838,8 +4459,9 @@ def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst), def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), "$dst ^= asr($src2, $src3)", - [(set DoubleRegs:$dst, - (xor DoubleRegs:$src1, (sra DoubleRegs:$src2, IntRegs:$src3)))], + [(set (i64 DoubleRegs:$dst), + (xor (i64 DoubleRegs:$src1), (sra (i64 DoubleRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2847,8 +4469,9 @@ def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst), def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), "$dst ^= lsl($src2, $src3)", - [(set DoubleRegs:$dst, - (xor DoubleRegs:$src1, (shl DoubleRegs:$src2, IntRegs:$src3)))], + [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1), + (shl (i64 DoubleRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2856,8 +4479,9 @@ def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst), def LSRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), "$dst ^= lsr($src2, $src3)", - [(set DoubleRegs:$dst, - (xor DoubleRegs:$src1, (srl DoubleRegs:$src2, IntRegs:$src3)))], + [(set (i64 DoubleRegs:$dst), + (xor (i64 DoubleRegs:$src1), (srl (i64 DoubleRegs:$src2), + (i32 IntRegs:$src3))))], "$src1 = $dst">, Requires<[HasV4T]>; @@ -2903,16 +4527,16 @@ let AddedComplexity = 30 in def MEMw_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_2Imm:$offset, m6Imm:$addend), "Error; should not emit", - [(store (add (load (add IntRegs:$base, u6_2ImmPred:$offset)), -m6ImmPred:$addend), - (add IntRegs:$base, u6_2ImmPred:$offset))]>, + [(store (add (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)), + m6ImmPred:$addend), + (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memw(Rs+#u6:2) += #U5 let AddedComplexity = 30 in def MEMw_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_2Imm:$offset, u5Imm:$addend), - "memw($base+#$offset) += $addend", + "memw($base+#$offset) += #$addend", []>, Requires<[HasV4T, UseMEMOP]>; @@ -2920,7 +4544,7 @@ def MEMw_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs), let AddedComplexity = 30 in def MEMw_SUBi_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_2Imm:$offset, u5Imm:$subend), - "memw($base+#$offset) -= $subend", + "memw($base+#$offset) -= #$subend", []>, Requires<[HasV4T, UseMEMOP]>; @@ -2929,9 +4553,9 @@ let AddedComplexity = 30 in def MEMw_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$addend), "memw($base+#$offset) += $addend", - [(store (add (load (add IntRegs:$base, u6_2ImmPred:$offset)), -IntRegs:$addend), - (add IntRegs:$base, u6_2ImmPred:$offset))]>, + [(store (add (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)), + (i32 IntRegs:$addend)), + (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memw(Rs+#u6:2) -= Rt @@ -2939,19 +4563,19 @@ let AddedComplexity = 30 in def MEMw_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$subend), "memw($base+#$offset) -= $subend", - [(store (sub (load (add IntRegs:$base, u6_2ImmPred:$offset)), -IntRegs:$subend), - (add IntRegs:$base, u6_2ImmPred:$offset))]>, + [(store (sub (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)), + (i32 IntRegs:$subend)), + (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memw(Rs+#u6:2) &= Rt let AddedComplexity = 30 in def MEMw_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$andend), - "memw($base+#$offset) += $andend", - [(store (and (load (add IntRegs:$base, u6_2ImmPred:$offset)), -IntRegs:$andend), - (add IntRegs:$base, u6_2ImmPred:$offset))]>, + "memw($base+#$offset) &= $andend", + [(store (and (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)), + (i32 IntRegs:$andend)), + (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memw(Rs+#u6:2) |= Rt @@ -2959,9 +4583,9 @@ let AddedComplexity = 30 in def MEMw_ORr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$orend), "memw($base+#$offset) |= $orend", - [(store (or (load (add IntRegs:$base, u6_2ImmPred:$offset)), - IntRegs:$orend), - (add IntRegs:$base, u6_2ImmPred:$offset))]>, + [(store (or (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)), + (i32 IntRegs:$orend)), + (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // MEMw_ADDSUBi_V4: @@ -2996,7 +4620,7 @@ let AddedComplexity = 30 in def MEMw_ADDr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$addend), "memw($addr) += $addend", - [(store (add (load ADDRriU6_2:$addr), IntRegs:$addend), + [(store (add (load ADDRriU6_2:$addr), (i32 IntRegs:$addend)), ADDRriU6_2:$addr)]>, Requires<[HasV4T, UseMEMOP]>; @@ -3005,7 +4629,7 @@ let AddedComplexity = 30 in def MEMw_SUBr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$subend), "memw($addr) -= $subend", - [(store (sub (load ADDRriU6_2:$addr), IntRegs:$subend), + [(store (sub (load ADDRriU6_2:$addr), (i32 IntRegs:$subend)), ADDRriU6_2:$addr)]>, Requires<[HasV4T, UseMEMOP]>; @@ -3014,7 +4638,7 @@ let AddedComplexity = 30 in def MEMw_ANDr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$andend), "memw($addr) &= $andend", - [(store (and (load ADDRriU6_2:$addr), IntRegs:$andend), + [(store (and (load ADDRriU6_2:$addr), (i32 IntRegs:$andend)), ADDRriU6_2:$addr)]>, Requires<[HasV4T, UseMEMOP]>; @@ -3023,8 +4647,8 @@ let AddedComplexity = 30 in def MEMw_ORr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$orend), "memw($addr) |= $orend", - [(store (or (load ADDRriU6_2:$addr), IntRegs:$orend), -ADDRriU6_2:$addr)]>, + [(store (or (load ADDRriU6_2:$addr), (i32 IntRegs:$orend)), + ADDRriU6_2:$addr)]>, Requires<[HasV4T, UseMEMOP]>; //===----------------------------------------------------------------------===// @@ -3060,10 +4684,10 @@ let AddedComplexity = 30 in def MEMh_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_1Imm:$offset, m6Imm:$addend), "Error; should not emit", - [(truncstorei16 (add (sextloadi16 (add IntRegs:$base, + [(truncstorei16 (add (sextloadi16 (add (i32 IntRegs:$base), u6_1ImmPred:$offset)), m6ImmPred:$addend), - (add IntRegs:$base, u6_1ImmPred:$offset))]>, + (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memh(Rs+#u6:1) += #U5 @@ -3087,10 +4711,10 @@ let AddedComplexity = 30 in def MEMh_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$addend), "memh($base+#$offset) += $addend", - [(truncstorei16 (add (sextloadi16 (add IntRegs:$base, + [(truncstorei16 (add (sextloadi16 (add (i32 IntRegs:$base), u6_1ImmPred:$offset)), - IntRegs:$addend), - (add IntRegs:$base, u6_1ImmPred:$offset))]>, + (i32 IntRegs:$addend)), + (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memh(Rs+#u6:1) -= Rt @@ -3098,10 +4722,10 @@ let AddedComplexity = 30 in def MEMh_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$subend), "memh($base+#$offset) -= $subend", - [(truncstorei16 (sub (sextloadi16 (add IntRegs:$base, + [(truncstorei16 (sub (sextloadi16 (add (i32 IntRegs:$base), u6_1ImmPred:$offset)), - IntRegs:$subend), - (add IntRegs:$base, u6_1ImmPred:$offset))]>, + (i32 IntRegs:$subend)), + (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memh(Rs+#u6:1) &= Rt @@ -3109,10 +4733,10 @@ let AddedComplexity = 30 in def MEMh_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$andend), "memh($base+#$offset) += $andend", - [(truncstorei16 (and (sextloadi16 (add IntRegs:$base, + [(truncstorei16 (and (sextloadi16 (add (i32 IntRegs:$base), u6_1ImmPred:$offset)), - IntRegs:$andend), - (add IntRegs:$base, u6_1ImmPred:$offset))]>, + (i32 IntRegs:$andend)), + (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memh(Rs+#u6:1) |= Rt @@ -3120,10 +4744,10 @@ let AddedComplexity = 30 in def MEMh_ORr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$orend), "memh($base+#$offset) |= $orend", - [(truncstorei16 (or (sextloadi16 (add IntRegs:$base, + [(truncstorei16 (or (sextloadi16 (add (i32 IntRegs:$base), u6_1ImmPred:$offset)), - IntRegs:$orend), - (add IntRegs:$base, u6_1ImmPred:$offset))]>, + (i32 IntRegs:$orend)), + (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // MEMh_ADDSUBi_V4: @@ -3159,7 +4783,7 @@ def MEMh_ADDr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$addend), "memh($addr) += $addend", [(truncstorei16 (add (sextloadi16 ADDRriU6_1:$addr), - IntRegs:$addend), ADDRriU6_1:$addr)]>, + (i32 IntRegs:$addend)), ADDRriU6_1:$addr)]>, Requires<[HasV4T, UseMEMOP]>; // memh(Rs+#u6:1) -= Rt @@ -3168,7 +4792,7 @@ def MEMh_SUBr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$subend), "memh($addr) -= $subend", [(truncstorei16 (sub (sextloadi16 ADDRriU6_1:$addr), - IntRegs:$subend), ADDRriU6_1:$addr)]>, + (i32 IntRegs:$subend)), ADDRriU6_1:$addr)]>, Requires<[HasV4T, UseMEMOP]>; // memh(Rs+#u6:1) &= Rt @@ -3177,7 +4801,7 @@ def MEMh_ANDr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$andend), "memh($addr) &= $andend", [(truncstorei16 (and (sextloadi16 ADDRriU6_1:$addr), - IntRegs:$andend), ADDRriU6_1:$addr)]>, + (i32 IntRegs:$andend)), ADDRriU6_1:$addr)]>, Requires<[HasV4T, UseMEMOP]>; // memh(Rs+#u6:1) |= Rt @@ -3186,7 +4810,7 @@ def MEMh_ORr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$orend), "memh($addr) |= $orend", [(truncstorei16 (or (sextloadi16 ADDRriU6_1:$addr), - IntRegs:$orend), ADDRriU6_1:$addr)]>, + (i32 IntRegs:$orend)), ADDRriU6_1:$addr)]>, Requires<[HasV4T, UseMEMOP]>; @@ -3223,10 +4847,10 @@ let AddedComplexity = 30 in def MEMb_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_0Imm:$offset, m6Imm:$addend), "Error; should not emit", - [(truncstorei8 (add (sextloadi8 (add IntRegs:$base, + [(truncstorei8 (add (sextloadi8 (add (i32 IntRegs:$base), u6_0ImmPred:$offset)), m6ImmPred:$addend), - (add IntRegs:$base, u6_0ImmPred:$offset))]>, + (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memb(Rs+#u6:0) += #U5 @@ -3250,10 +4874,10 @@ let AddedComplexity = 30 in def MEMb_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$addend), "memb($base+#$offset) += $addend", - [(truncstorei8 (add (sextloadi8 (add IntRegs:$base, + [(truncstorei8 (add (sextloadi8 (add (i32 IntRegs:$base), u6_0ImmPred:$offset)), - IntRegs:$addend), - (add IntRegs:$base, u6_0ImmPred:$offset))]>, + (i32 IntRegs:$addend)), + (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memb(Rs+#u6:0) -= Rt @@ -3261,10 +4885,10 @@ let AddedComplexity = 30 in def MEMb_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$subend), "memb($base+#$offset) -= $subend", - [(truncstorei8 (sub (sextloadi8 (add IntRegs:$base, + [(truncstorei8 (sub (sextloadi8 (add (i32 IntRegs:$base), u6_0ImmPred:$offset)), - IntRegs:$subend), - (add IntRegs:$base, u6_0ImmPred:$offset))]>, + (i32 IntRegs:$subend)), + (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memb(Rs+#u6:0) &= Rt @@ -3272,10 +4896,10 @@ let AddedComplexity = 30 in def MEMb_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$andend), "memb($base+#$offset) += $andend", - [(truncstorei8 (and (sextloadi8 (add IntRegs:$base, + [(truncstorei8 (and (sextloadi8 (add (i32 IntRegs:$base), u6_0ImmPred:$offset)), - IntRegs:$andend), - (add IntRegs:$base, u6_0ImmPred:$offset))]>, + (i32 IntRegs:$andend)), + (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // memb(Rs+#u6:0) |= Rt @@ -3283,10 +4907,10 @@ let AddedComplexity = 30 in def MEMb_ORr_indexed_MEM_V4 : MEMInst_V4<(outs), (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$orend), "memb($base+#$offset) |= $orend", - [(truncstorei8 (or (sextloadi8 (add IntRegs:$base, + [(truncstorei8 (or (sextloadi8 (add (i32 IntRegs:$base), u6_0ImmPred:$offset)), - IntRegs:$orend), - (add IntRegs:$base, u6_0ImmPred:$offset))]>, + (i32 IntRegs:$orend)), + (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>, Requires<[HasV4T, UseMEMOP]>; // MEMb_ADDSUBi_V4: @@ -3322,7 +4946,7 @@ def MEMb_ADDr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$addend), "memb($addr) += $addend", [(truncstorei8 (add (sextloadi8 ADDRriU6_0:$addr), - IntRegs:$addend), ADDRriU6_0:$addr)]>, + (i32 IntRegs:$addend)), ADDRriU6_0:$addr)]>, Requires<[HasV4T, UseMEMOP]>; // memb(Rs+#u6:0) -= Rt @@ -3331,7 +4955,7 @@ def MEMb_SUBr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$subend), "memb($addr) -= $subend", [(truncstorei8 (sub (sextloadi8 ADDRriU6_0:$addr), - IntRegs:$subend), ADDRriU6_0:$addr)]>, + (i32 IntRegs:$subend)), ADDRriU6_0:$addr)]>, Requires<[HasV4T, UseMEMOP]>; // memb(Rs+#u6:0) &= Rt @@ -3340,7 +4964,7 @@ def MEMb_ANDr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$andend), "memb($addr) &= $andend", [(truncstorei8 (and (sextloadi8 ADDRriU6_0:$addr), - IntRegs:$andend), ADDRriU6_0:$addr)]>, + (i32 IntRegs:$andend)), ADDRriU6_0:$addr)]>, Requires<[HasV4T, UseMEMOP]>; // memb(Rs+#u6:0) |= Rt @@ -3349,7 +4973,7 @@ def MEMb_ORr_MEM_V4 : MEMInst_V4<(outs), (ins MEMri:$addr, IntRegs:$orend), "memb($addr) |= $orend", [(truncstorei8 (or (sextloadi8 ADDRriU6_0:$addr), - IntRegs:$orend), ADDRriU6_0:$addr)]>, + (i32 IntRegs:$orend)), ADDRriU6_0:$addr)]>, Requires<[HasV4T, UseMEMOP]>; @@ -3364,13 +4988,16 @@ def MEMb_ORr_MEM_V4 : MEMInst_V4<(outs), // The implemented patterns are: EQ/GT/GTU. // Missing patterns are: GE/GEU/LT/LTU/LE/LEU. +// Following instruction is not being extended as it results into the +// incorrect code for negative numbers. // Pd=cmpb.eq(Rs,#u8) + let isCompare = 1 in def CMPbEQri_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2), "$dst = cmpb.eq($src1, #$src2)", - [(set PredRegs:$dst, (seteq (and IntRegs:$src1, 255), - u8ImmPred:$src2))]>, + [(set (i1 PredRegs:$dst), + (seteq (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2))]>, Requires<[HasV4T]>; // Pd=cmpb.eq(Rs,Rt) @@ -3378,10 +5005,9 @@ let isCompare = 1 in def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = cmpb.eq($src1, $src2)", - [(set PredRegs:$dst, (seteq (and (xor IntRegs:$src1, - IntRegs:$src2), - 255), - 0))]>, + [(set (i1 PredRegs:$dst), + (seteq (and (xor (i32 IntRegs:$src1), + (i32 IntRegs:$src2)), 255), 0))]>, Requires<[HasV4T]>; // Pd=cmpb.eq(Rs,Rt) @@ -3389,17 +5015,9 @@ let isCompare = 1 in def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = cmpb.eq($src1, $src2)", - [(set PredRegs:$dst, (seteq (shl IntRegs:$src1, (i32 24)), - (shl IntRegs:$src2, (i32 24))))]>, - Requires<[HasV4T]>; - -// Pd=cmpb.gt(Rs,#s8) -let isCompare = 1 in -def CMPbGTri_V4 : MInst<(outs PredRegs:$dst), - (ins IntRegs:$src1, s32Imm:$src2), - "$dst = cmpb.gt($src1, #$src2)", - [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 24)), - s32_24ImmPred:$src2))]>, + [(set (i1 PredRegs:$dst), + (seteq (shl (i32 IntRegs:$src1), (i32 24)), + (shl (i32 IntRegs:$src2), (i32 24))))]>, Requires<[HasV4T]>; // Pd=cmpb.gt(Rs,Rt) @@ -3407,8 +5025,9 @@ let isCompare = 1 in def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = cmpb.gt($src1, $src2)", - [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 24)), - (shl IntRegs:$src2, (i32 24))))]>, + [(set (i1 PredRegs:$dst), + (setgt (shl (i32 IntRegs:$src1), (i32 24)), + (shl (i32 IntRegs:$src2), (i32 24))))]>, Requires<[HasV4T]>; // Pd=cmpb.gtu(Rs,#u7) @@ -3416,8 +5035,8 @@ let isCompare = 1 in def CMPbGTUri_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u7Imm:$src2), "$dst = cmpb.gtu($src1, #$src2)", - [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 255), - u7ImmPred:$src2))]>, + [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255), + u7ImmPred:$src2))]>, Requires<[HasV4T]>; // Pd=cmpb.gtu(Rs,Rt) @@ -3425,18 +5044,21 @@ let isCompare = 1 in def CMPbGTUrr_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = cmpb.gtu($src1, $src2)", - [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 255), - (and IntRegs:$src2, 255)))]>, + [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255), + (and (i32 IntRegs:$src2), 255)))]>, Requires<[HasV4T]>; +// Following instruction is not being extended as it results into the incorrect +// code for negative numbers. + // Signed half compare(.eq) ri. // Pd=cmph.eq(Rs,#s8) let isCompare = 1 in def CMPhEQri_V4 : MInst<(outs PredRegs:$dst), - (ins IntRegs:$src1, u16Imm:$src2), + (ins IntRegs:$src1, s8Imm:$src2), "$dst = cmph.eq($src1, #$src2)", - [(set PredRegs:$dst, (seteq (and IntRegs:$src1, 65535), - u16_s8ImmPred:$src2))]>, + [(set (i1 PredRegs:$dst), (seteq (and (i32 IntRegs:$src1), 65535), + s8ImmPred:$src2))]>, Requires<[HasV4T]>; // Signed half compare(.eq) rr. @@ -3449,10 +5071,9 @@ let isCompare = 1 in def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = cmph.eq($src1, $src2)", - [(set PredRegs:$dst, (seteq (and (xor IntRegs:$src1, - IntRegs:$src2), - 65535), - 0))]>, + [(set (i1 PredRegs:$dst), (seteq (and (xor (i32 IntRegs:$src1), + (i32 IntRegs:$src2)), + 65535), 0))]>, Requires<[HasV4T]>; // Signed half compare(.eq) rr. @@ -3465,19 +5086,25 @@ let isCompare = 1 in def CMPhEQrr_shl_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = cmph.eq($src1, $src2)", - [(set PredRegs:$dst, (seteq (shl IntRegs:$src1, (i32 16)), - (shl IntRegs:$src2, (i32 16))))]>, + [(set (i1 PredRegs:$dst), + (seteq (shl (i32 IntRegs:$src1), (i32 16)), + (shl (i32 IntRegs:$src2), (i32 16))))]>, Requires<[HasV4T]>; +/* Incorrect Pattern -- immediate should be right shifted before being +used in the cmph.gt instruction. // Signed half compare(.gt) ri. // Pd=cmph.gt(Rs,#s8) + let isCompare = 1 in def CMPhGTri_V4 : MInst<(outs PredRegs:$dst), - (ins IntRegs:$src1, s32Imm:$src2), + (ins IntRegs:$src1, s8Imm:$src2), "$dst = cmph.gt($src1, #$src2)", - [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 16)), - s32_16s8ImmPred:$src2))]>, + [(set (i1 PredRegs:$dst), + (setgt (shl (i32 IntRegs:$src1), (i32 16)), + s8ImmPred:$src2))]>, Requires<[HasV4T]>; +*/ // Signed half compare(.gt) rr. // Pd=cmph.gt(Rs,Rt) @@ -3485,8 +5112,9 @@ let isCompare = 1 in def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = cmph.gt($src1, $src2)", - [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 16)), - (shl IntRegs:$src2, (i32 16))))]>, + [(set (i1 PredRegs:$dst), + (setgt (shl (i32 IntRegs:$src1), (i32 16)), + (shl (i32 IntRegs:$src2), (i32 16))))]>, Requires<[HasV4T]>; // Unsigned half compare rr (.gtu). @@ -3495,8 +5123,9 @@ let isCompare = 1 in def CMPhGTUrr_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), "$dst = cmph.gtu($src1, $src2)", - [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 65535), - (and IntRegs:$src2, 65535)))]>, + [(set (i1 PredRegs:$dst), + (setugt (and (i32 IntRegs:$src1), 65535), + (and (i32 IntRegs:$src2), 65535)))]>, Requires<[HasV4T]>; // Unsigned half compare ri (.gtu). @@ -3505,8 +5134,8 @@ let isCompare = 1 in def CMPhGTUri_V4 : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u7Imm:$src2), "$dst = cmph.gtu($src1, #$src2)", - [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 65535), - u7ImmPred:$src2))]>, + [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 65535), + u7ImmPred:$src2))]>, Requires<[HasV4T]>; //===----------------------------------------------------------------------===// @@ -3523,10 +5152,42 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1, Requires<[HasV4T]>; } +// Restore registers and dealloc return function call. +let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1, + Defs = [R29, R30, R31, PC] in { + def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs), + (ins calltarget:$dst), + "jump $dst // Restore_and_dealloc_return", + []>, + Requires<[HasV4T]>; +} + +// Restore registers and dealloc frame before a tail call. +let isCall = 1, isBarrier = 1, + Defs = [R29, R30, R31, PC] in { + def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs), + (ins calltarget:$dst), + "call $dst // Restore_and_dealloc_before_tailcall", + []>, + Requires<[HasV4T]>; +} + +// Save registers function call. +let isCall = 1, isBarrier = 1, + Uses = [R29, R31] in { + def SAVE_REGISTERS_CALL_V4 : JInst<(outs), + (ins calltarget:$dst), + "call $dst // Save_calle_saved_registers", + []>, + Requires<[HasV4T]>; +} + // if (Ps) dealloc_return let isReturn = 1, isTerminator = 1, - Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in { - def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1), + Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1, + isPredicated = 1 in { + def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs), + (ins PredRegs:$src1, i32imm:$amt1), "if ($src1) dealloc_return", []>, Requires<[HasV4T]>; @@ -3534,7 +5195,8 @@ let isReturn = 1, isTerminator = 1, // if (!Ps) dealloc_return let isReturn = 1, isTerminator = 1, - Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in { + Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1, + isPredicated = 1 in { def DEALLOC_RET_cNotPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1), "if (!$src1) dealloc_return", @@ -3544,7 +5206,8 @@ let isReturn = 1, isTerminator = 1, // if (Ps.new) dealloc_return:nt let isReturn = 1, isTerminator = 1, - Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in { + Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1, + isPredicated = 1 in { def DEALLOC_RET_cdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1), "if ($src1.new) dealloc_return:nt", @@ -3554,7 +5217,8 @@ let isReturn = 1, isTerminator = 1, // if (!Ps.new) dealloc_return:nt let isReturn = 1, isTerminator = 1, - Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in { + Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1, + isPredicated = 1 in { def DEALLOC_RET_cNotdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1), "if (!$src1.new) dealloc_return:nt", @@ -3564,7 +5228,8 @@ let isReturn = 1, isTerminator = 1, // if (Ps.new) dealloc_return:t let isReturn = 1, isTerminator = 1, - Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in { + Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1, + isPredicated = 1 in { def DEALLOC_RET_cdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1), "if ($src1.new) dealloc_return:t", @@ -3574,10 +5239,539 @@ let isReturn = 1, isTerminator = 1, // if (!Ps.new) dealloc_return:nt let isReturn = 1, isTerminator = 1, - Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in { + Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1, + isPredicated = 1 in { def DEALLOC_RET_cNotdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1), "if (!$src1.new) dealloc_return:t", []>, Requires<[HasV4T]>; } + + +// Load/Store with absolute addressing mode +// memw(#u6)=Rt + +multiclass ST_abs<string OpcStr> { + let isPredicable = 1 in + def _abs_V4 : STInst2<(outs), + (ins globaladdress:$absaddr, IntRegs:$src), + !strconcat(OpcStr, "(##$absaddr) = $src"), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2), + !strconcat("if ($src1)", + !strconcat(OpcStr, "(##$absaddr) = $src2")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2), + !strconcat("if (!$src1)", + !strconcat(OpcStr, "(##$absaddr) = $src2")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2), + !strconcat("if ($src1.new)", + !strconcat(OpcStr, "(##$absaddr) = $src2")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2), + !strconcat("if (!$src1.new)", + !strconcat(OpcStr, "(##$absaddr) = $src2")), + []>, + Requires<[HasV4T]>; + + def _abs_nv_V4 : STInst2<(outs), + (ins globaladdress:$absaddr, IntRegs:$src), + !strconcat(OpcStr, "(##$absaddr) = $src.new"), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cPt_nv_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2), + !strconcat("if ($src1)", + !strconcat(OpcStr, "(##$absaddr) = $src2.new")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cNotPt_nv_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2), + !strconcat("if (!$src1)", + !strconcat(OpcStr, "(##$absaddr) = $src2.new")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnPt_nv_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2), + !strconcat("if ($src1.new)", + !strconcat(OpcStr, "(##$absaddr) = $src2.new")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnNotPt_nv_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2), + !strconcat("if (!$src1.new)", + !strconcat(OpcStr, "(##$absaddr) = $src2.new")), + []>, + Requires<[HasV4T]>; +} + +let AddedComplexity = 30, isPredicable = 1 in +def STrid_abs_V4 : STInst<(outs), + (ins globaladdress:$absaddr, DoubleRegs:$src), + "memd(##$absaddr) = $src", + [(store (i64 DoubleRegs:$src), + (HexagonCONST32 tglobaladdr:$absaddr))]>, + Requires<[HasV4T]>; + +let AddedComplexity = 30, isPredicated = 1 in +def STrid_abs_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2), + "if ($src1) memd(##$absaddr) = $src2", + []>, + Requires<[HasV4T]>; + +let AddedComplexity = 30, isPredicated = 1 in +def STrid_abs_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2), + "if (!$src1) memd(##$absaddr) = $src2", + []>, + Requires<[HasV4T]>; + +let AddedComplexity = 30, isPredicated = 1 in +def STrid_abs_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2), + "if ($src1.new) memd(##$absaddr) = $src2", + []>, + Requires<[HasV4T]>; + +let AddedComplexity = 30, isPredicated = 1 in +def STrid_abs_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2), + "if (!$src1.new) memd(##$absaddr) = $src2", + []>, + Requires<[HasV4T]>; + +defm STrib : ST_abs<"memb">; +defm STrih : ST_abs<"memh">; +defm STriw : ST_abs<"memw">; + +let Predicates = [HasV4T], AddedComplexity = 30 in +def : Pat<(truncstorei8 (i32 IntRegs:$src1), + (HexagonCONST32 tglobaladdr:$absaddr)), + (STrib_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>; + +let Predicates = [HasV4T], AddedComplexity = 30 in +def : Pat<(truncstorei16 (i32 IntRegs:$src1), + (HexagonCONST32 tglobaladdr:$absaddr)), + (STrih_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>; + +let Predicates = [HasV4T], AddedComplexity = 30 in +def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32 tglobaladdr:$absaddr)), + (STriw_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>; + + +multiclass LD_abs<string OpcStr> { + let isPredicable = 1 in + def _abs_V4 : LDInst2<(outs IntRegs:$dst), + (ins globaladdress:$absaddr), + !strconcat("$dst = ", !strconcat(OpcStr, "(##$absaddr)")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$absaddr), + !strconcat("if ($src1) $dst = ", + !strconcat(OpcStr, "(##$absaddr)")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$absaddr), + !strconcat("if (!$src1) $dst = ", + !strconcat(OpcStr, "(##$absaddr)")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$absaddr), + !strconcat("if ($src1.new) $dst = ", + !strconcat(OpcStr, "(##$absaddr)")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$absaddr), + !strconcat("if (!$src1.new) $dst = ", + !strconcat(OpcStr, "(##$absaddr)")), + []>, + Requires<[HasV4T]>; +} + +let AddedComplexity = 30 in +def LDrid_abs_V4 : LDInst<(outs DoubleRegs:$dst), + (ins globaladdress:$absaddr), + "$dst = memd(##$absaddr)", + [(set (i64 DoubleRegs:$dst), + (load (HexagonCONST32 tglobaladdr:$absaddr)))]>, + Requires<[HasV4T]>; + +let AddedComplexity = 30, isPredicated = 1 in +def LDrid_abs_cPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$absaddr), + "if ($src1) $dst = memd(##$absaddr)", + []>, + Requires<[HasV4T]>; + +let AddedComplexity = 30, isPredicated = 1 in +def LDrid_abs_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$absaddr), + "if (!$src1) $dst = memd(##$absaddr)", + []>, + Requires<[HasV4T]>; + +let AddedComplexity = 30, isPredicated = 1 in +def LDrid_abs_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$absaddr), + "if ($src1.new) $dst = memd(##$absaddr)", + []>, + Requires<[HasV4T]>; + +let AddedComplexity = 30, isPredicated = 1 in +def LDrid_abs_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst), + (ins PredRegs:$src1, globaladdress:$absaddr), + "if (!$src1.new) $dst = memd(##$absaddr)", + []>, + Requires<[HasV4T]>; + +defm LDrib : LD_abs<"memb">; +defm LDriub : LD_abs<"memub">; +defm LDrih : LD_abs<"memh">; +defm LDriuh : LD_abs<"memuh">; +defm LDriw : LD_abs<"memw">; + + +let Predicates = [HasV4T], AddedComplexity = 30 in +def : Pat<(i32 (load (HexagonCONST32 tglobaladdr:$absaddr))), + (LDriw_abs_V4 tglobaladdr: $absaddr)>; + +let Predicates = [HasV4T], AddedComplexity=30 in +def : Pat<(i32 (sextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))), + (LDrib_abs_V4 tglobaladdr:$absaddr)>; + +let Predicates = [HasV4T], AddedComplexity=30 in +def : Pat<(i32 (zextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))), + (LDriub_abs_V4 tglobaladdr:$absaddr)>; + +let Predicates = [HasV4T], AddedComplexity=30 in +def : Pat<(i32 (sextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))), + (LDrih_abs_V4 tglobaladdr:$absaddr)>; + +let Predicates = [HasV4T], AddedComplexity=30 in +def : Pat<(i32 (zextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))), + (LDriuh_abs_V4 tglobaladdr:$absaddr)>; + +// Transfer global address into a register +let AddedComplexity=50, isMoveImm = 1, isReMaterializable = 1 in +def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$src1), + "$dst = ##$src1", + [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>, + Requires<[HasV4T]>; + +let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in +def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$src2), + "if($src1) $dst = ##$src2", + []>, + Requires<[HasV4T]>; + +let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in +def TFRI_cNotPt_V4 : ALU32_ri<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$src2), + "if(!$src1) $dst = ##$src2", + []>, + Requires<[HasV4T]>; + +let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in +def TFRI_cdnPt_V4 : ALU32_ri<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$src2), + "if($src1.new) $dst = ##$src2", + []>, + Requires<[HasV4T]>; + +let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in +def TFRI_cdnNotPt_V4 : ALU32_ri<(outs IntRegs:$dst), + (ins PredRegs:$src1, globaladdress:$src2), + "if(!$src1.new) $dst = ##$src2", + []>, + Requires<[HasV4T]>; + +let AddedComplexity = 50, Predicates = [HasV4T] in +def : Pat<(HexagonCONST32_GP tglobaladdr:$src1), + (TFRI_V4 tglobaladdr:$src1)>; + + +// Load - Indirect with long offset: These instructions take global address +// as an operand +let AddedComplexity = 10 in +def LDrid_ind_lo_V4 : LDInst<(outs DoubleRegs:$dst), + (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$offset), + "$dst=memd($src1<<#$src2+##$offset)", + [(set (i64 DoubleRegs:$dst), + (load (add (shl IntRegs:$src1, u2ImmPred:$src2), + (HexagonCONST32 tglobaladdr:$offset))))]>, + Requires<[HasV4T]>; + +let AddedComplexity = 10 in +multiclass LD_indirect_lo<string OpcStr, PatFrag OpNode> { + def _lo_V4 : LDInst<(outs IntRegs:$dst), + (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$offset), + !strconcat("$dst = ", + !strconcat(OpcStr, "($src1<<#$src2+##$offset)")), + [(set IntRegs:$dst, + (i32 (OpNode (add (shl IntRegs:$src1, u2ImmPred:$src2), + (HexagonCONST32 tglobaladdr:$offset)))))]>, + Requires<[HasV4T]>; +} + +defm LDrib_ind : LD_indirect_lo<"memb", sextloadi8>; +defm LDriub_ind : LD_indirect_lo<"memub", zextloadi8>; +defm LDrih_ind : LD_indirect_lo<"memh", sextloadi16>; +defm LDriuh_ind : LD_indirect_lo<"memuh", zextloadi16>; +defm LDriw_ind : LD_indirect_lo<"memw", load>; + +// Store - Indirect with long offset: These instructions take global address +// as an operand +let AddedComplexity = 10 in +def STrid_ind_lo_V4 : STInst<(outs), + (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$src3, + DoubleRegs:$src4), + "memd($src1<<#$src2+#$src3) = $src4", + [(store (i64 DoubleRegs:$src4), + (add (shl IntRegs:$src1, u2ImmPred:$src2), + (HexagonCONST32 tglobaladdr:$src3)))]>, + Requires<[HasV4T]>; + +let AddedComplexity = 10 in +multiclass ST_indirect_lo<string OpcStr, PatFrag OpNode> { + def _lo_V4 : STInst<(outs), + (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$src3, + IntRegs:$src4), + !strconcat(OpcStr, "($src1<<#$src2+##$src3) = $src4"), + [(OpNode (i32 IntRegs:$src4), + (add (shl IntRegs:$src1, u2ImmPred:$src2), + (HexagonCONST32 tglobaladdr:$src3)))]>, + Requires<[HasV4T]>; +} + +defm STrib_ind : ST_indirect_lo<"memb", truncstorei8>; +defm STrih_ind : ST_indirect_lo<"memh", truncstorei16>; +defm STriw_ind : ST_indirect_lo<"memw", store>; + +// Store - absolute addressing mode: These instruction take constant +// value as the extended operand +multiclass ST_absimm<string OpcStr> { + let isPredicable = 1 in + def _abs_V4 : STInst2<(outs), + (ins u6Imm:$src1, IntRegs:$src2), + !strconcat(OpcStr, "(#$src1) = $src2"), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3), + !strconcat("if ($src1)", !strconcat(OpcStr, "(#$src2) = $src3")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3), + !strconcat("if (!$src1)", !strconcat(OpcStr, "(#$src2) = $src3")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3), + !strconcat("if ($src1.new)", + !strconcat(OpcStr, "(#$src2) = $src3")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnNotPt_V4 : STInst2<(outs), + (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3), + !strconcat("if (!$src1.new)", + !strconcat(OpcStr, "(#$src2) = $src3")), + []>, + Requires<[HasV4T]>; + + def _abs_nv_V4 : STInst2<(outs), + (ins u6Imm:$src1, IntRegs:$src2), + !strconcat(OpcStr, "(#$src1) = $src2.new"), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cPt_nv_V4 : STInst2<(outs), + (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3), + !strconcat("if ($src1)", + !strconcat(OpcStr, "(#$src2) = $src3.new")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cNotPt_nv_V4 : STInst2<(outs), + (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3), + !strconcat("if (!$src1)", + !strconcat(OpcStr, "(#$src2) = $src3.new")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnPt_nv_V4 : STInst2<(outs), + (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3), + !strconcat("if ($src1.new)", + !strconcat(OpcStr, "(#$src2) = $src3.new")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnNotPt_nv_V4 : STInst2<(outs), + (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3), + !strconcat("if (!$src1.new)", + !strconcat(OpcStr, "(#$src2) = $src3.new")), + []>, + Requires<[HasV4T]>; +} + +defm STrib_imm : ST_absimm<"memb">; +defm STrih_imm : ST_absimm<"memh">; +defm STriw_imm : ST_absimm<"memw">; + +let Predicates = [HasV4T], AddedComplexity = 30 in +def : Pat<(truncstorei8 (i32 IntRegs:$src1), u6ImmPred:$src2), + (STrib_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>; + +let Predicates = [HasV4T], AddedComplexity = 30 in +def : Pat<(truncstorei16 (i32 IntRegs:$src1), u6ImmPred:$src2), + (STrih_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>; + +let Predicates = [HasV4T], AddedComplexity = 30 in +def : Pat<(store (i32 IntRegs:$src1), u6ImmPred:$src2), + (STriw_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>; + + +// Load - absolute addressing mode: These instruction take constant +// value as the extended operand + +multiclass LD_absimm<string OpcStr> { + let isPredicable = 1 in + def _abs_V4 : LDInst2<(outs IntRegs:$dst), + (ins u6Imm:$src), + !strconcat("$dst = ", + !strconcat(OpcStr, "(#$src)")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, u6Imm:$src2), + !strconcat("if ($src1) $dst = ", + !strconcat(OpcStr, "(#$src2)")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, u6Imm:$src2), + !strconcat("if (!$src1) $dst = ", + !strconcat(OpcStr, "(#$src2)")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, u6Imm:$src2), + !strconcat("if ($src1.new) $dst = ", + !strconcat(OpcStr, "(#$src2)")), + []>, + Requires<[HasV4T]>; + + let isPredicated = 1 in + def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst), + (ins PredRegs:$src1, u6Imm:$src2), + !strconcat("if (!$src1.new) $dst = ", + !strconcat(OpcStr, "(#$src2)")), + []>, + Requires<[HasV4T]>; +} + +defm LDrib_imm : LD_absimm<"memb">; +defm LDriub_imm : LD_absimm<"memub">; +defm LDrih_imm : LD_absimm<"memh">; +defm LDriuh_imm : LD_absimm<"memuh">; +defm LDriw_imm : LD_absimm<"memw">; + +let Predicates = [HasV4T], AddedComplexity = 30 in +def : Pat<(i32 (load u6ImmPred:$src)), + (LDriw_imm_abs_V4 u6ImmPred:$src)>; + +let Predicates = [HasV4T], AddedComplexity=30 in +def : Pat<(i32 (sextloadi8 u6ImmPred:$src)), + (LDrib_imm_abs_V4 u6ImmPred:$src)>; + +let Predicates = [HasV4T], AddedComplexity=30 in +def : Pat<(i32 (zextloadi8 u6ImmPred:$src)), + (LDriub_imm_abs_V4 u6ImmPred:$src)>; + +let Predicates = [HasV4T], AddedComplexity=30 in +def : Pat<(i32 (sextloadi16 u6ImmPred:$src)), + (LDrih_imm_abs_V4 u6ImmPred:$src)>; + +let Predicates = [HasV4T], AddedComplexity=30 in +def : Pat<(i32 (zextloadi16 u6ImmPred:$src)), + (LDriuh_imm_abs_V4 u6ImmPred:$src)>; + + +// Indexed store double word - global address. +// memw(Rs+#u6:2)=#S8 +let AddedComplexity = 10 in +def STriw_offset_ext_V4 : STInst<(outs), + (ins IntRegs:$src1, u6_2Imm:$src2, globaladdress:$src3), + "memw($src1+#$src2) = ##$src3", + [(store (HexagonCONST32 tglobaladdr:$src3), + (add IntRegs:$src1, u6_2ImmPred:$src2))]>, + Requires<[HasV4T]>; + + +// Indexed store double word - global address. +// memw(Rs+#u6:2)=#S8 +let AddedComplexity = 10 in +def STrih_offset_ext_V4 : STInst<(outs), + (ins IntRegs:$src1, u6_1Imm:$src2, globaladdress:$src3), + "memh($src1+#$src2) = ##$src3", + [(truncstorei16 (HexagonCONST32 tglobaladdr:$src3), + (add IntRegs:$src1, u6_1ImmPred:$src2))]>, + Requires<[HasV4T]>; diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td new file mode 100644 index 0000000..92d098c --- /dev/null +++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td @@ -0,0 +1,626 @@ +def SDTHexagonFCONST32 : SDTypeProfile<1, 1, [ + SDTCisVT<0, f32>, + SDTCisPtrTy<1>]>; +def HexagonFCONST32 : SDNode<"HexagonISD::FCONST32", SDTHexagonFCONST32>; + +let isReMaterializable = 1, isMoveImm = 1 in +def FCONST32_nsdata : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global), + "$dst = CONST32(#$global)", + [(set (f32 IntRegs:$dst), + (HexagonFCONST32 tglobaladdr:$global))]>, + Requires<[HasV5T]>; + +let isReMaterializable = 1, isMoveImm = 1 in +def CONST64_Float_Real : LDInst<(outs DoubleRegs:$dst), (ins f64imm:$src1), + "$dst = CONST64(#$src1)", + [(set DoubleRegs:$dst, fpimm:$src1)]>, + Requires<[HasV5T]>; + +let isReMaterializable = 1, isMoveImm = 1 in +def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1), + "$dst = CONST32(#$src1)", + [(set IntRegs:$dst, fpimm:$src1)]>, + Requires<[HasV5T]>; + +// Transfer immediate float. +// Only works with single precision fp value. +// For double precision, use CONST64_float_real, as 64bit transfer +// can only hold 40-bit values - 32 from const ext + 8 bit immediate. +let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in +def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32imm:$src1), + "$dst = ##$src1", + [(set IntRegs:$dst, fpimm:$src1)]>, + Requires<[HasV5T]>; + +def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst), + (ins PredRegs:$src1, f32imm:$src2), + "if ($src1) $dst = ##$src2", + []>, + Requires<[HasV5T]>; + +let isPredicated = 1 in +def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst), + (ins PredRegs:$src1, f32imm:$src2), + "if (!$src1) $dst = ##$src2", + []>, + Requires<[HasV5T]>; + +// Convert single precision to double precision and vice-versa. +def CONVERT_sf2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2df($src)", + [(set DoubleRegs:$dst, (fextend IntRegs:$src))]>, + Requires<[HasV5T]>; + +def CONVERT_df2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2sf($src)", + [(set IntRegs:$dst, (fround DoubleRegs:$src))]>, + Requires<[HasV5T]>; + + +// Load. +def LDrid_f : LDInst<(outs DoubleRegs:$dst), + (ins MEMri:$addr), + "$dst = memd($addr)", + [(set DoubleRegs:$dst, (f64 (load ADDRriS11_3:$addr)))]>, + Requires<[HasV5T]>; + + +let AddedComplexity = 20 in +def LDrid_indexed_f : LDInst<(outs DoubleRegs:$dst), + (ins IntRegs:$src1, s11_3Imm:$offset), + "$dst = memd($src1+#$offset)", + [(set DoubleRegs:$dst, (f64 (load (add IntRegs:$src1, + s11_3ImmPred:$offset))))]>, + Requires<[HasV5T]>; + +def LDriw_f : LDInst<(outs IntRegs:$dst), + (ins MEMri:$addr), "$dst = memw($addr)", + [(set IntRegs:$dst, (f32 (load ADDRriS11_2:$addr)))]>, + Requires<[HasV5T]>; + + +let AddedComplexity = 20 in +def LDriw_indexed_f : LDInst<(outs IntRegs:$dst), + (ins IntRegs:$src1, s11_2Imm:$offset), + "$dst = memw($src1+#$offset)", + [(set IntRegs:$dst, (f32 (load (add IntRegs:$src1, + s11_2ImmPred:$offset))))]>, + Requires<[HasV5T]>; + +// Store. +def STriw_f : STInst<(outs), + (ins MEMri:$addr, IntRegs:$src1), + "memw($addr) = $src1", + [(store (f32 IntRegs:$src1), ADDRriS11_2:$addr)]>, + Requires<[HasV5T]>; + +let AddedComplexity = 10 in +def STriw_indexed_f : STInst<(outs), + (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3), + "memw($src1+#$src2) = $src3", + [(store (f32 IntRegs:$src3), + (add IntRegs:$src1, s11_2ImmPred:$src2))]>, + Requires<[HasV5T]>; + +def STrid_f : STInst<(outs), + (ins MEMri:$addr, DoubleRegs:$src1), + "memd($addr) = $src1", + [(store (f64 DoubleRegs:$src1), ADDRriS11_2:$addr)]>, + Requires<[HasV5T]>; + +// Indexed store double word. +let AddedComplexity = 10 in +def STrid_indexed_f : STInst<(outs), + (ins IntRegs:$src1, s11_3Imm:$src2, DoubleRegs:$src3), + "memd($src1+#$src2) = $src3", + [(store (f64 DoubleRegs:$src3), + (add IntRegs:$src1, s11_3ImmPred:$src2))]>, + Requires<[HasV5T]>; + + +// Add +let isCommutable = 1 in +def fADD_rr : ALU64_rr<(outs IntRegs:$dst), + (ins IntRegs:$src1, IntRegs:$src2), + "$dst = sfadd($src1, $src2)", + [(set IntRegs:$dst, (fadd IntRegs:$src1, IntRegs:$src2))]>, + Requires<[HasV5T]>; + +let isCommutable = 1 in +def fADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, + DoubleRegs:$src2), + "$dst = dfadd($src1, $src2)", + [(set DoubleRegs:$dst, (fadd DoubleRegs:$src1, + DoubleRegs:$src2))]>, + Requires<[HasV5T]>; + +def fSUB_rr : ALU64_rr<(outs IntRegs:$dst), + (ins IntRegs:$src1, IntRegs:$src2), + "$dst = sfsub($src1, $src2)", + [(set IntRegs:$dst, (fsub IntRegs:$src1, IntRegs:$src2))]>, + Requires<[HasV5T]>; + +def fSUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, + DoubleRegs:$src2), + "$dst = dfsub($src1, $src2)", + [(set DoubleRegs:$dst, (fsub DoubleRegs:$src1, + DoubleRegs:$src2))]>, + Requires<[HasV5T]>; + +let isCommutable = 1 in +def fMUL_rr : ALU64_rr<(outs IntRegs:$dst), + (ins IntRegs:$src1, IntRegs:$src2), + "$dst = sfmpy($src1, $src2)", + [(set IntRegs:$dst, (fmul IntRegs:$src1, IntRegs:$src2))]>, + Requires<[HasV5T]>; + +let isCommutable = 1 in +def fMUL64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, + DoubleRegs:$src2), + "$dst = dfmpy($src1, $src2)", + [(set DoubleRegs:$dst, (fmul DoubleRegs:$src1, + DoubleRegs:$src2))]>, + Requires<[HasV5T]>; + +// Compare. +let isCompare = 1 in { +multiclass FCMP64_rr<string OpcStr, PatFrag OpNode> { + def _rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c), + !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")), + [(set PredRegs:$dst, + (OpNode (f64 DoubleRegs:$b), (f64 DoubleRegs:$c)))]>, + Requires<[HasV5T]>; +} + +multiclass FCMP32_rr<string OpcStr, PatFrag OpNode> { + def _rr : ALU64_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c), + !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")), + [(set PredRegs:$dst, + (OpNode (f32 IntRegs:$b), (f32 IntRegs:$c)))]>, + Requires<[HasV5T]>; +} +} + +defm FCMPOEQ64 : FCMP64_rr<"dfcmp.eq", setoeq>; +defm FCMPUEQ64 : FCMP64_rr<"dfcmp.eq", setueq>; +defm FCMPOGT64 : FCMP64_rr<"dfcmp.gt", setogt>; +defm FCMPUGT64 : FCMP64_rr<"dfcmp.gt", setugt>; +defm FCMPOGE64 : FCMP64_rr<"dfcmp.ge", setoge>; +defm FCMPUGE64 : FCMP64_rr<"dfcmp.ge", setuge>; + +defm FCMPOEQ32 : FCMP32_rr<"sfcmp.eq", setoeq>; +defm FCMPUEQ32 : FCMP32_rr<"sfcmp.eq", setueq>; +defm FCMPOGT32 : FCMP32_rr<"sfcmp.gt", setogt>; +defm FCMPUGT32 : FCMP32_rr<"sfcmp.gt", setugt>; +defm FCMPOGE32 : FCMP32_rr<"sfcmp.ge", setoge>; +defm FCMPUGE32 : FCMP32_rr<"sfcmp.ge", setuge>; + +// olt. +def : Pat <(i1 (setolt (f32 IntRegs:$src1), (f32 IntRegs:$src2))), + (i1 (FCMPOGT32_rr IntRegs:$src2, IntRegs:$src1))>, + Requires<[HasV5T]>; + +def : Pat <(i1 (setolt (f32 IntRegs:$src1), (fpimm:$src2))), + (i1 (FCMPOGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>, + Requires<[HasV5T]>; + +def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))), + (i1 (FCMPOGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>, + Requires<[HasV5T]>; + +def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (fpimm:$src2))), + (i1 (FCMPOGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)), + (f64 DoubleRegs:$src1)))>, + Requires<[HasV5T]>; + +// gt. +def : Pat <(i1 (setugt (f64 DoubleRegs:$src1), (fpimm:$src2))), + (i1 (FCMPUGT64_rr (f64 DoubleRegs:$src1), + (f64 (CONST64_Float_Real fpimm:$src2))))>, + Requires<[HasV5T]>; + +def : Pat <(i1 (setugt (f32 IntRegs:$src1), (fpimm:$src2))), + (i1 (FCMPUGT32_rr (f32 IntRegs:$src1), (f32 (TFRI_f fpimm:$src2))))>, + Requires<[HasV5T]>; + +// ult. +def : Pat <(i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))), + (i1 (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1))>, + Requires<[HasV5T]>; + +def : Pat <(i1 (setult (f32 IntRegs:$src1), (fpimm:$src2))), + (i1 (FCMPUGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>, + Requires<[HasV5T]>; + +def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))), + (i1 (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>, + Requires<[HasV5T]>; + +def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (fpimm:$src2))), + (i1 (FCMPUGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)), + (f64 DoubleRegs:$src1)))>, + Requires<[HasV5T]>; + +// le. +// rs <= rt -> rt >= rs. +def : Pat<(i1 (setole (f32 IntRegs:$src1), (f32 IntRegs:$src2))), + (i1 (FCMPOGE32_rr IntRegs:$src2, IntRegs:$src1))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setole (f32 IntRegs:$src1), (fpimm:$src2))), + (i1 (FCMPOGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>, + Requires<[HasV5T]>; + + +// Rss <= Rtt -> Rtt >= Rss. +def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))), + (i1 (FCMPOGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (fpimm:$src2))), + (i1 (FCMPOGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)), + DoubleRegs:$src1))>, + Requires<[HasV5T]>; + +// rs <= rt -> rt >= rs. +def : Pat<(i1 (setule (f32 IntRegs:$src1), (f32 IntRegs:$src2))), + (i1 (FCMPUGE32_rr IntRegs:$src2, IntRegs:$src1))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setule (f32 IntRegs:$src1), (fpimm:$src2))), + (i1 (FCMPUGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>, + Requires<[HasV5T]>; + +// Rss <= Rtt -> Rtt >= Rss. +def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))), + (i1 (FCMPUGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (fpimm:$src2))), + (i1 (FCMPUGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)), + DoubleRegs:$src1))>, + Requires<[HasV5T]>; + +// ne. +def : Pat<(i1 (setone (f32 IntRegs:$src1), (f32 IntRegs:$src2))), + (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, IntRegs:$src2)))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))), + (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setune (f32 IntRegs:$src1), (f32 IntRegs:$src2))), + (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1, IntRegs:$src2)))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))), + (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setone (f32 IntRegs:$src1), (fpimm:$src2))), + (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, (f32 (TFRI_f fpimm:$src2)))))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (fpimm:$src2))), + (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1, + (f64 (CONST64_Float_Real fpimm:$src2)))))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setune (f32 IntRegs:$src1), (fpimm:$src2))), + (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1, (f32 (TFRI_f fpimm:$src2)))))>, + Requires<[HasV5T]>; + +def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (fpimm:$src2))), + (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1, + (f64 (CONST64_Float_Real fpimm:$src2)))))>, + Requires<[HasV5T]>; + +// Convert Integer to Floating Point. +def CONVERT_d2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_d2sf($src)", + [(set (f32 IntRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_ud2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_ud2sf($src)", + [(set (f32 IntRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_uw2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src), + "$dst = convert_uw2sf($src)", + [(set (f32 IntRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_w2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src), + "$dst = convert_w2sf($src)", + [(set (f32 IntRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_d2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_d2df($src)", + [(set (f64 DoubleRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_ud2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_ud2df($src)", + [(set (f64 DoubleRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_uw2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src), + "$dst = convert_uw2df($src)", + [(set (f64 DoubleRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_w2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src), + "$dst = convert_w2df($src)", + [(set (f64 DoubleRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>, + Requires<[HasV5T]>; + +// Convert Floating Point to Integer - default. +def CONVERT_df2uw : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2uw($src):chop", + [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_df2w : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2w($src):chop", + [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_sf2uw : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2uw($src):chop", + [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_sf2w : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2w($src):chop", + [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_df2d : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2d($src):chop", + [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_df2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2ud($src):chop", + [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_sf2d : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2d($src):chop", + [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>, + Requires<[HasV5T]>; + +def CONVERT_sf2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2ud($src):chop", + [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>, + Requires<[HasV5T]>; + +// Convert Floating Point to Integer: non-chopped. +let AddedComplexity = 20 in +def CONVERT_df2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2uw($src)", + [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>, + Requires<[HasV5T, IEEERndNearV5T]>; + +let AddedComplexity = 20 in +def CONVERT_df2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2w($src)", + [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>, + Requires<[HasV5T, IEEERndNearV5T]>; + +let AddedComplexity = 20 in +def CONVERT_sf2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2uw($src)", + [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>, + Requires<[HasV5T, IEEERndNearV5T]>; + +let AddedComplexity = 20 in +def CONVERT_sf2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2w($src)", + [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>, + Requires<[HasV5T, IEEERndNearV5T]>; + +let AddedComplexity = 20 in +def CONVERT_df2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2d($src)", + [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>, + Requires<[HasV5T, IEEERndNearV5T]>; + +let AddedComplexity = 20 in +def CONVERT_df2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src), + "$dst = convert_df2ud($src)", + [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>, + Requires<[HasV5T, IEEERndNearV5T]>; + +let AddedComplexity = 20 in +def CONVERT_sf2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2d($src)", + [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>, + Requires<[HasV5T, IEEERndNearV5T]>; + +let AddedComplexity = 20 in +def CONVERT_sf2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src), + "$dst = convert_sf2ud($src)", + [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>, + Requires<[HasV5T, IEEERndNearV5T]>; + + + +// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp]. +def : Pat <(i32 (bitconvert (f32 IntRegs:$src))), + (i32 (TFR IntRegs:$src))>, + Requires<[HasV5T]>; + +def : Pat <(f32 (bitconvert (i32 IntRegs:$src))), + (f32 (TFR IntRegs:$src))>, + Requires<[HasV5T]>; + +def : Pat <(i64 (bitconvert (f64 DoubleRegs:$src))), + (i64 (TFR64 DoubleRegs:$src))>, + Requires<[HasV5T]>; + +def : Pat <(f64 (bitconvert (i64 DoubleRegs:$src))), + (f64 (TFR64 DoubleRegs:$src))>, + Requires<[HasV5T]>; + +// Floating point fused multiply-add. +def FMADD_dp : ALU64_acc<(outs DoubleRegs:$dst), + (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3), + "$dst += dfmpy($src2, $src3)", + [(set (f64 DoubleRegs:$dst), + (fma DoubleRegs:$src2, DoubleRegs:$src3, DoubleRegs:$src1))], + "$src1 = $dst">, + Requires<[HasV5T]>; + +def FMADD_sp : ALU64_acc<(outs IntRegs:$dst), + (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3), + "$dst += sfmpy($src2, $src3)", + [(set (f32 IntRegs:$dst), + (fma IntRegs:$src2, IntRegs:$src3, IntRegs:$src1))], + "$src1 = $dst">, + Requires<[HasV5T]>; + + +// Floating point max/min. +let AddedComplexity = 100 in +def FMAX_dp : ALU64_rr<(outs DoubleRegs:$dst), + (ins DoubleRegs:$src1, DoubleRegs:$src2), + "$dst = dfmax($src1, $src2)", + [(set DoubleRegs:$dst, (f64 (select (i1 (setolt DoubleRegs:$src2, + DoubleRegs:$src1)), + DoubleRegs:$src1, + DoubleRegs:$src2)))]>, + Requires<[HasV5T]>; + +let AddedComplexity = 100 in +def FMAX_sp : ALU64_rr<(outs IntRegs:$dst), + (ins IntRegs:$src1, IntRegs:$src2), + "$dst = sfmax($src1, $src2)", + [(set IntRegs:$dst, (f32 (select (i1 (setolt IntRegs:$src2, + IntRegs:$src1)), + IntRegs:$src1, + IntRegs:$src2)))]>, + Requires<[HasV5T]>; + +let AddedComplexity = 100 in +def FMIN_dp : ALU64_rr<(outs DoubleRegs:$dst), + (ins DoubleRegs:$src1, DoubleRegs:$src2), + "$dst = dfmin($src1, $src2)", + [(set DoubleRegs:$dst, (f64 (select (i1 (setogt DoubleRegs:$src2, + DoubleRegs:$src1)), + DoubleRegs:$src1, + DoubleRegs:$src2)))]>, + Requires<[HasV5T]>; + +let AddedComplexity = 100 in +def FMIN_sp : ALU64_rr<(outs IntRegs:$dst), + (ins IntRegs:$src1, IntRegs:$src2), + "$dst = sfmin($src1, $src2)", + [(set IntRegs:$dst, (f32 (select (i1 (setogt IntRegs:$src2, + IntRegs:$src1)), + IntRegs:$src1, + IntRegs:$src2)))]>, + Requires<[HasV5T]>; + +// Pseudo instruction to encode a set of conditional transfers. +// This instruction is used instead of a mux and trades-off codesize +// for performance. We conduct this transformation optimistically in +// the hope that these instructions get promoted to dot-new transfers. +let AddedComplexity = 100, isPredicated = 1 in +def TFR_condset_rr_f : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, + IntRegs:$src2, + IntRegs:$src3), + "Error; should not emit", + [(set IntRegs:$dst, (f32 (select PredRegs:$src1, + IntRegs:$src2, + IntRegs:$src3)))]>, + Requires<[HasV5T]>; + +let AddedComplexity = 100, isPredicated = 1 in +def TFR_condset_rr64_f : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1, + DoubleRegs:$src2, + DoubleRegs:$src3), + "Error; should not emit", + [(set DoubleRegs:$dst, (f64 (select PredRegs:$src1, + DoubleRegs:$src2, + DoubleRegs:$src3)))]>, + Requires<[HasV5T]>; + + + +let AddedComplexity = 100, isPredicated = 1 in +def TFR_condset_ri_f : ALU32_rr<(outs IntRegs:$dst), + (ins PredRegs:$src1, IntRegs:$src2, f32imm:$src3), + "Error; should not emit", + [(set IntRegs:$dst, + (f32 (select PredRegs:$src1, IntRegs:$src2, fpimm:$src3)))]>, + Requires<[HasV5T]>; + +let AddedComplexity = 100, isPredicated = 1 in +def TFR_condset_ir_f : ALU32_rr<(outs IntRegs:$dst), + (ins PredRegs:$src1, f32imm:$src2, IntRegs:$src3), + "Error; should not emit", + [(set IntRegs:$dst, + (f32 (select PredRegs:$src1, fpimm:$src2, IntRegs:$src3)))]>, + Requires<[HasV5T]>; + +let AddedComplexity = 100, isPredicated = 1 in +def TFR_condset_ii_f : ALU32_rr<(outs IntRegs:$dst), + (ins PredRegs:$src1, f32imm:$src2, f32imm:$src3), + "Error; should not emit", + [(set IntRegs:$dst, (f32 (select PredRegs:$src1, + fpimm:$src2, + fpimm:$src3)))]>, + Requires<[HasV5T]>; + + +def : Pat <(select (i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))), + (f32 IntRegs:$src3), + (f32 IntRegs:$src4)), + (TFR_condset_rr_f (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1), IntRegs:$src4, + IntRegs:$src3)>, Requires<[HasV5T]>; + +def : Pat <(select (i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))), + (f64 DoubleRegs:$src3), + (f64 DoubleRegs:$src4)), + (TFR_condset_rr64_f (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1), + DoubleRegs:$src4, DoubleRegs:$src3)>, Requires<[HasV5T]>; + +// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i). +def : Pat <(select (not PredRegs:$src1), fpimm:$src2, fpimm:$src3), + (TFR_condset_ii_f PredRegs:$src1, fpimm:$src3, fpimm:$src2)>; + +// Map from p0 = pnot(p0); r0 = select(p0, #i, r1) +// => r0 = TFR_condset_ri(p0, r1, #i) +def : Pat <(select (not PredRegs:$src1), fpimm:$src2, IntRegs:$src3), + (TFR_condset_ri_f PredRegs:$src1, IntRegs:$src3, fpimm:$src2)>; + +// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i) +// => r0 = TFR_condset_ir(p0, #i, r1) +def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, fpimm:$src3), + (TFR_condset_ir_f PredRegs:$src1, fpimm:$src3, IntRegs:$src2)>; + +def : Pat <(i32 (fp_to_sint (f64 DoubleRegs:$src1))), + (i32 (EXTRACT_SUBREG (i64 (CONVERT_df2d (f64 DoubleRegs:$src1))), subreg_loreg))>, + Requires<[HasV5T]>; + +def : Pat <(fabs (f32 IntRegs:$src1)), + (CLRBIT_31 (f32 IntRegs:$src1), 31)>, + Requires<[HasV5T]>; + +def : Pat <(fneg (f32 IntRegs:$src1)), + (TOGBIT_31 (f32 IntRegs:$src1), 31)>, + Requires<[HasV5T]>; + +/* +def : Pat <(fabs (f64 DoubleRegs:$src1)), + (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>, + Requires<[HasV5T]>; + +def : Pat <(fabs (f64 DoubleRegs:$src1)), + (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>, + Requires<[HasV5T]>; + */ diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td index b15e293..99f59d5 100644 --- a/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/lib/Target/Hexagon/HexagonIntrinsics.td @@ -551,13 +551,6 @@ class di_SInst_diu6u6<string opc, Intrinsic IntID> [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2, imm:$src3))]>; -class di_SInst_didisi<string opc, Intrinsic IntID> - : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, - IntRegs:$src3), - !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")), - [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2, - IntRegs:$src3))]>; - class di_SInst_didiqi<string opc, Intrinsic IntID> : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3), @@ -818,6 +811,11 @@ class di_MInst_s8s8<string opc, Intrinsic IntID> !strconcat("$dst = ", !strconcat(opc , "(#$src1, #$src2)")), [(set DoubleRegs:$dst, (IntID imm:$src1, imm:$src2))]>; +class si_MInst_sis9<string opc, Intrinsic IntID> + : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")), + [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>; + class si_MInst_sisi<string opc, Intrinsic IntID> : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")), @@ -952,6 +950,17 @@ class si_SInst_sisi_sat<string opc, Intrinsic IntID> !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")), [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>; +class si_SInst_didi_sat<string opc, Intrinsic IntID> + : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")), + [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>; + +class si_SInst_disi_s1_rnd_sat<string opc, Intrinsic IntID> + : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2), + !strconcat("$dst = ", !strconcat(opc , + "($src1, $src2):<<1:rnd:sat")), + [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>; + class si_MInst_sisi_s1_rnd_sat<string opc, Intrinsic IntID> : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), !strconcat("$dst = ", !strconcat(opc , @@ -1612,6 +1621,18 @@ class di_MInst_dididi_acc_rnd_sat<string opc, Intrinsic IntID> DoubleRegs:$src2))], "$dst2 = $dst">; +class di_MInst_dididi_acc_s1<string opc, Intrinsic IntID> + : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, + DoubleRegs:$src1, + DoubleRegs:$src2), + !strconcat("$dst += ", + !strconcat(opc , "($src1, $src2):<<1")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, + DoubleRegs:$src1, + DoubleRegs:$src2))], + "$dst2 = $dst">; + + class di_MInst_dididi_acc_s1_sat<string opc, Intrinsic IntID> : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1, @@ -1822,53 +1843,63 @@ class si_MInst_didi<string opc, Intrinsic IntID> !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")), [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>; +// +// LDInst classes. +// +let mayLoad = 1, neverHasSideEffects = 1 in +class di_LDInstPI_diu4<string opc, Intrinsic IntID> + : LDInstPI<(outs IntRegs:$dst, DoubleRegs:$dst2), + (ins IntRegs:$src1, IntRegs:$src2, CRRegs:$src3, s4Imm:$offset), + "$dst2 = memd($src1++#$offset:circ($src3))", + [], + "$src1 = $dst">; /******************************************************************** * ALU32/ALU * *********************************************************************/ // ALU32 / ALU / Add. -def Hexagon_A2_add: +def HEXAGON_A2_add: si_ALU32_sisi <"add", int_hexagon_A2_add>; -def Hexagon_A2_addi: +def HEXAGON_A2_addi: si_ALU32_sis16 <"add", int_hexagon_A2_addi>; // ALU32 / ALU / Logical operations. -def Hexagon_A2_and: +def HEXAGON_A2_and: si_ALU32_sisi <"and", int_hexagon_A2_and>; -def Hexagon_A2_andir: +def HEXAGON_A2_andir: si_ALU32_sis10 <"and", int_hexagon_A2_andir>; -def Hexagon_A2_not: +def HEXAGON_A2_not: si_ALU32_si <"not", int_hexagon_A2_not>; -def Hexagon_A2_or: +def HEXAGON_A2_or: si_ALU32_sisi <"or", int_hexagon_A2_or>; -def Hexagon_A2_orir: +def HEXAGON_A2_orir: si_ALU32_sis10 <"or", int_hexagon_A2_orir>; -def Hexagon_A2_xor: +def HEXAGON_A2_xor: si_ALU32_sisi <"xor", int_hexagon_A2_xor>; // ALU32 / ALU / Negate. -def Hexagon_A2_neg: +def HEXAGON_A2_neg: si_ALU32_si <"neg", int_hexagon_A2_neg>; // ALU32 / ALU / Subtract. -def Hexagon_A2_sub: +def HEXAGON_A2_sub: si_ALU32_sisi <"sub", int_hexagon_A2_sub>; -def Hexagon_A2_subri: +def HEXAGON_A2_subri: si_ALU32_s10si <"sub", int_hexagon_A2_subri>; // ALU32 / ALU / Transfer Immediate. -def Hexagon_A2_tfril: +def HEXAGON_A2_tfril: si_lo_ALU32_siu16 <"", int_hexagon_A2_tfril>; -def Hexagon_A2_tfrih: +def HEXAGON_A2_tfrih: si_hi_ALU32_siu16 <"", int_hexagon_A2_tfrih>; -def Hexagon_A2_tfrsi: +def HEXAGON_A2_tfrsi: si_ALU32_s16 <"", int_hexagon_A2_tfrsi>; -def Hexagon_A2_tfrpi: +def HEXAGON_A2_tfrpi: di_ALU32_s8 <"", int_hexagon_A2_tfrpi>; // ALU32 / ALU / Transfer Register. -def Hexagon_A2_tfr: +def HEXAGON_A2_tfr: si_ALU32_si_tfr <"", int_hexagon_A2_tfr>; /******************************************************************** @@ -1876,45 +1907,45 @@ def Hexagon_A2_tfr: *********************************************************************/ // ALU32 / PERM / Combine. -def Hexagon_A2_combinew: +def HEXAGON_A2_combinew: di_ALU32_sisi <"combine", int_hexagon_A2_combinew>; -def Hexagon_A2_combine_hh: +def HEXAGON_A2_combine_hh: si_MInst_sisi_hh <"combine", int_hexagon_A2_combine_hh>; -def Hexagon_A2_combine_lh: +def HEXAGON_A2_combine_lh: si_MInst_sisi_lh <"combine", int_hexagon_A2_combine_lh>; -def Hexagon_A2_combine_hl: +def HEXAGON_A2_combine_hl: si_MInst_sisi_hl <"combine", int_hexagon_A2_combine_hl>; -def Hexagon_A2_combine_ll: +def HEXAGON_A2_combine_ll: si_MInst_sisi_ll <"combine", int_hexagon_A2_combine_ll>; -def Hexagon_A2_combineii: +def HEXAGON_A2_combineii: di_MInst_s8s8 <"combine", int_hexagon_A2_combineii>; // ALU32 / PERM / Mux. -def Hexagon_C2_mux: +def HEXAGON_C2_mux: si_ALU32_qisisi <"mux", int_hexagon_C2_mux>; -def Hexagon_C2_muxri: +def HEXAGON_C2_muxri: si_ALU32_qis8si <"mux", int_hexagon_C2_muxri>; -def Hexagon_C2_muxir: +def HEXAGON_C2_muxir: si_ALU32_qisis8 <"mux", int_hexagon_C2_muxir>; -def Hexagon_C2_muxii: +def HEXAGON_C2_muxii: si_ALU32_qis8s8 <"mux", int_hexagon_C2_muxii>; // ALU32 / PERM / Shift halfword. -def Hexagon_A2_aslh: +def HEXAGON_A2_aslh: si_ALU32_si <"aslh", int_hexagon_A2_aslh>; -def Hexagon_A2_asrh: +def HEXAGON_A2_asrh: si_ALU32_si <"asrh", int_hexagon_A2_asrh>; def SI_to_SXTHI_asrh: si_ALU32_si <"asrh", int_hexagon_SI_to_SXTHI_asrh>; // ALU32 / PERM / Sign/zero extend. -def Hexagon_A2_sxth: +def HEXAGON_A2_sxth: si_ALU32_si <"sxth", int_hexagon_A2_sxth>; -def Hexagon_A2_sxtb: +def HEXAGON_A2_sxtb: si_ALU32_si <"sxtb", int_hexagon_A2_sxtb>; -def Hexagon_A2_zxth: +def HEXAGON_A2_zxth: si_ALU32_si <"zxth", int_hexagon_A2_zxth>; -def Hexagon_A2_zxtb: +def HEXAGON_A2_zxtb: si_ALU32_si <"zxtb", int_hexagon_A2_zxtb>; /******************************************************************** @@ -1922,25 +1953,25 @@ def Hexagon_A2_zxtb: *********************************************************************/ // ALU32 / PRED / Compare. -def Hexagon_C2_cmpeq: +def HEXAGON_C2_cmpeq: qi_ALU32_sisi <"cmp.eq", int_hexagon_C2_cmpeq>; -def Hexagon_C2_cmpeqi: +def HEXAGON_C2_cmpeqi: qi_ALU32_sis10 <"cmp.eq", int_hexagon_C2_cmpeqi>; -def Hexagon_C2_cmpgei: +def HEXAGON_C2_cmpgei: qi_ALU32_sis8 <"cmp.ge", int_hexagon_C2_cmpgei>; -def Hexagon_C2_cmpgeui: +def HEXAGON_C2_cmpgeui: qi_ALU32_siu8 <"cmp.geu", int_hexagon_C2_cmpgeui>; -def Hexagon_C2_cmpgt: +def HEXAGON_C2_cmpgt: qi_ALU32_sisi <"cmp.gt", int_hexagon_C2_cmpgt>; -def Hexagon_C2_cmpgti: +def HEXAGON_C2_cmpgti: qi_ALU32_sis10 <"cmp.gt", int_hexagon_C2_cmpgti>; -def Hexagon_C2_cmpgtu: +def HEXAGON_C2_cmpgtu: qi_ALU32_sisi <"cmp.gtu", int_hexagon_C2_cmpgtu>; -def Hexagon_C2_cmpgtui: +def HEXAGON_C2_cmpgtui: qi_ALU32_siu9 <"cmp.gtu", int_hexagon_C2_cmpgtui>; -def Hexagon_C2_cmplt: +def HEXAGON_C2_cmplt: qi_ALU32_sisi <"cmp.lt", int_hexagon_C2_cmplt>; -def Hexagon_C2_cmpltu: +def HEXAGON_C2_cmpltu: qi_ALU32_sisi <"cmp.ltu", int_hexagon_C2_cmpltu>; /******************************************************************** @@ -1949,27 +1980,27 @@ def Hexagon_C2_cmpltu: // ALU32 / VH / Vector add halfwords. // Rd32=vadd[u]h(Rs32,Rt32:sat] -def Hexagon_A2_svaddh: +def HEXAGON_A2_svaddh: si_ALU32_sisi <"vaddh", int_hexagon_A2_svaddh>; -def Hexagon_A2_svaddhs: +def HEXAGON_A2_svaddhs: si_ALU32_sisi_sat <"vaddh", int_hexagon_A2_svaddhs>; -def Hexagon_A2_svadduhs: +def HEXAGON_A2_svadduhs: si_ALU32_sisi_sat <"vadduh", int_hexagon_A2_svadduhs>; // ALU32 / VH / Vector average halfwords. -def Hexagon_A2_svavgh: +def HEXAGON_A2_svavgh: si_ALU32_sisi <"vavgh", int_hexagon_A2_svavgh>; -def Hexagon_A2_svavghs: +def HEXAGON_A2_svavghs: si_ALU32_sisi_rnd <"vavgh", int_hexagon_A2_svavghs>; -def Hexagon_A2_svnavgh: +def HEXAGON_A2_svnavgh: si_ALU32_sisi <"vnavgh", int_hexagon_A2_svnavgh>; // ALU32 / VH / Vector subtract halfwords. -def Hexagon_A2_svsubh: +def HEXAGON_A2_svsubh: si_ALU32_sisi <"vsubh", int_hexagon_A2_svsubh>; -def Hexagon_A2_svsubhs: +def HEXAGON_A2_svsubhs: si_ALU32_sisi_sat <"vsubh", int_hexagon_A2_svsubhs>; -def Hexagon_A2_svsubuhs: +def HEXAGON_A2_svsubuhs: si_ALU32_sisi_sat <"vsubuh", int_hexagon_A2_svsubuhs>; /******************************************************************** @@ -1977,109 +2008,109 @@ def Hexagon_A2_svsubuhs: *********************************************************************/ // ALU64 / ALU / Add. -def Hexagon_A2_addp: +def HEXAGON_A2_addp: di_ALU64_didi <"add", int_hexagon_A2_addp>; -def Hexagon_A2_addsat: +def HEXAGON_A2_addsat: si_ALU64_sisi_sat <"add", int_hexagon_A2_addsat>; // ALU64 / ALU / Add halfword. // Even though the definition says hl, it should be lh - //so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits. -def Hexagon_A2_addh_l16_hl: +def HEXAGON_A2_addh_l16_hl: si_ALU64_sisi_l16_lh <"add", int_hexagon_A2_addh_l16_hl>; -def Hexagon_A2_addh_l16_ll: +def HEXAGON_A2_addh_l16_ll: si_ALU64_sisi_l16_ll <"add", int_hexagon_A2_addh_l16_ll>; -def Hexagon_A2_addh_l16_sat_hl: +def HEXAGON_A2_addh_l16_sat_hl: si_ALU64_sisi_l16_sat_lh <"add", int_hexagon_A2_addh_l16_sat_hl>; -def Hexagon_A2_addh_l16_sat_ll: +def HEXAGON_A2_addh_l16_sat_ll: si_ALU64_sisi_l16_sat_ll <"add", int_hexagon_A2_addh_l16_sat_ll>; -def Hexagon_A2_addh_h16_hh: +def HEXAGON_A2_addh_h16_hh: si_ALU64_sisi_h16_hh <"add", int_hexagon_A2_addh_h16_hh>; -def Hexagon_A2_addh_h16_hl: +def HEXAGON_A2_addh_h16_hl: si_ALU64_sisi_h16_hl <"add", int_hexagon_A2_addh_h16_hl>; -def Hexagon_A2_addh_h16_lh: +def HEXAGON_A2_addh_h16_lh: si_ALU64_sisi_h16_lh <"add", int_hexagon_A2_addh_h16_lh>; -def Hexagon_A2_addh_h16_ll: +def HEXAGON_A2_addh_h16_ll: si_ALU64_sisi_h16_ll <"add", int_hexagon_A2_addh_h16_ll>; -def Hexagon_A2_addh_h16_sat_hh: +def HEXAGON_A2_addh_h16_sat_hh: si_ALU64_sisi_h16_sat_hh <"add", int_hexagon_A2_addh_h16_sat_hh>; -def Hexagon_A2_addh_h16_sat_hl: +def HEXAGON_A2_addh_h16_sat_hl: si_ALU64_sisi_h16_sat_hl <"add", int_hexagon_A2_addh_h16_sat_hl>; -def Hexagon_A2_addh_h16_sat_lh: +def HEXAGON_A2_addh_h16_sat_lh: si_ALU64_sisi_h16_sat_lh <"add", int_hexagon_A2_addh_h16_sat_lh>; -def Hexagon_A2_addh_h16_sat_ll: +def HEXAGON_A2_addh_h16_sat_ll: si_ALU64_sisi_h16_sat_ll <"add", int_hexagon_A2_addh_h16_sat_ll>; // ALU64 / ALU / Compare. -def Hexagon_C2_cmpeqp: +def HEXAGON_C2_cmpeqp: qi_ALU64_didi <"cmp.eq", int_hexagon_C2_cmpeqp>; -def Hexagon_C2_cmpgtp: +def HEXAGON_C2_cmpgtp: qi_ALU64_didi <"cmp.gt", int_hexagon_C2_cmpgtp>; -def Hexagon_C2_cmpgtup: +def HEXAGON_C2_cmpgtup: qi_ALU64_didi <"cmp.gtu", int_hexagon_C2_cmpgtup>; // ALU64 / ALU / Logical operations. -def Hexagon_A2_andp: +def HEXAGON_A2_andp: di_ALU64_didi <"and", int_hexagon_A2_andp>; -def Hexagon_A2_orp: +def HEXAGON_A2_orp: di_ALU64_didi <"or", int_hexagon_A2_orp>; -def Hexagon_A2_xorp: +def HEXAGON_A2_xorp: di_ALU64_didi <"xor", int_hexagon_A2_xorp>; // ALU64 / ALU / Maximum. -def Hexagon_A2_max: +def HEXAGON_A2_max: si_ALU64_sisi <"max", int_hexagon_A2_max>; -def Hexagon_A2_maxu: +def HEXAGON_A2_maxu: si_ALU64_sisi <"maxu", int_hexagon_A2_maxu>; // ALU64 / ALU / Minimum. -def Hexagon_A2_min: +def HEXAGON_A2_min: si_ALU64_sisi <"min", int_hexagon_A2_min>; -def Hexagon_A2_minu: +def HEXAGON_A2_minu: si_ALU64_sisi <"minu", int_hexagon_A2_minu>; // ALU64 / ALU / Subtract. -def Hexagon_A2_subp: +def HEXAGON_A2_subp: di_ALU64_didi <"sub", int_hexagon_A2_subp>; -def Hexagon_A2_subsat: +def HEXAGON_A2_subsat: si_ALU64_sisi_sat <"sub", int_hexagon_A2_subsat>; // ALU64 / ALU / Subtract halfword. // Even though the definition says hl, it should be lh - //so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits. -def Hexagon_A2_subh_l16_hl: +def HEXAGON_A2_subh_l16_hl: si_ALU64_sisi_l16_lh <"sub", int_hexagon_A2_subh_l16_hl>; -def Hexagon_A2_subh_l16_ll: +def HEXAGON_A2_subh_l16_ll: si_ALU64_sisi_l16_ll <"sub", int_hexagon_A2_subh_l16_ll>; -def Hexagon_A2_subh_l16_sat_hl: +def HEXAGON_A2_subh_l16_sat_hl: si_ALU64_sisi_l16_sat_lh <"sub", int_hexagon_A2_subh_l16_sat_hl>; -def Hexagon_A2_subh_l16_sat_ll: +def HEXAGON_A2_subh_l16_sat_ll: si_ALU64_sisi_l16_sat_ll <"sub", int_hexagon_A2_subh_l16_sat_ll>; -def Hexagon_A2_subh_h16_hh: +def HEXAGON_A2_subh_h16_hh: si_ALU64_sisi_h16_hh <"sub", int_hexagon_A2_subh_h16_hh>; -def Hexagon_A2_subh_h16_hl: +def HEXAGON_A2_subh_h16_hl: si_ALU64_sisi_h16_hl <"sub", int_hexagon_A2_subh_h16_hl>; -def Hexagon_A2_subh_h16_lh: +def HEXAGON_A2_subh_h16_lh: si_ALU64_sisi_h16_lh <"sub", int_hexagon_A2_subh_h16_lh>; -def Hexagon_A2_subh_h16_ll: +def HEXAGON_A2_subh_h16_ll: si_ALU64_sisi_h16_ll <"sub", int_hexagon_A2_subh_h16_ll>; -def Hexagon_A2_subh_h16_sat_hh: +def HEXAGON_A2_subh_h16_sat_hh: si_ALU64_sisi_h16_sat_hh <"sub", int_hexagon_A2_subh_h16_sat_hh>; -def Hexagon_A2_subh_h16_sat_hl: +def HEXAGON_A2_subh_h16_sat_hl: si_ALU64_sisi_h16_sat_hl <"sub", int_hexagon_A2_subh_h16_sat_hl>; -def Hexagon_A2_subh_h16_sat_lh: +def HEXAGON_A2_subh_h16_sat_lh: si_ALU64_sisi_h16_sat_lh <"sub", int_hexagon_A2_subh_h16_sat_lh>; -def Hexagon_A2_subh_h16_sat_ll: +def HEXAGON_A2_subh_h16_sat_ll: si_ALU64_sisi_h16_sat_ll <"sub", int_hexagon_A2_subh_h16_sat_ll>; // ALU64 / ALU / Transfer register. -def Hexagon_A2_tfrp: +def HEXAGON_A2_tfrp: di_ALU64_di <"", int_hexagon_A2_tfrp>; /******************************************************************** @@ -2087,7 +2118,7 @@ def Hexagon_A2_tfrp: *********************************************************************/ // ALU64 / BIT / Masked parity. -def Hexagon_S2_parityp: +def HEXAGON_S2_parityp: si_ALU64_didi <"parity", int_hexagon_S2_parityp>; /******************************************************************** @@ -2095,7 +2126,7 @@ def Hexagon_S2_parityp: *********************************************************************/ // ALU64 / PERM / Vector pack high and low halfwords. -def Hexagon_S2_packhl: +def HEXAGON_S2_packhl: di_ALU64_sisi <"packhl", int_hexagon_S2_packhl>; /******************************************************************** @@ -2103,37 +2134,37 @@ def Hexagon_S2_packhl: *********************************************************************/ // ALU64 / VB / Vector add unsigned bytes. -def Hexagon_A2_vaddub: +def HEXAGON_A2_vaddub: di_ALU64_didi <"vaddub", int_hexagon_A2_vaddub>; -def Hexagon_A2_vaddubs: +def HEXAGON_A2_vaddubs: di_ALU64_didi_sat <"vaddub", int_hexagon_A2_vaddubs>; // ALU64 / VB / Vector average unsigned bytes. -def Hexagon_A2_vavgub: +def HEXAGON_A2_vavgub: di_ALU64_didi <"vavgub", int_hexagon_A2_vavgub>; -def Hexagon_A2_vavgubr: +def HEXAGON_A2_vavgubr: di_ALU64_didi_rnd <"vavgub", int_hexagon_A2_vavgubr>; // ALU64 / VB / Vector compare unsigned bytes. -def Hexagon_A2_vcmpbeq: +def HEXAGON_A2_vcmpbeq: qi_ALU64_didi <"vcmpb.eq", int_hexagon_A2_vcmpbeq>; -def Hexagon_A2_vcmpbgtu: +def HEXAGON_A2_vcmpbgtu: qi_ALU64_didi <"vcmpb.gtu",int_hexagon_A2_vcmpbgtu>; // ALU64 / VB / Vector maximum/minimum unsigned bytes. -def Hexagon_A2_vmaxub: +def HEXAGON_A2_vmaxub: di_ALU64_didi <"vmaxub", int_hexagon_A2_vmaxub>; -def Hexagon_A2_vminub: +def HEXAGON_A2_vminub: di_ALU64_didi <"vminub", int_hexagon_A2_vminub>; // ALU64 / VB / Vector subtract unsigned bytes. -def Hexagon_A2_vsubub: +def HEXAGON_A2_vsubub: di_ALU64_didi <"vsubub", int_hexagon_A2_vsubub>; -def Hexagon_A2_vsububs: +def HEXAGON_A2_vsububs: di_ALU64_didi_sat <"vsubub", int_hexagon_A2_vsububs>; // ALU64 / VB / Vector mux. -def Hexagon_C2_vmux: +def HEXAGON_C2_vmux: di_ALU64_qididi <"vmux", int_hexagon_C2_vmux>; @@ -2143,58 +2174,58 @@ def Hexagon_C2_vmux: // ALU64 / VH / Vector add halfwords. // Rdd64=vadd[u]h(Rss64,Rtt64:sat] -def Hexagon_A2_vaddh: +def HEXAGON_A2_vaddh: di_ALU64_didi <"vaddh", int_hexagon_A2_vaddh>; -def Hexagon_A2_vaddhs: +def HEXAGON_A2_vaddhs: di_ALU64_didi_sat <"vaddh", int_hexagon_A2_vaddhs>; -def Hexagon_A2_vadduhs: +def HEXAGON_A2_vadduhs: di_ALU64_didi_sat <"vadduh", int_hexagon_A2_vadduhs>; // ALU64 / VH / Vector average halfwords. // Rdd64=v[n]avg[u]h(Rss64,Rtt64:rnd/:crnd][:sat] -def Hexagon_A2_vavgh: +def HEXAGON_A2_vavgh: di_ALU64_didi <"vavgh", int_hexagon_A2_vavgh>; -def Hexagon_A2_vavghcr: +def HEXAGON_A2_vavghcr: di_ALU64_didi_crnd <"vavgh", int_hexagon_A2_vavghcr>; -def Hexagon_A2_vavghr: +def HEXAGON_A2_vavghr: di_ALU64_didi_rnd <"vavgh", int_hexagon_A2_vavghr>; -def Hexagon_A2_vavguh: +def HEXAGON_A2_vavguh: di_ALU64_didi <"vavguh", int_hexagon_A2_vavguh>; -def Hexagon_A2_vavguhr: +def HEXAGON_A2_vavguhr: di_ALU64_didi_rnd <"vavguh", int_hexagon_A2_vavguhr>; -def Hexagon_A2_vnavgh: +def HEXAGON_A2_vnavgh: di_ALU64_didi <"vnavgh", int_hexagon_A2_vnavgh>; -def Hexagon_A2_vnavghcr: +def HEXAGON_A2_vnavghcr: di_ALU64_didi_crnd_sat <"vnavgh", int_hexagon_A2_vnavghcr>; -def Hexagon_A2_vnavghr: +def HEXAGON_A2_vnavghr: di_ALU64_didi_rnd_sat <"vnavgh", int_hexagon_A2_vnavghr>; // ALU64 / VH / Vector compare halfwords. -def Hexagon_A2_vcmpheq: +def HEXAGON_A2_vcmpheq: qi_ALU64_didi <"vcmph.eq", int_hexagon_A2_vcmpheq>; -def Hexagon_A2_vcmphgt: +def HEXAGON_A2_vcmphgt: qi_ALU64_didi <"vcmph.gt", int_hexagon_A2_vcmphgt>; -def Hexagon_A2_vcmphgtu: +def HEXAGON_A2_vcmphgtu: qi_ALU64_didi <"vcmph.gtu",int_hexagon_A2_vcmphgtu>; // ALU64 / VH / Vector maximum halfwords. -def Hexagon_A2_vmaxh: +def HEXAGON_A2_vmaxh: di_ALU64_didi <"vmaxh", int_hexagon_A2_vmaxh>; -def Hexagon_A2_vmaxuh: +def HEXAGON_A2_vmaxuh: di_ALU64_didi <"vmaxuh", int_hexagon_A2_vmaxuh>; // ALU64 / VH / Vector minimum halfwords. -def Hexagon_A2_vminh: +def HEXAGON_A2_vminh: di_ALU64_didi <"vminh", int_hexagon_A2_vminh>; -def Hexagon_A2_vminuh: +def HEXAGON_A2_vminuh: di_ALU64_didi <"vminuh", int_hexagon_A2_vminuh>; // ALU64 / VH / Vector subtract halfwords. -def Hexagon_A2_vsubh: +def HEXAGON_A2_vsubh: di_ALU64_didi <"vsubh", int_hexagon_A2_vsubh>; -def Hexagon_A2_vsubhs: +def HEXAGON_A2_vsubhs: di_ALU64_didi_sat <"vsubh", int_hexagon_A2_vsubhs>; -def Hexagon_A2_vsubuhs: +def HEXAGON_A2_vsubuhs: di_ALU64_didi_sat <"vsubuh", int_hexagon_A2_vsubuhs>; @@ -2204,53 +2235,53 @@ def Hexagon_A2_vsubuhs: // ALU64 / VW / Vector add words. // Rdd32=vaddw(Rss32,Rtt32)[:sat] -def Hexagon_A2_vaddw: +def HEXAGON_A2_vaddw: di_ALU64_didi <"vaddw", int_hexagon_A2_vaddw>; -def Hexagon_A2_vaddws: +def HEXAGON_A2_vaddws: di_ALU64_didi_sat <"vaddw", int_hexagon_A2_vaddws>; // ALU64 / VW / Vector average words. -def Hexagon_A2_vavguw: +def HEXAGON_A2_vavguw: di_ALU64_didi <"vavguw", int_hexagon_A2_vavguw>; -def Hexagon_A2_vavguwr: +def HEXAGON_A2_vavguwr: di_ALU64_didi_rnd <"vavguw", int_hexagon_A2_vavguwr>; -def Hexagon_A2_vavgw: +def HEXAGON_A2_vavgw: di_ALU64_didi <"vavgw", int_hexagon_A2_vavgw>; -def Hexagon_A2_vavgwcr: +def HEXAGON_A2_vavgwcr: di_ALU64_didi_crnd <"vavgw", int_hexagon_A2_vavgwcr>; -def Hexagon_A2_vavgwr: +def HEXAGON_A2_vavgwr: di_ALU64_didi_rnd <"vavgw", int_hexagon_A2_vavgwr>; -def Hexagon_A2_vnavgw: +def HEXAGON_A2_vnavgw: di_ALU64_didi <"vnavgw", int_hexagon_A2_vnavgw>; -def Hexagon_A2_vnavgwcr: +def HEXAGON_A2_vnavgwcr: di_ALU64_didi_crnd_sat <"vnavgw", int_hexagon_A2_vnavgwcr>; -def Hexagon_A2_vnavgwr: +def HEXAGON_A2_vnavgwr: di_ALU64_didi_rnd_sat <"vnavgw", int_hexagon_A2_vnavgwr>; // ALU64 / VW / Vector compare words. -def Hexagon_A2_vcmpweq: +def HEXAGON_A2_vcmpweq: qi_ALU64_didi <"vcmpw.eq", int_hexagon_A2_vcmpweq>; -def Hexagon_A2_vcmpwgt: +def HEXAGON_A2_vcmpwgt: qi_ALU64_didi <"vcmpw.gt", int_hexagon_A2_vcmpwgt>; -def Hexagon_A2_vcmpwgtu: +def HEXAGON_A2_vcmpwgtu: qi_ALU64_didi <"vcmpw.gtu",int_hexagon_A2_vcmpwgtu>; // ALU64 / VW / Vector maximum words. -def Hexagon_A2_vmaxw: +def HEXAGON_A2_vmaxw: di_ALU64_didi <"vmaxw", int_hexagon_A2_vmaxw>; -def Hexagon_A2_vmaxuw: +def HEXAGON_A2_vmaxuw: di_ALU64_didi <"vmaxuw", int_hexagon_A2_vmaxuw>; // ALU64 / VW / Vector minimum words. -def Hexagon_A2_vminw: +def HEXAGON_A2_vminw: di_ALU64_didi <"vminw", int_hexagon_A2_vminw>; -def Hexagon_A2_vminuw: +def HEXAGON_A2_vminuw: di_ALU64_didi <"vminuw", int_hexagon_A2_vminuw>; // ALU64 / VW / Vector subtract words. -def Hexagon_A2_vsubw: +def HEXAGON_A2_vsubw: di_ALU64_didi <"vsubw", int_hexagon_A2_vsubw>; -def Hexagon_A2_vsubws: +def HEXAGON_A2_vsubws: di_ALU64_didi_sat <"vsubw", int_hexagon_A2_vsubws>; @@ -2259,25 +2290,25 @@ def Hexagon_A2_vsubws: *********************************************************************/ // CR / Logical reductions on predicates. -def Hexagon_C2_all8: +def HEXAGON_C2_all8: qi_SInst_qi <"all8", int_hexagon_C2_all8>; -def Hexagon_C2_any8: +def HEXAGON_C2_any8: qi_SInst_qi <"any8", int_hexagon_C2_any8>; // CR / Logical operations on predicates. -def Hexagon_C2_pxfer_map: +def HEXAGON_C2_pxfer_map: qi_SInst_qi_pxfer <"", int_hexagon_C2_pxfer_map>; -def Hexagon_C2_and: +def HEXAGON_C2_and: qi_SInst_qiqi <"and", int_hexagon_C2_and>; -def Hexagon_C2_andn: +def HEXAGON_C2_andn: qi_SInst_qiqi_neg <"and", int_hexagon_C2_andn>; -def Hexagon_C2_not: +def HEXAGON_C2_not: qi_SInst_qi <"not", int_hexagon_C2_not>; -def Hexagon_C2_or: +def HEXAGON_C2_or: qi_SInst_qiqi <"or", int_hexagon_C2_or>; -def Hexagon_C2_orn: +def HEXAGON_C2_orn: qi_SInst_qiqi_neg <"or", int_hexagon_C2_orn>; -def Hexagon_C2_xor: +def HEXAGON_C2_xor: qi_SInst_qiqi <"xor", int_hexagon_C2_xor>; @@ -2286,27 +2317,27 @@ def Hexagon_C2_xor: *********************************************************************/ // MTYPE / ALU / Add and accumulate. -def Hexagon_M2_acci: +def HEXAGON_M2_acci: si_MInst_sisisi_acc <"add", int_hexagon_M2_acci>; -def Hexagon_M2_accii: +def HEXAGON_M2_accii: si_MInst_sisis8_acc <"add", int_hexagon_M2_accii>; -def Hexagon_M2_nacci: +def HEXAGON_M2_nacci: si_MInst_sisisi_nac <"add", int_hexagon_M2_nacci>; -def Hexagon_M2_naccii: +def HEXAGON_M2_naccii: si_MInst_sisis8_nac <"add", int_hexagon_M2_naccii>; // MTYPE / ALU / Subtract and accumulate. -def Hexagon_M2_subacc: +def HEXAGON_M2_subacc: si_MInst_sisisi_acc <"sub", int_hexagon_M2_subacc>; // MTYPE / ALU / Vector absolute difference. -def Hexagon_M2_vabsdiffh: +def HEXAGON_M2_vabsdiffh: di_MInst_didi <"vabsdiffh",int_hexagon_M2_vabsdiffh>; -def Hexagon_M2_vabsdiffw: +def HEXAGON_M2_vabsdiffw: di_MInst_didi <"vabsdiffw",int_hexagon_M2_vabsdiffw>; // MTYPE / ALU / XOR and xor with destination. -def Hexagon_M2_xor_xacc: +def HEXAGON_M2_xor_xacc: si_MInst_sisisi_xacc <"xor", int_hexagon_M2_xor_xacc>; @@ -2316,91 +2347,91 @@ def Hexagon_M2_xor_xacc: // MTYPE / COMPLEX / Complex multiply. // Rdd[-+]=cmpy(Rs, Rt:<<1]:sat -def Hexagon_M2_cmpys_s1: +def HEXAGON_M2_cmpys_s1: di_MInst_sisi_s1_sat <"cmpy", int_hexagon_M2_cmpys_s1>; -def Hexagon_M2_cmpys_s0: +def HEXAGON_M2_cmpys_s0: di_MInst_sisi_sat <"cmpy", int_hexagon_M2_cmpys_s0>; -def Hexagon_M2_cmpysc_s1: +def HEXAGON_M2_cmpysc_s1: di_MInst_sisi_s1_sat_conj <"cmpy", int_hexagon_M2_cmpysc_s1>; -def Hexagon_M2_cmpysc_s0: +def HEXAGON_M2_cmpysc_s0: di_MInst_sisi_sat_conj <"cmpy", int_hexagon_M2_cmpysc_s0>; -def Hexagon_M2_cmacs_s1: +def HEXAGON_M2_cmacs_s1: di_MInst_disisi_acc_s1_sat <"cmpy", int_hexagon_M2_cmacs_s1>; -def Hexagon_M2_cmacs_s0: +def HEXAGON_M2_cmacs_s0: di_MInst_disisi_acc_sat <"cmpy", int_hexagon_M2_cmacs_s0>; -def Hexagon_M2_cmacsc_s1: +def HEXAGON_M2_cmacsc_s1: di_MInst_disisi_acc_s1_sat_conj <"cmpy", int_hexagon_M2_cmacsc_s1>; -def Hexagon_M2_cmacsc_s0: +def HEXAGON_M2_cmacsc_s0: di_MInst_disisi_acc_sat_conj <"cmpy", int_hexagon_M2_cmacsc_s0>; -def Hexagon_M2_cnacs_s1: +def HEXAGON_M2_cnacs_s1: di_MInst_disisi_nac_s1_sat <"cmpy", int_hexagon_M2_cnacs_s1>; -def Hexagon_M2_cnacs_s0: +def HEXAGON_M2_cnacs_s0: di_MInst_disisi_nac_sat <"cmpy", int_hexagon_M2_cnacs_s0>; -def Hexagon_M2_cnacsc_s1: +def HEXAGON_M2_cnacsc_s1: di_MInst_disisi_nac_s1_sat_conj <"cmpy", int_hexagon_M2_cnacsc_s1>; -def Hexagon_M2_cnacsc_s0: +def HEXAGON_M2_cnacsc_s0: di_MInst_disisi_nac_sat_conj <"cmpy", int_hexagon_M2_cnacsc_s0>; // MTYPE / COMPLEX / Complex multiply real or imaginary. -def Hexagon_M2_cmpyr_s0: +def HEXAGON_M2_cmpyr_s0: di_MInst_sisi <"cmpyr", int_hexagon_M2_cmpyr_s0>; -def Hexagon_M2_cmacr_s0: +def HEXAGON_M2_cmacr_s0: di_MInst_disisi_acc <"cmpyr", int_hexagon_M2_cmacr_s0>; -def Hexagon_M2_cmpyi_s0: +def HEXAGON_M2_cmpyi_s0: di_MInst_sisi <"cmpyi", int_hexagon_M2_cmpyi_s0>; -def Hexagon_M2_cmaci_s0: +def HEXAGON_M2_cmaci_s0: di_MInst_disisi_acc <"cmpyi", int_hexagon_M2_cmaci_s0>; // MTYPE / COMPLEX / Complex multiply with round and pack. // Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat -def Hexagon_M2_cmpyrs_s0: +def HEXAGON_M2_cmpyrs_s0: si_MInst_sisi_rnd_sat <"cmpy", int_hexagon_M2_cmpyrs_s0>; -def Hexagon_M2_cmpyrs_s1: +def HEXAGON_M2_cmpyrs_s1: si_MInst_sisi_s1_rnd_sat <"cmpy", int_hexagon_M2_cmpyrs_s1>; -def Hexagon_M2_cmpyrsc_s0: +def HEXAGON_M2_cmpyrsc_s0: si_MInst_sisi_rnd_sat_conj <"cmpy", int_hexagon_M2_cmpyrsc_s0>; -def Hexagon_M2_cmpyrsc_s1: +def HEXAGON_M2_cmpyrsc_s1: si_MInst_sisi_s1_rnd_sat_conj <"cmpy", int_hexagon_M2_cmpyrsc_s1>; //MTYPE / COMPLEX / Vector complex multiply real or imaginary. -def Hexagon_M2_vcmpy_s0_sat_i: +def HEXAGON_M2_vcmpy_s0_sat_i: di_MInst_didi_sat <"vcmpyi", int_hexagon_M2_vcmpy_s0_sat_i>; -def Hexagon_M2_vcmpy_s1_sat_i: +def HEXAGON_M2_vcmpy_s1_sat_i: di_MInst_didi_s1_sat <"vcmpyi", int_hexagon_M2_vcmpy_s1_sat_i>; -def Hexagon_M2_vcmpy_s0_sat_r: +def HEXAGON_M2_vcmpy_s0_sat_r: di_MInst_didi_sat <"vcmpyr", int_hexagon_M2_vcmpy_s0_sat_r>; -def Hexagon_M2_vcmpy_s1_sat_r: +def HEXAGON_M2_vcmpy_s1_sat_r: di_MInst_didi_s1_sat <"vcmpyr", int_hexagon_M2_vcmpy_s1_sat_r>; -def Hexagon_M2_vcmac_s0_sat_i: +def HEXAGON_M2_vcmac_s0_sat_i: di_MInst_dididi_acc_sat <"vcmpyi", int_hexagon_M2_vcmac_s0_sat_i>; -def Hexagon_M2_vcmac_s0_sat_r: +def HEXAGON_M2_vcmac_s0_sat_r: di_MInst_dididi_acc_sat <"vcmpyr", int_hexagon_M2_vcmac_s0_sat_r>; //MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary. -def Hexagon_M2_vrcmpyi_s0: +def HEXAGON_M2_vrcmpyi_s0: di_MInst_didi <"vrcmpyi", int_hexagon_M2_vrcmpyi_s0>; -def Hexagon_M2_vrcmpyr_s0: +def HEXAGON_M2_vrcmpyr_s0: di_MInst_didi <"vrcmpyr", int_hexagon_M2_vrcmpyr_s0>; -def Hexagon_M2_vrcmpyi_s0c: +def HEXAGON_M2_vrcmpyi_s0c: di_MInst_didi_conj <"vrcmpyi", int_hexagon_M2_vrcmpyi_s0c>; -def Hexagon_M2_vrcmpyr_s0c: +def HEXAGON_M2_vrcmpyr_s0c: di_MInst_didi_conj <"vrcmpyr", int_hexagon_M2_vrcmpyr_s0c>; -def Hexagon_M2_vrcmaci_s0: +def HEXAGON_M2_vrcmaci_s0: di_MInst_dididi_acc <"vrcmpyi", int_hexagon_M2_vrcmaci_s0>; -def Hexagon_M2_vrcmacr_s0: +def HEXAGON_M2_vrcmacr_s0: di_MInst_dididi_acc <"vrcmpyr", int_hexagon_M2_vrcmacr_s0>; -def Hexagon_M2_vrcmaci_s0c: +def HEXAGON_M2_vrcmaci_s0c: di_MInst_dididi_acc_conj <"vrcmpyi", int_hexagon_M2_vrcmaci_s0c>; -def Hexagon_M2_vrcmacr_s0c: +def HEXAGON_M2_vrcmacr_s0c: di_MInst_dididi_acc_conj <"vrcmpyr", int_hexagon_M2_vrcmacr_s0c>; @@ -2409,115 +2440,120 @@ def Hexagon_M2_vrcmacr_s0c: *********************************************************************/ // MTYPE / MPYH / Multiply and use lower result. -//def Hexagon_M2_mpysmi: +//def HEXAGON_M2_mpysmi: +//FIXME: Hexagon_M2_mpysmi should really by of the type si_MInst_sim9, +// not si_MInst_sis9 - but for now, we will use s9. +// def Hexagon_M2_mpysmi: // si_MInst_sim9 <"mpyi", int_hexagon_M2_mpysmi>; -def Hexagon_M2_mpyi: +def Hexagon_M2_mpysmi: + si_MInst_sis9 <"mpyi", int_hexagon_M2_mpysmi>; +def HEXAGON_M2_mpyi: si_MInst_sisi <"mpyi", int_hexagon_M2_mpyi>; -def Hexagon_M2_mpyui: +def HEXAGON_M2_mpyui: si_MInst_sisi <"mpyui", int_hexagon_M2_mpyui>; -def Hexagon_M2_macsip: +def HEXAGON_M2_macsip: si_MInst_sisiu8_acc <"mpyi", int_hexagon_M2_macsip>; -def Hexagon_M2_maci: +def HEXAGON_M2_maci: si_MInst_sisisi_acc <"mpyi", int_hexagon_M2_maci>; -def Hexagon_M2_macsin: +def HEXAGON_M2_macsin: si_MInst_sisiu8_nac <"mpyi", int_hexagon_M2_macsin>; // MTYPE / MPYH / Multiply word by half (32x16). //Rdd[+]=vmpywoh(Rss,Rtt)[:<<1][:rnd][:sat] //Rdd[+]=vmpyweh(Rss,Rtt)[:<<1][:rnd][:sat] -def Hexagon_M2_mmpyl_rs1: +def HEXAGON_M2_mmpyl_rs1: di_MInst_didi_s1_rnd_sat <"vmpyweh", int_hexagon_M2_mmpyl_rs1>; -def Hexagon_M2_mmpyl_s1: +def HEXAGON_M2_mmpyl_s1: di_MInst_didi_s1_sat <"vmpyweh", int_hexagon_M2_mmpyl_s1>; -def Hexagon_M2_mmpyl_rs0: +def HEXAGON_M2_mmpyl_rs0: di_MInst_didi_rnd_sat <"vmpyweh", int_hexagon_M2_mmpyl_rs0>; -def Hexagon_M2_mmpyl_s0: +def HEXAGON_M2_mmpyl_s0: di_MInst_didi_sat <"vmpyweh", int_hexagon_M2_mmpyl_s0>; -def Hexagon_M2_mmpyh_rs1: +def HEXAGON_M2_mmpyh_rs1: di_MInst_didi_s1_rnd_sat <"vmpywoh", int_hexagon_M2_mmpyh_rs1>; -def Hexagon_M2_mmpyh_s1: +def HEXAGON_M2_mmpyh_s1: di_MInst_didi_s1_sat <"vmpywoh", int_hexagon_M2_mmpyh_s1>; -def Hexagon_M2_mmpyh_rs0: +def HEXAGON_M2_mmpyh_rs0: di_MInst_didi_rnd_sat <"vmpywoh", int_hexagon_M2_mmpyh_rs0>; -def Hexagon_M2_mmpyh_s0: +def HEXAGON_M2_mmpyh_s0: di_MInst_didi_sat <"vmpywoh", int_hexagon_M2_mmpyh_s0>; -def Hexagon_M2_mmacls_rs1: +def HEXAGON_M2_mmacls_rs1: di_MInst_dididi_acc_s1_rnd_sat <"vmpyweh", int_hexagon_M2_mmacls_rs1>; -def Hexagon_M2_mmacls_s1: +def HEXAGON_M2_mmacls_s1: di_MInst_dididi_acc_s1_sat <"vmpyweh", int_hexagon_M2_mmacls_s1>; -def Hexagon_M2_mmacls_rs0: +def HEXAGON_M2_mmacls_rs0: di_MInst_dididi_acc_rnd_sat <"vmpyweh", int_hexagon_M2_mmacls_rs0>; -def Hexagon_M2_mmacls_s0: +def HEXAGON_M2_mmacls_s0: di_MInst_dididi_acc_sat <"vmpyweh", int_hexagon_M2_mmacls_s0>; -def Hexagon_M2_mmachs_rs1: +def HEXAGON_M2_mmachs_rs1: di_MInst_dididi_acc_s1_rnd_sat <"vmpywoh", int_hexagon_M2_mmachs_rs1>; -def Hexagon_M2_mmachs_s1: +def HEXAGON_M2_mmachs_s1: di_MInst_dididi_acc_s1_sat <"vmpywoh", int_hexagon_M2_mmachs_s1>; -def Hexagon_M2_mmachs_rs0: +def HEXAGON_M2_mmachs_rs0: di_MInst_dididi_acc_rnd_sat <"vmpywoh", int_hexagon_M2_mmachs_rs0>; -def Hexagon_M2_mmachs_s0: +def HEXAGON_M2_mmachs_s0: di_MInst_dididi_acc_sat <"vmpywoh", int_hexagon_M2_mmachs_s0>; // MTYPE / MPYH / Multiply word by unsigned half (32x16). //Rdd[+]=vmpywouh(Rss,Rtt)[:<<1][:rnd][:sat] //Rdd[+]=vmpyweuh(Rss,Rtt)[:<<1][:rnd][:sat] -def Hexagon_M2_mmpyul_rs1: +def HEXAGON_M2_mmpyul_rs1: di_MInst_didi_s1_rnd_sat <"vmpyweuh", int_hexagon_M2_mmpyul_rs1>; -def Hexagon_M2_mmpyul_s1: +def HEXAGON_M2_mmpyul_s1: di_MInst_didi_s1_sat <"vmpyweuh", int_hexagon_M2_mmpyul_s1>; -def Hexagon_M2_mmpyul_rs0: +def HEXAGON_M2_mmpyul_rs0: di_MInst_didi_rnd_sat <"vmpyweuh", int_hexagon_M2_mmpyul_rs0>; -def Hexagon_M2_mmpyul_s0: +def HEXAGON_M2_mmpyul_s0: di_MInst_didi_sat <"vmpyweuh", int_hexagon_M2_mmpyul_s0>; -def Hexagon_M2_mmpyuh_rs1: +def HEXAGON_M2_mmpyuh_rs1: di_MInst_didi_s1_rnd_sat <"vmpywouh", int_hexagon_M2_mmpyuh_rs1>; -def Hexagon_M2_mmpyuh_s1: +def HEXAGON_M2_mmpyuh_s1: di_MInst_didi_s1_sat <"vmpywouh", int_hexagon_M2_mmpyuh_s1>; -def Hexagon_M2_mmpyuh_rs0: +def HEXAGON_M2_mmpyuh_rs0: di_MInst_didi_rnd_sat <"vmpywouh", int_hexagon_M2_mmpyuh_rs0>; -def Hexagon_M2_mmpyuh_s0: +def HEXAGON_M2_mmpyuh_s0: di_MInst_didi_sat <"vmpywouh", int_hexagon_M2_mmpyuh_s0>; -def Hexagon_M2_mmaculs_rs1: +def HEXAGON_M2_mmaculs_rs1: di_MInst_dididi_acc_s1_rnd_sat <"vmpyweuh", int_hexagon_M2_mmaculs_rs1>; -def Hexagon_M2_mmaculs_s1: +def HEXAGON_M2_mmaculs_s1: di_MInst_dididi_acc_s1_sat <"vmpyweuh", int_hexagon_M2_mmaculs_s1>; -def Hexagon_M2_mmaculs_rs0: +def HEXAGON_M2_mmaculs_rs0: di_MInst_dididi_acc_rnd_sat <"vmpyweuh", int_hexagon_M2_mmaculs_rs0>; -def Hexagon_M2_mmaculs_s0: +def HEXAGON_M2_mmaculs_s0: di_MInst_dididi_acc_sat <"vmpyweuh", int_hexagon_M2_mmaculs_s0>; -def Hexagon_M2_mmacuhs_rs1: +def HEXAGON_M2_mmacuhs_rs1: di_MInst_dididi_acc_s1_rnd_sat <"vmpywouh", int_hexagon_M2_mmacuhs_rs1>; -def Hexagon_M2_mmacuhs_s1: +def HEXAGON_M2_mmacuhs_s1: di_MInst_dididi_acc_s1_sat <"vmpywouh", int_hexagon_M2_mmacuhs_s1>; -def Hexagon_M2_mmacuhs_rs0: +def HEXAGON_M2_mmacuhs_rs0: di_MInst_dididi_acc_rnd_sat <"vmpywouh", int_hexagon_M2_mmacuhs_rs0>; -def Hexagon_M2_mmacuhs_s0: +def HEXAGON_M2_mmacuhs_s0: di_MInst_dididi_acc_sat <"vmpywouh", int_hexagon_M2_mmacuhs_s0>; // MTYPE / MPYH / Multiply and use upper result. -def Hexagon_M2_hmmpyh_rs1: +def HEXAGON_M2_hmmpyh_rs1: si_MInst_sisi_h_s1_rnd_sat <"mpy", int_hexagon_M2_hmmpyh_rs1>; -def Hexagon_M2_hmmpyl_rs1: +def HEXAGON_M2_hmmpyl_rs1: si_MInst_sisi_l_s1_rnd_sat <"mpy", int_hexagon_M2_hmmpyl_rs1>; -def Hexagon_M2_mpy_up: +def HEXAGON_M2_mpy_up: si_MInst_sisi <"mpy", int_hexagon_M2_mpy_up>; -def Hexagon_M2_dpmpyss_rnd_s0: +def HEXAGON_M2_dpmpyss_rnd_s0: si_MInst_sisi_rnd <"mpy", int_hexagon_M2_dpmpyss_rnd_s0>; -def Hexagon_M2_mpyu_up: +def HEXAGON_M2_mpyu_up: si_MInst_sisi <"mpyu", int_hexagon_M2_mpyu_up>; // MTYPE / MPYH / Multiply and use full result. -def Hexagon_M2_dpmpyuu_s0: +def HEXAGON_M2_dpmpyuu_s0: di_MInst_sisi <"mpyu", int_hexagon_M2_dpmpyuu_s0>; -def Hexagon_M2_dpmpyuu_acc_s0: +def HEXAGON_M2_dpmpyuu_acc_s0: di_MInst_disisi_acc <"mpyu", int_hexagon_M2_dpmpyuu_acc_s0>; -def Hexagon_M2_dpmpyuu_nac_s0: +def HEXAGON_M2_dpmpyuu_nac_s0: di_MInst_disisi_nac <"mpyu", int_hexagon_M2_dpmpyuu_nac_s0>; -def Hexagon_M2_dpmpyss_s0: +def HEXAGON_M2_dpmpyss_s0: di_MInst_sisi <"mpy", int_hexagon_M2_dpmpyss_s0>; -def Hexagon_M2_dpmpyss_acc_s0: +def HEXAGON_M2_dpmpyss_acc_s0: di_MInst_disisi_acc <"mpy", int_hexagon_M2_dpmpyss_acc_s0>; -def Hexagon_M2_dpmpyss_nac_s0: +def HEXAGON_M2_dpmpyss_nac_s0: di_MInst_disisi_nac <"mpy", int_hexagon_M2_dpmpyss_nac_s0>; @@ -2528,334 +2564,334 @@ def Hexagon_M2_dpmpyss_nac_s0: // MTYPE / MPYS / Scalar 16x16 multiply signed. //Rd=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]| // [:<<0[:rnd|:sat|:rnd:sat]|:<<1[:rnd|:sat|:rnd:sat]]] -def Hexagon_M2_mpy_hh_s0: +def HEXAGON_M2_mpy_hh_s0: si_MInst_sisi_hh <"mpy", int_hexagon_M2_mpy_hh_s0>; -def Hexagon_M2_mpy_hh_s1: +def HEXAGON_M2_mpy_hh_s1: si_MInst_sisi_hh_s1 <"mpy", int_hexagon_M2_mpy_hh_s1>; -def Hexagon_M2_mpy_rnd_hh_s1: +def HEXAGON_M2_mpy_rnd_hh_s1: si_MInst_sisi_rnd_hh_s1 <"mpy", int_hexagon_M2_mpy_rnd_hh_s1>; -def Hexagon_M2_mpy_sat_rnd_hh_s1: +def HEXAGON_M2_mpy_sat_rnd_hh_s1: si_MInst_sisi_sat_rnd_hh_s1 <"mpy", int_hexagon_M2_mpy_sat_rnd_hh_s1>; -def Hexagon_M2_mpy_sat_hh_s1: +def HEXAGON_M2_mpy_sat_hh_s1: si_MInst_sisi_sat_hh_s1 <"mpy", int_hexagon_M2_mpy_sat_hh_s1>; -def Hexagon_M2_mpy_rnd_hh_s0: +def HEXAGON_M2_mpy_rnd_hh_s0: si_MInst_sisi_rnd_hh <"mpy", int_hexagon_M2_mpy_rnd_hh_s0>; -def Hexagon_M2_mpy_sat_rnd_hh_s0: +def HEXAGON_M2_mpy_sat_rnd_hh_s0: si_MInst_sisi_sat_rnd_hh <"mpy", int_hexagon_M2_mpy_sat_rnd_hh_s0>; -def Hexagon_M2_mpy_sat_hh_s0: +def HEXAGON_M2_mpy_sat_hh_s0: si_MInst_sisi_sat_hh <"mpy", int_hexagon_M2_mpy_sat_hh_s0>; -def Hexagon_M2_mpy_hl_s0: +def HEXAGON_M2_mpy_hl_s0: si_MInst_sisi_hl <"mpy", int_hexagon_M2_mpy_hl_s0>; -def Hexagon_M2_mpy_hl_s1: +def HEXAGON_M2_mpy_hl_s1: si_MInst_sisi_hl_s1 <"mpy", int_hexagon_M2_mpy_hl_s1>; -def Hexagon_M2_mpy_rnd_hl_s1: +def HEXAGON_M2_mpy_rnd_hl_s1: si_MInst_sisi_rnd_hl_s1 <"mpy", int_hexagon_M2_mpy_rnd_hl_s1>; -def Hexagon_M2_mpy_sat_rnd_hl_s1: +def HEXAGON_M2_mpy_sat_rnd_hl_s1: si_MInst_sisi_sat_rnd_hl_s1 <"mpy", int_hexagon_M2_mpy_sat_rnd_hl_s1>; -def Hexagon_M2_mpy_sat_hl_s1: +def HEXAGON_M2_mpy_sat_hl_s1: si_MInst_sisi_sat_hl_s1 <"mpy", int_hexagon_M2_mpy_sat_hl_s1>; -def Hexagon_M2_mpy_rnd_hl_s0: +def HEXAGON_M2_mpy_rnd_hl_s0: si_MInst_sisi_rnd_hl <"mpy", int_hexagon_M2_mpy_rnd_hl_s0>; -def Hexagon_M2_mpy_sat_rnd_hl_s0: +def HEXAGON_M2_mpy_sat_rnd_hl_s0: si_MInst_sisi_sat_rnd_hl <"mpy", int_hexagon_M2_mpy_sat_rnd_hl_s0>; -def Hexagon_M2_mpy_sat_hl_s0: +def HEXAGON_M2_mpy_sat_hl_s0: si_MInst_sisi_sat_hl <"mpy", int_hexagon_M2_mpy_sat_hl_s0>; -def Hexagon_M2_mpy_lh_s0: +def HEXAGON_M2_mpy_lh_s0: si_MInst_sisi_lh <"mpy", int_hexagon_M2_mpy_lh_s0>; -def Hexagon_M2_mpy_lh_s1: +def HEXAGON_M2_mpy_lh_s1: si_MInst_sisi_lh_s1 <"mpy", int_hexagon_M2_mpy_lh_s1>; -def Hexagon_M2_mpy_rnd_lh_s1: +def HEXAGON_M2_mpy_rnd_lh_s1: si_MInst_sisi_rnd_lh_s1 <"mpy", int_hexagon_M2_mpy_rnd_lh_s1>; -def Hexagon_M2_mpy_sat_rnd_lh_s1: +def HEXAGON_M2_mpy_sat_rnd_lh_s1: si_MInst_sisi_sat_rnd_lh_s1 <"mpy", int_hexagon_M2_mpy_sat_rnd_lh_s1>; -def Hexagon_M2_mpy_sat_lh_s1: +def HEXAGON_M2_mpy_sat_lh_s1: si_MInst_sisi_sat_lh_s1 <"mpy", int_hexagon_M2_mpy_sat_lh_s1>; -def Hexagon_M2_mpy_rnd_lh_s0: +def HEXAGON_M2_mpy_rnd_lh_s0: si_MInst_sisi_rnd_lh <"mpy", int_hexagon_M2_mpy_rnd_lh_s0>; -def Hexagon_M2_mpy_sat_rnd_lh_s0: +def HEXAGON_M2_mpy_sat_rnd_lh_s0: si_MInst_sisi_sat_rnd_lh <"mpy", int_hexagon_M2_mpy_sat_rnd_lh_s0>; -def Hexagon_M2_mpy_sat_lh_s0: +def HEXAGON_M2_mpy_sat_lh_s0: si_MInst_sisi_sat_lh <"mpy", int_hexagon_M2_mpy_sat_lh_s0>; -def Hexagon_M2_mpy_ll_s0: +def HEXAGON_M2_mpy_ll_s0: si_MInst_sisi_ll <"mpy", int_hexagon_M2_mpy_ll_s0>; -def Hexagon_M2_mpy_ll_s1: +def HEXAGON_M2_mpy_ll_s1: si_MInst_sisi_ll_s1 <"mpy", int_hexagon_M2_mpy_ll_s1>; -def Hexagon_M2_mpy_rnd_ll_s1: +def HEXAGON_M2_mpy_rnd_ll_s1: si_MInst_sisi_rnd_ll_s1 <"mpy", int_hexagon_M2_mpy_rnd_ll_s1>; -def Hexagon_M2_mpy_sat_rnd_ll_s1: +def HEXAGON_M2_mpy_sat_rnd_ll_s1: si_MInst_sisi_sat_rnd_ll_s1 <"mpy", int_hexagon_M2_mpy_sat_rnd_ll_s1>; -def Hexagon_M2_mpy_sat_ll_s1: +def HEXAGON_M2_mpy_sat_ll_s1: si_MInst_sisi_sat_ll_s1 <"mpy", int_hexagon_M2_mpy_sat_ll_s1>; -def Hexagon_M2_mpy_rnd_ll_s0: +def HEXAGON_M2_mpy_rnd_ll_s0: si_MInst_sisi_rnd_ll <"mpy", int_hexagon_M2_mpy_rnd_ll_s0>; -def Hexagon_M2_mpy_sat_rnd_ll_s0: +def HEXAGON_M2_mpy_sat_rnd_ll_s0: si_MInst_sisi_sat_rnd_ll <"mpy", int_hexagon_M2_mpy_sat_rnd_ll_s0>; -def Hexagon_M2_mpy_sat_ll_s0: +def HEXAGON_M2_mpy_sat_ll_s0: si_MInst_sisi_sat_ll <"mpy", int_hexagon_M2_mpy_sat_ll_s0>; //Rdd=mpy(Rs.[H|L],Rt.[H|L])[[:<<0|:<<1]|[:<<0:rnd|:<<1:rnd]] -def Hexagon_M2_mpyd_hh_s0: +def HEXAGON_M2_mpyd_hh_s0: di_MInst_sisi_hh <"mpy", int_hexagon_M2_mpyd_hh_s0>; -def Hexagon_M2_mpyd_hh_s1: +def HEXAGON_M2_mpyd_hh_s1: di_MInst_sisi_hh_s1 <"mpy", int_hexagon_M2_mpyd_hh_s1>; -def Hexagon_M2_mpyd_rnd_hh_s1: +def HEXAGON_M2_mpyd_rnd_hh_s1: di_MInst_sisi_rnd_hh_s1 <"mpy", int_hexagon_M2_mpyd_rnd_hh_s1>; -def Hexagon_M2_mpyd_rnd_hh_s0: +def HEXAGON_M2_mpyd_rnd_hh_s0: di_MInst_sisi_rnd_hh <"mpy", int_hexagon_M2_mpyd_rnd_hh_s0>; -def Hexagon_M2_mpyd_hl_s0: +def HEXAGON_M2_mpyd_hl_s0: di_MInst_sisi_hl <"mpy", int_hexagon_M2_mpyd_hl_s0>; -def Hexagon_M2_mpyd_hl_s1: +def HEXAGON_M2_mpyd_hl_s1: di_MInst_sisi_hl_s1 <"mpy", int_hexagon_M2_mpyd_hl_s1>; -def Hexagon_M2_mpyd_rnd_hl_s1: +def HEXAGON_M2_mpyd_rnd_hl_s1: di_MInst_sisi_rnd_hl_s1 <"mpy", int_hexagon_M2_mpyd_rnd_hl_s1>; -def Hexagon_M2_mpyd_rnd_hl_s0: +def HEXAGON_M2_mpyd_rnd_hl_s0: di_MInst_sisi_rnd_hl <"mpy", int_hexagon_M2_mpyd_rnd_hl_s0>; -def Hexagon_M2_mpyd_lh_s0: +def HEXAGON_M2_mpyd_lh_s0: di_MInst_sisi_lh <"mpy", int_hexagon_M2_mpyd_lh_s0>; -def Hexagon_M2_mpyd_lh_s1: +def HEXAGON_M2_mpyd_lh_s1: di_MInst_sisi_lh_s1 <"mpy", int_hexagon_M2_mpyd_lh_s1>; -def Hexagon_M2_mpyd_rnd_lh_s1: +def HEXAGON_M2_mpyd_rnd_lh_s1: di_MInst_sisi_rnd_lh_s1 <"mpy", int_hexagon_M2_mpyd_rnd_lh_s1>; -def Hexagon_M2_mpyd_rnd_lh_s0: +def HEXAGON_M2_mpyd_rnd_lh_s0: di_MInst_sisi_rnd_lh <"mpy", int_hexagon_M2_mpyd_rnd_lh_s0>; -def Hexagon_M2_mpyd_ll_s0: +def HEXAGON_M2_mpyd_ll_s0: di_MInst_sisi_ll <"mpy", int_hexagon_M2_mpyd_ll_s0>; -def Hexagon_M2_mpyd_ll_s1: +def HEXAGON_M2_mpyd_ll_s1: di_MInst_sisi_ll_s1 <"mpy", int_hexagon_M2_mpyd_ll_s1>; -def Hexagon_M2_mpyd_rnd_ll_s1: +def HEXAGON_M2_mpyd_rnd_ll_s1: di_MInst_sisi_rnd_ll_s1 <"mpy", int_hexagon_M2_mpyd_rnd_ll_s1>; -def Hexagon_M2_mpyd_rnd_ll_s0: +def HEXAGON_M2_mpyd_rnd_ll_s0: di_MInst_sisi_rnd_ll <"mpy", int_hexagon_M2_mpyd_rnd_ll_s0>; //Rx+=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]] -def Hexagon_M2_mpy_acc_hh_s0: +def HEXAGON_M2_mpy_acc_hh_s0: si_MInst_sisisi_acc_hh <"mpy", int_hexagon_M2_mpy_acc_hh_s0>; -def Hexagon_M2_mpy_acc_hh_s1: +def HEXAGON_M2_mpy_acc_hh_s1: si_MInst_sisisi_acc_hh_s1 <"mpy", int_hexagon_M2_mpy_acc_hh_s1>; -def Hexagon_M2_mpy_acc_sat_hh_s1: +def HEXAGON_M2_mpy_acc_sat_hh_s1: si_MInst_sisisi_acc_sat_hh_s1 <"mpy", int_hexagon_M2_mpy_acc_sat_hh_s1>; -def Hexagon_M2_mpy_acc_sat_hh_s0: +def HEXAGON_M2_mpy_acc_sat_hh_s0: si_MInst_sisisi_acc_sat_hh <"mpy", int_hexagon_M2_mpy_acc_sat_hh_s0>; -def Hexagon_M2_mpy_acc_hl_s0: +def HEXAGON_M2_mpy_acc_hl_s0: si_MInst_sisisi_acc_hl <"mpy", int_hexagon_M2_mpy_acc_hl_s0>; -def Hexagon_M2_mpy_acc_hl_s1: +def HEXAGON_M2_mpy_acc_hl_s1: si_MInst_sisisi_acc_hl_s1 <"mpy", int_hexagon_M2_mpy_acc_hl_s1>; -def Hexagon_M2_mpy_acc_sat_hl_s1: +def HEXAGON_M2_mpy_acc_sat_hl_s1: si_MInst_sisisi_acc_sat_hl_s1 <"mpy", int_hexagon_M2_mpy_acc_sat_hl_s1>; -def Hexagon_M2_mpy_acc_sat_hl_s0: +def HEXAGON_M2_mpy_acc_sat_hl_s0: si_MInst_sisisi_acc_sat_hl <"mpy", int_hexagon_M2_mpy_acc_sat_hl_s0>; -def Hexagon_M2_mpy_acc_lh_s0: +def HEXAGON_M2_mpy_acc_lh_s0: si_MInst_sisisi_acc_lh <"mpy", int_hexagon_M2_mpy_acc_lh_s0>; -def Hexagon_M2_mpy_acc_lh_s1: +def HEXAGON_M2_mpy_acc_lh_s1: si_MInst_sisisi_acc_lh_s1 <"mpy", int_hexagon_M2_mpy_acc_lh_s1>; -def Hexagon_M2_mpy_acc_sat_lh_s1: +def HEXAGON_M2_mpy_acc_sat_lh_s1: si_MInst_sisisi_acc_sat_lh_s1 <"mpy", int_hexagon_M2_mpy_acc_sat_lh_s1>; -def Hexagon_M2_mpy_acc_sat_lh_s0: +def HEXAGON_M2_mpy_acc_sat_lh_s0: si_MInst_sisisi_acc_sat_lh <"mpy", int_hexagon_M2_mpy_acc_sat_lh_s0>; -def Hexagon_M2_mpy_acc_ll_s0: +def HEXAGON_M2_mpy_acc_ll_s0: si_MInst_sisisi_acc_ll <"mpy", int_hexagon_M2_mpy_acc_ll_s0>; -def Hexagon_M2_mpy_acc_ll_s1: +def HEXAGON_M2_mpy_acc_ll_s1: si_MInst_sisisi_acc_ll_s1 <"mpy", int_hexagon_M2_mpy_acc_ll_s1>; -def Hexagon_M2_mpy_acc_sat_ll_s1: +def HEXAGON_M2_mpy_acc_sat_ll_s1: si_MInst_sisisi_acc_sat_ll_s1 <"mpy", int_hexagon_M2_mpy_acc_sat_ll_s1>; -def Hexagon_M2_mpy_acc_sat_ll_s0: +def HEXAGON_M2_mpy_acc_sat_ll_s0: si_MInst_sisisi_acc_sat_ll <"mpy", int_hexagon_M2_mpy_acc_sat_ll_s0>; //Rx-=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]] -def Hexagon_M2_mpy_nac_hh_s0: +def HEXAGON_M2_mpy_nac_hh_s0: si_MInst_sisisi_nac_hh <"mpy", int_hexagon_M2_mpy_nac_hh_s0>; -def Hexagon_M2_mpy_nac_hh_s1: +def HEXAGON_M2_mpy_nac_hh_s1: si_MInst_sisisi_nac_hh_s1 <"mpy", int_hexagon_M2_mpy_nac_hh_s1>; -def Hexagon_M2_mpy_nac_sat_hh_s1: +def HEXAGON_M2_mpy_nac_sat_hh_s1: si_MInst_sisisi_nac_sat_hh_s1 <"mpy", int_hexagon_M2_mpy_nac_sat_hh_s1>; -def Hexagon_M2_mpy_nac_sat_hh_s0: +def HEXAGON_M2_mpy_nac_sat_hh_s0: si_MInst_sisisi_nac_sat_hh <"mpy", int_hexagon_M2_mpy_nac_sat_hh_s0>; -def Hexagon_M2_mpy_nac_hl_s0: +def HEXAGON_M2_mpy_nac_hl_s0: si_MInst_sisisi_nac_hl <"mpy", int_hexagon_M2_mpy_nac_hl_s0>; -def Hexagon_M2_mpy_nac_hl_s1: +def HEXAGON_M2_mpy_nac_hl_s1: si_MInst_sisisi_nac_hl_s1 <"mpy", int_hexagon_M2_mpy_nac_hl_s1>; -def Hexagon_M2_mpy_nac_sat_hl_s1: +def HEXAGON_M2_mpy_nac_sat_hl_s1: si_MInst_sisisi_nac_sat_hl_s1 <"mpy", int_hexagon_M2_mpy_nac_sat_hl_s1>; -def Hexagon_M2_mpy_nac_sat_hl_s0: +def HEXAGON_M2_mpy_nac_sat_hl_s0: si_MInst_sisisi_nac_sat_hl <"mpy", int_hexagon_M2_mpy_nac_sat_hl_s0>; -def Hexagon_M2_mpy_nac_lh_s0: +def HEXAGON_M2_mpy_nac_lh_s0: si_MInst_sisisi_nac_lh <"mpy", int_hexagon_M2_mpy_nac_lh_s0>; -def Hexagon_M2_mpy_nac_lh_s1: +def HEXAGON_M2_mpy_nac_lh_s1: si_MInst_sisisi_nac_lh_s1 <"mpy", int_hexagon_M2_mpy_nac_lh_s1>; -def Hexagon_M2_mpy_nac_sat_lh_s1: +def HEXAGON_M2_mpy_nac_sat_lh_s1: si_MInst_sisisi_nac_sat_lh_s1 <"mpy", int_hexagon_M2_mpy_nac_sat_lh_s1>; -def Hexagon_M2_mpy_nac_sat_lh_s0: +def HEXAGON_M2_mpy_nac_sat_lh_s0: si_MInst_sisisi_nac_sat_lh <"mpy", int_hexagon_M2_mpy_nac_sat_lh_s0>; -def Hexagon_M2_mpy_nac_ll_s0: +def HEXAGON_M2_mpy_nac_ll_s0: si_MInst_sisisi_nac_ll <"mpy", int_hexagon_M2_mpy_nac_ll_s0>; -def Hexagon_M2_mpy_nac_ll_s1: +def HEXAGON_M2_mpy_nac_ll_s1: si_MInst_sisisi_nac_ll_s1 <"mpy", int_hexagon_M2_mpy_nac_ll_s1>; -def Hexagon_M2_mpy_nac_sat_ll_s1: +def HEXAGON_M2_mpy_nac_sat_ll_s1: si_MInst_sisisi_nac_sat_ll_s1 <"mpy", int_hexagon_M2_mpy_nac_sat_ll_s1>; -def Hexagon_M2_mpy_nac_sat_ll_s0: +def HEXAGON_M2_mpy_nac_sat_ll_s0: si_MInst_sisisi_nac_sat_ll <"mpy", int_hexagon_M2_mpy_nac_sat_ll_s0>; //Rx+=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1] -def Hexagon_M2_mpyd_acc_hh_s0: +def HEXAGON_M2_mpyd_acc_hh_s0: di_MInst_disisi_acc_hh <"mpy", int_hexagon_M2_mpyd_acc_hh_s0>; -def Hexagon_M2_mpyd_acc_hh_s1: +def HEXAGON_M2_mpyd_acc_hh_s1: di_MInst_disisi_acc_hh_s1 <"mpy", int_hexagon_M2_mpyd_acc_hh_s1>; -def Hexagon_M2_mpyd_acc_hl_s0: +def HEXAGON_M2_mpyd_acc_hl_s0: di_MInst_disisi_acc_hl <"mpy", int_hexagon_M2_mpyd_acc_hl_s0>; -def Hexagon_M2_mpyd_acc_hl_s1: +def HEXAGON_M2_mpyd_acc_hl_s1: di_MInst_disisi_acc_hl_s1 <"mpy", int_hexagon_M2_mpyd_acc_hl_s1>; -def Hexagon_M2_mpyd_acc_lh_s0: +def HEXAGON_M2_mpyd_acc_lh_s0: di_MInst_disisi_acc_lh <"mpy", int_hexagon_M2_mpyd_acc_lh_s0>; -def Hexagon_M2_mpyd_acc_lh_s1: +def HEXAGON_M2_mpyd_acc_lh_s1: di_MInst_disisi_acc_lh_s1 <"mpy", int_hexagon_M2_mpyd_acc_lh_s1>; -def Hexagon_M2_mpyd_acc_ll_s0: +def HEXAGON_M2_mpyd_acc_ll_s0: di_MInst_disisi_acc_ll <"mpy", int_hexagon_M2_mpyd_acc_ll_s0>; -def Hexagon_M2_mpyd_acc_ll_s1: +def HEXAGON_M2_mpyd_acc_ll_s1: di_MInst_disisi_acc_ll_s1 <"mpy", int_hexagon_M2_mpyd_acc_ll_s1>; //Rx-=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1] -def Hexagon_M2_mpyd_nac_hh_s0: +def HEXAGON_M2_mpyd_nac_hh_s0: di_MInst_disisi_nac_hh <"mpy", int_hexagon_M2_mpyd_nac_hh_s0>; -def Hexagon_M2_mpyd_nac_hh_s1: +def HEXAGON_M2_mpyd_nac_hh_s1: di_MInst_disisi_nac_hh_s1 <"mpy", int_hexagon_M2_mpyd_nac_hh_s1>; -def Hexagon_M2_mpyd_nac_hl_s0: +def HEXAGON_M2_mpyd_nac_hl_s0: di_MInst_disisi_nac_hl <"mpy", int_hexagon_M2_mpyd_nac_hl_s0>; -def Hexagon_M2_mpyd_nac_hl_s1: +def HEXAGON_M2_mpyd_nac_hl_s1: di_MInst_disisi_nac_hl_s1 <"mpy", int_hexagon_M2_mpyd_nac_hl_s1>; -def Hexagon_M2_mpyd_nac_lh_s0: +def HEXAGON_M2_mpyd_nac_lh_s0: di_MInst_disisi_nac_lh <"mpy", int_hexagon_M2_mpyd_nac_lh_s0>; -def Hexagon_M2_mpyd_nac_lh_s1: +def HEXAGON_M2_mpyd_nac_lh_s1: di_MInst_disisi_nac_lh_s1 <"mpy", int_hexagon_M2_mpyd_nac_lh_s1>; -def Hexagon_M2_mpyd_nac_ll_s0: +def HEXAGON_M2_mpyd_nac_ll_s0: di_MInst_disisi_nac_ll <"mpy", int_hexagon_M2_mpyd_nac_ll_s0>; -def Hexagon_M2_mpyd_nac_ll_s1: +def HEXAGON_M2_mpyd_nac_ll_s1: di_MInst_disisi_nac_ll_s1 <"mpy", int_hexagon_M2_mpyd_nac_ll_s1>; // MTYPE / MPYS / Scalar 16x16 multiply unsigned. //Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1] -def Hexagon_M2_mpyu_hh_s0: +def HEXAGON_M2_mpyu_hh_s0: si_MInst_sisi_hh <"mpyu", int_hexagon_M2_mpyu_hh_s0>; -def Hexagon_M2_mpyu_hh_s1: +def HEXAGON_M2_mpyu_hh_s1: si_MInst_sisi_hh_s1 <"mpyu", int_hexagon_M2_mpyu_hh_s1>; -def Hexagon_M2_mpyu_hl_s0: +def HEXAGON_M2_mpyu_hl_s0: si_MInst_sisi_hl <"mpyu", int_hexagon_M2_mpyu_hl_s0>; -def Hexagon_M2_mpyu_hl_s1: +def HEXAGON_M2_mpyu_hl_s1: si_MInst_sisi_hl_s1 <"mpyu", int_hexagon_M2_mpyu_hl_s1>; -def Hexagon_M2_mpyu_lh_s0: +def HEXAGON_M2_mpyu_lh_s0: si_MInst_sisi_lh <"mpyu", int_hexagon_M2_mpyu_lh_s0>; -def Hexagon_M2_mpyu_lh_s1: +def HEXAGON_M2_mpyu_lh_s1: si_MInst_sisi_lh_s1 <"mpyu", int_hexagon_M2_mpyu_lh_s1>; -def Hexagon_M2_mpyu_ll_s0: +def HEXAGON_M2_mpyu_ll_s0: si_MInst_sisi_ll <"mpyu", int_hexagon_M2_mpyu_ll_s0>; -def Hexagon_M2_mpyu_ll_s1: +def HEXAGON_M2_mpyu_ll_s1: si_MInst_sisi_ll_s1 <"mpyu", int_hexagon_M2_mpyu_ll_s1>; //Rdd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1] -def Hexagon_M2_mpyud_hh_s0: +def HEXAGON_M2_mpyud_hh_s0: di_MInst_sisi_hh <"mpyu", int_hexagon_M2_mpyud_hh_s0>; -def Hexagon_M2_mpyud_hh_s1: +def HEXAGON_M2_mpyud_hh_s1: di_MInst_sisi_hh_s1 <"mpyu", int_hexagon_M2_mpyud_hh_s1>; -def Hexagon_M2_mpyud_hl_s0: +def HEXAGON_M2_mpyud_hl_s0: di_MInst_sisi_hl <"mpyu", int_hexagon_M2_mpyud_hl_s0>; -def Hexagon_M2_mpyud_hl_s1: +def HEXAGON_M2_mpyud_hl_s1: di_MInst_sisi_hl_s1 <"mpyu", int_hexagon_M2_mpyud_hl_s1>; -def Hexagon_M2_mpyud_lh_s0: +def HEXAGON_M2_mpyud_lh_s0: di_MInst_sisi_lh <"mpyu", int_hexagon_M2_mpyud_lh_s0>; -def Hexagon_M2_mpyud_lh_s1: +def HEXAGON_M2_mpyud_lh_s1: di_MInst_sisi_lh_s1 <"mpyu", int_hexagon_M2_mpyud_lh_s1>; -def Hexagon_M2_mpyud_ll_s0: +def HEXAGON_M2_mpyud_ll_s0: di_MInst_sisi_ll <"mpyu", int_hexagon_M2_mpyud_ll_s0>; -def Hexagon_M2_mpyud_ll_s1: +def HEXAGON_M2_mpyud_ll_s1: di_MInst_sisi_ll_s1 <"mpyu", int_hexagon_M2_mpyud_ll_s1>; //Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1] -def Hexagon_M2_mpyu_acc_hh_s0: +def HEXAGON_M2_mpyu_acc_hh_s0: si_MInst_sisisi_acc_hh <"mpyu", int_hexagon_M2_mpyu_acc_hh_s0>; -def Hexagon_M2_mpyu_acc_hh_s1: +def HEXAGON_M2_mpyu_acc_hh_s1: si_MInst_sisisi_acc_hh_s1 <"mpyu", int_hexagon_M2_mpyu_acc_hh_s1>; -def Hexagon_M2_mpyu_acc_hl_s0: +def HEXAGON_M2_mpyu_acc_hl_s0: si_MInst_sisisi_acc_hl <"mpyu", int_hexagon_M2_mpyu_acc_hl_s0>; -def Hexagon_M2_mpyu_acc_hl_s1: +def HEXAGON_M2_mpyu_acc_hl_s1: si_MInst_sisisi_acc_hl_s1 <"mpyu", int_hexagon_M2_mpyu_acc_hl_s1>; -def Hexagon_M2_mpyu_acc_lh_s0: +def HEXAGON_M2_mpyu_acc_lh_s0: si_MInst_sisisi_acc_lh <"mpyu", int_hexagon_M2_mpyu_acc_lh_s0>; -def Hexagon_M2_mpyu_acc_lh_s1: +def HEXAGON_M2_mpyu_acc_lh_s1: si_MInst_sisisi_acc_lh_s1 <"mpyu", int_hexagon_M2_mpyu_acc_lh_s1>; -def Hexagon_M2_mpyu_acc_ll_s0: +def HEXAGON_M2_mpyu_acc_ll_s0: si_MInst_sisisi_acc_ll <"mpyu", int_hexagon_M2_mpyu_acc_ll_s0>; -def Hexagon_M2_mpyu_acc_ll_s1: +def HEXAGON_M2_mpyu_acc_ll_s1: si_MInst_sisisi_acc_ll_s1 <"mpyu", int_hexagon_M2_mpyu_acc_ll_s1>; //Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1] -def Hexagon_M2_mpyu_nac_hh_s0: +def HEXAGON_M2_mpyu_nac_hh_s0: si_MInst_sisisi_nac_hh <"mpyu", int_hexagon_M2_mpyu_nac_hh_s0>; -def Hexagon_M2_mpyu_nac_hh_s1: +def HEXAGON_M2_mpyu_nac_hh_s1: si_MInst_sisisi_nac_hh_s1 <"mpyu", int_hexagon_M2_mpyu_nac_hh_s1>; -def Hexagon_M2_mpyu_nac_hl_s0: +def HEXAGON_M2_mpyu_nac_hl_s0: si_MInst_sisisi_nac_hl <"mpyu", int_hexagon_M2_mpyu_nac_hl_s0>; -def Hexagon_M2_mpyu_nac_hl_s1: +def HEXAGON_M2_mpyu_nac_hl_s1: si_MInst_sisisi_nac_hl_s1 <"mpyu", int_hexagon_M2_mpyu_nac_hl_s1>; -def Hexagon_M2_mpyu_nac_lh_s0: +def HEXAGON_M2_mpyu_nac_lh_s0: si_MInst_sisisi_nac_lh <"mpyu", int_hexagon_M2_mpyu_nac_lh_s0>; -def Hexagon_M2_mpyu_nac_lh_s1: +def HEXAGON_M2_mpyu_nac_lh_s1: si_MInst_sisisi_nac_lh_s1 <"mpyu", int_hexagon_M2_mpyu_nac_lh_s1>; -def Hexagon_M2_mpyu_nac_ll_s0: +def HEXAGON_M2_mpyu_nac_ll_s0: si_MInst_sisisi_nac_ll <"mpyu", int_hexagon_M2_mpyu_nac_ll_s0>; -def Hexagon_M2_mpyu_nac_ll_s1: +def HEXAGON_M2_mpyu_nac_ll_s1: si_MInst_sisisi_nac_ll_s1 <"mpyu", int_hexagon_M2_mpyu_nac_ll_s1>; //Rdd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1] -def Hexagon_M2_mpyud_acc_hh_s0: +def HEXAGON_M2_mpyud_acc_hh_s0: di_MInst_disisi_acc_hh <"mpyu", int_hexagon_M2_mpyud_acc_hh_s0>; -def Hexagon_M2_mpyud_acc_hh_s1: +def HEXAGON_M2_mpyud_acc_hh_s1: di_MInst_disisi_acc_hh_s1 <"mpyu", int_hexagon_M2_mpyud_acc_hh_s1>; -def Hexagon_M2_mpyud_acc_hl_s0: +def HEXAGON_M2_mpyud_acc_hl_s0: di_MInst_disisi_acc_hl <"mpyu", int_hexagon_M2_mpyud_acc_hl_s0>; -def Hexagon_M2_mpyud_acc_hl_s1: +def HEXAGON_M2_mpyud_acc_hl_s1: di_MInst_disisi_acc_hl_s1 <"mpyu", int_hexagon_M2_mpyud_acc_hl_s1>; -def Hexagon_M2_mpyud_acc_lh_s0: +def HEXAGON_M2_mpyud_acc_lh_s0: di_MInst_disisi_acc_lh <"mpyu", int_hexagon_M2_mpyud_acc_lh_s0>; -def Hexagon_M2_mpyud_acc_lh_s1: +def HEXAGON_M2_mpyud_acc_lh_s1: di_MInst_disisi_acc_lh_s1 <"mpyu", int_hexagon_M2_mpyud_acc_lh_s1>; -def Hexagon_M2_mpyud_acc_ll_s0: +def HEXAGON_M2_mpyud_acc_ll_s0: di_MInst_disisi_acc_ll <"mpyu", int_hexagon_M2_mpyud_acc_ll_s0>; -def Hexagon_M2_mpyud_acc_ll_s1: +def HEXAGON_M2_mpyud_acc_ll_s1: di_MInst_disisi_acc_ll_s1 <"mpyu", int_hexagon_M2_mpyud_acc_ll_s1>; //Rdd-=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1] -def Hexagon_M2_mpyud_nac_hh_s0: +def HEXAGON_M2_mpyud_nac_hh_s0: di_MInst_disisi_nac_hh <"mpyu", int_hexagon_M2_mpyud_nac_hh_s0>; -def Hexagon_M2_mpyud_nac_hh_s1: +def HEXAGON_M2_mpyud_nac_hh_s1: di_MInst_disisi_nac_hh_s1 <"mpyu", int_hexagon_M2_mpyud_nac_hh_s1>; -def Hexagon_M2_mpyud_nac_hl_s0: +def HEXAGON_M2_mpyud_nac_hl_s0: di_MInst_disisi_nac_hl <"mpyu", int_hexagon_M2_mpyud_nac_hl_s0>; -def Hexagon_M2_mpyud_nac_hl_s1: +def HEXAGON_M2_mpyud_nac_hl_s1: di_MInst_disisi_nac_hl_s1 <"mpyu", int_hexagon_M2_mpyud_nac_hl_s1>; -def Hexagon_M2_mpyud_nac_lh_s0: +def HEXAGON_M2_mpyud_nac_lh_s0: di_MInst_disisi_nac_lh <"mpyu", int_hexagon_M2_mpyud_nac_lh_s0>; -def Hexagon_M2_mpyud_nac_lh_s1: +def HEXAGON_M2_mpyud_nac_lh_s1: di_MInst_disisi_nac_lh_s1 <"mpyu", int_hexagon_M2_mpyud_nac_lh_s1>; -def Hexagon_M2_mpyud_nac_ll_s0: +def HEXAGON_M2_mpyud_nac_ll_s0: di_MInst_disisi_nac_ll <"mpyu", int_hexagon_M2_mpyud_nac_ll_s0>; -def Hexagon_M2_mpyud_nac_ll_s1: +def HEXAGON_M2_mpyud_nac_ll_s1: di_MInst_disisi_nac_ll_s1 <"mpyu", int_hexagon_M2_mpyud_nac_ll_s1>; @@ -2864,15 +2900,15 @@ def Hexagon_M2_mpyud_nac_ll_s1: *********************************************************************/ // MTYPE / VB / Vector reduce add unsigned bytes. -def Hexagon_A2_vraddub: +def HEXAGON_A2_vraddub: di_MInst_didi <"vraddub", int_hexagon_A2_vraddub>; -def Hexagon_A2_vraddub_acc: +def HEXAGON_A2_vraddub_acc: di_MInst_dididi_acc <"vraddub", int_hexagon_A2_vraddub_acc>; // MTYPE / VB / Vector sum of absolute differences unsigned bytes. -def Hexagon_A2_vrsadub: +def HEXAGON_A2_vrsadub: di_MInst_didi <"vrsadub", int_hexagon_A2_vrsadub>; -def Hexagon_A2_vrsadub_acc: +def HEXAGON_A2_vrsadub_acc: di_MInst_dididi_acc <"vrsadub", int_hexagon_A2_vrsadub_acc>; /******************************************************************** @@ -2880,56 +2916,56 @@ def Hexagon_A2_vrsadub_acc: *********************************************************************/ // MTYPE / VH / Vector dual multiply. -def Hexagon_M2_vdmpys_s1: +def HEXAGON_M2_vdmpys_s1: di_MInst_didi_s1_sat <"vdmpy", int_hexagon_M2_vdmpys_s1>; -def Hexagon_M2_vdmpys_s0: +def HEXAGON_M2_vdmpys_s0: di_MInst_didi_sat <"vdmpy", int_hexagon_M2_vdmpys_s0>; -def Hexagon_M2_vdmacs_s1: +def HEXAGON_M2_vdmacs_s1: di_MInst_dididi_acc_s1_sat <"vdmpy", int_hexagon_M2_vdmacs_s1>; -def Hexagon_M2_vdmacs_s0: +def HEXAGON_M2_vdmacs_s0: di_MInst_dididi_acc_sat <"vdmpy", int_hexagon_M2_vdmacs_s0>; // MTYPE / VH / Vector dual multiply with round and pack. -def Hexagon_M2_vdmpyrs_s0: +def HEXAGON_M2_vdmpyrs_s0: si_MInst_didi_rnd_sat <"vdmpy", int_hexagon_M2_vdmpyrs_s0>; -def Hexagon_M2_vdmpyrs_s1: +def HEXAGON_M2_vdmpyrs_s1: si_MInst_didi_s1_rnd_sat <"vdmpy", int_hexagon_M2_vdmpyrs_s1>; // MTYPE / VH / Vector multiply even halfwords. -def Hexagon_M2_vmpy2es_s1: +def HEXAGON_M2_vmpy2es_s1: di_MInst_didi_s1_sat <"vmpyeh", int_hexagon_M2_vmpy2es_s1>; -def Hexagon_M2_vmpy2es_s0: +def HEXAGON_M2_vmpy2es_s0: di_MInst_didi_sat <"vmpyeh", int_hexagon_M2_vmpy2es_s0>; -def Hexagon_M2_vmac2es: +def HEXAGON_M2_vmac2es: di_MInst_dididi_acc <"vmpyeh", int_hexagon_M2_vmac2es>; -def Hexagon_M2_vmac2es_s1: +def HEXAGON_M2_vmac2es_s1: di_MInst_dididi_acc_s1_sat <"vmpyeh", int_hexagon_M2_vmac2es_s1>; -def Hexagon_M2_vmac2es_s0: +def HEXAGON_M2_vmac2es_s0: di_MInst_dididi_acc_sat <"vmpyeh", int_hexagon_M2_vmac2es_s0>; // MTYPE / VH / Vector multiply halfwords. -def Hexagon_M2_vmpy2s_s0: +def HEXAGON_M2_vmpy2s_s0: di_MInst_sisi_sat <"vmpyh", int_hexagon_M2_vmpy2s_s0>; -def Hexagon_M2_vmpy2s_s1: +def HEXAGON_M2_vmpy2s_s1: di_MInst_sisi_s1_sat <"vmpyh", int_hexagon_M2_vmpy2s_s1>; -def Hexagon_M2_vmac2: +def HEXAGON_M2_vmac2: di_MInst_disisi_acc <"vmpyh", int_hexagon_M2_vmac2>; -def Hexagon_M2_vmac2s_s0: +def HEXAGON_M2_vmac2s_s0: di_MInst_disisi_acc_sat <"vmpyh", int_hexagon_M2_vmac2s_s0>; -def Hexagon_M2_vmac2s_s1: +def HEXAGON_M2_vmac2s_s1: di_MInst_disisi_acc_s1_sat <"vmpyh", int_hexagon_M2_vmac2s_s1>; // MTYPE / VH / Vector multiply halfwords with round and pack. -def Hexagon_M2_vmpy2s_s0pack: +def HEXAGON_M2_vmpy2s_s0pack: si_MInst_sisi_rnd_sat <"vmpyh", int_hexagon_M2_vmpy2s_s0pack>; -def Hexagon_M2_vmpy2s_s1pack: +def HEXAGON_M2_vmpy2s_s1pack: si_MInst_sisi_s1_rnd_sat <"vmpyh", int_hexagon_M2_vmpy2s_s1pack>; // MTYPE / VH / Vector reduce multiply halfwords. // Rxx32+=vrmpyh(Rss32,Rtt32) -def Hexagon_M2_vrmpy_s0: +def HEXAGON_M2_vrmpy_s0: di_MInst_didi <"vrmpyh", int_hexagon_M2_vrmpy_s0>; -def Hexagon_M2_vrmac_s0: +def HEXAGON_M2_vrmac_s0: di_MInst_dididi_acc <"vrmpyh", int_hexagon_M2_vrmac_s0>; @@ -2938,25 +2974,25 @@ def Hexagon_M2_vrmac_s0: *********************************************************************/ // STYPE / ALU / Absolute value. -def Hexagon_A2_abs: +def HEXAGON_A2_abs: si_SInst_si <"abs", int_hexagon_A2_abs>; -def Hexagon_A2_absp: +def HEXAGON_A2_absp: di_SInst_di <"abs", int_hexagon_A2_absp>; -def Hexagon_A2_abssat: +def HEXAGON_A2_abssat: si_SInst_si_sat <"abs", int_hexagon_A2_abssat>; // STYPE / ALU / Negate. -def Hexagon_A2_negp: +def HEXAGON_A2_negp: di_SInst_di <"neg", int_hexagon_A2_negp>; -def Hexagon_A2_negsat: +def HEXAGON_A2_negsat: si_SInst_si_sat <"neg", int_hexagon_A2_negsat>; // STYPE / ALU / Logical Not. -def Hexagon_A2_notp: +def HEXAGON_A2_notp: di_SInst_di <"not", int_hexagon_A2_notp>; // STYPE / ALU / Sign extend word to doubleword. -def Hexagon_A2_sxtw: +def HEXAGON_A2_sxtw: di_SInst_si <"sxtw", int_hexagon_A2_sxtw>; @@ -2965,88 +3001,88 @@ def Hexagon_A2_sxtw: *********************************************************************/ // STYPE / BIT / Count leading. -def Hexagon_S2_cl0: +def HEXAGON_S2_cl0: si_SInst_si <"cl0", int_hexagon_S2_cl0>; -def Hexagon_S2_cl0p: +def HEXAGON_S2_cl0p: si_SInst_di <"cl0", int_hexagon_S2_cl0p>; -def Hexagon_S2_cl1: +def HEXAGON_S2_cl1: si_SInst_si <"cl1", int_hexagon_S2_cl1>; -def Hexagon_S2_cl1p: +def HEXAGON_S2_cl1p: si_SInst_di <"cl1", int_hexagon_S2_cl1p>; -def Hexagon_S2_clb: +def HEXAGON_S2_clb: si_SInst_si <"clb", int_hexagon_S2_clb>; -def Hexagon_S2_clbp: +def HEXAGON_S2_clbp: si_SInst_di <"clb", int_hexagon_S2_clbp>; -def Hexagon_S2_clbnorm: +def HEXAGON_S2_clbnorm: si_SInst_si <"normamt", int_hexagon_S2_clbnorm>; // STYPE / BIT / Count trailing. -def Hexagon_S2_ct0: +def HEXAGON_S2_ct0: si_SInst_si <"ct0", int_hexagon_S2_ct0>; -def Hexagon_S2_ct1: +def HEXAGON_S2_ct1: si_SInst_si <"ct1", int_hexagon_S2_ct1>; // STYPE / BIT / Compare bit mask. -def HEXAGON_C2_bitsclr: +def Hexagon_C2_bitsclr: qi_SInst_sisi <"bitsclr", int_hexagon_C2_bitsclr>; -def HEXAGON_C2_bitsclri: +def Hexagon_C2_bitsclri: qi_SInst_siu6 <"bitsclr", int_hexagon_C2_bitsclri>; -def HEXAGON_C2_bitsset: +def Hexagon_C2_bitsset: qi_SInst_sisi <"bitsset", int_hexagon_C2_bitsset>; // STYPE / BIT / Extract unsigned. // Rd[d][32/64]=extractu(Rs[s],Rt[t],[imm]) -def Hexagon_S2_extractu: +def HEXAGON_S2_extractu: si_SInst_siu5u5 <"extractu",int_hexagon_S2_extractu>; -def Hexagon_S2_extractu_rp: +def HEXAGON_S2_extractu_rp: si_SInst_sidi <"extractu",int_hexagon_S2_extractu_rp>; -def Hexagon_S2_extractup: +def HEXAGON_S2_extractup: di_SInst_diu6u6 <"extractu",int_hexagon_S2_extractup>; -def Hexagon_S2_extractup_rp: +def HEXAGON_S2_extractup_rp: di_SInst_didi <"extractu",int_hexagon_S2_extractup_rp>; // STYPE / BIT / Insert bitfield. -def HEXAGON_S2_insert: +def Hexagon_S2_insert: si_SInst_sisiu5u5 <"insert", int_hexagon_S2_insert>; -def HEXAGON_S2_insert_rp: +def Hexagon_S2_insert_rp: si_SInst_sisidi <"insert", int_hexagon_S2_insert_rp>; -def HEXAGON_S2_insertp: +def Hexagon_S2_insertp: di_SInst_didiu6u6 <"insert", int_hexagon_S2_insertp>; -def HEXAGON_S2_insertp_rp: +def Hexagon_S2_insertp_rp: di_SInst_dididi <"insert", int_hexagon_S2_insertp_rp>; // STYPE / BIT / Innterleave/deinterleave. -def HEXAGON_S2_interleave: +def Hexagon_S2_interleave: di_SInst_di <"interleave", int_hexagon_S2_interleave>; -def HEXAGON_S2_deinterleave: +def Hexagon_S2_deinterleave: di_SInst_di <"deinterleave", int_hexagon_S2_deinterleave>; // STYPE / BIT / Linear feedback-shift Iteration. -def HEXAGON_S2_lfsp: +def Hexagon_S2_lfsp: di_SInst_didi <"lfs", int_hexagon_S2_lfsp>; // STYPE / BIT / Bit reverse. -def HEXAGON_S2_brev: +def Hexagon_S2_brev: si_SInst_si <"brev", int_hexagon_S2_brev>; // STYPE / BIT / Set/Clear/Toggle Bit. -def Hexagon_S2_setbit_i: +def HEXAGON_S2_setbit_i: si_SInst_siu5 <"setbit", int_hexagon_S2_setbit_i>; -def Hexagon_S2_togglebit_i: +def HEXAGON_S2_togglebit_i: si_SInst_siu5 <"togglebit", int_hexagon_S2_togglebit_i>; -def Hexagon_S2_clrbit_i: +def HEXAGON_S2_clrbit_i: si_SInst_siu5 <"clrbit", int_hexagon_S2_clrbit_i>; -def Hexagon_S2_setbit_r: +def HEXAGON_S2_setbit_r: si_SInst_sisi <"setbit", int_hexagon_S2_setbit_r>; -def Hexagon_S2_togglebit_r: +def HEXAGON_S2_togglebit_r: si_SInst_sisi <"togglebit", int_hexagon_S2_togglebit_r>; -def Hexagon_S2_clrbit_r: +def HEXAGON_S2_clrbit_r: si_SInst_sisi <"clrbit", int_hexagon_S2_clrbit_r>; // STYPE / BIT / Test Bit. -def Hexagon_S2_tstbit_i: +def HEXAGON_S2_tstbit_i: qi_SInst_siu5 <"tstbit", int_hexagon_S2_tstbit_i>; -def Hexagon_S2_tstbit_r: +def HEXAGON_S2_tstbit_r: qi_SInst_sisi <"tstbit", int_hexagon_S2_tstbit_r>; @@ -3055,11 +3091,11 @@ def Hexagon_S2_tstbit_r: *********************************************************************/ // STYPE / COMPLEX / Vector Complex conjugate. -def Hexagon_A2_vconj: +def HEXAGON_A2_vconj: di_SInst_di_sat <"vconj", int_hexagon_A2_vconj>; // STYPE / COMPLEX / Vector Complex rotate. -def Hexagon_S2_vcrotate: +def HEXAGON_S2_vcrotate: di_SInst_disi <"vcrotate",int_hexagon_S2_vcrotate>; @@ -3068,102 +3104,102 @@ def Hexagon_S2_vcrotate: *********************************************************************/ // STYPE / PERM / Saturate. -def Hexagon_A2_sat: +def HEXAGON_A2_sat: si_SInst_di <"sat", int_hexagon_A2_sat>; -def Hexagon_A2_satb: +def HEXAGON_A2_satb: si_SInst_si <"satb", int_hexagon_A2_satb>; -def Hexagon_A2_sath: +def HEXAGON_A2_sath: si_SInst_si <"sath", int_hexagon_A2_sath>; -def Hexagon_A2_satub: +def HEXAGON_A2_satub: si_SInst_si <"satub", int_hexagon_A2_satub>; -def Hexagon_A2_satuh: +def HEXAGON_A2_satuh: si_SInst_si <"satuh", int_hexagon_A2_satuh>; // STYPE / PERM / Swizzle bytes. -def Hexagon_A2_swiz: +def HEXAGON_A2_swiz: si_SInst_si <"swiz", int_hexagon_A2_swiz>; // STYPE / PERM / Vector align. // Need custom lowering -def Hexagon_S2_valignib: +def HEXAGON_S2_valignib: di_SInst_didiu3 <"valignb", int_hexagon_S2_valignib>; -def Hexagon_S2_valignrb: +def HEXAGON_S2_valignrb: di_SInst_didiqi <"valignb", int_hexagon_S2_valignrb>; // STYPE / PERM / Vector round and pack. -def Hexagon_S2_vrndpackwh: +def HEXAGON_S2_vrndpackwh: si_SInst_di <"vrndwh", int_hexagon_S2_vrndpackwh>; -def Hexagon_S2_vrndpackwhs: +def HEXAGON_S2_vrndpackwhs: si_SInst_di_sat <"vrndwh", int_hexagon_S2_vrndpackwhs>; // STYPE / PERM / Vector saturate and pack. -def Hexagon_S2_svsathb: +def HEXAGON_S2_svsathb: si_SInst_si <"vsathb", int_hexagon_S2_svsathb>; -def Hexagon_S2_vsathb: +def HEXAGON_S2_vsathb: si_SInst_di <"vsathb", int_hexagon_S2_vsathb>; -def Hexagon_S2_svsathub: +def HEXAGON_S2_svsathub: si_SInst_si <"vsathub", int_hexagon_S2_svsathub>; -def Hexagon_S2_vsathub: +def HEXAGON_S2_vsathub: si_SInst_di <"vsathub", int_hexagon_S2_vsathub>; -def Hexagon_S2_vsatwh: +def HEXAGON_S2_vsatwh: si_SInst_di <"vsatwh", int_hexagon_S2_vsatwh>; -def Hexagon_S2_vsatwuh: +def HEXAGON_S2_vsatwuh: si_SInst_di <"vsatwuh", int_hexagon_S2_vsatwuh>; // STYPE / PERM / Vector saturate without pack. -def Hexagon_S2_vsathb_nopack: +def HEXAGON_S2_vsathb_nopack: di_SInst_di <"vsathb", int_hexagon_S2_vsathb_nopack>; -def Hexagon_S2_vsathub_nopack: +def HEXAGON_S2_vsathub_nopack: di_SInst_di <"vsathub", int_hexagon_S2_vsathub_nopack>; -def Hexagon_S2_vsatwh_nopack: +def HEXAGON_S2_vsatwh_nopack: di_SInst_di <"vsatwh", int_hexagon_S2_vsatwh_nopack>; -def Hexagon_S2_vsatwuh_nopack: +def HEXAGON_S2_vsatwuh_nopack: di_SInst_di <"vsatwuh", int_hexagon_S2_vsatwuh_nopack>; // STYPE / PERM / Vector shuffle. -def Hexagon_S2_shuffeb: +def HEXAGON_S2_shuffeb: di_SInst_didi <"shuffeb", int_hexagon_S2_shuffeb>; -def Hexagon_S2_shuffeh: +def HEXAGON_S2_shuffeh: di_SInst_didi <"shuffeh", int_hexagon_S2_shuffeh>; -def Hexagon_S2_shuffob: +def HEXAGON_S2_shuffob: di_SInst_didi <"shuffob", int_hexagon_S2_shuffob>; -def Hexagon_S2_shuffoh: +def HEXAGON_S2_shuffoh: di_SInst_didi <"shuffoh", int_hexagon_S2_shuffoh>; // STYPE / PERM / Vector splat bytes. -def Hexagon_S2_vsplatrb: +def HEXAGON_S2_vsplatrb: si_SInst_si <"vsplatb", int_hexagon_S2_vsplatrb>; // STYPE / PERM / Vector splat halfwords. -def Hexagon_S2_vsplatrh: +def HEXAGON_S2_vsplatrh: di_SInst_si <"vsplath", int_hexagon_S2_vsplatrh>; // STYPE / PERM / Vector splice. -def HEXAGON_S2_vsplicerb: +def Hexagon_S2_vsplicerb: di_SInst_didiqi <"vspliceb",int_hexagon_S2_vsplicerb>; -def HEXAGON_S2_vspliceib: +def Hexagon_S2_vspliceib: di_SInst_didiu3 <"vspliceb",int_hexagon_S2_vspliceib>; // STYPE / PERM / Sign extend. -def Hexagon_S2_vsxtbh: +def HEXAGON_S2_vsxtbh: di_SInst_si <"vsxtbh", int_hexagon_S2_vsxtbh>; -def Hexagon_S2_vsxthw: +def HEXAGON_S2_vsxthw: di_SInst_si <"vsxthw", int_hexagon_S2_vsxthw>; // STYPE / PERM / Truncate. -def Hexagon_S2_vtrunehb: +def HEXAGON_S2_vtrunehb: si_SInst_di <"vtrunehb",int_hexagon_S2_vtrunehb>; -def Hexagon_S2_vtrunohb: +def HEXAGON_S2_vtrunohb: si_SInst_di <"vtrunohb",int_hexagon_S2_vtrunohb>; -def Hexagon_S2_vtrunewh: +def HEXAGON_S2_vtrunewh: di_SInst_didi <"vtrunewh",int_hexagon_S2_vtrunewh>; -def Hexagon_S2_vtrunowh: +def HEXAGON_S2_vtrunowh: di_SInst_didi <"vtrunowh",int_hexagon_S2_vtrunowh>; // STYPE / PERM / Zero extend. -def Hexagon_S2_vzxtbh: +def HEXAGON_S2_vzxtbh: di_SInst_si <"vzxtbh", int_hexagon_S2_vzxtbh>; -def Hexagon_S2_vzxthw: +def HEXAGON_S2_vzxthw: di_SInst_si <"vzxthw", int_hexagon_S2_vzxthw>; @@ -3172,17 +3208,17 @@ def Hexagon_S2_vzxthw: *********************************************************************/ // STYPE / PRED / Mask generate from predicate. -def Hexagon_C2_mask: +def HEXAGON_C2_mask: di_SInst_qi <"mask", int_hexagon_C2_mask>; // STYPE / PRED / Predicate transfer. -def Hexagon_C2_tfrpr: +def HEXAGON_C2_tfrpr: si_SInst_qi <"", int_hexagon_C2_tfrpr>; -def Hexagon_C2_tfrrp: +def HEXAGON_C2_tfrrp: qi_SInst_si <"", int_hexagon_C2_tfrrp>; // STYPE / PRED / Viterbi pack even and odd predicate bits. -def Hexagon_C2_vitpack: +def HEXAGON_C2_vitpack: si_SInst_qiqi <"vitpack",int_hexagon_C2_vitpack>; @@ -3191,202 +3227,202 @@ def Hexagon_C2_vitpack: *********************************************************************/ // STYPE / SHIFT / Shift by immediate. -def Hexagon_S2_asl_i_r: +def HEXAGON_S2_asl_i_r: si_SInst_siu5 <"asl", int_hexagon_S2_asl_i_r>; -def Hexagon_S2_asr_i_r: +def HEXAGON_S2_asr_i_r: si_SInst_siu5 <"asr", int_hexagon_S2_asr_i_r>; -def Hexagon_S2_lsr_i_r: +def HEXAGON_S2_lsr_i_r: si_SInst_siu5 <"lsr", int_hexagon_S2_lsr_i_r>; -def Hexagon_S2_asl_i_p: +def HEXAGON_S2_asl_i_p: di_SInst_diu6 <"asl", int_hexagon_S2_asl_i_p>; -def Hexagon_S2_asr_i_p: +def HEXAGON_S2_asr_i_p: di_SInst_diu6 <"asr", int_hexagon_S2_asr_i_p>; -def Hexagon_S2_lsr_i_p: +def HEXAGON_S2_lsr_i_p: di_SInst_diu6 <"lsr", int_hexagon_S2_lsr_i_p>; // STYPE / SHIFT / Shift by immediate and accumulate. -def Hexagon_S2_asl_i_r_acc: +def HEXAGON_S2_asl_i_r_acc: si_SInst_sisiu5_acc <"asl", int_hexagon_S2_asl_i_r_acc>; -def Hexagon_S2_asr_i_r_acc: +def HEXAGON_S2_asr_i_r_acc: si_SInst_sisiu5_acc <"asr", int_hexagon_S2_asr_i_r_acc>; -def Hexagon_S2_lsr_i_r_acc: +def HEXAGON_S2_lsr_i_r_acc: si_SInst_sisiu5_acc <"lsr", int_hexagon_S2_lsr_i_r_acc>; -def Hexagon_S2_asl_i_r_nac: +def HEXAGON_S2_asl_i_r_nac: si_SInst_sisiu5_nac <"asl", int_hexagon_S2_asl_i_r_nac>; -def Hexagon_S2_asr_i_r_nac: +def HEXAGON_S2_asr_i_r_nac: si_SInst_sisiu5_nac <"asr", int_hexagon_S2_asr_i_r_nac>; -def Hexagon_S2_lsr_i_r_nac: +def HEXAGON_S2_lsr_i_r_nac: si_SInst_sisiu5_nac <"lsr", int_hexagon_S2_lsr_i_r_nac>; -def Hexagon_S2_asl_i_p_acc: +def HEXAGON_S2_asl_i_p_acc: di_SInst_didiu6_acc <"asl", int_hexagon_S2_asl_i_p_acc>; -def Hexagon_S2_asr_i_p_acc: +def HEXAGON_S2_asr_i_p_acc: di_SInst_didiu6_acc <"asr", int_hexagon_S2_asr_i_p_acc>; -def Hexagon_S2_lsr_i_p_acc: +def HEXAGON_S2_lsr_i_p_acc: di_SInst_didiu6_acc <"lsr", int_hexagon_S2_lsr_i_p_acc>; -def Hexagon_S2_asl_i_p_nac: +def HEXAGON_S2_asl_i_p_nac: di_SInst_didiu6_nac <"asl", int_hexagon_S2_asl_i_p_nac>; -def Hexagon_S2_asr_i_p_nac: +def HEXAGON_S2_asr_i_p_nac: di_SInst_didiu6_nac <"asr", int_hexagon_S2_asr_i_p_nac>; -def Hexagon_S2_lsr_i_p_nac: +def HEXAGON_S2_lsr_i_p_nac: di_SInst_didiu6_nac <"lsr", int_hexagon_S2_lsr_i_p_nac>; // STYPE / SHIFT / Shift by immediate and add. -def Hexagon_S2_addasl_rrri: +def HEXAGON_S2_addasl_rrri: si_SInst_sisiu3 <"addasl", int_hexagon_S2_addasl_rrri>; // STYPE / SHIFT / Shift by immediate and logical. -def Hexagon_S2_asl_i_r_and: +def HEXAGON_S2_asl_i_r_and: si_SInst_sisiu5_and <"asl", int_hexagon_S2_asl_i_r_and>; -def Hexagon_S2_asr_i_r_and: +def HEXAGON_S2_asr_i_r_and: si_SInst_sisiu5_and <"asr", int_hexagon_S2_asr_i_r_and>; -def Hexagon_S2_lsr_i_r_and: +def HEXAGON_S2_lsr_i_r_and: si_SInst_sisiu5_and <"lsr", int_hexagon_S2_lsr_i_r_and>; -def Hexagon_S2_asl_i_r_xacc: +def HEXAGON_S2_asl_i_r_xacc: si_SInst_sisiu5_xor <"asl", int_hexagon_S2_asl_i_r_xacc>; -def Hexagon_S2_lsr_i_r_xacc: +def HEXAGON_S2_lsr_i_r_xacc: si_SInst_sisiu5_xor <"lsr", int_hexagon_S2_lsr_i_r_xacc>; -def Hexagon_S2_asl_i_r_or: +def HEXAGON_S2_asl_i_r_or: si_SInst_sisiu5_or <"asl", int_hexagon_S2_asl_i_r_or>; -def Hexagon_S2_asr_i_r_or: +def HEXAGON_S2_asr_i_r_or: si_SInst_sisiu5_or <"asr", int_hexagon_S2_asr_i_r_or>; -def Hexagon_S2_lsr_i_r_or: +def HEXAGON_S2_lsr_i_r_or: si_SInst_sisiu5_or <"lsr", int_hexagon_S2_lsr_i_r_or>; -def Hexagon_S2_asl_i_p_and: +def HEXAGON_S2_asl_i_p_and: di_SInst_didiu6_and <"asl", int_hexagon_S2_asl_i_p_and>; -def Hexagon_S2_asr_i_p_and: +def HEXAGON_S2_asr_i_p_and: di_SInst_didiu6_and <"asr", int_hexagon_S2_asr_i_p_and>; -def Hexagon_S2_lsr_i_p_and: +def HEXAGON_S2_lsr_i_p_and: di_SInst_didiu6_and <"lsr", int_hexagon_S2_lsr_i_p_and>; -def Hexagon_S2_asl_i_p_xacc: +def HEXAGON_S2_asl_i_p_xacc: di_SInst_didiu6_xor <"asl", int_hexagon_S2_asl_i_p_xacc>; -def Hexagon_S2_lsr_i_p_xacc: +def HEXAGON_S2_lsr_i_p_xacc: di_SInst_didiu6_xor <"lsr", int_hexagon_S2_lsr_i_p_xacc>; -def Hexagon_S2_asl_i_p_or: +def HEXAGON_S2_asl_i_p_or: di_SInst_didiu6_or <"asl", int_hexagon_S2_asl_i_p_or>; -def Hexagon_S2_asr_i_p_or: +def HEXAGON_S2_asr_i_p_or: di_SInst_didiu6_or <"asr", int_hexagon_S2_asr_i_p_or>; -def Hexagon_S2_lsr_i_p_or: +def HEXAGON_S2_lsr_i_p_or: di_SInst_didiu6_or <"lsr", int_hexagon_S2_lsr_i_p_or>; // STYPE / SHIFT / Shift right by immediate with rounding. -def Hexagon_S2_asr_i_r_rnd: +def HEXAGON_S2_asr_i_r_rnd: si_SInst_siu5_rnd <"asr", int_hexagon_S2_asr_i_r_rnd>; -def Hexagon_S2_asr_i_r_rnd_goodsyntax: +def HEXAGON_S2_asr_i_r_rnd_goodsyntax: si_SInst_siu5 <"asrrnd", int_hexagon_S2_asr_i_r_rnd_goodsyntax>; // STYPE / SHIFT / Shift left by immediate with saturation. -def Hexagon_S2_asl_i_r_sat: +def HEXAGON_S2_asl_i_r_sat: si_SInst_sisi_sat <"asl", int_hexagon_S2_asl_i_r_sat>; // STYPE / SHIFT / Shift by register. -def Hexagon_S2_asl_r_r: +def HEXAGON_S2_asl_r_r: si_SInst_sisi <"asl", int_hexagon_S2_asl_r_r>; -def Hexagon_S2_asr_r_r: +def HEXAGON_S2_asr_r_r: si_SInst_sisi <"asr", int_hexagon_S2_asr_r_r>; -def Hexagon_S2_lsl_r_r: +def HEXAGON_S2_lsl_r_r: si_SInst_sisi <"lsl", int_hexagon_S2_lsl_r_r>; -def Hexagon_S2_lsr_r_r: +def HEXAGON_S2_lsr_r_r: si_SInst_sisi <"lsr", int_hexagon_S2_lsr_r_r>; -def Hexagon_S2_asl_r_p: +def HEXAGON_S2_asl_r_p: di_SInst_disi <"asl", int_hexagon_S2_asl_r_p>; -def Hexagon_S2_asr_r_p: +def HEXAGON_S2_asr_r_p: di_SInst_disi <"asr", int_hexagon_S2_asr_r_p>; -def Hexagon_S2_lsl_r_p: +def HEXAGON_S2_lsl_r_p: di_SInst_disi <"lsl", int_hexagon_S2_lsl_r_p>; -def Hexagon_S2_lsr_r_p: +def HEXAGON_S2_lsr_r_p: di_SInst_disi <"lsr", int_hexagon_S2_lsr_r_p>; // STYPE / SHIFT / Shift by register and accumulate. -def Hexagon_S2_asl_r_r_acc: +def HEXAGON_S2_asl_r_r_acc: si_SInst_sisisi_acc <"asl", int_hexagon_S2_asl_r_r_acc>; -def Hexagon_S2_asr_r_r_acc: +def HEXAGON_S2_asr_r_r_acc: si_SInst_sisisi_acc <"asr", int_hexagon_S2_asr_r_r_acc>; -def Hexagon_S2_lsl_r_r_acc: +def HEXAGON_S2_lsl_r_r_acc: si_SInst_sisisi_acc <"lsl", int_hexagon_S2_lsl_r_r_acc>; -def Hexagon_S2_lsr_r_r_acc: +def HEXAGON_S2_lsr_r_r_acc: si_SInst_sisisi_acc <"lsr", int_hexagon_S2_lsr_r_r_acc>; -def Hexagon_S2_asl_r_p_acc: +def HEXAGON_S2_asl_r_p_acc: di_SInst_didisi_acc <"asl", int_hexagon_S2_asl_r_p_acc>; -def Hexagon_S2_asr_r_p_acc: +def HEXAGON_S2_asr_r_p_acc: di_SInst_didisi_acc <"asr", int_hexagon_S2_asr_r_p_acc>; -def Hexagon_S2_lsl_r_p_acc: +def HEXAGON_S2_lsl_r_p_acc: di_SInst_didisi_acc <"lsl", int_hexagon_S2_lsl_r_p_acc>; -def Hexagon_S2_lsr_r_p_acc: +def HEXAGON_S2_lsr_r_p_acc: di_SInst_didisi_acc <"lsr", int_hexagon_S2_lsr_r_p_acc>; -def Hexagon_S2_asl_r_r_nac: +def HEXAGON_S2_asl_r_r_nac: si_SInst_sisisi_nac <"asl", int_hexagon_S2_asl_r_r_nac>; -def Hexagon_S2_asr_r_r_nac: +def HEXAGON_S2_asr_r_r_nac: si_SInst_sisisi_nac <"asr", int_hexagon_S2_asr_r_r_nac>; -def Hexagon_S2_lsl_r_r_nac: +def HEXAGON_S2_lsl_r_r_nac: si_SInst_sisisi_nac <"lsl", int_hexagon_S2_lsl_r_r_nac>; -def Hexagon_S2_lsr_r_r_nac: +def HEXAGON_S2_lsr_r_r_nac: si_SInst_sisisi_nac <"lsr", int_hexagon_S2_lsr_r_r_nac>; -def Hexagon_S2_asl_r_p_nac: +def HEXAGON_S2_asl_r_p_nac: di_SInst_didisi_nac <"asl", int_hexagon_S2_asl_r_p_nac>; -def Hexagon_S2_asr_r_p_nac: +def HEXAGON_S2_asr_r_p_nac: di_SInst_didisi_nac <"asr", int_hexagon_S2_asr_r_p_nac>; -def Hexagon_S2_lsl_r_p_nac: +def HEXAGON_S2_lsl_r_p_nac: di_SInst_didisi_nac <"lsl", int_hexagon_S2_lsl_r_p_nac>; -def Hexagon_S2_lsr_r_p_nac: +def HEXAGON_S2_lsr_r_p_nac: di_SInst_didisi_nac <"lsr", int_hexagon_S2_lsr_r_p_nac>; // STYPE / SHIFT / Shift by register and logical. -def Hexagon_S2_asl_r_r_and: +def HEXAGON_S2_asl_r_r_and: si_SInst_sisisi_and <"asl", int_hexagon_S2_asl_r_r_and>; -def Hexagon_S2_asr_r_r_and: +def HEXAGON_S2_asr_r_r_and: si_SInst_sisisi_and <"asr", int_hexagon_S2_asr_r_r_and>; -def Hexagon_S2_lsl_r_r_and: +def HEXAGON_S2_lsl_r_r_and: si_SInst_sisisi_and <"lsl", int_hexagon_S2_lsl_r_r_and>; -def Hexagon_S2_lsr_r_r_and: +def HEXAGON_S2_lsr_r_r_and: si_SInst_sisisi_and <"lsr", int_hexagon_S2_lsr_r_r_and>; -def Hexagon_S2_asl_r_r_or: +def HEXAGON_S2_asl_r_r_or: si_SInst_sisisi_or <"asl", int_hexagon_S2_asl_r_r_or>; -def Hexagon_S2_asr_r_r_or: +def HEXAGON_S2_asr_r_r_or: si_SInst_sisisi_or <"asr", int_hexagon_S2_asr_r_r_or>; -def Hexagon_S2_lsl_r_r_or: +def HEXAGON_S2_lsl_r_r_or: si_SInst_sisisi_or <"lsl", int_hexagon_S2_lsl_r_r_or>; -def Hexagon_S2_lsr_r_r_or: +def HEXAGON_S2_lsr_r_r_or: si_SInst_sisisi_or <"lsr", int_hexagon_S2_lsr_r_r_or>; -def Hexagon_S2_asl_r_p_and: +def HEXAGON_S2_asl_r_p_and: di_SInst_didisi_and <"asl", int_hexagon_S2_asl_r_p_and>; -def Hexagon_S2_asr_r_p_and: +def HEXAGON_S2_asr_r_p_and: di_SInst_didisi_and <"asr", int_hexagon_S2_asr_r_p_and>; -def Hexagon_S2_lsl_r_p_and: +def HEXAGON_S2_lsl_r_p_and: di_SInst_didisi_and <"lsl", int_hexagon_S2_lsl_r_p_and>; -def Hexagon_S2_lsr_r_p_and: +def HEXAGON_S2_lsr_r_p_and: di_SInst_didisi_and <"lsr", int_hexagon_S2_lsr_r_p_and>; -def Hexagon_S2_asl_r_p_or: +def HEXAGON_S2_asl_r_p_or: di_SInst_didisi_or <"asl", int_hexagon_S2_asl_r_p_or>; -def Hexagon_S2_asr_r_p_or: +def HEXAGON_S2_asr_r_p_or: di_SInst_didisi_or <"asr", int_hexagon_S2_asr_r_p_or>; -def Hexagon_S2_lsl_r_p_or: +def HEXAGON_S2_lsl_r_p_or: di_SInst_didisi_or <"lsl", int_hexagon_S2_lsl_r_p_or>; -def Hexagon_S2_lsr_r_p_or: +def HEXAGON_S2_lsr_r_p_or: di_SInst_didisi_or <"lsr", int_hexagon_S2_lsr_r_p_or>; // STYPE / SHIFT / Shift by register with saturation. -def Hexagon_S2_asl_r_r_sat: +def HEXAGON_S2_asl_r_r_sat: si_SInst_sisi_sat <"asl", int_hexagon_S2_asl_r_r_sat>; -def Hexagon_S2_asr_r_r_sat: +def HEXAGON_S2_asr_r_r_sat: si_SInst_sisi_sat <"asr", int_hexagon_S2_asr_r_r_sat>; // STYPE / SHIFT / Table Index. -def HEXAGON_S2_tableidxb_goodsyntax: +def Hexagon_S2_tableidxb_goodsyntax: si_MInst_sisiu4u5 <"tableidxb",int_hexagon_S2_tableidxb_goodsyntax>; -def HEXAGON_S2_tableidxd_goodsyntax: +def Hexagon_S2_tableidxd_goodsyntax: si_MInst_sisiu4u5 <"tableidxd",int_hexagon_S2_tableidxd_goodsyntax>; -def HEXAGON_S2_tableidxh_goodsyntax: +def Hexagon_S2_tableidxh_goodsyntax: si_MInst_sisiu4u5 <"tableidxh",int_hexagon_S2_tableidxh_goodsyntax>; -def HEXAGON_S2_tableidxw_goodsyntax: +def Hexagon_S2_tableidxw_goodsyntax: si_MInst_sisiu4u5 <"tableidxw",int_hexagon_S2_tableidxw_goodsyntax>; @@ -3396,29 +3432,29 @@ def HEXAGON_S2_tableidxw_goodsyntax: // STYPE / VH / Vector absolute value halfwords. // Rdd64=vabsh(Rss64) -def Hexagon_A2_vabsh: +def HEXAGON_A2_vabsh: di_SInst_di <"vabsh", int_hexagon_A2_vabsh>; -def Hexagon_A2_vabshsat: +def HEXAGON_A2_vabshsat: di_SInst_di_sat <"vabsh", int_hexagon_A2_vabshsat>; // STYPE / VH / Vector shift halfwords by immediate. // Rdd64=v[asl/asr/lsr]h(Rss64,Rt32) -def Hexagon_S2_asl_i_vh: +def HEXAGON_S2_asl_i_vh: di_SInst_disi <"vaslh", int_hexagon_S2_asl_i_vh>; -def Hexagon_S2_asr_i_vh: +def HEXAGON_S2_asr_i_vh: di_SInst_disi <"vasrh", int_hexagon_S2_asr_i_vh>; -def Hexagon_S2_lsr_i_vh: +def HEXAGON_S2_lsr_i_vh: di_SInst_disi <"vlsrh", int_hexagon_S2_lsr_i_vh>; // STYPE / VH / Vector shift halfwords by register. // Rdd64=v[asl/asr/lsl/lsr]w(Rss64,Rt32) -def Hexagon_S2_asl_r_vh: +def HEXAGON_S2_asl_r_vh: di_SInst_disi <"vaslh", int_hexagon_S2_asl_r_vh>; -def Hexagon_S2_asr_r_vh: +def HEXAGON_S2_asr_r_vh: di_SInst_disi <"vasrh", int_hexagon_S2_asr_r_vh>; -def Hexagon_S2_lsl_r_vh: +def HEXAGON_S2_lsl_r_vh: di_SInst_disi <"vlslh", int_hexagon_S2_lsl_r_vh>; -def Hexagon_S2_lsr_r_vh: +def HEXAGON_S2_lsr_r_vh: di_SInst_disi <"vlsrh", int_hexagon_S2_lsr_r_vh>; @@ -3427,36 +3463,41 @@ def Hexagon_S2_lsr_r_vh: *********************************************************************/ // STYPE / VW / Vector absolute value words. -def Hexagon_A2_vabsw: +def HEXAGON_A2_vabsw: di_SInst_di <"vabsw", int_hexagon_A2_vabsw>; -def Hexagon_A2_vabswsat: +def HEXAGON_A2_vabswsat: di_SInst_di_sat <"vabsw", int_hexagon_A2_vabswsat>; // STYPE / VW / Vector shift words by immediate. // Rdd64=v[asl/vsl]w(Rss64,Rt32) -def Hexagon_S2_asl_i_vw: +def HEXAGON_S2_asl_i_vw: di_SInst_disi <"vaslw", int_hexagon_S2_asl_i_vw>; -def Hexagon_S2_asr_i_vw: +def HEXAGON_S2_asr_i_vw: di_SInst_disi <"vasrw", int_hexagon_S2_asr_i_vw>; -def Hexagon_S2_lsr_i_vw: +def HEXAGON_S2_lsr_i_vw: di_SInst_disi <"vlsrw", int_hexagon_S2_lsr_i_vw>; // STYPE / VW / Vector shift words by register. // Rdd64=v[asl/vsl]w(Rss64,Rt32) -def Hexagon_S2_asl_r_vw: +def HEXAGON_S2_asl_r_vw: di_SInst_disi <"vaslw", int_hexagon_S2_asl_r_vw>; -def Hexagon_S2_asr_r_vw: +def HEXAGON_S2_asr_r_vw: di_SInst_disi <"vasrw", int_hexagon_S2_asr_r_vw>; -def Hexagon_S2_lsl_r_vw: +def HEXAGON_S2_lsl_r_vw: di_SInst_disi <"vlslw", int_hexagon_S2_lsl_r_vw>; -def Hexagon_S2_lsr_r_vw: +def HEXAGON_S2_lsr_r_vw: di_SInst_disi <"vlsrw", int_hexagon_S2_lsr_r_vw>; // STYPE / VW / Vector shift words with truncate and pack. -def Hexagon_S2_asr_r_svw_trun: +def HEXAGON_S2_asr_r_svw_trun: si_SInst_disi <"vasrw", int_hexagon_S2_asr_r_svw_trun>; -def Hexagon_S2_asr_i_svw_trun: +def HEXAGON_S2_asr_i_svw_trun: si_SInst_diu5 <"vasrw", int_hexagon_S2_asr_i_svw_trun>; +// LD / Circular loads. +def HEXAGON_circ_ldd: + di_LDInstPI_diu4 <"circ_ldd", int_hexagon_circ_ldd>; + include "HexagonIntrinsicsV3.td" include "HexagonIntrinsicsV4.td" +include "HexagonIntrinsicsV5.td" diff --git a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td index 68eaf68..2788101 100644 --- a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td +++ b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td @@ -12,18 +12,28 @@ // Optimized with intrinisics accumulates // def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2), - (COMBINE_rr - (Hexagon_M2_maci - (Hexagon_M2_maci (EXTRACT_SUBREG (MPYU64 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg)), - subreg_hireg), - (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)), - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), - (EXTRACT_SUBREG (MPYU64 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg), - (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg)), - subreg_loreg))>; + (i64 + (COMBINE_rr + (HEXAGON_M2_maci + (HEXAGON_M2_maci + (i32 + (EXTRACT_SUBREG + (i64 + (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), + subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), + subreg_loreg)))), + subreg_hireg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg))), + (i32 + (EXTRACT_SUBREG + (i64 + (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)), + (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), + subreg_loreg)))), subreg_loreg))))>; diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td new file mode 100644 index 0000000..1d44b52 --- /dev/null +++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td @@ -0,0 +1,395 @@ +class sf_SInst_sf<string opc, Intrinsic IntID> + : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set IntRegs:$dst, (IntID IntRegs:$src1))]>; + +class si_SInst_sf<string opc, Intrinsic IntID> + : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set IntRegs:$dst, (IntID IntRegs:$src1))]>; + +class sf_SInst_si<string opc, Intrinsic IntID> + : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set IntRegs:$dst, (IntID IntRegs:$src1))]>; + +class sf_SInst_di<string opc, Intrinsic IntID> + : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>; + +class sf_SInst_df<string opc, Intrinsic IntID> + : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>; + +class si_SInst_df<string opc, Intrinsic IntID> + : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>; + +class df_SInst_sf<string opc, Intrinsic IntID> + : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>; + +class di_SInst_sf<string opc, Intrinsic IntID> + : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>; + +class df_SInst_si<string opc, Intrinsic IntID> + : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>; + +class df_SInst_df<string opc, Intrinsic IntID> + : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>; + +class di_SInst_df<string opc, Intrinsic IntID> + : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>; + + +class df_SInst_di<string opc, Intrinsic IntID> + : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1), + !strconcat("$dst = ", !strconcat(opc , "($src1)")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>; + +class sf_MInst_sfsf<string opc, Intrinsic IntID> + : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")), + [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>; + +class df_MInst_dfdf<string opc, Intrinsic IntID> + : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>; + +class qi_ALU64_dfdf<string opc, Intrinsic IntID> + : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")), + [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>; + +class qi_ALU64_dfu5<string opc, Intrinsic IntID> + : ALU64_ri<(outs PredRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")), + [(set PredRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>; + + +class sf_MInst_sfsfsf_acc<string opc, Intrinsic IntID> + : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, + IntRegs:$dst2), + !strconcat("$dst += ", !strconcat(opc , + "($src1, $src2)")), + [(set IntRegs:$dst, (IntID IntRegs:$src1, + IntRegs:$src2, IntRegs:$dst2))], + "$dst2 = $dst">; + +class sf_MInst_sfsfsf_nac<string opc, Intrinsic IntID> + : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, + IntRegs:$dst2), + !strconcat("$dst -= ", !strconcat(opc , + "($src1, $src2)")), + [(set IntRegs:$dst, (IntID IntRegs:$src1, + IntRegs:$src2, IntRegs:$dst2))], + "$dst2 = $dst">; + + +class sf_MInst_sfsfsfsi_sc<string opc, Intrinsic IntID> + : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1, + IntRegs:$src2, IntRegs:$src3), + !strconcat("$dst += ", !strconcat(opc , + "($src1, $src2, $src3):scale")), + [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1, + IntRegs:$src2, IntRegs:$src3))], + "$dst2 = $dst">; + +class sf_MInst_sfsfsf_acc_lib<string opc, Intrinsic IntID> + : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, + IntRegs:$dst2), + !strconcat("$dst += ", !strconcat(opc , + "($src1, $src2):lib")), + [(set IntRegs:$dst, (IntID IntRegs:$src1, + IntRegs:$src2, IntRegs:$dst2))], + "$dst2 = $dst">; + +class sf_MInst_sfsfsf_nac_lib<string opc, Intrinsic IntID> + : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, + IntRegs:$dst2), + !strconcat("$dst -= ", !strconcat(opc , + "($src1, $src2):lib")), + [(set IntRegs:$dst, (IntID IntRegs:$src1, + IntRegs:$src2, IntRegs:$dst2))], + "$dst2 = $dst">; + +class df_MInst_dfdfdf_acc<string opc, Intrinsic IntID> + : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, + DoubleRegs:$dst2), + !strconcat("$dst += ", !strconcat(opc , + "($src1, $src2)")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, + DoubleRegs:$src2, DoubleRegs:$dst2))], + "$dst2 = $dst">; + +class df_MInst_dfdfdf_nac<string opc, Intrinsic IntID> + : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, + DoubleRegs:$dst2), + !strconcat("$dst -= ", !strconcat(opc , + "($src1, $src2)")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, + DoubleRegs:$src2, DoubleRegs:$dst2))], + "$dst2 = $dst">; + + +class df_MInst_dfdfdfsi_sc<string opc, Intrinsic IntID> + : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1, + DoubleRegs:$src2, IntRegs:$src3), + !strconcat("$dst += ", !strconcat(opc , + "($src1, $src2, $src3):scale")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1, + DoubleRegs:$src2, IntRegs:$src3))], + "$dst2 = $dst">; + +class df_MInst_dfdfdf_acc_lib<string opc, Intrinsic IntID> + : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, + DoubleRegs:$dst2), + !strconcat("$dst += ", !strconcat(opc , + "($src1, $src2):lib")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, + DoubleRegs:$src2, DoubleRegs:$dst2))], + "$dst2 = $dst">; + +class df_MInst_dfdfdf_nac_lib<string opc, Intrinsic IntID> + : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2, + DoubleRegs:$dst2), + !strconcat("$dst -= ", !strconcat(opc , + "($src1, $src2):lib")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, + DoubleRegs:$src2, DoubleRegs:$dst2))], + "$dst2 = $dst">; + +class qi_SInst_sfsf<string opc, Intrinsic IntID> + : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")), + [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>; + +class qi_SInst_sfu5<string opc, Intrinsic IntID> + : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")), + [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>; + +class sf_ALU64_u10_pos<string opc, Intrinsic IntID> + : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1), + !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")), + [(set IntRegs:$dst, (IntID imm:$src1))]>; + +class sf_ALU64_u10_neg<string opc, Intrinsic IntID> + : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1), + !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")), + [(set IntRegs:$dst, (IntID imm:$src1))]>; + +class df_ALU64_u10_pos<string opc, Intrinsic IntID> + : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1), + !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")), + [(set DoubleRegs:$dst, (IntID imm:$src1))]>; + +class df_ALU64_u10_neg<string opc, Intrinsic IntID> + : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1), + !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")), + [(set DoubleRegs:$dst, (IntID imm:$src1))]>; + +class di_MInst_diu6<string opc, Intrinsic IntID> + : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>; + +class di_MInst_diu4_rnd<string opc, Intrinsic IntID> + : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")), + [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>; + +class si_MInst_diu4_rnd_sat<string opc, Intrinsic IntID> + : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd:sat")), + [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>; + +class si_SInst_diu4_sat<string opc, Intrinsic IntID> + : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2), + !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")), + [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>; + + +def HEXAGON_C4_fastcorner9: + qi_SInst_qiqi <"fastcorner9", int_hexagon_C4_fastcorner9>; +def HEXAGON_C4_fastcorner9_not: + qi_SInst_qiqi <"!fastcorner9", int_hexagon_C4_fastcorner9_not>; +def HEXAGON_M5_vrmpybuu: + di_MInst_didi <"vrmpybu", int_hexagon_M5_vrmpybuu>; +def HEXAGON_M5_vrmacbuu: + di_MInst_dididi_acc <"vrmpybu", int_hexagon_M5_vrmacbuu>; +def HEXAGON_M5_vrmpybsu: + di_MInst_didi <"vrmpybsu", int_hexagon_M5_vrmpybsu>; +def HEXAGON_M5_vrmacbsu: + di_MInst_dididi_acc <"vrmpybsu", int_hexagon_M5_vrmacbsu>; +def HEXAGON_M5_vmpybuu: + di_MInst_sisi <"vmpybu", int_hexagon_M5_vmpybuu>; +def HEXAGON_M5_vmpybsu: + di_MInst_sisi <"vmpybsu", int_hexagon_M5_vmpybsu>; +def HEXAGON_M5_vmacbuu: + di_MInst_disisi_acc <"vmpybu", int_hexagon_M5_vmacbuu>; +def HEXAGON_M5_vmacbsu: + di_MInst_disisi_acc <"vmpybsu", int_hexagon_M5_vmacbsu>; +def HEXAGON_M5_vdmpybsu: + di_MInst_didi_sat <"vdmpybsu", int_hexagon_M5_vdmpybsu>; +def HEXAGON_M5_vdmacbsu: + di_MInst_dididi_acc_sat <"vdmpybsu", int_hexagon_M5_vdmacbsu>; +def HEXAGON_A5_vaddhubs: + si_SInst_didi_sat <"vaddhub", int_hexagon_A5_vaddhubs>; +def HEXAGON_S5_popcountp: + si_SInst_di <"popcount", int_hexagon_S5_popcountp>; +def HEXAGON_S5_asrhub_rnd_sat_goodsyntax: + si_MInst_diu4_rnd_sat <"vasrhub", int_hexagon_S5_asrhub_rnd_sat_goodsyntax>; +def HEXAGON_S5_asrhub_sat: + si_SInst_diu4_sat <"vasrhub", int_hexagon_S5_asrhub_sat>; +def HEXAGON_S5_vasrhrnd_goodsyntax: + di_MInst_diu4_rnd <"vasrh", int_hexagon_S5_vasrhrnd_goodsyntax>; +def HEXAGON_S2_asr_i_p_rnd: + di_SInst_diu6 <"asr", int_hexagon_S2_asr_i_p_rnd>; +def HEXAGON_S2_asr_i_p_rnd_goodsyntax: + di_MInst_diu6 <"asrrnd", int_hexagon_S2_asr_i_p_rnd_goodsyntax>; +def HEXAGON_F2_sfadd: + sf_MInst_sfsf <"sfadd", int_hexagon_F2_sfadd>; +def HEXAGON_F2_sfsub: + sf_MInst_sfsf <"sfsub", int_hexagon_F2_sfsub>; +def HEXAGON_F2_sfmpy: + sf_MInst_sfsf <"sfmpy", int_hexagon_F2_sfmpy>; +def HEXAGON_F2_sffma: + sf_MInst_sfsfsf_acc <"sfmpy", int_hexagon_F2_sffma>; +def HEXAGON_F2_sffma_sc: + sf_MInst_sfsfsfsi_sc <"sfmpy", int_hexagon_F2_sffma_sc>; +def HEXAGON_F2_sffms: + sf_MInst_sfsfsf_nac <"sfmpy", int_hexagon_F2_sffms>; +def HEXAGON_F2_sffma_lib: + sf_MInst_sfsfsf_acc_lib <"sfmpy", int_hexagon_F2_sffma_lib>; +def HEXAGON_F2_sffms_lib: + sf_MInst_sfsfsf_nac_lib <"sfmpy", int_hexagon_F2_sffms_lib>; +def HEXAGON_F2_sfcmpeq: + qi_SInst_sfsf <"sfcmp.eq", int_hexagon_F2_sfcmpeq>; +def HEXAGON_F2_sfcmpgt: + qi_SInst_sfsf <"sfcmp.gt", int_hexagon_F2_sfcmpgt>; +def HEXAGON_F2_sfcmpge: + qi_SInst_sfsf <"sfcmp.ge", int_hexagon_F2_sfcmpge>; +def HEXAGON_F2_sfcmpuo: + qi_SInst_sfsf <"sfcmp.uo", int_hexagon_F2_sfcmpuo>; +def HEXAGON_F2_sfmax: + sf_MInst_sfsf <"sfmax", int_hexagon_F2_sfmax>; +def HEXAGON_F2_sfmin: + sf_MInst_sfsf <"sfmin", int_hexagon_F2_sfmin>; +def HEXAGON_F2_sfclass: + qi_SInst_sfu5 <"sfclass", int_hexagon_F2_sfclass>; +def HEXAGON_F2_sfimm_p: + sf_ALU64_u10_pos <"sfmake", int_hexagon_F2_sfimm_p>; +def HEXAGON_F2_sfimm_n: + sf_ALU64_u10_neg <"sfmake", int_hexagon_F2_sfimm_n>; +def HEXAGON_F2_sffixupn: + sf_MInst_sfsf <"sffixupn", int_hexagon_F2_sffixupn>; +def HEXAGON_F2_sffixupd: + sf_MInst_sfsf <"sffixupd", int_hexagon_F2_sffixupd>; +def HEXAGON_F2_sffixupr: + sf_SInst_sf <"sffixupr", int_hexagon_F2_sffixupr>; +def HEXAGON_F2_dfadd: + df_MInst_dfdf <"dfadd", int_hexagon_F2_dfadd>; +def HEXAGON_F2_dfsub: + df_MInst_dfdf <"dfsub", int_hexagon_F2_dfsub>; +def HEXAGON_F2_dfmpy: + df_MInst_dfdf <"dfmpy", int_hexagon_F2_dfmpy>; +def HEXAGON_F2_dffma: + df_MInst_dfdfdf_acc <"dfmpy", int_hexagon_F2_dffma>; +def HEXAGON_F2_dffms: + df_MInst_dfdfdf_nac <"dfmpy", int_hexagon_F2_dffms>; +def HEXAGON_F2_dffma_lib: + df_MInst_dfdfdf_acc_lib <"dfmpy", int_hexagon_F2_dffma_lib>; +def HEXAGON_F2_dffms_lib: + df_MInst_dfdfdf_nac_lib <"dfmpy", int_hexagon_F2_dffms_lib>; +def HEXAGON_F2_dffma_sc: + df_MInst_dfdfdfsi_sc <"dfmpy", int_hexagon_F2_dffma_sc>; +def HEXAGON_F2_dfmax: + df_MInst_dfdf <"dfmax", int_hexagon_F2_dfmax>; +def HEXAGON_F2_dfmin: + df_MInst_dfdf <"dfmin", int_hexagon_F2_dfmin>; +def HEXAGON_F2_dfcmpeq: + qi_ALU64_dfdf <"dfcmp.eq", int_hexagon_F2_dfcmpeq>; +def HEXAGON_F2_dfcmpgt: + qi_ALU64_dfdf <"dfcmp.gt", int_hexagon_F2_dfcmpgt>; +def HEXAGON_F2_dfcmpge: + qi_ALU64_dfdf <"dfcmp.ge", int_hexagon_F2_dfcmpge>; +def HEXAGON_F2_dfcmpuo: + qi_ALU64_dfdf <"dfcmp.uo", int_hexagon_F2_dfcmpuo>; +def HEXAGON_F2_dfclass: + qi_ALU64_dfu5 <"dfclass", int_hexagon_F2_dfclass>; +def HEXAGON_F2_dfimm_p: + df_ALU64_u10_pos <"dfmake", int_hexagon_F2_dfimm_p>; +def HEXAGON_F2_dfimm_n: + df_ALU64_u10_neg <"dfmake", int_hexagon_F2_dfimm_n>; +def HEXAGON_F2_dffixupn: + df_MInst_dfdf <"dffixupn", int_hexagon_F2_dffixupn>; +def HEXAGON_F2_dffixupd: + df_MInst_dfdf <"dffixupd", int_hexagon_F2_dffixupd>; +def HEXAGON_F2_dffixupr: + df_SInst_df <"dffixupr", int_hexagon_F2_dffixupr>; +def HEXAGON_F2_conv_sf2df: + df_SInst_sf <"convert_sf2df", int_hexagon_F2_conv_sf2df>; +def HEXAGON_F2_conv_df2sf: + sf_SInst_df <"convert_df2sf", int_hexagon_F2_conv_df2sf>; +def HEXAGON_F2_conv_uw2sf: + sf_SInst_si <"convert_uw2sf", int_hexagon_F2_conv_uw2sf>; +def HEXAGON_F2_conv_uw2df: + df_SInst_si <"convert_uw2df", int_hexagon_F2_conv_uw2df>; +def HEXAGON_F2_conv_w2sf: + sf_SInst_si <"convert_w2sf", int_hexagon_F2_conv_w2sf>; +def HEXAGON_F2_conv_w2df: + df_SInst_si <"convert_w2df", int_hexagon_F2_conv_w2df>; +def HEXAGON_F2_conv_ud2sf: + sf_SInst_di <"convert_ud2sf", int_hexagon_F2_conv_ud2sf>; +def HEXAGON_F2_conv_ud2df: + df_SInst_di <"convert_ud2df", int_hexagon_F2_conv_ud2df>; +def HEXAGON_F2_conv_d2sf: + sf_SInst_di <"convert_d2sf", int_hexagon_F2_conv_d2sf>; +def HEXAGON_F2_conv_d2df: + df_SInst_di <"convert_d2df", int_hexagon_F2_conv_d2df>; +def HEXAGON_F2_conv_sf2uw: + si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw>; +def HEXAGON_F2_conv_sf2w: + si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w>; +def HEXAGON_F2_conv_sf2ud: + di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud>; +def HEXAGON_F2_conv_sf2d: + di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d>; +def HEXAGON_F2_conv_df2uw: + si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw>; +def HEXAGON_F2_conv_df2w: + si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w>; +def HEXAGON_F2_conv_df2ud: + di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud>; +def HEXAGON_F2_conv_df2d: + di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d>; +def HEXAGON_F2_conv_sf2uw_chop: + si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw_chop>; +def HEXAGON_F2_conv_sf2w_chop: + si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w_chop>; +def HEXAGON_F2_conv_sf2ud_chop: + di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud_chop>; +def HEXAGON_F2_conv_sf2d_chop: + di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d_chop>; +def HEXAGON_F2_conv_df2uw_chop: + si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw_chop>; +def HEXAGON_F2_conv_df2w_chop: + si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w_chop>; +def HEXAGON_F2_conv_df2ud_chop: + di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud_chop>; +def HEXAGON_F2_conv_df2d_chop: + di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d_chop>; diff --git a/lib/Target/Hexagon/HexagonMCInst.h b/lib/Target/Hexagon/HexagonMCInst.h new file mode 100644 index 0000000..7a16c24 --- /dev/null +++ b/lib/Target/Hexagon/HexagonMCInst.h @@ -0,0 +1,41 @@ +//===- HexagonMCInst.h - Hexagon sub-class of MCInst ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class extends MCInst to allow some VLIW annotation. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGONMCINST_H +#define HEXAGONMCINST_H + +#include "llvm/MC/MCInst.h" +#include "llvm/CodeGen/MachineInstr.h" + +namespace llvm { + class HexagonMCInst: public MCInst { + // Packet start and end markers + unsigned startPacket: 1, endPacket: 1; + const MachineInstr *MachineI; + public: + explicit HexagonMCInst(): MCInst(), + startPacket(0), endPacket(0) {} + + const MachineInstr* getMI() const { return MachineI; } + + void setMI(const MachineInstr *MI) { MachineI = MI; } + + bool isStartPacket() const { return (startPacket); } + bool isEndPacket() const { return (endPacket); } + + void setStartPacket(bool yes) { startPacket = yes; } + void setEndPacket(bool yes) { endPacket = yes; } + }; +} + +#endif diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp index fbb331b..70bddcc 100644 --- a/lib/Target/Hexagon/HexagonMCInstLower.cpp +++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp @@ -49,7 +49,7 @@ void llvm::HexagonLowerToMC(const MachineInstr* MI, MCInst& MCI, switch (MO.getType()) { default: MI->dump(); - assert(0 && "unknown operand type"); + llvm_unreachable("unknown operand type"); case MachineOperand::MO_Register: // Ignore all implicit register operands. if (MO.isImplicit()) continue; diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp new file mode 100644 index 0000000..7ece408 --- /dev/null +++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -0,0 +1,647 @@ +//===----- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements NewValueJump pass in Hexagon. +// Ideally, we should merge this as a Peephole pass prior to register +// allocation, but because we have a spill in between the feeder and new value +// jump instructions, we are forced to write after register allocation. +// Having said that, we should re-attempt to pull this earlier at some point +// in future. + +// The basic approach looks for sequence of predicated jump, compare instruciton +// that genereates the predicate and, the feeder to the predicate. Once it finds +// all, it collapses compare and jump instruction into a new valu jump +// intstructions. +// +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "hexagon-nvj" +#include "llvm/PassSupport.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "Hexagon.h" +#include "HexagonTargetMachine.h" +#include "HexagonRegisterInfo.h" +#include "HexagonSubtarget.h" +#include "HexagonInstrInfo.h" +#include "HexagonMachineFunctionInfo.h" + +#include <map> + +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created"); + +static cl::opt<int> +DbgNVJCount("nvj-count", cl::init(-1), cl::Hidden, cl::desc( + "Maximum number of predicated jumps to be converted to New Value Jump")); + +static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden, + cl::ZeroOrMore, cl::init(false), + cl::desc("Disable New Value Jumps")); + +namespace { + struct HexagonNewValueJump : public MachineFunctionPass { + const HexagonInstrInfo *QII; + const HexagonRegisterInfo *QRI; + + public: + static char ID; + + HexagonNewValueJump() : MachineFunctionPass(ID) { } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const { + return "Hexagon NewValueJump"; + } + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + private: + + }; + +} // end of anonymous namespace + +char HexagonNewValueJump::ID = 0; + +// We have identified this II could be feeder to NVJ, +// verify that it can be. +static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII, + const TargetRegisterInfo *TRI, + MachineBasicBlock::iterator II, + MachineBasicBlock::iterator end, + MachineBasicBlock::iterator skip, + MachineFunction &MF) { + + // Predicated instruction can not be feeder to NVJ. + if (QII->isPredicated(II)) + return false; + + // Bail out if feederReg is a paired register (double regs in + // our case). One would think that we can check to see if a given + // register cmpReg1 or cmpReg2 is a sub register of feederReg + // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic + // before the callsite of this function + // But we can not as it comes in the following fashion. + // %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill> + // %R0<def> = KILL %R0, %D0<imp-use,kill> + // %P0<def> = CMPEQri %R0<kill>, 0 + // Hence, we need to check if it's a KILL instruction. + if (II->getOpcode() == TargetOpcode::KILL) + return false; + + + // Make sure there there is no 'def' or 'use' of any of the uses of + // feeder insn between it's definition, this MI and jump, jmpInst + // skipping compare, cmpInst. + // Here's the example. + // r21=memub(r22+r24<<#0) + // p0 = cmp.eq(r21, #0) + // r4=memub(r3+r21<<#0) + // if (p0.new) jump:t .LBB29_45 + // Without this check, it will be converted into + // r4=memub(r3+r21<<#0) + // r21=memub(r22+r24<<#0) + // p0 = cmp.eq(r21, #0) + // if (p0.new) jump:t .LBB29_45 + // and result WAR hazards if converted to New Value Jump. + + for (unsigned i = 0; i < II->getNumOperands(); ++i) { + if (II->getOperand(i).isReg() && + (II->getOperand(i).isUse() || II->getOperand(i).isDef())) { + MachineBasicBlock::iterator localII = II; + ++localII; + unsigned Reg = II->getOperand(i).getReg(); + for (MachineBasicBlock::iterator localBegin = localII; + localBegin != end; ++localBegin) { + if (localBegin == skip ) continue; + // Check for Subregisters too. + if (localBegin->modifiesRegister(Reg, TRI) || + localBegin->readsRegister(Reg, TRI)) + return false; + } + } + } + return true; +} + +// These are the common checks that need to performed +// to determine if +// 1. compare instruction can be moved before jump. +// 2. feeder to the compare instruction can be moved before jump. +static bool commonChecksToProhibitNewValueJump(bool afterRA, + MachineBasicBlock::iterator MII) { + + // If store in path, bail out. + if (MII->getDesc().mayStore()) + return false; + + // if call in path, bail out. + if (MII->getOpcode() == Hexagon::CALLv3) + return false; + + // if NVJ is running prior to RA, do the following checks. + if (!afterRA) { + // The following Target Opcode instructions are spurious + // to new value jump. If they are in the path, bail out. + // KILL sets kill flag on the opcode. It also sets up a + // single register, out of pair. + // %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill> + // %R0<def> = KILL %R0, %D0<imp-use,kill> + // %P0<def> = CMPEQri %R0<kill>, 0 + // PHI can be anything after RA. + // COPY can remateriaze things in between feeder, compare and nvj. + if (MII->getOpcode() == TargetOpcode::KILL || + MII->getOpcode() == TargetOpcode::PHI || + MII->getOpcode() == TargetOpcode::COPY) + return false; + + // The following pseudo Hexagon instructions sets "use" and "def" + // of registers by individual passes in the backend. At this time, + // we don't know the scope of usage and definitions of these + // instructions. + if (MII->getOpcode() == Hexagon::TFR_condset_rr || + MII->getOpcode() == Hexagon::TFR_condset_ii || + MII->getOpcode() == Hexagon::TFR_condset_ri || + MII->getOpcode() == Hexagon::TFR_condset_ir || + MII->getOpcode() == Hexagon::LDriw_pred || + MII->getOpcode() == Hexagon::STriw_pred) + return false; + } + + return true; +} + +static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII, + const TargetRegisterInfo *TRI, + MachineBasicBlock::iterator II, + unsigned pReg, + bool secondReg, + bool optLocation, + MachineBasicBlock::iterator end, + MachineFunction &MF) { + + MachineInstr *MI = II; + + // If the second operand of the compare is an imm, make sure it's in the + // range specified by the arch. + if (!secondReg) { + int64_t v = MI->getOperand(2).getImm(); + if (MI->getOpcode() == Hexagon::CMPGEri || + (MI->getOpcode() == Hexagon::CMPGEUri && v > 0)) + --v; + + if (!(isUInt<5>(v) || + ((MI->getOpcode() == Hexagon::CMPEQri || + MI->getOpcode() == Hexagon::CMPGTri || + MI->getOpcode() == Hexagon::CMPGEri) && + (v == -1)))) + return false; + } + + unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning. + cmpReg1 = MI->getOperand(1).getReg(); + + if (secondReg) { + cmpOp2 = MI->getOperand(2).getReg(); + + // Make sure that that second register is not from COPY + // At machine code level, we don't need this, but if we decide + // to move new value jump prior to RA, we would be needing this. + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) { + MachineInstr *def = MRI.getVRegDef(cmpOp2); + if (def->getOpcode() == TargetOpcode::COPY) + return false; + } + } + + // Walk the instructions after the compare (predicate def) to the jump, + // and satisfy the following conditions. + ++II ; + for (MachineBasicBlock::iterator localII = II; localII != end; + ++localII) { + + // Check 1. + // If "common" checks fail, bail out. + if (!commonChecksToProhibitNewValueJump(optLocation, localII)) + return false; + + // Check 2. + // If there is a def or use of predicate (result of compare), bail out. + if (localII->modifiesRegister(pReg, TRI) || + localII->readsRegister(pReg, TRI)) + return false; + + // Check 3. + // If there is a def of any of the use of the compare (operands of compare), + // bail out. + // Eg. + // p0 = cmp.eq(r2, r0) + // r2 = r4 + // if (p0.new) jump:t .LBB28_3 + if (localII->modifiesRegister(cmpReg1, TRI) || + (secondReg && localII->modifiesRegister(cmpOp2, TRI))) + return false; + } + return true; +} + +// Given a compare operator, return a matching New Value Jump +// compare operator. Make sure that MI here is included in +// HexagonInstrInfo.cpp::isNewValueJumpCandidate +static unsigned getNewValueJumpOpcode(const MachineInstr *MI, int reg, + bool secondRegNewified) { + switch (MI->getOpcode()) { + case Hexagon::CMPEQrr: + return Hexagon::JMP_EQrrPt_nv_V4; + + case Hexagon::CMPEQri: { + if (reg >= 0) + return Hexagon::JMP_EQriPt_nv_V4; + else + return Hexagon::JMP_EQriPtneg_nv_V4; + } + + case Hexagon::CMPLTrr: + case Hexagon::CMPGTrr: { + if (secondRegNewified) + return Hexagon::JMP_GTrrdnPt_nv_V4; + else + return Hexagon::JMP_GTrrPt_nv_V4; + } + + case Hexagon::CMPGEri: { + if (reg >= 1) + return Hexagon::JMP_GTriPt_nv_V4; + else + return Hexagon::JMP_GTriPtneg_nv_V4; + } + + case Hexagon::CMPGTri: { + if (reg >= 0) + return Hexagon::JMP_GTriPt_nv_V4; + else + return Hexagon::JMP_GTriPtneg_nv_V4; + } + + case Hexagon::CMPLTUrr: + case Hexagon::CMPGTUrr: { + if (secondRegNewified) + return Hexagon::JMP_GTUrrdnPt_nv_V4; + else + return Hexagon::JMP_GTUrrPt_nv_V4; + } + + case Hexagon::CMPGTUri: + return Hexagon::JMP_GTUriPt_nv_V4; + + case Hexagon::CMPGEUri: { + if (reg == 0) + return Hexagon::JMP_EQrrPt_nv_V4; + else + return Hexagon::JMP_GTUriPt_nv_V4; + } + + default: + llvm_unreachable("Could not find matching New Value Jump instruction."); + } + // return *some value* to avoid compiler warning + return 0; +} + +bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { + + DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n" + << "********** Function: " + << MF.getFunction()->getName() << "\n"); + +#if 0 + // for now disable this, if we move NewValueJump before register + // allocation we need this information. + LiveVariables &LVs = getAnalysis<LiveVariables>(); +#endif + + QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo()); + QRI = + static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo()); + + if (!QRI->Subtarget.hasV4TOps() || + DisableNewValueJumps) { + return false; + } + + int nvjCount = DbgNVJCount; + int nvjGenerated = 0; + + // Loop through all the bb's of the function + for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); + MBBb != MBBe; ++MBBb) { + MachineBasicBlock* MBB = MBBb; + + DEBUG(dbgs() << "** dumping bb ** " + << MBB->getNumber() << "\n"); + DEBUG(MBB->dump()); + DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n"); + bool foundJump = false; + bool foundCompare = false; + bool invertPredicate = false; + unsigned predReg = 0; // predicate reg of the jump. + unsigned cmpReg1 = 0; + int cmpOp2 = 0; + bool MO1IsKill = false; + bool MO2IsKill = false; + MachineBasicBlock::iterator jmpPos; + MachineBasicBlock::iterator cmpPos; + MachineInstr *cmpInstr = NULL, *jmpInstr = NULL; + MachineBasicBlock *jmpTarget = NULL; + bool afterRA = false; + bool isSecondOpReg = false; + bool isSecondOpNewified = false; + // Traverse the basic block - bottom up + for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin(); + MII != E;) { + MachineInstr *MI = --MII; + if (MI->isDebugValue()) { + continue; + } + + if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated)) + break; + + DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n"); + + if (!foundJump && + (MI->getOpcode() == Hexagon::JMP_c || + MI->getOpcode() == Hexagon::JMP_cNot || + MI->getOpcode() == Hexagon::JMP_cdnPt || + MI->getOpcode() == Hexagon::JMP_cdnPnt || + MI->getOpcode() == Hexagon::JMP_cdnNotPt || + MI->getOpcode() == Hexagon::JMP_cdnNotPnt)) { + // This is where you would insert your compare and + // instr that feeds compare + jmpPos = MII; + jmpInstr = MI; + predReg = MI->getOperand(0).getReg(); + afterRA = TargetRegisterInfo::isPhysicalRegister(predReg); + + // If ifconverter had not messed up with the kill flags of the + // operands, the following check on the kill flag would suffice. + // if(!jmpInstr->getOperand(0).isKill()) break; + + // This predicate register is live out out of BB + // this would only work if we can actually use Live + // variable analysis on phy regs - but LLVM does not + // provide LV analysis on phys regs. + //if(LVs.isLiveOut(predReg, *MBB)) break; + + // Get all the successors of this block - which will always + // be 2. Check if the predicate register is live in in those + // successor. If yes, we can not delete the predicate - + // I am doing this only because LLVM does not provide LiveOut + // at the BB level. + bool predLive = false; + for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(), + SIE = MBB->succ_end(); SI != SIE; ++SI) { + MachineBasicBlock* succMBB = *SI; + if (succMBB->isLiveIn(predReg)) { + predLive = true; + } + } + if (predLive) + break; + + jmpTarget = MI->getOperand(1).getMBB(); + foundJump = true; + if (MI->getOpcode() == Hexagon::JMP_cNot || + MI->getOpcode() == Hexagon::JMP_cdnNotPt || + MI->getOpcode() == Hexagon::JMP_cdnNotPnt) { + invertPredicate = true; + } + continue; + } + + // No new value jump if there is a barrier. A barrier has to be in its + // own packet. A barrier has zero operands. We conservatively bail out + // here if we see any instruction with zero operands. + if (foundJump && MI->getNumOperands() == 0) + break; + + if (foundJump && + !foundCompare && + MI->getOperand(0).isReg() && + MI->getOperand(0).getReg() == predReg) { + + // Not all compares can be new value compare. Arch Spec: 7.6.1.1 + if (QII->isNewValueJumpCandidate(MI)) { + + assert((MI->getDesc().isCompare()) && + "Only compare instruction can be collapsed into New Value Jump"); + isSecondOpReg = MI->getOperand(2).isReg(); + + if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg, + afterRA, jmpPos, MF)) + break; + + cmpInstr = MI; + cmpPos = MII; + foundCompare = true; + + // We need cmpReg1 and cmpOp2(imm or reg) while building + // new value jump instruction. + cmpReg1 = MI->getOperand(1).getReg(); + if (MI->getOperand(1).isKill()) + MO1IsKill = true; + + if (isSecondOpReg) { + cmpOp2 = MI->getOperand(2).getReg(); + if (MI->getOperand(2).isKill()) + MO2IsKill = true; + } else + cmpOp2 = MI->getOperand(2).getImm(); + continue; + } + } + + if (foundCompare && foundJump) { + + // If "common" checks fail, bail out on this BB. + if (!commonChecksToProhibitNewValueJump(afterRA, MII)) + break; + + bool foundFeeder = false; + MachineBasicBlock::iterator feederPos = MII; + if (MI->getOperand(0).isReg() && + MI->getOperand(0).isDef() && + (MI->getOperand(0).getReg() == cmpReg1 || + (isSecondOpReg && + MI->getOperand(0).getReg() == (unsigned) cmpOp2))) { + + unsigned feederReg = MI->getOperand(0).getReg(); + + // First try to see if we can get the feeder from the first operand + // of the compare. If we can not, and if secondOpReg is true + // (second operand of the compare is also register), try that one. + // TODO: Try to come up with some heuristic to figure out which + // feeder would benefit. + + if (feederReg == cmpReg1) { + if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) { + if (!isSecondOpReg) + break; + else + continue; + } else + foundFeeder = true; + } + + if (!foundFeeder && + isSecondOpReg && + feederReg == (unsigned) cmpOp2) + if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) + break; + + if (isSecondOpReg) { + // In case of CMPLT, or CMPLTU, or EQ with the second register + // to newify, swap the operands. + if (cmpInstr->getOpcode() == Hexagon::CMPLTrr || + cmpInstr->getOpcode() == Hexagon::CMPLTUrr || + (cmpInstr->getOpcode() == Hexagon::CMPEQrr && + feederReg == (unsigned) cmpOp2)) { + unsigned tmp = cmpReg1; + bool tmpIsKill = MO1IsKill; + cmpReg1 = cmpOp2; + MO1IsKill = MO2IsKill; + cmpOp2 = tmp; + MO2IsKill = tmpIsKill; + } + + // Now we have swapped the operands, all we need to check is, + // if the second operand (after swap) is the feeder. + // And if it is, make a note. + if (feederReg == (unsigned)cmpOp2) + isSecondOpNewified = true; + } + + // Now that we are moving feeder close the jump, + // make sure we are respecting the kill values of + // the operands of the feeder. + + bool updatedIsKill = false; + for (unsigned i = 0; i < MI->getNumOperands(); i++) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isUse()) { + unsigned feederReg = MO.getReg(); + for (MachineBasicBlock::iterator localII = feederPos, + end = jmpPos; localII != end; localII++) { + MachineInstr *localMI = localII; + for (unsigned j = 0; j < localMI->getNumOperands(); j++) { + MachineOperand &localMO = localMI->getOperand(j); + if (localMO.isReg() && localMO.isUse() && + localMO.isKill() && feederReg == localMO.getReg()) { + // We found that there is kill of a use register + // Set up a kill flag on the register + localMO.setIsKill(false); + MO.setIsKill(); + updatedIsKill = true; + break; + } + } + if (updatedIsKill) break; + } + } + if (updatedIsKill) break; + } + + MBB->splice(jmpPos, MI->getParent(), MI); + MBB->splice(jmpPos, MI->getParent(), cmpInstr); + DebugLoc dl = MI->getDebugLoc(); + MachineInstr *NewMI; + + assert((QII->isNewValueJumpCandidate(cmpInstr)) && + "This compare is not a New Value Jump candidate."); + unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2, + isSecondOpNewified); + if (invertPredicate) + opc = QII->getInvertedPredicatedOpcode(opc); + + // Manage the conversions from CMPGEUri to either CMPEQrr + // or CMPGTUri properly. See Arch spec for CMPGEUri instructions. + // This has to be after the getNewValueJumpOpcode function call as + // second operand of the compare could be modified in this logic. + if (cmpInstr->getOpcode() == Hexagon::CMPGEUri) { + if (cmpOp2 == 0) { + cmpOp2 = cmpReg1; + MO2IsKill = MO1IsKill; + isSecondOpReg = true; + } else + --cmpOp2; + } + + // Manage the conversions from CMPGEri to CMPGTUri properly. + // See Arch spec for CMPGEri instructions. + if (cmpInstr->getOpcode() == Hexagon::CMPGEri) + --cmpOp2; + + if (isSecondOpReg) { + NewMI = BuildMI(*MBB, jmpPos, dl, + QII->get(opc)) + .addReg(cmpReg1, getKillRegState(MO1IsKill)) + .addReg(cmpOp2, getKillRegState(MO2IsKill)) + .addMBB(jmpTarget); + } + else { + NewMI = BuildMI(*MBB, jmpPos, dl, + QII->get(opc)) + .addReg(cmpReg1, getKillRegState(MO1IsKill)) + .addImm(cmpOp2) + .addMBB(jmpTarget); + } + + assert(NewMI && "New Value Jump Instruction Not created!"); + if (cmpInstr->getOperand(0).isReg() && + cmpInstr->getOperand(0).isKill()) + cmpInstr->getOperand(0).setIsKill(false); + if (cmpInstr->getOperand(1).isReg() && + cmpInstr->getOperand(1).isKill()) + cmpInstr->getOperand(1).setIsKill(false); + cmpInstr->eraseFromParent(); + jmpInstr->eraseFromParent(); + ++nvjGenerated; + ++NumNVJGenerated; + break; + } + } + } + } + + return true; + +} + +FunctionPass *llvm::createHexagonNewValueJump() { + return new HexagonNewValueJump(); +} diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 2a9de92..2c23674 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -63,6 +63,7 @@ const uint16_t* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction return CalleeSavedRegsV2; case HexagonSubtarget::V3: case HexagonSubtarget::V4: + case HexagonSubtarget::V5: return CalleeSavedRegsV3; } llvm_unreachable("Callee saved registers requested for unknown architecture " @@ -109,6 +110,7 @@ HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { return CalleeSavedRegClassesV2; case HexagonSubtarget::V3: case HexagonSubtarget::V4: + case HexagonSubtarget::V5: return CalleeSavedRegClassesV3; } llvm_unreachable("Callee saved register classes requested for unknown " @@ -179,13 +181,15 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // r0 = add(r30, #10000) // r0 = memw(r0) if ( (MI.getOpcode() == Hexagon::LDriw) || - (MI.getOpcode() == Hexagon::LDrid) || - (MI.getOpcode() == Hexagon::LDrih) || - (MI.getOpcode() == Hexagon::LDriuh) || - (MI.getOpcode() == Hexagon::LDrib) || - (MI.getOpcode() == Hexagon::LDriub) ) { + (MI.getOpcode() == Hexagon::LDrid) || + (MI.getOpcode() == Hexagon::LDrih) || + (MI.getOpcode() == Hexagon::LDriuh) || + (MI.getOpcode() == Hexagon::LDrib) || + (MI.getOpcode() == Hexagon::LDriub) || + (MI.getOpcode() == Hexagon::LDriw_f) || + (MI.getOpcode() == Hexagon::LDrid_f)) { unsigned dstReg = (MI.getOpcode() == Hexagon::LDrid) ? - *getSubRegisters(MI.getOperand(0).getReg()) : + getSubReg(MI.getOperand(0).getReg(), Hexagon::subreg_loreg) : MI.getOperand(0).getReg(); // Check if offset can fit in addi. @@ -203,10 +207,13 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(i).ChangeToRegister(dstReg, false, false, true); MI.getOperand(i+1).ChangeToImmediate(0); - } else if ((MI.getOpcode() == Hexagon::STriw) || + } else if ((MI.getOpcode() == Hexagon::STriw_indexed) || + (MI.getOpcode() == Hexagon::STriw) || (MI.getOpcode() == Hexagon::STrid) || (MI.getOpcode() == Hexagon::STrih) || - (MI.getOpcode() == Hexagon::STrib)) { + (MI.getOpcode() == Hexagon::STrib) || + (MI.getOpcode() == Hexagon::STrid_f) || + (MI.getOpcode() == Hexagon::STriw_f)) { // For stores, we need a reserved register. Change // memw(r30 + #10000) = r0 to: // diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h index 6cf727b..85355ae 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.h +++ b/lib/Target/Hexagon/HexagonRegisterInfo.h @@ -73,6 +73,10 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo { return true; } + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + return true; + } + // Debug information queries. unsigned getRARegister() const; unsigned getFrameRegister(const MachineFunction &MF) const; diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td index d44eae3..fe41fc3 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -131,6 +131,9 @@ let Namespace = "Hexagon" in { def SA1 : Rc<2, "sa1">, DwarfRegNum<[69]>; def LC1 : Rc<3, "lc1">, DwarfRegNum<[70]>; + def M0 : Rc<6, "m0">, DwarfRegNum<[71]>; + def M1 : Rc<7, "m1">, DwarfRegNum<[72]>; + def PC : Rc<9, "pc">, DwarfRegNum<[32]>; // is the Dwarf number correct? def GP : Rc<11, "gp">, DwarfRegNum<[33]>; // is the Dwarf number correct? } @@ -140,19 +143,15 @@ let Namespace = "Hexagon" in { // FIXME: the register order should be defined in terms of the preferred // allocation order... // -def IntRegs : RegisterClass<"Hexagon", [i32], 32, +def IntRegs : RegisterClass<"Hexagon", [i32,f32], 32, (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28), R10, R11, R29, R30, R31)> { } - - -def DoubleRegs : RegisterClass<"Hexagon", [i64], 64, +def DoubleRegs : RegisterClass<"Hexagon", [i64,f64], 64, (add (sequence "D%u", 0, 4), - (sequence "D%u", 6, 13), D5, D14, D15)> { - let SubRegClasses = [(IntRegs subreg_loreg, subreg_hireg)]; -} + (sequence "D%u", 6, 13), D5, D14, D15)>; def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))> @@ -162,6 +161,7 @@ def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))> def CRRegs : RegisterClass<"Hexagon", [i32], 32, (add (sequence "LC%u", 0, 1), - (sequence "SA%u", 0, 1), PC, GP)> { + (sequence "SA%u", 0, 1), + (sequence "M%u", 0, 1), PC, GP)> { let Size = 32; } diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp index 66a00e1..2468f0b 100644 --- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp +++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp @@ -1,4 +1,4 @@ -//===- HexagonRemoveExtendArgs.cpp - Remove unecessary argument sign extends =// +//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends // // // The LLVM Compiler Infrastructure // diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td index fbea445..d1076b8 100644 --- a/lib/Target/Hexagon/HexagonSchedule.td +++ b/lib/Target/Hexagon/HexagonSchedule.td @@ -13,7 +13,6 @@ def LSUNIT : FuncUnit; def MUNIT : FuncUnit; def SUNIT : FuncUnit; - // Itinerary classes def ALU32 : InstrItinClass; def ALU64 : InstrItinClass; @@ -24,23 +23,31 @@ def LD : InstrItinClass; def M : InstrItinClass; def ST : InstrItinClass; def S : InstrItinClass; +def SYS : InstrItinClass; +def MARKER : InstrItinClass; def PSEUDO : InstrItinClass; - def HexagonItineraries : - ProcessorItineraries<[LUNIT, LSUNIT, MUNIT, SUNIT], [], [ - InstrItinData<ALU32 , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>, - InstrItinData<ALU64 , [InstrStage<1, [MUNIT, SUNIT]>]>, - InstrItinData<CR , [InstrStage<1, [SUNIT]>]>, - InstrItinData<J , [InstrStage<1, [SUNIT, MUNIT]>]>, - InstrItinData<JR , [InstrStage<1, [MUNIT]>]>, - InstrItinData<LD , [InstrStage<1, [LUNIT, LSUNIT]>]>, - InstrItinData<M , [InstrStage<1, [MUNIT, SUNIT]>]>, - InstrItinData<ST , [InstrStage<1, [LSUNIT]>]>, - InstrItinData<S , [InstrStage<1, [SUNIT, MUNIT]>]>, - InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]> -]>; - + ProcessorItineraries<[LUNIT, LSUNIT, MUNIT, SUNIT], [], [ + InstrItinData<ALU32 , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>, + InstrItinData<ALU64 , [InstrStage<1, [MUNIT, SUNIT]>]>, + InstrItinData<CR , [InstrStage<1, [SUNIT]>]>, + InstrItinData<J , [InstrStage<1, [SUNIT, MUNIT]>]>, + InstrItinData<JR , [InstrStage<1, [MUNIT]>]>, + InstrItinData<LD , [InstrStage<1, [LUNIT, LSUNIT]>]>, + InstrItinData<M , [InstrStage<1, [MUNIT, SUNIT]>]>, + InstrItinData<ST , [InstrStage<1, [LSUNIT]>]>, + InstrItinData<S , [InstrStage<1, [SUNIT, MUNIT]>]>, + InstrItinData<SYS , [InstrStage<1, [LSUNIT]>]>, + InstrItinData<MARKER , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>, + InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]> + ]>; + +def HexagonModel : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItineraries; +} //===----------------------------------------------------------------------===// // V4 Machine Info + diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td index 4cf66fe..9b41126 100644 --- a/lib/Target/Hexagon/HexagonScheduleV4.td +++ b/lib/Target/Hexagon/HexagonScheduleV4.td @@ -23,7 +23,6 @@ // | SLOT3 | XTYPE ALU32 J CR | // |===========|==================================================| - // Functional Units. def SLOT0 : FuncUnit; def SLOT1 : FuncUnit; @@ -34,22 +33,32 @@ def SLOT3 : FuncUnit; def NV_V4 : InstrItinClass; def MEM_V4 : InstrItinClass; // ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4. +def PREFIX : InstrItinClass; + +def HexagonItinerariesV4 : + ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3], [], [ + InstrItinData<ALU32 , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU64 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<CR , [InstrStage<1, [SLOT3]>]>, + InstrItinData<J , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<JR , [InstrStage<1, [SLOT2]>]>, + InstrItinData<LD , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<M , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<ST , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<S , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<SYS , [InstrStage<1, [SLOT0]>]>, + InstrItinData<NV_V4 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<MEM_V4 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<MARKER , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]> + ]>; -def HexagonItinerariesV4 : ProcessorItineraries< - [SLOT0, SLOT1, SLOT2, SLOT3], [], [ - InstrItinData<LD , [InstrStage<1, [SLOT0, SLOT1]>]>, - InstrItinData<ST , [InstrStage<1, [SLOT0, SLOT1]>]>, - InstrItinData<ALU32 , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, - InstrItinData<NV_V4 , [InstrStage<1, [SLOT0]>]>, - InstrItinData<MEM_V4 , [InstrStage<1, [SLOT0]>]>, - InstrItinData<J , [InstrStage<1, [SLOT2, SLOT3]>]>, - InstrItinData<JR , [InstrStage<1, [SLOT2]>]>, - InstrItinData<CR , [InstrStage<1, [SLOT3]>]>, - InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, - InstrItinData<ALU64 , [InstrStage<1, [SLOT2, SLOT3]>]>, - InstrItinData<M , [InstrStage<1, [SLOT2, SLOT3]>]>, - InstrItinData<S , [InstrStage<1, [SLOT2, SLOT3]>]> -]>; +def HexagonModelV4 : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItinerariesV4; +} //===----------------------------------------------------------------------===// // Hexagon V4 Resource Definitions - diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp index d10c9f2..a81cd91 100644 --- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp +++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp @@ -14,7 +14,7 @@ // {p0 = cmp.eq(r0,r1)} // {r3 = mux(p0,#1,#3)} // -// This requires two packets. If we use .new predicated immediate transfers, +// This requires two packets. If we use .new predicated immediate transfers, // then we can do this in a single packet, e.g.: // // {p0 = cmp.eq(r0,r1) @@ -81,40 +81,126 @@ bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) { for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); ++MII) { MachineInstr *MI = MII; - int Opc = MI->getOpcode(); - if (Opc == Hexagon::TFR_condset_rr) { - - int DestReg = MI->getOperand(0).getReg(); - int SrcReg1 = MI->getOperand(2).getReg(); - int SrcReg2 = MI->getOperand(3).getReg(); - - // Minor optimization: do not emit the predicated copy if the source and - // the destination is the same register - if (DestReg != SrcReg1) { - BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_cPt), - DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1); + int Opc1, Opc2; + switch(MI->getOpcode()) { + case Hexagon::TFR_condset_rr: + case Hexagon::TFR_condset_rr_f: + case Hexagon::TFR_condset_rr64_f: { + int DestReg = MI->getOperand(0).getReg(); + int SrcReg1 = MI->getOperand(2).getReg(); + int SrcReg2 = MI->getOperand(3).getReg(); + + if (MI->getOpcode() == Hexagon::TFR_condset_rr || + MI->getOpcode() == Hexagon::TFR_condset_rr_f) { + Opc1 = Hexagon::TFR_cPt; + Opc2 = Hexagon::TFR_cNotPt; + } + else if (MI->getOpcode() == Hexagon::TFR_condset_rr64_f) { + Opc1 = Hexagon::TFR64_cPt; + Opc2 = Hexagon::TFR64_cNotPt; + } + + // Minor optimization: do not emit the predicated copy if the source + // and the destination is the same register. + if (DestReg != SrcReg1) { + BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc1), + DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1); + } + if (DestReg != SrcReg2) { + BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc2), + DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2); + } + MII = MBB->erase(MI); + --MII; + break; } - if (DestReg != SrcReg2) { - BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_cNotPt), - DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2); + case Hexagon::TFR_condset_ri: + case Hexagon::TFR_condset_ri_f: { + int DestReg = MI->getOperand(0).getReg(); + int SrcReg1 = MI->getOperand(2).getReg(); + + // Do not emit the predicated copy if the source and the destination + // is the same register. + if (DestReg != SrcReg1) { + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFR_cPt), DestReg). + addReg(MI->getOperand(1).getReg()).addReg(SrcReg1); + } + if (MI->getOpcode() == Hexagon::TFR_condset_ri ) { + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFRI_cNotPt), DestReg). + addReg(MI->getOperand(1).getReg()). + addImm(MI->getOperand(3).getImm()); + } else if (MI->getOpcode() == Hexagon::TFR_condset_ri_f ) { + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFRI_cNotPt_f), DestReg). + addReg(MI->getOperand(1).getReg()). + addFPImm(MI->getOperand(3).getFPImm()); + } + + MII = MBB->erase(MI); + --MII; + break; + } + case Hexagon::TFR_condset_ir: + case Hexagon::TFR_condset_ir_f: { + int DestReg = MI->getOperand(0).getReg(); + int SrcReg2 = MI->getOperand(3).getReg(); + + if (MI->getOpcode() == Hexagon::TFR_condset_ir ) { + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFRI_cPt), DestReg). + addReg(MI->getOperand(1).getReg()). + addImm(MI->getOperand(2).getImm()); + } else if (MI->getOpcode() == Hexagon::TFR_condset_ir_f ) { + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFRI_cPt_f), DestReg). + addReg(MI->getOperand(1).getReg()). + addFPImm(MI->getOperand(2).getFPImm()); + } + + // Do not emit the predicated copy if the source and + // the destination is the same register. + if (DestReg != SrcReg2) { + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFR_cNotPt), DestReg). + addReg(MI->getOperand(1).getReg()).addReg(SrcReg2); + } + MII = MBB->erase(MI); + --MII; + break; + } + case Hexagon::TFR_condset_ii: + case Hexagon::TFR_condset_ii_f: { + int DestReg = MI->getOperand(0).getReg(); + int SrcReg1 = MI->getOperand(1).getReg(); + + if (MI->getOpcode() == Hexagon::TFR_condset_ii ) { + int Immed1 = MI->getOperand(2).getImm(); + int Immed2 = MI->getOperand(3).getImm(); + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFRI_cPt), + DestReg).addReg(SrcReg1).addImm(Immed1); + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFRI_cNotPt), + DestReg).addReg(SrcReg1).addImm(Immed2); + } else if (MI->getOpcode() == Hexagon::TFR_condset_ii_f ) { + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFRI_cPt_f), DestReg). + addReg(SrcReg1). + addFPImm(MI->getOperand(2).getFPImm()); + BuildMI(*MBB, MII, MI->getDebugLoc(), + TII->get(Hexagon::TFRI_cNotPt_f), DestReg). + addReg(SrcReg1). + addFPImm(MI->getOperand(3).getFPImm()); + } + MII = MBB->erase(MI); + --MII; + break; } - MII = MBB->erase(MI); - --MII; - } else if (Opc == Hexagon::TFR_condset_ii) { - int DestReg = MI->getOperand(0).getReg(); - int SrcReg1 = MI->getOperand(1).getReg(); - int Immed1 = MI->getOperand(2).getImm(); - int Immed2 = MI->getOperand(3).getImm(); - BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFRI_cPt), - DestReg).addReg(SrcReg1).addImm(Immed1); - BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFRI_cNotPt), - DestReg).addReg(SrcReg1).addImm(Immed2); - MII = MBB->erase(MI); - --MII; } } } - return true; } diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index 654d336..5d087db 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -13,6 +13,7 @@ #include "HexagonSubtarget.h" #include "Hexagon.h" +#include "HexagonRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; @@ -29,11 +30,17 @@ static cl::opt<bool> EnableMemOps( "enable-hexagon-memops", cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, - cl::desc("Generate V4 MEMOP in code generation for Hexagon target")); + cl::desc("Generate V4 memop instructions.")); + +static cl::opt<bool> +EnableIEEERndNear( + "enable-hexagon-ieee-rnd-near", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Generate non-chopped conversion from fp to int.")); HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS): HexagonGenSubtargetInfo(TT, CPU, FS), - HexagonArchVersion(V1), + HexagonArchVersion(V2), CPUString(CPU.str()) { ParseSubtargetFeatures(CPU, FS); @@ -45,18 +52,27 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS): break; case HexagonSubtarget::V4: break; + case HexagonSubtarget::V5: + break; default: - llvm_unreachable("Unknown Architecture Version."); + // If the programmer has not specified a Hexagon version, default + // to -mv4. + CPUString = "hexagonv4"; + HexagonArchVersion = HexagonSubtarget::V4; + break; } // Initialize scheduling itinerary for the specified CPU. InstrItins = getInstrItineraryForCPU(CPUString); - // Max issue per cycle == bundle width. - InstrItins.IssueWidth = 4; - if (EnableMemOps) UseMemOps = true; else UseMemOps = false; + + if (EnableIEEERndNear) + ModeIEEERndNear = true; + else + ModeIEEERndNear = false; } + diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 3079086..5d9d6d8 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -22,16 +22,18 @@ #include "HexagonGenSubtargetInfo.inc" #define Hexagon_SMALL_DATA_THRESHOLD 8 +#define Hexagon_SLOTS 4 namespace llvm { class HexagonSubtarget : public HexagonGenSubtargetInfo { bool UseMemOps; + bool ModeIEEERndNear; public: enum HexagonArchEnum { - V1, V2, V3, V4 + V1, V2, V3, V4, V5 }; HexagonArchEnum HexagonArchVersion; @@ -55,7 +57,11 @@ public: bool hasV3TOps () const { return HexagonArchVersion >= V3; } bool hasV3TOpsOnly () const { return HexagonArchVersion == V3; } bool hasV4TOps () const { return HexagonArchVersion >= V4; } + bool hasV4TOpsOnly () const { return HexagonArchVersion == V4; } bool useMemOps () const { return HexagonArchVersion >= V4 && UseMemOps; } + bool hasV5TOps () const { return HexagonArchVersion >= V5; } + bool hasV5TOpsOnly () const { return HexagonArchVersion == V5; } + bool modeIEEERndNear () const { return ModeIEEERndNear; } bool isSubtargetV2() const { return HexagonArchVersion == V2;} const std::string &getCPUString () const { return CPUString; } diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index b9e6894..a7b291f 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -55,7 +55,9 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT, CodeModel::Model CM, CodeGenOpt::Level OL) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - DataLayout("e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-a0:0") , + DataLayout("e-p:32:32:32-" + "i64:64:64-i32:32:32-i16:16:16-i1:32:32-" + "f64:64:64-f32:32:32-a0:0-n32") , Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this), TSInfo(*this), FrameLowering(Subtarget), @@ -100,43 +102,47 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) { } bool HexagonPassConfig::addInstSelector() { - PM.add(createHexagonRemoveExtendOps(getHexagonTargetMachine())); - PM.add(createHexagonISelDag(getHexagonTargetMachine())); - PM.add(createHexagonPeephole()); + addPass(createHexagonRemoveExtendOps(getHexagonTargetMachine())); + addPass(createHexagonISelDag(getHexagonTargetMachine())); + addPass(createHexagonPeephole()); return false; } bool HexagonPassConfig::addPreRegAlloc() { if (!DisableHardwareLoops) { - PM.add(createHexagonHardwareLoops()); + addPass(createHexagonHardwareLoops()); } - return false; } bool HexagonPassConfig::addPostRegAlloc() { - PM.add(createHexagonCFGOptimizer(getHexagonTargetMachine())); + addPass(createHexagonCFGOptimizer(getHexagonTargetMachine())); return true; } bool HexagonPassConfig::addPreSched2() { - addPass(IfConverterID); + addPass(&IfConverterID); return true; } bool HexagonPassConfig::addPreEmitPass() { if (!DisableHardwareLoops) { - PM.add(createHexagonFixupHwLoops()); + addPass(createHexagonFixupHwLoops()); } + addPass(createHexagonNewValueJump()); + // Expand Spill code for predicate registers. - PM.add(createHexagonExpandPredSpillCode(getHexagonTargetMachine())); + addPass(createHexagonExpandPredSpillCode(getHexagonTargetMachine())); // Split up TFRcondsets into conditional transfers. - PM.add(createHexagonSplitTFRCondSets(getHexagonTargetMachine())); + addPass(createHexagonSplitTFRCondSets(getHexagonTargetMachine())); + + // Create Packets. + addPass(createHexagonPacketizer()); return false; } diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp new file mode 100644 index 0000000..a03ed03 --- /dev/null +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -0,0 +1,3646 @@ +//===----- HexagonPacketizer.cpp - vliw packetizer ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements a simple VLIW packetizer using DFA. The packetizer works on +// machine basic blocks. For each instruction I in BB, the packetizer consults +// the DFA to see if machine resources are available to execute I. If so, the +// packetizer checks if I depends on any instruction J in the current packet. +// If no dependency is found, I is added to current packet and machine resource +// is marked as taken. If any dependency is found, a target API call is made to +// prune the dependence. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "packets" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/LatencyPriorityQueue.h" +#include "llvm/CodeGen/SchedulerRegistry.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "Hexagon.h" +#include "HexagonTargetMachine.h" +#include "HexagonRegisterInfo.h" +#include "HexagonSubtarget.h" +#include "HexagonMachineFunctionInfo.h" + +#include <map> + +using namespace llvm; + +namespace { + class HexagonPacketizer : public MachineFunctionPass { + + public: + static char ID; + HexagonPacketizer() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const { + return "Hexagon Packetizer"; + } + + bool runOnMachineFunction(MachineFunction &Fn); + }; + char HexagonPacketizer::ID = 0; + + class HexagonPacketizerList : public VLIWPacketizerList { + + private: + + // Has the instruction been promoted to a dot-new instruction. + bool PromotedToDotNew; + + // Has the instruction been glued to allocframe. + bool GlueAllocframeStore; + + // Has the feeder instruction been glued to new value jump. + bool GlueToNewValueJump; + + // Check if there is a dependence between some instruction already in this + // packet and this instruction. + bool Dependence; + + // Only check for dependence if there are resources available to + // schedule this instruction. + bool FoundSequentialDependence; + + public: + // Ctor. + HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, + MachineDominatorTree &MDT); + + // initPacketizerState - initialize some internal flags. + void initPacketizerState(); + + // ignorePseudoInstruction - Ignore bundling of pseudo instructions. + bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB); + + // isSoloInstruction - return true if instruction MI can not be packetized + // with any other instruction, which means that MI itself is a packet. + bool isSoloInstruction(MachineInstr *MI); + + // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ + // together. + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ); + + // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // and SUJ. + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ); + + MachineBasicBlock::iterator addToPacket(MachineInstr *MI); + private: + bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg); + bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType, + MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + bool CanPromoteToDotNew(MachineInstr* MI, SUnit* PacketSU, + unsigned DepReg, + std::map <MachineInstr*, SUnit*> MIToSUnit, + MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + bool CanPromoteToNewValue(MachineInstr* MI, SUnit* PacketSU, + unsigned DepReg, + std::map <MachineInstr*, SUnit*> MIToSUnit, + MachineBasicBlock::iterator &MII); + bool CanPromoteToNewValueStore(MachineInstr* MI, MachineInstr* PacketMI, + unsigned DepReg, + std::map <MachineInstr*, SUnit*> MIToSUnit); + bool DemoteToDotOld(MachineInstr* MI); + bool ArePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2, + std::map <MachineInstr*, SUnit*> MIToSUnit); + bool RestrictingDepExistInPacket(MachineInstr*, + unsigned, std::map <MachineInstr*, SUnit*>); + bool isNewifiable(MachineInstr* MI); + bool isCondInst(MachineInstr* MI); + bool IsNewifyStore (MachineInstr* MI); + bool tryAllocateResourcesForConstExt(MachineInstr* MI); + bool canReserveResourcesForConstExt(MachineInstr *MI); + void reserveResourcesForConstExt(MachineInstr* MI); + bool isNewValueInst(MachineInstr* MI); + bool isDotNewInst(MachineInstr* MI); + }; +} + +// HexagonPacketizerList Ctor. +HexagonPacketizerList::HexagonPacketizerList( + MachineFunction &MF, MachineLoopInfo &MLI,MachineDominatorTree &MDT) + : VLIWPacketizerList(MF, MLI, MDT, true){ +} + +bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) { + const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); + + // Instantiate the packetizer. + HexagonPacketizerList Packetizer(Fn, MLI, MDT); + + // DFA state table should not be empty. + assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + + // + // Loop over all basic blocks and remove KILL pseudo-instructions + // These instructions confuse the dependence analysis. Consider: + // D0 = ... (Insn 0) + // R0 = KILL R0, D0 (Insn 1) + // R0 = ... (Insn 2) + // Here, Insn 1 will result in the dependence graph not emitting an output + // dependence between Insn 0 and Insn 2. This can lead to incorrect + // packetization + // + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock::iterator End = MBB->end(); + MachineBasicBlock::iterator MI = MBB->begin(); + while (MI != End) { + if (MI->isKill()) { + MachineBasicBlock::iterator DeleteMI = MI; + ++MI; + MBB->erase(DeleteMI); + End = MBB->end(); + continue; + } + ++MI; + } + } + + // Loop over all of the basic blocks. + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + // Find scheduling regions and schedule / packetize each region. + unsigned RemainingCount = MBB->size(); + for(MachineBasicBlock::iterator RegionEnd = MBB->end(); + RegionEnd != MBB->begin();) { + // The next region starts above the previous region. Look backward in the + // instruction stream until we find the nearest boundary. + MachineBasicBlock::iterator I = RegionEnd; + for(;I != MBB->begin(); --I, --RemainingCount) { + if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn)) + break; + } + I = MBB->begin(); + + // Skip empty scheduling regions. + if (I == RegionEnd) { + RegionEnd = llvm::prior(RegionEnd); + --RemainingCount; + continue; + } + // Skip regions with one instruction. + if (I == llvm::prior(RegionEnd)) { + RegionEnd = llvm::prior(RegionEnd); + continue; + } + + Packetizer.PacketizeMIs(MBB, I, RegionEnd); + RegionEnd = I; + } + } + + return true; +} + + +static bool IsIndirectCall(MachineInstr* MI) { + return ((MI->getOpcode() == Hexagon::CALLR) || + (MI->getOpcode() == Hexagon::CALLRv3)); +} + +// Reserve resources for constant extender. Trigure an assertion if +// reservation fail. +void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) { + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + MachineInstr *PseudoMI = MI->getParent()->getParent()->CreateMachineInstr( + QII->get(Hexagon::IMMEXT), MI->getDebugLoc()); + + if (ResourceTracker->canReserveResources(PseudoMI)) { + ResourceTracker->reserveResources(PseudoMI); + MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); + } else { + MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); + llvm_unreachable("can not reserve resources for constant extender."); + } + return; +} + +bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) { + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + assert(QII->isExtended(MI) && + "Should only be called for constant extended instructions"); + MachineFunction *MF = MI->getParent()->getParent(); + MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT), + MI->getDebugLoc()); + bool CanReserve = ResourceTracker->canReserveResources(PseudoMI); + MF->DeleteMachineInstr(PseudoMI); + return CanReserve; +} + +// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return +// true, otherwise, return false. +bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) { + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + MachineInstr *PseudoMI = MI->getParent()->getParent()->CreateMachineInstr( + QII->get(Hexagon::IMMEXT), MI->getDebugLoc()); + + if (ResourceTracker->canReserveResources(PseudoMI)) { + ResourceTracker->reserveResources(PseudoMI); + MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); + return true; + } else { + MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); + return false; + } +} + + +bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI, + SDep::Kind DepType, + unsigned DepReg) { + + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + const HexagonRegisterInfo* QRI = + (const HexagonRegisterInfo *) TM.getRegisterInfo(); + + // Check for lr dependence + if (DepReg == QRI->getRARegister()) { + return true; + } + + if (QII->isDeallocRet(MI)) { + if (DepReg == QRI->getFrameRegister() || + DepReg == QRI->getStackRegister()) + return true; + } + + // Check if this is a predicate dependence + const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg); + if (RC == &Hexagon::PredRegsRegClass) { + return true; + } + + // + // Lastly check for an operand used in an indirect call + // If we had an attribute for checking if an instruction is an indirect call, + // then we could have avoided this relatively brittle implementation of + // IsIndirectCall() + // + // Assumes that the first operand of the CALLr is the function address + // + if (IsIndirectCall(MI) && (DepType == SDep::Data)) { + MachineOperand MO = MI->getOperand(0); + if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) { + return true; + } + } + + return false; +} + +static bool IsRegDependence(const SDep::Kind DepType) { + return (DepType == SDep::Data || DepType == SDep::Anti || + DepType == SDep::Output); +} + +static bool IsDirectJump(MachineInstr* MI) { + return (MI->getOpcode() == Hexagon::JMP); +} + +static bool IsSchedBarrier(MachineInstr* MI) { + switch (MI->getOpcode()) { + case Hexagon::BARRIER: + return true; + } + return false; +} + +static bool IsControlFlow(MachineInstr* MI) { + return (MI->getDesc().isTerminator() || MI->getDesc().isCall()); +} + +bool HexagonPacketizerList::isNewValueInst(MachineInstr* MI) { + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + if (QII->isNewValueJump(MI)) + return true; + + if (QII->isNewValueStore(MI)) + return true; + + return false; +} + +// Function returns true if an instruction can be promoted to the new-value +// store. It will always return false for v2 and v3. +// It lists all the conditional and unconditional stores that can be promoted +// to the new-value stores. + +bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) { + const HexagonRegisterInfo* QRI = + (const HexagonRegisterInfo *) TM.getRegisterInfo(); + switch (MI->getOpcode()) + { + // store byte + case Hexagon::STrib: + case Hexagon::STrib_indexed: + case Hexagon::STrib_indexed_shl_V4: + case Hexagon::STrib_shl_V4: + case Hexagon::STrib_GP_V4: + case Hexagon::STb_GP_V4: + case Hexagon::POST_STbri: + case Hexagon::STrib_cPt: + case Hexagon::STrib_cdnPt_V4: + case Hexagon::STrib_cNotPt: + case Hexagon::STrib_cdnNotPt_V4: + case Hexagon::STrib_indexed_cPt: + case Hexagon::STrib_indexed_cdnPt_V4: + case Hexagon::STrib_indexed_cNotPt: + case Hexagon::STrib_indexed_cdnNotPt_V4: + case Hexagon::STrib_indexed_shl_cPt_V4: + case Hexagon::STrib_indexed_shl_cdnPt_V4: + case Hexagon::STrib_indexed_shl_cNotPt_V4: + case Hexagon::STrib_indexed_shl_cdnNotPt_V4: + case Hexagon::POST_STbri_cPt: + case Hexagon::POST_STbri_cdnPt_V4: + case Hexagon::POST_STbri_cNotPt: + case Hexagon::POST_STbri_cdnNotPt_V4: + case Hexagon::STb_GP_cPt_V4: + case Hexagon::STb_GP_cNotPt_V4: + case Hexagon::STb_GP_cdnPt_V4: + case Hexagon::STb_GP_cdnNotPt_V4: + case Hexagon::STrib_GP_cPt_V4: + case Hexagon::STrib_GP_cNotPt_V4: + case Hexagon::STrib_GP_cdnPt_V4: + case Hexagon::STrib_GP_cdnNotPt_V4: + + // store halfword + case Hexagon::STrih: + case Hexagon::STrih_indexed: + case Hexagon::STrih_indexed_shl_V4: + case Hexagon::STrih_shl_V4: + case Hexagon::STrih_GP_V4: + case Hexagon::STh_GP_V4: + case Hexagon::POST_SThri: + case Hexagon::STrih_cPt: + case Hexagon::STrih_cdnPt_V4: + case Hexagon::STrih_cNotPt: + case Hexagon::STrih_cdnNotPt_V4: + case Hexagon::STrih_indexed_cPt: + case Hexagon::STrih_indexed_cdnPt_V4: + case Hexagon::STrih_indexed_cNotPt: + case Hexagon::STrih_indexed_cdnNotPt_V4: + case Hexagon::STrih_indexed_shl_cPt_V4: + case Hexagon::STrih_indexed_shl_cdnPt_V4: + case Hexagon::STrih_indexed_shl_cNotPt_V4: + case Hexagon::STrih_indexed_shl_cdnNotPt_V4: + case Hexagon::POST_SThri_cPt: + case Hexagon::POST_SThri_cdnPt_V4: + case Hexagon::POST_SThri_cNotPt: + case Hexagon::POST_SThri_cdnNotPt_V4: + case Hexagon::STh_GP_cPt_V4: + case Hexagon::STh_GP_cNotPt_V4: + case Hexagon::STh_GP_cdnPt_V4: + case Hexagon::STh_GP_cdnNotPt_V4: + case Hexagon::STrih_GP_cPt_V4: + case Hexagon::STrih_GP_cNotPt_V4: + case Hexagon::STrih_GP_cdnPt_V4: + case Hexagon::STrih_GP_cdnNotPt_V4: + + // store word + case Hexagon::STriw: + case Hexagon::STriw_indexed: + case Hexagon::STriw_indexed_shl_V4: + case Hexagon::STriw_shl_V4: + case Hexagon::STriw_GP_V4: + case Hexagon::STw_GP_V4: + case Hexagon::POST_STwri: + case Hexagon::STriw_cPt: + case Hexagon::STriw_cdnPt_V4: + case Hexagon::STriw_cNotPt: + case Hexagon::STriw_cdnNotPt_V4: + case Hexagon::STriw_indexed_cPt: + case Hexagon::STriw_indexed_cdnPt_V4: + case Hexagon::STriw_indexed_cNotPt: + case Hexagon::STriw_indexed_cdnNotPt_V4: + case Hexagon::STriw_indexed_shl_cPt_V4: + case Hexagon::STriw_indexed_shl_cdnPt_V4: + case Hexagon::STriw_indexed_shl_cNotPt_V4: + case Hexagon::STriw_indexed_shl_cdnNotPt_V4: + case Hexagon::POST_STwri_cPt: + case Hexagon::POST_STwri_cdnPt_V4: + case Hexagon::POST_STwri_cNotPt: + case Hexagon::POST_STwri_cdnNotPt_V4: + case Hexagon::STw_GP_cPt_V4: + case Hexagon::STw_GP_cNotPt_V4: + case Hexagon::STw_GP_cdnPt_V4: + case Hexagon::STw_GP_cdnNotPt_V4: + case Hexagon::STriw_GP_cPt_V4: + case Hexagon::STriw_GP_cNotPt_V4: + case Hexagon::STriw_GP_cdnPt_V4: + case Hexagon::STriw_GP_cdnNotPt_V4: + return QRI->Subtarget.hasV4TOps(); + } + return false; +} + +static bool IsLoopN(MachineInstr *MI) { + return (MI->getOpcode() == Hexagon::LOOP0_i || + MI->getOpcode() == Hexagon::LOOP0_r); +} + +/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a +/// callee-saved register. +static bool DoesModifyCalleeSavedReg(MachineInstr *MI, + const TargetRegisterInfo *TRI) { + for (const uint16_t *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) { + unsigned CalleeSavedReg = *CSR; + if (MI->modifiesRegister(CalleeSavedReg, TRI)) + return true; + } + return false; +} + +// Return the new value instruction for a given store. +static int GetDotNewOp(const int opc) { + switch (opc) { + default: llvm_unreachable("Unknown .new type"); + // store new value byte + case Hexagon::STrib: + return Hexagon::STrib_nv_V4; + + case Hexagon::STrib_indexed: + return Hexagon::STrib_indexed_nv_V4; + + case Hexagon::STrib_indexed_shl_V4: + return Hexagon::STrib_indexed_shl_nv_V4; + + case Hexagon::STrib_shl_V4: + return Hexagon::STrib_shl_nv_V4; + + case Hexagon::STrib_GP_V4: + return Hexagon::STrib_GP_nv_V4; + + case Hexagon::STb_GP_V4: + return Hexagon::STb_GP_nv_V4; + + case Hexagon::POST_STbri: + return Hexagon::POST_STbri_nv_V4; + + case Hexagon::STrib_cPt: + return Hexagon::STrib_cPt_nv_V4; + + case Hexagon::STrib_cdnPt_V4: + return Hexagon::STrib_cdnPt_nv_V4; + + case Hexagon::STrib_cNotPt: + return Hexagon::STrib_cNotPt_nv_V4; + + case Hexagon::STrib_cdnNotPt_V4: + return Hexagon::STrib_cdnNotPt_nv_V4; + + case Hexagon::STrib_indexed_cPt: + return Hexagon::STrib_indexed_cPt_nv_V4; + + case Hexagon::STrib_indexed_cdnPt_V4: + return Hexagon::STrib_indexed_cdnPt_nv_V4; + + case Hexagon::STrib_indexed_cNotPt: + return Hexagon::STrib_indexed_cNotPt_nv_V4; + + case Hexagon::STrib_indexed_cdnNotPt_V4: + return Hexagon::STrib_indexed_cdnNotPt_nv_V4; + + case Hexagon::STrib_indexed_shl_cPt_V4: + return Hexagon::STrib_indexed_shl_cPt_nv_V4; + + case Hexagon::STrib_indexed_shl_cdnPt_V4: + return Hexagon::STrib_indexed_shl_cdnPt_nv_V4; + + case Hexagon::STrib_indexed_shl_cNotPt_V4: + return Hexagon::STrib_indexed_shl_cNotPt_nv_V4; + + case Hexagon::STrib_indexed_shl_cdnNotPt_V4: + return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4; + + case Hexagon::POST_STbri_cPt: + return Hexagon::POST_STbri_cPt_nv_V4; + + case Hexagon::POST_STbri_cdnPt_V4: + return Hexagon::POST_STbri_cdnPt_nv_V4; + + case Hexagon::POST_STbri_cNotPt: + return Hexagon::POST_STbri_cNotPt_nv_V4; + + case Hexagon::POST_STbri_cdnNotPt_V4: + return Hexagon::POST_STbri_cdnNotPt_nv_V4; + + case Hexagon::STb_GP_cPt_V4: + return Hexagon::STb_GP_cPt_nv_V4; + + case Hexagon::STb_GP_cNotPt_V4: + return Hexagon::STb_GP_cNotPt_nv_V4; + + case Hexagon::STb_GP_cdnPt_V4: + return Hexagon::STb_GP_cdnPt_nv_V4; + + case Hexagon::STb_GP_cdnNotPt_V4: + return Hexagon::STb_GP_cdnNotPt_nv_V4; + + case Hexagon::STrib_GP_cPt_V4: + return Hexagon::STrib_GP_cPt_nv_V4; + + case Hexagon::STrib_GP_cNotPt_V4: + return Hexagon::STrib_GP_cNotPt_nv_V4; + + case Hexagon::STrib_GP_cdnPt_V4: + return Hexagon::STrib_GP_cdnPt_nv_V4; + + case Hexagon::STrib_GP_cdnNotPt_V4: + return Hexagon::STrib_GP_cdnNotPt_nv_V4; + + // store new value halfword + case Hexagon::STrih: + return Hexagon::STrih_nv_V4; + + case Hexagon::STrih_indexed: + return Hexagon::STrih_indexed_nv_V4; + + case Hexagon::STrih_indexed_shl_V4: + return Hexagon::STrih_indexed_shl_nv_V4; + + case Hexagon::STrih_shl_V4: + return Hexagon::STrih_shl_nv_V4; + + case Hexagon::STrih_GP_V4: + return Hexagon::STrih_GP_nv_V4; + + case Hexagon::STh_GP_V4: + return Hexagon::STh_GP_nv_V4; + + case Hexagon::POST_SThri: + return Hexagon::POST_SThri_nv_V4; + + case Hexagon::STrih_cPt: + return Hexagon::STrih_cPt_nv_V4; + + case Hexagon::STrih_cdnPt_V4: + return Hexagon::STrih_cdnPt_nv_V4; + + case Hexagon::STrih_cNotPt: + return Hexagon::STrih_cNotPt_nv_V4; + + case Hexagon::STrih_cdnNotPt_V4: + return Hexagon::STrih_cdnNotPt_nv_V4; + + case Hexagon::STrih_indexed_cPt: + return Hexagon::STrih_indexed_cPt_nv_V4; + + case Hexagon::STrih_indexed_cdnPt_V4: + return Hexagon::STrih_indexed_cdnPt_nv_V4; + + case Hexagon::STrih_indexed_cNotPt: + return Hexagon::STrih_indexed_cNotPt_nv_V4; + + case Hexagon::STrih_indexed_cdnNotPt_V4: + return Hexagon::STrih_indexed_cdnNotPt_nv_V4; + + case Hexagon::STrih_indexed_shl_cPt_V4: + return Hexagon::STrih_indexed_shl_cPt_nv_V4; + + case Hexagon::STrih_indexed_shl_cdnPt_V4: + return Hexagon::STrih_indexed_shl_cdnPt_nv_V4; + + case Hexagon::STrih_indexed_shl_cNotPt_V4: + return Hexagon::STrih_indexed_shl_cNotPt_nv_V4; + + case Hexagon::STrih_indexed_shl_cdnNotPt_V4: + return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4; + + case Hexagon::POST_SThri_cPt: + return Hexagon::POST_SThri_cPt_nv_V4; + + case Hexagon::POST_SThri_cdnPt_V4: + return Hexagon::POST_SThri_cdnPt_nv_V4; + + case Hexagon::POST_SThri_cNotPt: + return Hexagon::POST_SThri_cNotPt_nv_V4; + + case Hexagon::POST_SThri_cdnNotPt_V4: + return Hexagon::POST_SThri_cdnNotPt_nv_V4; + + case Hexagon::STh_GP_cPt_V4: + return Hexagon::STh_GP_cPt_nv_V4; + + case Hexagon::STh_GP_cNotPt_V4: + return Hexagon::STh_GP_cNotPt_nv_V4; + + case Hexagon::STh_GP_cdnPt_V4: + return Hexagon::STh_GP_cdnPt_nv_V4; + + case Hexagon::STh_GP_cdnNotPt_V4: + return Hexagon::STh_GP_cdnNotPt_nv_V4; + + case Hexagon::STrih_GP_cPt_V4: + return Hexagon::STrih_GP_cPt_nv_V4; + + case Hexagon::STrih_GP_cNotPt_V4: + return Hexagon::STrih_GP_cNotPt_nv_V4; + + case Hexagon::STrih_GP_cdnPt_V4: + return Hexagon::STrih_GP_cdnPt_nv_V4; + + case Hexagon::STrih_GP_cdnNotPt_V4: + return Hexagon::STrih_GP_cdnNotPt_nv_V4; + + // store new value word + case Hexagon::STriw: + return Hexagon::STriw_nv_V4; + + case Hexagon::STriw_indexed: + return Hexagon::STriw_indexed_nv_V4; + + case Hexagon::STriw_indexed_shl_V4: + return Hexagon::STriw_indexed_shl_nv_V4; + + case Hexagon::STriw_shl_V4: + return Hexagon::STriw_shl_nv_V4; + + case Hexagon::STriw_GP_V4: + return Hexagon::STriw_GP_nv_V4; + + case Hexagon::STw_GP_V4: + return Hexagon::STw_GP_nv_V4; + + case Hexagon::POST_STwri: + return Hexagon::POST_STwri_nv_V4; + + case Hexagon::STriw_cPt: + return Hexagon::STriw_cPt_nv_V4; + + case Hexagon::STriw_cdnPt_V4: + return Hexagon::STriw_cdnPt_nv_V4; + + case Hexagon::STriw_cNotPt: + return Hexagon::STriw_cNotPt_nv_V4; + + case Hexagon::STriw_cdnNotPt_V4: + return Hexagon::STriw_cdnNotPt_nv_V4; + + case Hexagon::STriw_indexed_cPt: + return Hexagon::STriw_indexed_cPt_nv_V4; + + case Hexagon::STriw_indexed_cdnPt_V4: + return Hexagon::STriw_indexed_cdnPt_nv_V4; + + case Hexagon::STriw_indexed_cNotPt: + return Hexagon::STriw_indexed_cNotPt_nv_V4; + + case Hexagon::STriw_indexed_cdnNotPt_V4: + return Hexagon::STriw_indexed_cdnNotPt_nv_V4; + + case Hexagon::STriw_indexed_shl_cPt_V4: + return Hexagon::STriw_indexed_shl_cPt_nv_V4; + + case Hexagon::STriw_indexed_shl_cdnPt_V4: + return Hexagon::STriw_indexed_shl_cdnPt_nv_V4; + + case Hexagon::STriw_indexed_shl_cNotPt_V4: + return Hexagon::STriw_indexed_shl_cNotPt_nv_V4; + + case Hexagon::STriw_indexed_shl_cdnNotPt_V4: + return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4; + + case Hexagon::POST_STwri_cPt: + return Hexagon::POST_STwri_cPt_nv_V4; + + case Hexagon::POST_STwri_cdnPt_V4: + return Hexagon::POST_STwri_cdnPt_nv_V4; + + case Hexagon::POST_STwri_cNotPt: + return Hexagon::POST_STwri_cNotPt_nv_V4; + + case Hexagon::POST_STwri_cdnNotPt_V4: + return Hexagon::POST_STwri_cdnNotPt_nv_V4; + + case Hexagon::STw_GP_cPt_V4: + return Hexagon::STw_GP_cPt_nv_V4; + + case Hexagon::STw_GP_cNotPt_V4: + return Hexagon::STw_GP_cNotPt_nv_V4; + + case Hexagon::STw_GP_cdnPt_V4: + return Hexagon::STw_GP_cdnPt_nv_V4; + + case Hexagon::STw_GP_cdnNotPt_V4: + return Hexagon::STw_GP_cdnNotPt_nv_V4; + + case Hexagon::STriw_GP_cPt_V4: + return Hexagon::STriw_GP_cPt_nv_V4; + + case Hexagon::STriw_GP_cNotPt_V4: + return Hexagon::STriw_GP_cNotPt_nv_V4; + + case Hexagon::STriw_GP_cdnPt_V4: + return Hexagon::STriw_GP_cdnPt_nv_V4; + + case Hexagon::STriw_GP_cdnNotPt_V4: + return Hexagon::STriw_GP_cdnNotPt_nv_V4; + } +} + +// Return .new predicate version for an instruction +static int GetDotNewPredOp(const int opc) { + switch (opc) { + default: llvm_unreachable("Unknown .new type"); + // Conditional stores + // Store byte conditionally + case Hexagon::STrib_cPt : + return Hexagon::STrib_cdnPt_V4; + + case Hexagon::STrib_cNotPt : + return Hexagon::STrib_cdnNotPt_V4; + + case Hexagon::STrib_indexed_cPt : + return Hexagon::STrib_indexed_cdnPt_V4; + + case Hexagon::STrib_indexed_cNotPt : + return Hexagon::STrib_indexed_cdnNotPt_V4; + + case Hexagon::STrib_imm_cPt_V4 : + return Hexagon::STrib_imm_cdnPt_V4; + + case Hexagon::STrib_imm_cNotPt_V4 : + return Hexagon::STrib_imm_cdnNotPt_V4; + + case Hexagon::POST_STbri_cPt : + return Hexagon::POST_STbri_cdnPt_V4; + + case Hexagon::POST_STbri_cNotPt : + return Hexagon::POST_STbri_cdnNotPt_V4; + + case Hexagon::STrib_indexed_shl_cPt_V4 : + return Hexagon::STrib_indexed_shl_cdnPt_V4; + + case Hexagon::STrib_indexed_shl_cNotPt_V4 : + return Hexagon::STrib_indexed_shl_cdnNotPt_V4; + + case Hexagon::STb_GP_cPt_V4 : + return Hexagon::STb_GP_cdnPt_V4; + + case Hexagon::STb_GP_cNotPt_V4 : + return Hexagon::STb_GP_cdnNotPt_V4; + + case Hexagon::STrib_GP_cPt_V4 : + return Hexagon::STrib_GP_cdnPt_V4; + + case Hexagon::STrib_GP_cNotPt_V4 : + return Hexagon::STrib_GP_cdnNotPt_V4; + + // Store doubleword conditionally + case Hexagon::STrid_cPt : + return Hexagon::STrid_cdnPt_V4; + + case Hexagon::STrid_cNotPt : + return Hexagon::STrid_cdnNotPt_V4; + + case Hexagon::STrid_indexed_cPt : + return Hexagon::STrid_indexed_cdnPt_V4; + + case Hexagon::STrid_indexed_cNotPt : + return Hexagon::STrid_indexed_cdnNotPt_V4; + + case Hexagon::STrid_indexed_shl_cPt_V4 : + return Hexagon::STrid_indexed_shl_cdnPt_V4; + + case Hexagon::STrid_indexed_shl_cNotPt_V4 : + return Hexagon::STrid_indexed_shl_cdnNotPt_V4; + + case Hexagon::POST_STdri_cPt : + return Hexagon::POST_STdri_cdnPt_V4; + + case Hexagon::POST_STdri_cNotPt : + return Hexagon::POST_STdri_cdnNotPt_V4; + + case Hexagon::STd_GP_cPt_V4 : + return Hexagon::STd_GP_cdnPt_V4; + + case Hexagon::STd_GP_cNotPt_V4 : + return Hexagon::STd_GP_cdnNotPt_V4; + + case Hexagon::STrid_GP_cPt_V4 : + return Hexagon::STrid_GP_cdnPt_V4; + + case Hexagon::STrid_GP_cNotPt_V4 : + return Hexagon::STrid_GP_cdnNotPt_V4; + + // Store halfword conditionally + case Hexagon::STrih_cPt : + return Hexagon::STrih_cdnPt_V4; + + case Hexagon::STrih_cNotPt : + return Hexagon::STrih_cdnNotPt_V4; + + case Hexagon::STrih_indexed_cPt : + return Hexagon::STrih_indexed_cdnPt_V4; + + case Hexagon::STrih_indexed_cNotPt : + return Hexagon::STrih_indexed_cdnNotPt_V4; + + case Hexagon::STrih_imm_cPt_V4 : + return Hexagon::STrih_imm_cdnPt_V4; + + case Hexagon::STrih_imm_cNotPt_V4 : + return Hexagon::STrih_imm_cdnNotPt_V4; + + case Hexagon::STrih_indexed_shl_cPt_V4 : + return Hexagon::STrih_indexed_shl_cdnPt_V4; + + case Hexagon::STrih_indexed_shl_cNotPt_V4 : + return Hexagon::STrih_indexed_shl_cdnNotPt_V4; + + case Hexagon::POST_SThri_cPt : + return Hexagon::POST_SThri_cdnPt_V4; + + case Hexagon::POST_SThri_cNotPt : + return Hexagon::POST_SThri_cdnNotPt_V4; + + case Hexagon::STh_GP_cPt_V4 : + return Hexagon::STh_GP_cdnPt_V4; + + case Hexagon::STh_GP_cNotPt_V4 : + return Hexagon::STh_GP_cdnNotPt_V4; + + case Hexagon::STrih_GP_cPt_V4 : + return Hexagon::STrih_GP_cdnPt_V4; + + case Hexagon::STrih_GP_cNotPt_V4 : + return Hexagon::STrih_GP_cdnNotPt_V4; + + // Store word conditionally + case Hexagon::STriw_cPt : + return Hexagon::STriw_cdnPt_V4; + + case Hexagon::STriw_cNotPt : + return Hexagon::STriw_cdnNotPt_V4; + + case Hexagon::STriw_indexed_cPt : + return Hexagon::STriw_indexed_cdnPt_V4; + + case Hexagon::STriw_indexed_cNotPt : + return Hexagon::STriw_indexed_cdnNotPt_V4; + + case Hexagon::STriw_imm_cPt_V4 : + return Hexagon::STriw_imm_cdnPt_V4; + + case Hexagon::STriw_imm_cNotPt_V4 : + return Hexagon::STriw_imm_cdnNotPt_V4; + + case Hexagon::STriw_indexed_shl_cPt_V4 : + return Hexagon::STriw_indexed_shl_cdnPt_V4; + + case Hexagon::STriw_indexed_shl_cNotPt_V4 : + return Hexagon::STriw_indexed_shl_cdnNotPt_V4; + + case Hexagon::POST_STwri_cPt : + return Hexagon::POST_STwri_cdnPt_V4; + + case Hexagon::POST_STwri_cNotPt : + return Hexagon::POST_STwri_cdnNotPt_V4; + + case Hexagon::STw_GP_cPt_V4 : + return Hexagon::STw_GP_cdnPt_V4; + + case Hexagon::STw_GP_cNotPt_V4 : + return Hexagon::STw_GP_cdnNotPt_V4; + + case Hexagon::STriw_GP_cPt_V4 : + return Hexagon::STriw_GP_cdnPt_V4; + + case Hexagon::STriw_GP_cNotPt_V4 : + return Hexagon::STriw_GP_cdnNotPt_V4; + + // Condtional Jumps + case Hexagon::JMP_c: + return Hexagon::JMP_cdnPt; + + case Hexagon::JMP_cNot: + return Hexagon::JMP_cdnNotPt; + + case Hexagon::JMPR_cPt: + return Hexagon::JMPR_cdnPt_V3; + + case Hexagon::JMPR_cNotPt: + return Hexagon::JMPR_cdnNotPt_V3; + + // Conditional Transfers + case Hexagon::TFR_cPt: + return Hexagon::TFR_cdnPt; + + case Hexagon::TFR_cNotPt: + return Hexagon::TFR_cdnNotPt; + + case Hexagon::TFRI_cPt: + return Hexagon::TFRI_cdnPt; + + case Hexagon::TFRI_cNotPt: + return Hexagon::TFRI_cdnNotPt; + + // Load double word + case Hexagon::LDrid_cPt : + return Hexagon::LDrid_cdnPt; + + case Hexagon::LDrid_cNotPt : + return Hexagon::LDrid_cdnNotPt; + + case Hexagon::LDrid_indexed_cPt : + return Hexagon::LDrid_indexed_cdnPt; + + case Hexagon::LDrid_indexed_cNotPt : + return Hexagon::LDrid_indexed_cdnNotPt; + + case Hexagon::POST_LDrid_cPt : + return Hexagon::POST_LDrid_cdnPt_V4; + + case Hexagon::POST_LDrid_cNotPt : + return Hexagon::POST_LDrid_cdnNotPt_V4; + + // Load word + case Hexagon::LDriw_cPt : + return Hexagon::LDriw_cdnPt; + + case Hexagon::LDriw_cNotPt : + return Hexagon::LDriw_cdnNotPt; + + case Hexagon::LDriw_indexed_cPt : + return Hexagon::LDriw_indexed_cdnPt; + + case Hexagon::LDriw_indexed_cNotPt : + return Hexagon::LDriw_indexed_cdnNotPt; + + case Hexagon::POST_LDriw_cPt : + return Hexagon::POST_LDriw_cdnPt_V4; + + case Hexagon::POST_LDriw_cNotPt : + return Hexagon::POST_LDriw_cdnNotPt_V4; + + // Load halfword + case Hexagon::LDrih_cPt : + return Hexagon::LDrih_cdnPt; + + case Hexagon::LDrih_cNotPt : + return Hexagon::LDrih_cdnNotPt; + + case Hexagon::LDrih_indexed_cPt : + return Hexagon::LDrih_indexed_cdnPt; + + case Hexagon::LDrih_indexed_cNotPt : + return Hexagon::LDrih_indexed_cdnNotPt; + + case Hexagon::POST_LDrih_cPt : + return Hexagon::POST_LDrih_cdnPt_V4; + + case Hexagon::POST_LDrih_cNotPt : + return Hexagon::POST_LDrih_cdnNotPt_V4; + + // Load byte + case Hexagon::LDrib_cPt : + return Hexagon::LDrib_cdnPt; + + case Hexagon::LDrib_cNotPt : + return Hexagon::LDrib_cdnNotPt; + + case Hexagon::LDrib_indexed_cPt : + return Hexagon::LDrib_indexed_cdnPt; + + case Hexagon::LDrib_indexed_cNotPt : + return Hexagon::LDrib_indexed_cdnNotPt; + + case Hexagon::POST_LDrib_cPt : + return Hexagon::POST_LDrib_cdnPt_V4; + + case Hexagon::POST_LDrib_cNotPt : + return Hexagon::POST_LDrib_cdnNotPt_V4; + + // Load unsigned halfword + case Hexagon::LDriuh_cPt : + return Hexagon::LDriuh_cdnPt; + + case Hexagon::LDriuh_cNotPt : + return Hexagon::LDriuh_cdnNotPt; + + case Hexagon::LDriuh_indexed_cPt : + return Hexagon::LDriuh_indexed_cdnPt; + + case Hexagon::LDriuh_indexed_cNotPt : + return Hexagon::LDriuh_indexed_cdnNotPt; + + case Hexagon::POST_LDriuh_cPt : + return Hexagon::POST_LDriuh_cdnPt_V4; + + case Hexagon::POST_LDriuh_cNotPt : + return Hexagon::POST_LDriuh_cdnNotPt_V4; + + // Load unsigned byte + case Hexagon::LDriub_cPt : + return Hexagon::LDriub_cdnPt; + + case Hexagon::LDriub_cNotPt : + return Hexagon::LDriub_cdnNotPt; + + case Hexagon::LDriub_indexed_cPt : + return Hexagon::LDriub_indexed_cdnPt; + + case Hexagon::LDriub_indexed_cNotPt : + return Hexagon::LDriub_indexed_cdnNotPt; + + case Hexagon::POST_LDriub_cPt : + return Hexagon::POST_LDriub_cdnPt_V4; + + case Hexagon::POST_LDriub_cNotPt : + return Hexagon::POST_LDriub_cdnNotPt_V4; + + // V4 indexed+scaled load + + case Hexagon::LDrid_indexed_cPt_V4 : + return Hexagon::LDrid_indexed_cdnPt_V4; + + case Hexagon::LDrid_indexed_cNotPt_V4 : + return Hexagon::LDrid_indexed_cdnNotPt_V4; + + case Hexagon::LDrid_indexed_shl_cPt_V4 : + return Hexagon::LDrid_indexed_shl_cdnPt_V4; + + case Hexagon::LDrid_indexed_shl_cNotPt_V4 : + return Hexagon::LDrid_indexed_shl_cdnNotPt_V4; + + case Hexagon::LDrib_indexed_cPt_V4 : + return Hexagon::LDrib_indexed_cdnPt_V4; + + case Hexagon::LDrib_indexed_cNotPt_V4 : + return Hexagon::LDrib_indexed_cdnNotPt_V4; + + case Hexagon::LDrib_indexed_shl_cPt_V4 : + return Hexagon::LDrib_indexed_shl_cdnPt_V4; + + case Hexagon::LDrib_indexed_shl_cNotPt_V4 : + return Hexagon::LDrib_indexed_shl_cdnNotPt_V4; + + case Hexagon::LDriub_indexed_cPt_V4 : + return Hexagon::LDriub_indexed_cdnPt_V4; + + case Hexagon::LDriub_indexed_cNotPt_V4 : + return Hexagon::LDriub_indexed_cdnNotPt_V4; + + case Hexagon::LDriub_indexed_shl_cPt_V4 : + return Hexagon::LDriub_indexed_shl_cdnPt_V4; + + case Hexagon::LDriub_indexed_shl_cNotPt_V4 : + return Hexagon::LDriub_indexed_shl_cdnNotPt_V4; + + case Hexagon::LDrih_indexed_cPt_V4 : + return Hexagon::LDrih_indexed_cdnPt_V4; + + case Hexagon::LDrih_indexed_cNotPt_V4 : + return Hexagon::LDrih_indexed_cdnNotPt_V4; + + case Hexagon::LDrih_indexed_shl_cPt_V4 : + return Hexagon::LDrih_indexed_shl_cdnPt_V4; + + case Hexagon::LDrih_indexed_shl_cNotPt_V4 : + return Hexagon::LDrih_indexed_shl_cdnNotPt_V4; + + case Hexagon::LDriuh_indexed_cPt_V4 : + return Hexagon::LDriuh_indexed_cdnPt_V4; + + case Hexagon::LDriuh_indexed_cNotPt_V4 : + return Hexagon::LDriuh_indexed_cdnNotPt_V4; + + case Hexagon::LDriuh_indexed_shl_cPt_V4 : + return Hexagon::LDriuh_indexed_shl_cdnPt_V4; + + case Hexagon::LDriuh_indexed_shl_cNotPt_V4 : + return Hexagon::LDriuh_indexed_shl_cdnNotPt_V4; + + case Hexagon::LDriw_indexed_cPt_V4 : + return Hexagon::LDriw_indexed_cdnPt_V4; + + case Hexagon::LDriw_indexed_cNotPt_V4 : + return Hexagon::LDriw_indexed_cdnNotPt_V4; + + case Hexagon::LDriw_indexed_shl_cPt_V4 : + return Hexagon::LDriw_indexed_shl_cdnPt_V4; + + case Hexagon::LDriw_indexed_shl_cNotPt_V4 : + return Hexagon::LDriw_indexed_shl_cdnNotPt_V4; + + // V4 global address load + + case Hexagon::LDd_GP_cPt_V4: + return Hexagon::LDd_GP_cdnPt_V4; + + case Hexagon::LDd_GP_cNotPt_V4: + return Hexagon::LDd_GP_cdnNotPt_V4; + + case Hexagon::LDb_GP_cPt_V4: + return Hexagon::LDb_GP_cdnPt_V4; + + case Hexagon::LDb_GP_cNotPt_V4: + return Hexagon::LDb_GP_cdnNotPt_V4; + + case Hexagon::LDub_GP_cPt_V4: + return Hexagon::LDub_GP_cdnPt_V4; + + case Hexagon::LDub_GP_cNotPt_V4: + return Hexagon::LDub_GP_cdnNotPt_V4; + + case Hexagon::LDh_GP_cPt_V4: + return Hexagon::LDh_GP_cdnPt_V4; + + case Hexagon::LDh_GP_cNotPt_V4: + return Hexagon::LDh_GP_cdnNotPt_V4; + + case Hexagon::LDuh_GP_cPt_V4: + return Hexagon::LDuh_GP_cdnPt_V4; + + case Hexagon::LDuh_GP_cNotPt_V4: + return Hexagon::LDuh_GP_cdnNotPt_V4; + + case Hexagon::LDw_GP_cPt_V4: + return Hexagon::LDw_GP_cdnPt_V4; + + case Hexagon::LDw_GP_cNotPt_V4: + return Hexagon::LDw_GP_cdnNotPt_V4; + + case Hexagon::LDrid_GP_cPt_V4: + return Hexagon::LDrid_GP_cdnPt_V4; + + case Hexagon::LDrid_GP_cNotPt_V4: + return Hexagon::LDrid_GP_cdnNotPt_V4; + + case Hexagon::LDrib_GP_cPt_V4: + return Hexagon::LDrib_GP_cdnPt_V4; + + case Hexagon::LDrib_GP_cNotPt_V4: + return Hexagon::LDrib_GP_cdnNotPt_V4; + + case Hexagon::LDriub_GP_cPt_V4: + return Hexagon::LDriub_GP_cdnPt_V4; + + case Hexagon::LDriub_GP_cNotPt_V4: + return Hexagon::LDriub_GP_cdnNotPt_V4; + + case Hexagon::LDrih_GP_cPt_V4: + return Hexagon::LDrih_GP_cdnPt_V4; + + case Hexagon::LDrih_GP_cNotPt_V4: + return Hexagon::LDrih_GP_cdnNotPt_V4; + + case Hexagon::LDriuh_GP_cPt_V4: + return Hexagon::LDriuh_GP_cdnPt_V4; + + case Hexagon::LDriuh_GP_cNotPt_V4: + return Hexagon::LDriuh_GP_cdnNotPt_V4; + + case Hexagon::LDriw_GP_cPt_V4: + return Hexagon::LDriw_GP_cdnPt_V4; + + case Hexagon::LDriw_GP_cNotPt_V4: + return Hexagon::LDriw_GP_cdnNotPt_V4; + + // Conditional store new-value byte + case Hexagon::STrib_cPt_nv_V4 : + return Hexagon::STrib_cdnPt_nv_V4; + case Hexagon::STrib_cNotPt_nv_V4 : + return Hexagon::STrib_cdnNotPt_nv_V4; + + case Hexagon::STrib_indexed_cPt_nv_V4 : + return Hexagon::STrib_indexed_cdnPt_nv_V4; + case Hexagon::STrib_indexed_cNotPt_nv_V4 : + return Hexagon::STrib_indexed_cdnNotPt_nv_V4; + + case Hexagon::STrib_indexed_shl_cPt_nv_V4 : + return Hexagon::STrib_indexed_shl_cdnPt_nv_V4; + case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 : + return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4; + + case Hexagon::POST_STbri_cPt_nv_V4 : + return Hexagon::POST_STbri_cdnPt_nv_V4; + case Hexagon::POST_STbri_cNotPt_nv_V4 : + return Hexagon::POST_STbri_cdnNotPt_nv_V4; + + case Hexagon::STb_GP_cPt_nv_V4 : + return Hexagon::STb_GP_cdnPt_nv_V4; + + case Hexagon::STb_GP_cNotPt_nv_V4 : + return Hexagon::STb_GP_cdnNotPt_nv_V4; + + case Hexagon::STrib_GP_cPt_nv_V4 : + return Hexagon::STrib_GP_cdnPt_nv_V4; + + case Hexagon::STrib_GP_cNotPt_nv_V4 : + return Hexagon::STrib_GP_cdnNotPt_nv_V4; + + // Conditional store new-value halfword + case Hexagon::STrih_cPt_nv_V4 : + return Hexagon::STrih_cdnPt_nv_V4; + case Hexagon::STrih_cNotPt_nv_V4 : + return Hexagon::STrih_cdnNotPt_nv_V4; + + case Hexagon::STrih_indexed_cPt_nv_V4 : + return Hexagon::STrih_indexed_cdnPt_nv_V4; + case Hexagon::STrih_indexed_cNotPt_nv_V4 : + return Hexagon::STrih_indexed_cdnNotPt_nv_V4; + + case Hexagon::STrih_indexed_shl_cPt_nv_V4 : + return Hexagon::STrih_indexed_shl_cdnPt_nv_V4; + case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 : + return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4; + + case Hexagon::POST_SThri_cPt_nv_V4 : + return Hexagon::POST_SThri_cdnPt_nv_V4; + case Hexagon::POST_SThri_cNotPt_nv_V4 : + return Hexagon::POST_SThri_cdnNotPt_nv_V4; + + case Hexagon::STh_GP_cPt_nv_V4 : + return Hexagon::STh_GP_cdnPt_nv_V4; + + case Hexagon::STh_GP_cNotPt_nv_V4 : + return Hexagon::STh_GP_cdnNotPt_nv_V4; + + case Hexagon::STrih_GP_cPt_nv_V4 : + return Hexagon::STrih_GP_cdnPt_nv_V4; + + case Hexagon::STrih_GP_cNotPt_nv_V4 : + return Hexagon::STrih_GP_cdnNotPt_nv_V4; + + // Conditional store new-value word + case Hexagon::STriw_cPt_nv_V4 : + return Hexagon::STriw_cdnPt_nv_V4; + case Hexagon::STriw_cNotPt_nv_V4 : + return Hexagon::STriw_cdnNotPt_nv_V4; + + case Hexagon::STriw_indexed_cPt_nv_V4 : + return Hexagon::STriw_indexed_cdnPt_nv_V4; + case Hexagon::STriw_indexed_cNotPt_nv_V4 : + return Hexagon::STriw_indexed_cdnNotPt_nv_V4; + + case Hexagon::STriw_indexed_shl_cPt_nv_V4 : + return Hexagon::STriw_indexed_shl_cdnPt_nv_V4; + case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 : + return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4; + + case Hexagon::POST_STwri_cPt_nv_V4 : + return Hexagon::POST_STwri_cdnPt_nv_V4; + case Hexagon::POST_STwri_cNotPt_nv_V4: + return Hexagon::POST_STwri_cdnNotPt_nv_V4; + + case Hexagon::STw_GP_cPt_nv_V4 : + return Hexagon::STw_GP_cdnPt_nv_V4; + + case Hexagon::STw_GP_cNotPt_nv_V4 : + return Hexagon::STw_GP_cdnNotPt_nv_V4; + + case Hexagon::STriw_GP_cPt_nv_V4 : + return Hexagon::STriw_GP_cdnPt_nv_V4; + + case Hexagon::STriw_GP_cNotPt_nv_V4 : + return Hexagon::STriw_GP_cdnNotPt_nv_V4; + + // Conditional add + case Hexagon::ADD_ri_cPt : + return Hexagon::ADD_ri_cdnPt; + case Hexagon::ADD_ri_cNotPt : + return Hexagon::ADD_ri_cdnNotPt; + + case Hexagon::ADD_rr_cPt : + return Hexagon::ADD_rr_cdnPt; + case Hexagon::ADD_rr_cNotPt : + return Hexagon::ADD_rr_cdnNotPt; + + // Conditional logical Operations + case Hexagon::XOR_rr_cPt : + return Hexagon::XOR_rr_cdnPt; + case Hexagon::XOR_rr_cNotPt : + return Hexagon::XOR_rr_cdnNotPt; + + case Hexagon::AND_rr_cPt : + return Hexagon::AND_rr_cdnPt; + case Hexagon::AND_rr_cNotPt : + return Hexagon::AND_rr_cdnNotPt; + + case Hexagon::OR_rr_cPt : + return Hexagon::OR_rr_cdnPt; + case Hexagon::OR_rr_cNotPt : + return Hexagon::OR_rr_cdnNotPt; + + // Conditional Subtract + case Hexagon::SUB_rr_cPt : + return Hexagon::SUB_rr_cdnPt; + case Hexagon::SUB_rr_cNotPt : + return Hexagon::SUB_rr_cdnNotPt; + + // Conditional combine + case Hexagon::COMBINE_rr_cPt : + return Hexagon::COMBINE_rr_cdnPt; + case Hexagon::COMBINE_rr_cNotPt : + return Hexagon::COMBINE_rr_cdnNotPt; + + case Hexagon::ASLH_cPt_V4 : + return Hexagon::ASLH_cdnPt_V4; + case Hexagon::ASLH_cNotPt_V4 : + return Hexagon::ASLH_cdnNotPt_V4; + + case Hexagon::ASRH_cPt_V4 : + return Hexagon::ASRH_cdnPt_V4; + case Hexagon::ASRH_cNotPt_V4 : + return Hexagon::ASRH_cdnNotPt_V4; + + case Hexagon::SXTB_cPt_V4 : + return Hexagon::SXTB_cdnPt_V4; + case Hexagon::SXTB_cNotPt_V4 : + return Hexagon::SXTB_cdnNotPt_V4; + + case Hexagon::SXTH_cPt_V4 : + return Hexagon::SXTH_cdnPt_V4; + case Hexagon::SXTH_cNotPt_V4 : + return Hexagon::SXTH_cdnNotPt_V4; + + case Hexagon::ZXTB_cPt_V4 : + return Hexagon::ZXTB_cdnPt_V4; + case Hexagon::ZXTB_cNotPt_V4 : + return Hexagon::ZXTB_cdnNotPt_V4; + + case Hexagon::ZXTH_cPt_V4 : + return Hexagon::ZXTH_cdnPt_V4; + case Hexagon::ZXTH_cNotPt_V4 : + return Hexagon::ZXTH_cdnNotPt_V4; + } +} + +// Returns true if an instruction can be promoted to .new predicate +// or new-value store. +bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) { + if ( isCondInst(MI) || IsNewifyStore(MI)) + return true; + else + return false; +} + +bool HexagonPacketizerList::isCondInst (MachineInstr* MI) { + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + const MCInstrDesc& TID = MI->getDesc(); + // bug 5670: until that is fixed, + // this portion is disabled. + if ( TID.isConditionalBranch() // && !IsRegisterJump(MI)) || + || QII->isConditionalTransfer(MI) + || QII->isConditionalALU32(MI) + || QII->isConditionalLoad(MI) + || QII->isConditionalStore(MI)) { + return true; + } + return false; +} + + +// Promote an instructiont to its .new form. +// At this time, we have already made a call to CanPromoteToDotNew +// and made sure that it can *indeed* be promoted. +bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI, + SDep::Kind DepType, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC) { + + assert (DepType == SDep::Data); + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + + int NewOpcode; + if (RC == &Hexagon::PredRegsRegClass) + NewOpcode = GetDotNewPredOp(MI->getOpcode()); + else + NewOpcode = GetDotNewOp(MI->getOpcode()); + MI->setDesc(QII->get(NewOpcode)); + + return true; +} + +// Returns the most basic instruction for the .new predicated instructions and +// new-value stores. +// For example, all of the following instructions will be converted back to the +// same instruction: +// 1) if (p0.new) memw(R0+#0) = R1.new ---> +// 2) if (p0) memw(R0+#0)= R1.new -------> if (p0) memw(R0+#0) = R1 +// 3) if (p0.new) memw(R0+#0) = R1 ---> +// +// To understand the translation of instruction 1 to its original form, consider +// a packet with 3 instructions. +// { p0 = cmp.eq(R0,R1) +// if (p0.new) R2 = add(R3, R4) +// R5 = add (R3, R1) +// } +// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet +// +// This instruction can be part of the previous packet only if both p0 and R2 +// are promoted to .new values. This promotion happens in steps, first +// predicate register is promoted to .new and in the next iteration R2 is +// promoted. Therefore, in case of dependence check failure (due to R5) during +// next iteration, it should be converted back to its most basic form. + +static int GetDotOldOp(const int opc) { + switch (opc) { + default: llvm_unreachable("Unknown .old type"); + case Hexagon::TFR_cdnPt: + return Hexagon::TFR_cPt; + + case Hexagon::TFR_cdnNotPt: + return Hexagon::TFR_cNotPt; + + case Hexagon::TFRI_cdnPt: + return Hexagon::TFRI_cPt; + + case Hexagon::TFRI_cdnNotPt: + return Hexagon::TFRI_cNotPt; + + case Hexagon::JMP_cdnPt: + return Hexagon::JMP_c; + + case Hexagon::JMP_cdnNotPt: + return Hexagon::JMP_cNot; + + case Hexagon::JMPR_cdnPt_V3: + return Hexagon::JMPR_cPt; + + case Hexagon::JMPR_cdnNotPt_V3: + return Hexagon::JMPR_cNotPt; + + // Load double word + + case Hexagon::LDrid_cdnPt : + return Hexagon::LDrid_cPt; + + case Hexagon::LDrid_cdnNotPt : + return Hexagon::LDrid_cNotPt; + + case Hexagon::LDrid_indexed_cdnPt : + return Hexagon::LDrid_indexed_cPt; + + case Hexagon::LDrid_indexed_cdnNotPt : + return Hexagon::LDrid_indexed_cNotPt; + + case Hexagon::POST_LDrid_cdnPt_V4 : + return Hexagon::POST_LDrid_cPt; + + case Hexagon::POST_LDrid_cdnNotPt_V4 : + return Hexagon::POST_LDrid_cNotPt; + + // Load word + + case Hexagon::LDriw_cdnPt : + return Hexagon::LDriw_cPt; + + case Hexagon::LDriw_cdnNotPt : + return Hexagon::LDriw_cNotPt; + + case Hexagon::LDriw_indexed_cdnPt : + return Hexagon::LDriw_indexed_cPt; + + case Hexagon::LDriw_indexed_cdnNotPt : + return Hexagon::LDriw_indexed_cNotPt; + + case Hexagon::POST_LDriw_cdnPt_V4 : + return Hexagon::POST_LDriw_cPt; + + case Hexagon::POST_LDriw_cdnNotPt_V4 : + return Hexagon::POST_LDriw_cNotPt; + + // Load half + + case Hexagon::LDrih_cdnPt : + return Hexagon::LDrih_cPt; + + case Hexagon::LDrih_cdnNotPt : + return Hexagon::LDrih_cNotPt; + + case Hexagon::LDrih_indexed_cdnPt : + return Hexagon::LDrih_indexed_cPt; + + case Hexagon::LDrih_indexed_cdnNotPt : + return Hexagon::LDrih_indexed_cNotPt; + + case Hexagon::POST_LDrih_cdnPt_V4 : + return Hexagon::POST_LDrih_cPt; + + case Hexagon::POST_LDrih_cdnNotPt_V4 : + return Hexagon::POST_LDrih_cNotPt; + + // Load byte + + case Hexagon::LDrib_cdnPt : + return Hexagon::LDrib_cPt; + + case Hexagon::LDrib_cdnNotPt : + return Hexagon::LDrib_cNotPt; + + case Hexagon::LDrib_indexed_cdnPt : + return Hexagon::LDrib_indexed_cPt; + + case Hexagon::LDrib_indexed_cdnNotPt : + return Hexagon::LDrib_indexed_cNotPt; + + case Hexagon::POST_LDrib_cdnPt_V4 : + return Hexagon::POST_LDrib_cPt; + + case Hexagon::POST_LDrib_cdnNotPt_V4 : + return Hexagon::POST_LDrib_cNotPt; + + // Load unsigned half + + case Hexagon::LDriuh_cdnPt : + return Hexagon::LDriuh_cPt; + + case Hexagon::LDriuh_cdnNotPt : + return Hexagon::LDriuh_cNotPt; + + case Hexagon::LDriuh_indexed_cdnPt : + return Hexagon::LDriuh_indexed_cPt; + + case Hexagon::LDriuh_indexed_cdnNotPt : + return Hexagon::LDriuh_indexed_cNotPt; + + case Hexagon::POST_LDriuh_cdnPt_V4 : + return Hexagon::POST_LDriuh_cPt; + + case Hexagon::POST_LDriuh_cdnNotPt_V4 : + return Hexagon::POST_LDriuh_cNotPt; + + // Load unsigned byte + case Hexagon::LDriub_cdnPt : + return Hexagon::LDriub_cPt; + + case Hexagon::LDriub_cdnNotPt : + return Hexagon::LDriub_cNotPt; + + case Hexagon::LDriub_indexed_cdnPt : + return Hexagon::LDriub_indexed_cPt; + + case Hexagon::LDriub_indexed_cdnNotPt : + return Hexagon::LDriub_indexed_cNotPt; + + case Hexagon::POST_LDriub_cdnPt_V4 : + return Hexagon::POST_LDriub_cPt; + + case Hexagon::POST_LDriub_cdnNotPt_V4 : + return Hexagon::POST_LDriub_cNotPt; + + // V4 indexed+scaled Load + + case Hexagon::LDrid_indexed_cdnPt_V4 : + return Hexagon::LDrid_indexed_cPt_V4; + + case Hexagon::LDrid_indexed_cdnNotPt_V4 : + return Hexagon::LDrid_indexed_cNotPt_V4; + + case Hexagon::LDrid_indexed_shl_cdnPt_V4 : + return Hexagon::LDrid_indexed_shl_cPt_V4; + + case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 : + return Hexagon::LDrid_indexed_shl_cNotPt_V4; + + case Hexagon::LDrib_indexed_cdnPt_V4 : + return Hexagon::LDrib_indexed_cPt_V4; + + case Hexagon::LDrib_indexed_cdnNotPt_V4 : + return Hexagon::LDrib_indexed_cNotPt_V4; + + case Hexagon::LDrib_indexed_shl_cdnPt_V4 : + return Hexagon::LDrib_indexed_shl_cPt_V4; + + case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 : + return Hexagon::LDrib_indexed_shl_cNotPt_V4; + + case Hexagon::LDriub_indexed_cdnPt_V4 : + return Hexagon::LDriub_indexed_cPt_V4; + + case Hexagon::LDriub_indexed_cdnNotPt_V4 : + return Hexagon::LDriub_indexed_cNotPt_V4; + + case Hexagon::LDriub_indexed_shl_cdnPt_V4 : + return Hexagon::LDriub_indexed_shl_cPt_V4; + + case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 : + return Hexagon::LDriub_indexed_shl_cNotPt_V4; + + case Hexagon::LDrih_indexed_cdnPt_V4 : + return Hexagon::LDrih_indexed_cPt_V4; + + case Hexagon::LDrih_indexed_cdnNotPt_V4 : + return Hexagon::LDrih_indexed_cNotPt_V4; + + case Hexagon::LDrih_indexed_shl_cdnPt_V4 : + return Hexagon::LDrih_indexed_shl_cPt_V4; + + case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 : + return Hexagon::LDrih_indexed_shl_cNotPt_V4; + + case Hexagon::LDriuh_indexed_cdnPt_V4 : + return Hexagon::LDriuh_indexed_cPt_V4; + + case Hexagon::LDriuh_indexed_cdnNotPt_V4 : + return Hexagon::LDriuh_indexed_cNotPt_V4; + + case Hexagon::LDriuh_indexed_shl_cdnPt_V4 : + return Hexagon::LDriuh_indexed_shl_cPt_V4; + + case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 : + return Hexagon::LDriuh_indexed_shl_cNotPt_V4; + + case Hexagon::LDriw_indexed_cdnPt_V4 : + return Hexagon::LDriw_indexed_cPt_V4; + + case Hexagon::LDriw_indexed_cdnNotPt_V4 : + return Hexagon::LDriw_indexed_cNotPt_V4; + + case Hexagon::LDriw_indexed_shl_cdnPt_V4 : + return Hexagon::LDriw_indexed_shl_cPt_V4; + + case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 : + return Hexagon::LDriw_indexed_shl_cNotPt_V4; + + // V4 global address load + + case Hexagon::LDd_GP_cdnPt_V4: + return Hexagon::LDd_GP_cPt_V4; + + case Hexagon::LDd_GP_cdnNotPt_V4: + return Hexagon::LDd_GP_cNotPt_V4; + + case Hexagon::LDb_GP_cdnPt_V4: + return Hexagon::LDb_GP_cPt_V4; + + case Hexagon::LDb_GP_cdnNotPt_V4: + return Hexagon::LDb_GP_cNotPt_V4; + + case Hexagon::LDub_GP_cdnPt_V4: + return Hexagon::LDub_GP_cPt_V4; + + case Hexagon::LDub_GP_cdnNotPt_V4: + return Hexagon::LDub_GP_cNotPt_V4; + + case Hexagon::LDh_GP_cdnPt_V4: + return Hexagon::LDh_GP_cPt_V4; + + case Hexagon::LDh_GP_cdnNotPt_V4: + return Hexagon::LDh_GP_cNotPt_V4; + + case Hexagon::LDuh_GP_cdnPt_V4: + return Hexagon::LDuh_GP_cPt_V4; + + case Hexagon::LDuh_GP_cdnNotPt_V4: + return Hexagon::LDuh_GP_cNotPt_V4; + + case Hexagon::LDw_GP_cdnPt_V4: + return Hexagon::LDw_GP_cPt_V4; + + case Hexagon::LDw_GP_cdnNotPt_V4: + return Hexagon::LDw_GP_cNotPt_V4; + + case Hexagon::LDrid_GP_cdnPt_V4: + return Hexagon::LDrid_GP_cPt_V4; + + case Hexagon::LDrid_GP_cdnNotPt_V4: + return Hexagon::LDrid_GP_cNotPt_V4; + + case Hexagon::LDrib_GP_cdnPt_V4: + return Hexagon::LDrib_GP_cPt_V4; + + case Hexagon::LDrib_GP_cdnNotPt_V4: + return Hexagon::LDrib_GP_cNotPt_V4; + + case Hexagon::LDriub_GP_cdnPt_V4: + return Hexagon::LDriub_GP_cPt_V4; + + case Hexagon::LDriub_GP_cdnNotPt_V4: + return Hexagon::LDriub_GP_cNotPt_V4; + + case Hexagon::LDrih_GP_cdnPt_V4: + return Hexagon::LDrih_GP_cPt_V4; + + case Hexagon::LDrih_GP_cdnNotPt_V4: + return Hexagon::LDrih_GP_cNotPt_V4; + + case Hexagon::LDriuh_GP_cdnPt_V4: + return Hexagon::LDriuh_GP_cPt_V4; + + case Hexagon::LDriuh_GP_cdnNotPt_V4: + return Hexagon::LDriuh_GP_cNotPt_V4; + + case Hexagon::LDriw_GP_cdnPt_V4: + return Hexagon::LDriw_GP_cPt_V4; + + case Hexagon::LDriw_GP_cdnNotPt_V4: + return Hexagon::LDriw_GP_cNotPt_V4; + + // Conditional add + + case Hexagon::ADD_ri_cdnPt : + return Hexagon::ADD_ri_cPt; + case Hexagon::ADD_ri_cdnNotPt : + return Hexagon::ADD_ri_cNotPt; + + case Hexagon::ADD_rr_cdnPt : + return Hexagon::ADD_rr_cPt; + case Hexagon::ADD_rr_cdnNotPt: + return Hexagon::ADD_rr_cNotPt; + + // Conditional logical Operations + + case Hexagon::XOR_rr_cdnPt : + return Hexagon::XOR_rr_cPt; + case Hexagon::XOR_rr_cdnNotPt : + return Hexagon::XOR_rr_cNotPt; + + case Hexagon::AND_rr_cdnPt : + return Hexagon::AND_rr_cPt; + case Hexagon::AND_rr_cdnNotPt : + return Hexagon::AND_rr_cNotPt; + + case Hexagon::OR_rr_cdnPt : + return Hexagon::OR_rr_cPt; + case Hexagon::OR_rr_cdnNotPt : + return Hexagon::OR_rr_cNotPt; + + // Conditional Subtract + + case Hexagon::SUB_rr_cdnPt : + return Hexagon::SUB_rr_cPt; + case Hexagon::SUB_rr_cdnNotPt : + return Hexagon::SUB_rr_cNotPt; + + // Conditional combine + + case Hexagon::COMBINE_rr_cdnPt : + return Hexagon::COMBINE_rr_cPt; + case Hexagon::COMBINE_rr_cdnNotPt : + return Hexagon::COMBINE_rr_cNotPt; + +// Conditional shift operations + + case Hexagon::ASLH_cdnPt_V4 : + return Hexagon::ASLH_cPt_V4; + case Hexagon::ASLH_cdnNotPt_V4 : + return Hexagon::ASLH_cNotPt_V4; + + case Hexagon::ASRH_cdnPt_V4 : + return Hexagon::ASRH_cPt_V4; + case Hexagon::ASRH_cdnNotPt_V4 : + return Hexagon::ASRH_cNotPt_V4; + + case Hexagon::SXTB_cdnPt_V4 : + return Hexagon::SXTB_cPt_V4; + case Hexagon::SXTB_cdnNotPt_V4 : + return Hexagon::SXTB_cNotPt_V4; + + case Hexagon::SXTH_cdnPt_V4 : + return Hexagon::SXTH_cPt_V4; + case Hexagon::SXTH_cdnNotPt_V4 : + return Hexagon::SXTH_cNotPt_V4; + + case Hexagon::ZXTB_cdnPt_V4 : + return Hexagon::ZXTB_cPt_V4; + case Hexagon::ZXTB_cdnNotPt_V4 : + return Hexagon::ZXTB_cNotPt_V4; + + case Hexagon::ZXTH_cdnPt_V4 : + return Hexagon::ZXTH_cPt_V4; + case Hexagon::ZXTH_cdnNotPt_V4 : + return Hexagon::ZXTH_cNotPt_V4; + + // Store byte + + case Hexagon::STrib_imm_cdnPt_V4 : + return Hexagon::STrib_imm_cPt_V4; + + case Hexagon::STrib_imm_cdnNotPt_V4 : + return Hexagon::STrib_imm_cNotPt_V4; + + case Hexagon::STrib_cdnPt_nv_V4 : + case Hexagon::STrib_cPt_nv_V4 : + case Hexagon::STrib_cdnPt_V4 : + return Hexagon::STrib_cPt; + + case Hexagon::STrib_cdnNotPt_nv_V4 : + case Hexagon::STrib_cNotPt_nv_V4 : + case Hexagon::STrib_cdnNotPt_V4 : + return Hexagon::STrib_cNotPt; + + case Hexagon::STrib_indexed_cdnPt_V4 : + case Hexagon::STrib_indexed_cPt_nv_V4 : + case Hexagon::STrib_indexed_cdnPt_nv_V4 : + return Hexagon::STrib_indexed_cPt; + + case Hexagon::STrib_indexed_cdnNotPt_V4 : + case Hexagon::STrib_indexed_cNotPt_nv_V4 : + case Hexagon::STrib_indexed_cdnNotPt_nv_V4 : + return Hexagon::STrib_indexed_cNotPt; + + case Hexagon::STrib_indexed_shl_cdnPt_nv_V4: + case Hexagon::STrib_indexed_shl_cPt_nv_V4 : + case Hexagon::STrib_indexed_shl_cdnPt_V4 : + return Hexagon::STrib_indexed_shl_cPt_V4; + + case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4: + case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 : + case Hexagon::STrib_indexed_shl_cdnNotPt_V4 : + return Hexagon::STrib_indexed_shl_cNotPt_V4; + + case Hexagon::POST_STbri_cdnPt_nv_V4 : + case Hexagon::POST_STbri_cPt_nv_V4 : + case Hexagon::POST_STbri_cdnPt_V4 : + return Hexagon::POST_STbri_cPt; + + case Hexagon::POST_STbri_cdnNotPt_nv_V4 : + case Hexagon::POST_STbri_cNotPt_nv_V4: + case Hexagon::POST_STbri_cdnNotPt_V4 : + return Hexagon::POST_STbri_cNotPt; + + case Hexagon::STb_GP_cdnPt_nv_V4: + case Hexagon::STb_GP_cdnPt_V4: + case Hexagon::STb_GP_cPt_nv_V4: + return Hexagon::STb_GP_cPt_V4; + + case Hexagon::STb_GP_cdnNotPt_nv_V4: + case Hexagon::STb_GP_cdnNotPt_V4: + case Hexagon::STb_GP_cNotPt_nv_V4: + return Hexagon::STb_GP_cNotPt_V4; + + case Hexagon::STrib_GP_cdnPt_nv_V4: + case Hexagon::STrib_GP_cdnPt_V4: + case Hexagon::STrib_GP_cPt_nv_V4: + return Hexagon::STrib_GP_cPt_V4; + + case Hexagon::STrib_GP_cdnNotPt_nv_V4: + case Hexagon::STrib_GP_cdnNotPt_V4: + case Hexagon::STrib_GP_cNotPt_nv_V4: + return Hexagon::STrib_GP_cNotPt_V4; + + // Store new-value byte - unconditional + case Hexagon::STrib_nv_V4: + return Hexagon::STrib; + + case Hexagon::STrib_indexed_nv_V4: + return Hexagon::STrib_indexed; + + case Hexagon::STrib_indexed_shl_nv_V4: + return Hexagon::STrib_indexed_shl_V4; + + case Hexagon::STrib_shl_nv_V4: + return Hexagon::STrib_shl_V4; + + case Hexagon::STrib_GP_nv_V4: + return Hexagon::STrib_GP_V4; + + case Hexagon::STb_GP_nv_V4: + return Hexagon::STb_GP_V4; + + case Hexagon::POST_STbri_nv_V4: + return Hexagon::POST_STbri; + + // Store halfword + case Hexagon::STrih_imm_cdnPt_V4 : + return Hexagon::STrih_imm_cPt_V4; + + case Hexagon::STrih_imm_cdnNotPt_V4 : + return Hexagon::STrih_imm_cNotPt_V4; + + case Hexagon::STrih_cdnPt_nv_V4 : + case Hexagon::STrih_cPt_nv_V4 : + case Hexagon::STrih_cdnPt_V4 : + return Hexagon::STrih_cPt; + + case Hexagon::STrih_cdnNotPt_nv_V4 : + case Hexagon::STrih_cNotPt_nv_V4 : + case Hexagon::STrih_cdnNotPt_V4 : + return Hexagon::STrih_cNotPt; + + case Hexagon::STrih_indexed_cdnPt_nv_V4: + case Hexagon::STrih_indexed_cPt_nv_V4 : + case Hexagon::STrih_indexed_cdnPt_V4 : + return Hexagon::STrih_indexed_cPt; + + case Hexagon::STrih_indexed_cdnNotPt_nv_V4: + case Hexagon::STrih_indexed_cNotPt_nv_V4 : + case Hexagon::STrih_indexed_cdnNotPt_V4 : + return Hexagon::STrih_indexed_cNotPt; + + case Hexagon::STrih_indexed_shl_cdnPt_nv_V4 : + case Hexagon::STrih_indexed_shl_cPt_nv_V4 : + case Hexagon::STrih_indexed_shl_cdnPt_V4 : + return Hexagon::STrih_indexed_shl_cPt_V4; + + case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4 : + case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 : + case Hexagon::STrih_indexed_shl_cdnNotPt_V4 : + return Hexagon::STrih_indexed_shl_cNotPt_V4; + + case Hexagon::POST_SThri_cdnPt_nv_V4 : + case Hexagon::POST_SThri_cPt_nv_V4 : + case Hexagon::POST_SThri_cdnPt_V4 : + return Hexagon::POST_SThri_cPt; + + case Hexagon::POST_SThri_cdnNotPt_nv_V4 : + case Hexagon::POST_SThri_cNotPt_nv_V4 : + case Hexagon::POST_SThri_cdnNotPt_V4 : + return Hexagon::POST_SThri_cNotPt; + + case Hexagon::STh_GP_cdnPt_nv_V4: + case Hexagon::STh_GP_cdnPt_V4: + case Hexagon::STh_GP_cPt_nv_V4: + return Hexagon::STh_GP_cPt_V4; + + case Hexagon::STh_GP_cdnNotPt_nv_V4: + case Hexagon::STh_GP_cdnNotPt_V4: + case Hexagon::STh_GP_cNotPt_nv_V4: + return Hexagon::STh_GP_cNotPt_V4; + + case Hexagon::STrih_GP_cdnPt_nv_V4: + case Hexagon::STrih_GP_cdnPt_V4: + case Hexagon::STrih_GP_cPt_nv_V4: + return Hexagon::STrih_GP_cPt_V4; + + case Hexagon::STrih_GP_cdnNotPt_nv_V4: + case Hexagon::STrih_GP_cdnNotPt_V4: + case Hexagon::STrih_GP_cNotPt_nv_V4: + return Hexagon::STrih_GP_cNotPt_V4; + + // Store new-value halfword - unconditional + + case Hexagon::STrih_nv_V4: + return Hexagon::STrih; + + case Hexagon::STrih_indexed_nv_V4: + return Hexagon::STrih_indexed; + + case Hexagon::STrih_indexed_shl_nv_V4: + return Hexagon::STrih_indexed_shl_V4; + + case Hexagon::STrih_shl_nv_V4: + return Hexagon::STrih_shl_V4; + + case Hexagon::STrih_GP_nv_V4: + return Hexagon::STrih_GP_V4; + + case Hexagon::STh_GP_nv_V4: + return Hexagon::STh_GP_V4; + + case Hexagon::POST_SThri_nv_V4: + return Hexagon::POST_SThri; + + // Store word + + case Hexagon::STriw_imm_cdnPt_V4 : + return Hexagon::STriw_imm_cPt_V4; + + case Hexagon::STriw_imm_cdnNotPt_V4 : + return Hexagon::STriw_imm_cNotPt_V4; + + case Hexagon::STriw_cdnPt_nv_V4 : + case Hexagon::STriw_cPt_nv_V4 : + case Hexagon::STriw_cdnPt_V4 : + return Hexagon::STriw_cPt; + + case Hexagon::STriw_cdnNotPt_nv_V4 : + case Hexagon::STriw_cNotPt_nv_V4 : + case Hexagon::STriw_cdnNotPt_V4 : + return Hexagon::STriw_cNotPt; + + case Hexagon::STriw_indexed_cdnPt_nv_V4 : + case Hexagon::STriw_indexed_cPt_nv_V4 : + case Hexagon::STriw_indexed_cdnPt_V4 : + return Hexagon::STriw_indexed_cPt; + + case Hexagon::STriw_indexed_cdnNotPt_nv_V4 : + case Hexagon::STriw_indexed_cNotPt_nv_V4 : + case Hexagon::STriw_indexed_cdnNotPt_V4 : + return Hexagon::STriw_indexed_cNotPt; + + case Hexagon::STriw_indexed_shl_cdnPt_nv_V4 : + case Hexagon::STriw_indexed_shl_cPt_nv_V4 : + case Hexagon::STriw_indexed_shl_cdnPt_V4 : + return Hexagon::STriw_indexed_shl_cPt_V4; + + case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4 : + case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 : + case Hexagon::STriw_indexed_shl_cdnNotPt_V4 : + return Hexagon::STriw_indexed_shl_cNotPt_V4; + + case Hexagon::POST_STwri_cdnPt_nv_V4 : + case Hexagon::POST_STwri_cPt_nv_V4 : + case Hexagon::POST_STwri_cdnPt_V4 : + return Hexagon::POST_STwri_cPt; + + case Hexagon::POST_STwri_cdnNotPt_nv_V4 : + case Hexagon::POST_STwri_cNotPt_nv_V4 : + case Hexagon::POST_STwri_cdnNotPt_V4 : + return Hexagon::POST_STwri_cNotPt; + + case Hexagon::STw_GP_cdnPt_nv_V4: + case Hexagon::STw_GP_cdnPt_V4: + case Hexagon::STw_GP_cPt_nv_V4: + return Hexagon::STw_GP_cPt_V4; + + case Hexagon::STw_GP_cdnNotPt_nv_V4: + case Hexagon::STw_GP_cdnNotPt_V4: + case Hexagon::STw_GP_cNotPt_nv_V4: + return Hexagon::STw_GP_cNotPt_V4; + + case Hexagon::STriw_GP_cdnPt_nv_V4: + case Hexagon::STriw_GP_cdnPt_V4: + case Hexagon::STriw_GP_cPt_nv_V4: + return Hexagon::STriw_GP_cPt_V4; + + case Hexagon::STriw_GP_cdnNotPt_nv_V4: + case Hexagon::STriw_GP_cdnNotPt_V4: + case Hexagon::STriw_GP_cNotPt_nv_V4: + return Hexagon::STriw_GP_cNotPt_V4; + + // Store new-value word - unconditional + + case Hexagon::STriw_nv_V4: + return Hexagon::STriw; + + case Hexagon::STriw_indexed_nv_V4: + return Hexagon::STriw_indexed; + + case Hexagon::STriw_indexed_shl_nv_V4: + return Hexagon::STriw_indexed_shl_V4; + + case Hexagon::STriw_shl_nv_V4: + return Hexagon::STriw_shl_V4; + + case Hexagon::STriw_GP_nv_V4: + return Hexagon::STriw_GP_V4; + + case Hexagon::STw_GP_nv_V4: + return Hexagon::STw_GP_V4; + + case Hexagon::POST_STwri_nv_V4: + return Hexagon::POST_STwri; + + // Store doubleword + + case Hexagon::STrid_cdnPt_V4 : + return Hexagon::STrid_cPt; + + case Hexagon::STrid_cdnNotPt_V4 : + return Hexagon::STrid_cNotPt; + + case Hexagon::STrid_indexed_cdnPt_V4 : + return Hexagon::STrid_indexed_cPt; + + case Hexagon::STrid_indexed_cdnNotPt_V4 : + return Hexagon::STrid_indexed_cNotPt; + + case Hexagon::STrid_indexed_shl_cdnPt_V4 : + return Hexagon::STrid_indexed_shl_cPt_V4; + + case Hexagon::STrid_indexed_shl_cdnNotPt_V4 : + return Hexagon::STrid_indexed_shl_cNotPt_V4; + + case Hexagon::POST_STdri_cdnPt_V4 : + return Hexagon::POST_STdri_cPt; + + case Hexagon::POST_STdri_cdnNotPt_V4 : + return Hexagon::POST_STdri_cNotPt; + + case Hexagon::STd_GP_cdnPt_V4 : + return Hexagon::STd_GP_cPt_V4; + + case Hexagon::STd_GP_cdnNotPt_V4 : + return Hexagon::STd_GP_cNotPt_V4; + + case Hexagon::STrid_GP_cdnPt_V4 : + return Hexagon::STrid_GP_cPt_V4; + + case Hexagon::STrid_GP_cdnNotPt_V4 : + return Hexagon::STrid_GP_cNotPt_V4; + } +} + +bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) { + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + int NewOpcode = GetDotOldOp(MI->getOpcode()); + MI->setDesc(QII->get(NewOpcode)); + return true; +} + +// Returns true if an instruction is predicated on p0 and false if it's +// predicated on !p0. + +static bool GetPredicateSense(MachineInstr* MI, + const HexagonInstrInfo *QII) { + + switch (MI->getOpcode()) { + default: llvm_unreachable("Unknown predicate sense of the instruction"); + case Hexagon::TFR_cPt: + case Hexagon::TFR_cdnPt: + case Hexagon::TFRI_cPt: + case Hexagon::TFRI_cdnPt: + case Hexagon::STrib_cPt : + case Hexagon::STrib_cdnPt_V4 : + case Hexagon::STrib_indexed_cPt : + case Hexagon::STrib_indexed_cdnPt_V4 : + case Hexagon::STrib_indexed_shl_cPt_V4 : + case Hexagon::STrib_indexed_shl_cdnPt_V4 : + case Hexagon::POST_STbri_cPt : + case Hexagon::POST_STbri_cdnPt_V4 : + case Hexagon::STrih_cPt : + case Hexagon::STrih_cdnPt_V4 : + case Hexagon::STrih_indexed_cPt : + case Hexagon::STrih_indexed_cdnPt_V4 : + case Hexagon::STrih_indexed_shl_cPt_V4 : + case Hexagon::STrih_indexed_shl_cdnPt_V4 : + case Hexagon::POST_SThri_cPt : + case Hexagon::POST_SThri_cdnPt_V4 : + case Hexagon::STriw_cPt : + case Hexagon::STriw_cdnPt_V4 : + case Hexagon::STriw_indexed_cPt : + case Hexagon::STriw_indexed_cdnPt_V4 : + case Hexagon::STriw_indexed_shl_cPt_V4 : + case Hexagon::STriw_indexed_shl_cdnPt_V4 : + case Hexagon::POST_STwri_cPt : + case Hexagon::POST_STwri_cdnPt_V4 : + case Hexagon::STrib_imm_cPt_V4 : + case Hexagon::STrib_imm_cdnPt_V4 : + case Hexagon::STrid_cPt : + case Hexagon::STrid_cdnPt_V4 : + case Hexagon::STrid_indexed_cPt : + case Hexagon::STrid_indexed_cdnPt_V4 : + case Hexagon::STrid_indexed_shl_cPt_V4 : + case Hexagon::STrid_indexed_shl_cdnPt_V4 : + case Hexagon::POST_STdri_cPt : + case Hexagon::POST_STdri_cdnPt_V4 : + case Hexagon::STrih_imm_cPt_V4 : + case Hexagon::STrih_imm_cdnPt_V4 : + case Hexagon::STriw_imm_cPt_V4 : + case Hexagon::STriw_imm_cdnPt_V4 : + case Hexagon::JMP_cdnPt : + case Hexagon::LDrid_cPt : + case Hexagon::LDrid_cdnPt : + case Hexagon::LDrid_indexed_cPt : + case Hexagon::LDrid_indexed_cdnPt : + case Hexagon::POST_LDrid_cPt : + case Hexagon::POST_LDrid_cdnPt_V4 : + case Hexagon::LDriw_cPt : + case Hexagon::LDriw_cdnPt : + case Hexagon::LDriw_indexed_cPt : + case Hexagon::LDriw_indexed_cdnPt : + case Hexagon::POST_LDriw_cPt : + case Hexagon::POST_LDriw_cdnPt_V4 : + case Hexagon::LDrih_cPt : + case Hexagon::LDrih_cdnPt : + case Hexagon::LDrih_indexed_cPt : + case Hexagon::LDrih_indexed_cdnPt : + case Hexagon::POST_LDrih_cPt : + case Hexagon::POST_LDrih_cdnPt_V4 : + case Hexagon::LDrib_cPt : + case Hexagon::LDrib_cdnPt : + case Hexagon::LDrib_indexed_cPt : + case Hexagon::LDrib_indexed_cdnPt : + case Hexagon::POST_LDrib_cPt : + case Hexagon::POST_LDrib_cdnPt_V4 : + case Hexagon::LDriuh_cPt : + case Hexagon::LDriuh_cdnPt : + case Hexagon::LDriuh_indexed_cPt : + case Hexagon::LDriuh_indexed_cdnPt : + case Hexagon::POST_LDriuh_cPt : + case Hexagon::POST_LDriuh_cdnPt_V4 : + case Hexagon::LDriub_cPt : + case Hexagon::LDriub_cdnPt : + case Hexagon::LDriub_indexed_cPt : + case Hexagon::LDriub_indexed_cdnPt : + case Hexagon::POST_LDriub_cPt : + case Hexagon::POST_LDriub_cdnPt_V4 : + case Hexagon::LDrid_indexed_cPt_V4 : + case Hexagon::LDrid_indexed_cdnPt_V4 : + case Hexagon::LDrid_indexed_shl_cPt_V4 : + case Hexagon::LDrid_indexed_shl_cdnPt_V4 : + case Hexagon::LDrib_indexed_cPt_V4 : + case Hexagon::LDrib_indexed_cdnPt_V4 : + case Hexagon::LDrib_indexed_shl_cPt_V4 : + case Hexagon::LDrib_indexed_shl_cdnPt_V4 : + case Hexagon::LDriub_indexed_cPt_V4 : + case Hexagon::LDriub_indexed_cdnPt_V4 : + case Hexagon::LDriub_indexed_shl_cPt_V4 : + case Hexagon::LDriub_indexed_shl_cdnPt_V4 : + case Hexagon::LDrih_indexed_cPt_V4 : + case Hexagon::LDrih_indexed_cdnPt_V4 : + case Hexagon::LDrih_indexed_shl_cPt_V4 : + case Hexagon::LDrih_indexed_shl_cdnPt_V4 : + case Hexagon::LDriuh_indexed_cPt_V4 : + case Hexagon::LDriuh_indexed_cdnPt_V4 : + case Hexagon::LDriuh_indexed_shl_cPt_V4 : + case Hexagon::LDriuh_indexed_shl_cdnPt_V4 : + case Hexagon::LDriw_indexed_cPt_V4 : + case Hexagon::LDriw_indexed_cdnPt_V4 : + case Hexagon::LDriw_indexed_shl_cPt_V4 : + case Hexagon::LDriw_indexed_shl_cdnPt_V4 : + case Hexagon::ADD_ri_cPt : + case Hexagon::ADD_ri_cdnPt : + case Hexagon::ADD_rr_cPt : + case Hexagon::ADD_rr_cdnPt : + case Hexagon::XOR_rr_cPt : + case Hexagon::XOR_rr_cdnPt : + case Hexagon::AND_rr_cPt : + case Hexagon::AND_rr_cdnPt : + case Hexagon::OR_rr_cPt : + case Hexagon::OR_rr_cdnPt : + case Hexagon::SUB_rr_cPt : + case Hexagon::SUB_rr_cdnPt : + case Hexagon::COMBINE_rr_cPt : + case Hexagon::COMBINE_rr_cdnPt : + case Hexagon::ASLH_cPt_V4 : + case Hexagon::ASLH_cdnPt_V4 : + case Hexagon::ASRH_cPt_V4 : + case Hexagon::ASRH_cdnPt_V4 : + case Hexagon::SXTB_cPt_V4 : + case Hexagon::SXTB_cdnPt_V4 : + case Hexagon::SXTH_cPt_V4 : + case Hexagon::SXTH_cdnPt_V4 : + case Hexagon::ZXTB_cPt_V4 : + case Hexagon::ZXTB_cdnPt_V4 : + case Hexagon::ZXTH_cPt_V4 : + case Hexagon::ZXTH_cdnPt_V4 : + case Hexagon::LDrid_GP_cPt_V4 : + case Hexagon::LDrib_GP_cPt_V4 : + case Hexagon::LDriub_GP_cPt_V4 : + case Hexagon::LDrih_GP_cPt_V4 : + case Hexagon::LDriuh_GP_cPt_V4 : + case Hexagon::LDriw_GP_cPt_V4 : + case Hexagon::LDd_GP_cPt_V4 : + case Hexagon::LDb_GP_cPt_V4 : + case Hexagon::LDub_GP_cPt_V4 : + case Hexagon::LDh_GP_cPt_V4 : + case Hexagon::LDuh_GP_cPt_V4 : + case Hexagon::LDw_GP_cPt_V4 : + case Hexagon::STrid_GP_cPt_V4 : + case Hexagon::STrib_GP_cPt_V4 : + case Hexagon::STrih_GP_cPt_V4 : + case Hexagon::STriw_GP_cPt_V4 : + case Hexagon::STd_GP_cPt_V4 : + case Hexagon::STb_GP_cPt_V4 : + case Hexagon::STh_GP_cPt_V4 : + case Hexagon::STw_GP_cPt_V4 : + case Hexagon::LDrid_GP_cdnPt_V4 : + case Hexagon::LDrib_GP_cdnPt_V4 : + case Hexagon::LDriub_GP_cdnPt_V4 : + case Hexagon::LDrih_GP_cdnPt_V4 : + case Hexagon::LDriuh_GP_cdnPt_V4 : + case Hexagon::LDriw_GP_cdnPt_V4 : + case Hexagon::LDd_GP_cdnPt_V4 : + case Hexagon::LDb_GP_cdnPt_V4 : + case Hexagon::LDub_GP_cdnPt_V4 : + case Hexagon::LDh_GP_cdnPt_V4 : + case Hexagon::LDuh_GP_cdnPt_V4 : + case Hexagon::LDw_GP_cdnPt_V4 : + case Hexagon::STrid_GP_cdnPt_V4 : + case Hexagon::STrib_GP_cdnPt_V4 : + case Hexagon::STrih_GP_cdnPt_V4 : + case Hexagon::STriw_GP_cdnPt_V4 : + case Hexagon::STd_GP_cdnPt_V4 : + case Hexagon::STb_GP_cdnPt_V4 : + case Hexagon::STh_GP_cdnPt_V4 : + case Hexagon::STw_GP_cdnPt_V4 : + return true; + + case Hexagon::TFR_cNotPt: + case Hexagon::TFR_cdnNotPt: + case Hexagon::TFRI_cNotPt: + case Hexagon::TFRI_cdnNotPt: + case Hexagon::STrib_cNotPt : + case Hexagon::STrib_cdnNotPt_V4 : + case Hexagon::STrib_indexed_cNotPt : + case Hexagon::STrib_indexed_cdnNotPt_V4 : + case Hexagon::STrib_indexed_shl_cNotPt_V4 : + case Hexagon::STrib_indexed_shl_cdnNotPt_V4 : + case Hexagon::POST_STbri_cNotPt : + case Hexagon::POST_STbri_cdnNotPt_V4 : + case Hexagon::STrih_cNotPt : + case Hexagon::STrih_cdnNotPt_V4 : + case Hexagon::STrih_indexed_cNotPt : + case Hexagon::STrih_indexed_cdnNotPt_V4 : + case Hexagon::STrih_indexed_shl_cNotPt_V4 : + case Hexagon::STrih_indexed_shl_cdnNotPt_V4 : + case Hexagon::POST_SThri_cNotPt : + case Hexagon::POST_SThri_cdnNotPt_V4 : + case Hexagon::STriw_cNotPt : + case Hexagon::STriw_cdnNotPt_V4 : + case Hexagon::STriw_indexed_cNotPt : + case Hexagon::STriw_indexed_cdnNotPt_V4 : + case Hexagon::STriw_indexed_shl_cNotPt_V4 : + case Hexagon::STriw_indexed_shl_cdnNotPt_V4 : + case Hexagon::POST_STwri_cNotPt : + case Hexagon::POST_STwri_cdnNotPt_V4 : + case Hexagon::STrib_imm_cNotPt_V4 : + case Hexagon::STrib_imm_cdnNotPt_V4 : + case Hexagon::STrid_cNotPt : + case Hexagon::STrid_cdnNotPt_V4 : + case Hexagon::STrid_indexed_cdnNotPt_V4 : + case Hexagon::STrid_indexed_cNotPt : + case Hexagon::STrid_indexed_shl_cNotPt_V4 : + case Hexagon::STrid_indexed_shl_cdnNotPt_V4 : + case Hexagon::POST_STdri_cNotPt : + case Hexagon::POST_STdri_cdnNotPt_V4 : + case Hexagon::STrih_imm_cNotPt_V4 : + case Hexagon::STrih_imm_cdnNotPt_V4 : + case Hexagon::STriw_imm_cNotPt_V4 : + case Hexagon::STriw_imm_cdnNotPt_V4 : + case Hexagon::JMP_cdnNotPt : + case Hexagon::LDrid_cNotPt : + case Hexagon::LDrid_cdnNotPt : + case Hexagon::LDrid_indexed_cNotPt : + case Hexagon::LDrid_indexed_cdnNotPt : + case Hexagon::POST_LDrid_cNotPt : + case Hexagon::POST_LDrid_cdnNotPt_V4 : + case Hexagon::LDriw_cNotPt : + case Hexagon::LDriw_cdnNotPt : + case Hexagon::LDriw_indexed_cNotPt : + case Hexagon::LDriw_indexed_cdnNotPt : + case Hexagon::POST_LDriw_cNotPt : + case Hexagon::POST_LDriw_cdnNotPt_V4 : + case Hexagon::LDrih_cNotPt : + case Hexagon::LDrih_cdnNotPt : + case Hexagon::LDrih_indexed_cNotPt : + case Hexagon::LDrih_indexed_cdnNotPt : + case Hexagon::POST_LDrih_cNotPt : + case Hexagon::POST_LDrih_cdnNotPt_V4 : + case Hexagon::LDrib_cNotPt : + case Hexagon::LDrib_cdnNotPt : + case Hexagon::LDrib_indexed_cNotPt : + case Hexagon::LDrib_indexed_cdnNotPt : + case Hexagon::POST_LDrib_cNotPt : + case Hexagon::POST_LDrib_cdnNotPt_V4 : + case Hexagon::LDriuh_cNotPt : + case Hexagon::LDriuh_cdnNotPt : + case Hexagon::LDriuh_indexed_cNotPt : + case Hexagon::LDriuh_indexed_cdnNotPt : + case Hexagon::POST_LDriuh_cNotPt : + case Hexagon::POST_LDriuh_cdnNotPt_V4 : + case Hexagon::LDriub_cNotPt : + case Hexagon::LDriub_cdnNotPt : + case Hexagon::LDriub_indexed_cNotPt : + case Hexagon::LDriub_indexed_cdnNotPt : + case Hexagon::POST_LDriub_cNotPt : + case Hexagon::POST_LDriub_cdnNotPt_V4 : + case Hexagon::LDrid_indexed_cNotPt_V4 : + case Hexagon::LDrid_indexed_cdnNotPt_V4 : + case Hexagon::LDrid_indexed_shl_cNotPt_V4 : + case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDrib_indexed_cNotPt_V4 : + case Hexagon::LDrib_indexed_cdnNotPt_V4 : + case Hexagon::LDrib_indexed_shl_cNotPt_V4 : + case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDriub_indexed_cNotPt_V4 : + case Hexagon::LDriub_indexed_cdnNotPt_V4 : + case Hexagon::LDriub_indexed_shl_cNotPt_V4 : + case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDrih_indexed_cNotPt_V4 : + case Hexagon::LDrih_indexed_cdnNotPt_V4 : + case Hexagon::LDrih_indexed_shl_cNotPt_V4 : + case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDriuh_indexed_cNotPt_V4 : + case Hexagon::LDriuh_indexed_cdnNotPt_V4 : + case Hexagon::LDriuh_indexed_shl_cNotPt_V4 : + case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDriw_indexed_cNotPt_V4 : + case Hexagon::LDriw_indexed_cdnNotPt_V4 : + case Hexagon::LDriw_indexed_shl_cNotPt_V4 : + case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 : + case Hexagon::ADD_ri_cNotPt : + case Hexagon::ADD_ri_cdnNotPt : + case Hexagon::ADD_rr_cNotPt : + case Hexagon::ADD_rr_cdnNotPt : + case Hexagon::XOR_rr_cNotPt : + case Hexagon::XOR_rr_cdnNotPt : + case Hexagon::AND_rr_cNotPt : + case Hexagon::AND_rr_cdnNotPt : + case Hexagon::OR_rr_cNotPt : + case Hexagon::OR_rr_cdnNotPt : + case Hexagon::SUB_rr_cNotPt : + case Hexagon::SUB_rr_cdnNotPt : + case Hexagon::COMBINE_rr_cNotPt : + case Hexagon::COMBINE_rr_cdnNotPt : + case Hexagon::ASLH_cNotPt_V4 : + case Hexagon::ASLH_cdnNotPt_V4 : + case Hexagon::ASRH_cNotPt_V4 : + case Hexagon::ASRH_cdnNotPt_V4 : + case Hexagon::SXTB_cNotPt_V4 : + case Hexagon::SXTB_cdnNotPt_V4 : + case Hexagon::SXTH_cNotPt_V4 : + case Hexagon::SXTH_cdnNotPt_V4 : + case Hexagon::ZXTB_cNotPt_V4 : + case Hexagon::ZXTB_cdnNotPt_V4 : + case Hexagon::ZXTH_cNotPt_V4 : + case Hexagon::ZXTH_cdnNotPt_V4 : + + case Hexagon::LDrid_GP_cNotPt_V4 : + case Hexagon::LDrib_GP_cNotPt_V4 : + case Hexagon::LDriub_GP_cNotPt_V4 : + case Hexagon::LDrih_GP_cNotPt_V4 : + case Hexagon::LDriuh_GP_cNotPt_V4 : + case Hexagon::LDriw_GP_cNotPt_V4 : + case Hexagon::LDd_GP_cNotPt_V4 : + case Hexagon::LDb_GP_cNotPt_V4 : + case Hexagon::LDub_GP_cNotPt_V4 : + case Hexagon::LDh_GP_cNotPt_V4 : + case Hexagon::LDuh_GP_cNotPt_V4 : + case Hexagon::LDw_GP_cNotPt_V4 : + case Hexagon::STrid_GP_cNotPt_V4 : + case Hexagon::STrib_GP_cNotPt_V4 : + case Hexagon::STrih_GP_cNotPt_V4 : + case Hexagon::STriw_GP_cNotPt_V4 : + case Hexagon::STd_GP_cNotPt_V4 : + case Hexagon::STb_GP_cNotPt_V4 : + case Hexagon::STh_GP_cNotPt_V4 : + case Hexagon::STw_GP_cNotPt_V4 : + case Hexagon::LDrid_GP_cdnNotPt_V4 : + case Hexagon::LDrib_GP_cdnNotPt_V4 : + case Hexagon::LDriub_GP_cdnNotPt_V4 : + case Hexagon::LDrih_GP_cdnNotPt_V4 : + case Hexagon::LDriuh_GP_cdnNotPt_V4 : + case Hexagon::LDriw_GP_cdnNotPt_V4 : + case Hexagon::LDd_GP_cdnNotPt_V4 : + case Hexagon::LDb_GP_cdnNotPt_V4 : + case Hexagon::LDub_GP_cdnNotPt_V4 : + case Hexagon::LDh_GP_cdnNotPt_V4 : + case Hexagon::LDuh_GP_cdnNotPt_V4 : + case Hexagon::LDw_GP_cdnNotPt_V4 : + case Hexagon::STrid_GP_cdnNotPt_V4 : + case Hexagon::STrib_GP_cdnNotPt_V4 : + case Hexagon::STrih_GP_cdnNotPt_V4 : + case Hexagon::STriw_GP_cdnNotPt_V4 : + case Hexagon::STd_GP_cdnNotPt_V4 : + case Hexagon::STb_GP_cdnNotPt_V4 : + case Hexagon::STh_GP_cdnNotPt_V4 : + case Hexagon::STw_GP_cdnNotPt_V4 : + return false; + } + // return *some value* to avoid compiler warning + return false; +} + +bool HexagonPacketizerList::isDotNewInst(MachineInstr* MI) { + if (isNewValueInst(MI)) + return true; + + switch (MI->getOpcode()) { + case Hexagon::TFR_cdnNotPt: + case Hexagon::TFR_cdnPt: + case Hexagon::TFRI_cdnNotPt: + case Hexagon::TFRI_cdnPt: + case Hexagon::LDrid_cdnPt : + case Hexagon::LDrid_cdnNotPt : + case Hexagon::LDrid_indexed_cdnPt : + case Hexagon::LDrid_indexed_cdnNotPt : + case Hexagon::POST_LDrid_cdnPt_V4 : + case Hexagon::POST_LDrid_cdnNotPt_V4 : + case Hexagon::LDriw_cdnPt : + case Hexagon::LDriw_cdnNotPt : + case Hexagon::LDriw_indexed_cdnPt : + case Hexagon::LDriw_indexed_cdnNotPt : + case Hexagon::POST_LDriw_cdnPt_V4 : + case Hexagon::POST_LDriw_cdnNotPt_V4 : + case Hexagon::LDrih_cdnPt : + case Hexagon::LDrih_cdnNotPt : + case Hexagon::LDrih_indexed_cdnPt : + case Hexagon::LDrih_indexed_cdnNotPt : + case Hexagon::POST_LDrih_cdnPt_V4 : + case Hexagon::POST_LDrih_cdnNotPt_V4 : + case Hexagon::LDrib_cdnPt : + case Hexagon::LDrib_cdnNotPt : + case Hexagon::LDrib_indexed_cdnPt : + case Hexagon::LDrib_indexed_cdnNotPt : + case Hexagon::POST_LDrib_cdnPt_V4 : + case Hexagon::POST_LDrib_cdnNotPt_V4 : + case Hexagon::LDriuh_cdnPt : + case Hexagon::LDriuh_cdnNotPt : + case Hexagon::LDriuh_indexed_cdnPt : + case Hexagon::LDriuh_indexed_cdnNotPt : + case Hexagon::POST_LDriuh_cdnPt_V4 : + case Hexagon::POST_LDriuh_cdnNotPt_V4 : + case Hexagon::LDriub_cdnPt : + case Hexagon::LDriub_cdnNotPt : + case Hexagon::LDriub_indexed_cdnPt : + case Hexagon::LDriub_indexed_cdnNotPt : + case Hexagon::POST_LDriub_cdnPt_V4 : + case Hexagon::POST_LDriub_cdnNotPt_V4 : + + case Hexagon::LDrid_indexed_cdnPt_V4 : + case Hexagon::LDrid_indexed_cdnNotPt_V4 : + case Hexagon::LDrid_indexed_shl_cdnPt_V4 : + case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDrib_indexed_cdnPt_V4 : + case Hexagon::LDrib_indexed_cdnNotPt_V4 : + case Hexagon::LDrib_indexed_shl_cdnPt_V4 : + case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDriub_indexed_cdnPt_V4 : + case Hexagon::LDriub_indexed_cdnNotPt_V4 : + case Hexagon::LDriub_indexed_shl_cdnPt_V4 : + case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDrih_indexed_cdnPt_V4 : + case Hexagon::LDrih_indexed_cdnNotPt_V4 : + case Hexagon::LDrih_indexed_shl_cdnPt_V4 : + case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDriuh_indexed_cdnPt_V4 : + case Hexagon::LDriuh_indexed_cdnNotPt_V4 : + case Hexagon::LDriuh_indexed_shl_cdnPt_V4 : + case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 : + case Hexagon::LDriw_indexed_cdnPt_V4 : + case Hexagon::LDriw_indexed_cdnNotPt_V4 : + case Hexagon::LDriw_indexed_shl_cdnPt_V4 : + case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 : + +// Coditional add + case Hexagon::ADD_ri_cdnPt: + case Hexagon::ADD_ri_cdnNotPt: + case Hexagon::ADD_rr_cdnPt: + case Hexagon::ADD_rr_cdnNotPt: + + // Conditional logical operations + case Hexagon::XOR_rr_cdnPt : + case Hexagon::XOR_rr_cdnNotPt : + case Hexagon::AND_rr_cdnPt : + case Hexagon::AND_rr_cdnNotPt : + case Hexagon::OR_rr_cdnPt : + case Hexagon::OR_rr_cdnNotPt : + + // Conditonal subtract + case Hexagon::SUB_rr_cdnPt : + case Hexagon::SUB_rr_cdnNotPt : + + // Conditional combine + case Hexagon::COMBINE_rr_cdnPt : + case Hexagon::COMBINE_rr_cdnNotPt : + + // Conditional shift operations + case Hexagon::ASLH_cdnPt_V4: + case Hexagon::ASLH_cdnNotPt_V4: + case Hexagon::ASRH_cdnPt_V4: + case Hexagon::ASRH_cdnNotPt_V4: + case Hexagon::SXTB_cdnPt_V4: + case Hexagon::SXTB_cdnNotPt_V4: + case Hexagon::SXTH_cdnPt_V4: + case Hexagon::SXTH_cdnNotPt_V4: + case Hexagon::ZXTB_cdnPt_V4: + case Hexagon::ZXTB_cdnNotPt_V4: + case Hexagon::ZXTH_cdnPt_V4: + case Hexagon::ZXTH_cdnNotPt_V4: + + // Conditional stores + case Hexagon::STrib_imm_cdnPt_V4 : + case Hexagon::STrib_imm_cdnNotPt_V4 : + case Hexagon::STrib_cdnPt_V4 : + case Hexagon::STrib_cdnNotPt_V4 : + case Hexagon::STrib_indexed_cdnPt_V4 : + case Hexagon::STrib_indexed_cdnNotPt_V4 : + case Hexagon::POST_STbri_cdnPt_V4 : + case Hexagon::POST_STbri_cdnNotPt_V4 : + case Hexagon::STrib_indexed_shl_cdnPt_V4 : + case Hexagon::STrib_indexed_shl_cdnNotPt_V4 : + + // Store doubleword conditionally + case Hexagon::STrid_indexed_cdnPt_V4 : + case Hexagon::STrid_indexed_cdnNotPt_V4 : + case Hexagon::STrid_indexed_shl_cdnPt_V4 : + case Hexagon::STrid_indexed_shl_cdnNotPt_V4 : + case Hexagon::POST_STdri_cdnPt_V4 : + case Hexagon::POST_STdri_cdnNotPt_V4 : + + // Store halfword conditionally + case Hexagon::STrih_cdnPt_V4 : + case Hexagon::STrih_cdnNotPt_V4 : + case Hexagon::STrih_indexed_cdnPt_V4 : + case Hexagon::STrih_indexed_cdnNotPt_V4 : + case Hexagon::STrih_imm_cdnPt_V4 : + case Hexagon::STrih_imm_cdnNotPt_V4 : + case Hexagon::STrih_indexed_shl_cdnPt_V4 : + case Hexagon::STrih_indexed_shl_cdnNotPt_V4 : + case Hexagon::POST_SThri_cdnPt_V4 : + case Hexagon::POST_SThri_cdnNotPt_V4 : + + // Store word conditionally + case Hexagon::STriw_cdnPt_V4 : + case Hexagon::STriw_cdnNotPt_V4 : + case Hexagon::STriw_indexed_cdnPt_V4 : + case Hexagon::STriw_indexed_cdnNotPt_V4 : + case Hexagon::STriw_imm_cdnPt_V4 : + case Hexagon::STriw_imm_cdnNotPt_V4 : + case Hexagon::STriw_indexed_shl_cdnPt_V4 : + case Hexagon::STriw_indexed_shl_cdnNotPt_V4 : + case Hexagon::POST_STwri_cdnPt_V4 : + case Hexagon::POST_STwri_cdnNotPt_V4 : + + case Hexagon::LDd_GP_cdnPt_V4: + case Hexagon::LDd_GP_cdnNotPt_V4: + case Hexagon::LDb_GP_cdnPt_V4: + case Hexagon::LDb_GP_cdnNotPt_V4: + case Hexagon::LDub_GP_cdnPt_V4: + case Hexagon::LDub_GP_cdnNotPt_V4: + case Hexagon::LDh_GP_cdnPt_V4: + case Hexagon::LDh_GP_cdnNotPt_V4: + case Hexagon::LDuh_GP_cdnPt_V4: + case Hexagon::LDuh_GP_cdnNotPt_V4: + case Hexagon::LDw_GP_cdnPt_V4: + case Hexagon::LDw_GP_cdnNotPt_V4: + case Hexagon::LDrid_GP_cdnPt_V4: + case Hexagon::LDrid_GP_cdnNotPt_V4: + case Hexagon::LDrib_GP_cdnPt_V4: + case Hexagon::LDrib_GP_cdnNotPt_V4: + case Hexagon::LDriub_GP_cdnPt_V4: + case Hexagon::LDriub_GP_cdnNotPt_V4: + case Hexagon::LDrih_GP_cdnPt_V4: + case Hexagon::LDrih_GP_cdnNotPt_V4: + case Hexagon::LDriuh_GP_cdnPt_V4: + case Hexagon::LDriuh_GP_cdnNotPt_V4: + case Hexagon::LDriw_GP_cdnPt_V4: + case Hexagon::LDriw_GP_cdnNotPt_V4: + + case Hexagon::STrid_GP_cdnPt_V4: + case Hexagon::STrid_GP_cdnNotPt_V4: + case Hexagon::STrib_GP_cdnPt_V4: + case Hexagon::STrib_GP_cdnNotPt_V4: + case Hexagon::STrih_GP_cdnPt_V4: + case Hexagon::STrih_GP_cdnNotPt_V4: + case Hexagon::STriw_GP_cdnPt_V4: + case Hexagon::STriw_GP_cdnNotPt_V4: + case Hexagon::STd_GP_cdnPt_V4: + case Hexagon::STd_GP_cdnNotPt_V4: + case Hexagon::STb_GP_cdnPt_V4: + case Hexagon::STb_GP_cdnNotPt_V4: + case Hexagon::STh_GP_cdnPt_V4: + case Hexagon::STh_GP_cdnNotPt_V4: + case Hexagon::STw_GP_cdnPt_V4: + case Hexagon::STw_GP_cdnNotPt_V4: + return true; + } + return false; +} + +static MachineOperand& GetPostIncrementOperand(MachineInstr *MI, + const HexagonInstrInfo *QII) { + assert(QII->isPostIncrement(MI) && "Not a post increment operation."); +#ifndef NDEBUG + // Post Increment means duplicates. Use dense map to find duplicates in the + // list. Caution: Densemap initializes with the minimum of 64 buckets, + // whereas there are at most 5 operands in the post increment. + DenseMap<unsigned, unsigned> DefRegsSet; + for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) + if (MI->getOperand(opNum).isReg() && + MI->getOperand(opNum).isDef()) { + DefRegsSet[MI->getOperand(opNum).getReg()] = 1; + } + + for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) + if (MI->getOperand(opNum).isReg() && + MI->getOperand(opNum).isUse()) { + if (DefRegsSet[MI->getOperand(opNum).getReg()]) { + return MI->getOperand(opNum); + } + } +#else + if (MI->getDesc().mayLoad()) { + // The 2nd operand is always the post increment operand in load. + assert(MI->getOperand(1).isReg() && + "Post increment operand has be to a register."); + return (MI->getOperand(1)); + } + if (MI->getDesc().mayStore()) { + // The 1st operand is always the post increment operand in store. + assert(MI->getOperand(0).isReg() && + "Post increment operand has be to a register."); + return (MI->getOperand(0)); + } +#endif + // we should never come here. + llvm_unreachable("mayLoad or mayStore not set for Post Increment operation"); +} + +// get the value being stored +static MachineOperand& GetStoreValueOperand(MachineInstr *MI) { + // value being stored is always the last operand. + return (MI->getOperand(MI->getNumOperands()-1)); +} + +// can be new value store? +// Following restrictions are to be respected in convert a store into +// a new value store. +// 1. If an instruction uses auto-increment, its address register cannot +// be a new-value register. Arch Spec 5.4.2.1 +// 2. If an instruction uses absolute-set addressing mode, +// its address register cannot be a new-value register. +// Arch Spec 5.4.2.1.TODO: This is not enabled as +// as absolute-set address mode patters are not implemented. +// 3. If an instruction produces a 64-bit result, its registers cannot be used +// as new-value registers. Arch Spec 5.4.2.2. +// 4. If the instruction that sets a new-value register is conditional, then +// the instruction that uses the new-value register must also be conditional, +// and both must always have their predicates evaluate identically. +// Arch Spec 5.4.2.3. +// 5. There is an implied restriction of a packet can not have another store, +// if there is a new value store in the packet. Corollary, if there is +// already a store in a packet, there can not be a new value store. +// Arch Spec: 3.4.4.2 +bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI, + MachineInstr *PacketMI, unsigned DepReg, + std::map <MachineInstr*, SUnit*> MIToSUnit) +{ + // Make sure we are looking at the store + if (!IsNewifyStore(MI)) + return false; + + // Make sure there is dependency and can be new value'ed + if (GetStoreValueOperand(MI).isReg() && + GetStoreValueOperand(MI).getReg() != DepReg) + return false; + + const HexagonRegisterInfo* QRI = + (const HexagonRegisterInfo *) TM.getRegisterInfo(); + const MCInstrDesc& MCID = PacketMI->getDesc(); + // first operand is always the result + + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF); + + // if there is already an store in the packet, no can do new value store + // Arch Spec 3.4.4.2. + for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(), + VE = CurrentPacketMIs.end(); + (VI != VE); ++VI) { + SUnit* PacketSU = MIToSUnit[*VI]; + if (PacketSU->getInstr()->getDesc().mayStore() || + // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME, + // then we don't need this + PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME || + PacketSU->getInstr()->getOpcode() == Hexagon::DEALLOCFRAME) + return false; + } + + if (PacketRC == &Hexagon::DoubleRegsRegClass) { + // new value store constraint: double regs can not feed into new value store + // arch spec section: 5.4.2.2 + return false; + } + + // Make sure it's NOT the post increment register that we are going to + // new value. + if (QII->isPostIncrement(MI) && + MI->getDesc().mayStore() && + GetPostIncrementOperand(MI, QII).getReg() == DepReg) { + return false; + } + + if (QII->isPostIncrement(PacketMI) && + PacketMI->getDesc().mayLoad() && + GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) { + // if source is post_inc, or absolute-set addressing, + // it can not feed into new value store + // r3 = memw(r2++#4) + // memw(r30 + #-1404) = r2.new -> can not be new value store + // arch spec section: 5.4.2.1 + return false; + } + + // If the source that feeds the store is predicated, new value store must + // also be also predicated. + if (QII->isPredicated(PacketMI)) { + if (!QII->isPredicated(MI)) + return false; + + // Check to make sure that they both will have their predicates + // evaluate identically + unsigned predRegNumSrc = 0; + unsigned predRegNumDst = 0; + const TargetRegisterClass* predRegClass = NULL; + + // Get predicate register used in the source instruction + for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) { + if ( PacketMI->getOperand(opNum).isReg()) + predRegNumSrc = PacketMI->getOperand(opNum).getReg(); + predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc); + if (predRegClass == &Hexagon::PredRegsRegClass) { + break; + } + } + assert ((predRegClass == &Hexagon::PredRegsRegClass ) && + ("predicate register not found in a predicated PacketMI instruction")); + + // Get predicate register used in new-value store instruction + for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) { + if ( MI->getOperand(opNum).isReg()) + predRegNumDst = MI->getOperand(opNum).getReg(); + predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst); + if (predRegClass == &Hexagon::PredRegsRegClass) { + break; + } + } + assert ((predRegClass == &Hexagon::PredRegsRegClass ) && + ("predicate register not found in a predicated MI instruction")); + + // New-value register producer and user (store) need to satisfy these + // constraints: + // 1) Both instructions should be predicated on the same register. + // 2) If producer of the new-value register is .new predicated then store + // should also be .new predicated and if producer is not .new predicated + // then store should not be .new predicated. + // 3) Both new-value register producer and user should have same predicate + // sense, i.e, either both should be negated or both should be none negated. + + if (( predRegNumDst != predRegNumSrc) || + isDotNewInst(PacketMI) != isDotNewInst(MI) || + GetPredicateSense(MI, QII) != GetPredicateSense(PacketMI, QII)) { + return false; + } + } + + // Make sure that other than the new-value register no other store instruction + // register has been modified in the same packet. Predicate registers can be + // modified by they should not be modified between the producer and the store + // instruction as it will make them both conditional on different values. + // We already know this to be true for all the instructions before and + // including PacketMI. Howerver, we need to perform the check for the + // remaining instructions in the packet. + + std::vector<MachineInstr*>::iterator VI; + std::vector<MachineInstr*>::iterator VE; + unsigned StartCheck = 0; + + for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end(); + (VI != VE); ++VI) { + SUnit* TempSU = MIToSUnit[*VI]; + MachineInstr* TempMI = TempSU->getInstr(); + + // Following condition is true for all the instructions until PacketMI is + // reached (StartCheck is set to 0 before the for loop). + // StartCheck flag is 1 for all the instructions after PacketMI. + if (TempMI != PacketMI && !StartCheck) // start processing only after + continue; // encountering PacketMI + + StartCheck = 1; + if (TempMI == PacketMI) // We don't want to check PacketMI for dependence + continue; + + for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) { + if (MI->getOperand(opNum).isReg() && + TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(), + QRI)) + return false; + } + } + + // Make sure that for non POST_INC stores: + // 1. The only use of reg is DepReg and no other registers. + // This handles V4 base+index registers. + // The following store can not be dot new. + // Eg. r0 = add(r0, #3)a + // memw(r1+r0<<#2) = r0 + if (!QII->isPostIncrement(MI) && + GetStoreValueOperand(MI).isReg() && + GetStoreValueOperand(MI).getReg() == DepReg) { + for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) { + if (MI->getOperand(opNum).isReg() && + MI->getOperand(opNum).getReg() == DepReg) { + return false; + } + } + // 2. If data definition is because of implicit definition of the register, + // do not newify the store. Eg. + // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def> + // STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343] + for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) { + if (PacketMI->getOperand(opNum).isReg() && + PacketMI->getOperand(opNum).getReg() == DepReg && + PacketMI->getOperand(opNum).isDef() && + PacketMI->getOperand(opNum).isImplicit()) { + return false; + } + } + } + + // Can be dot new store. + return true; +} + +// can this MI to promoted to either +// new value store or new value jump +bool HexagonPacketizerList::CanPromoteToNewValue( MachineInstr *MI, + SUnit *PacketSU, unsigned DepReg, + std::map <MachineInstr*, SUnit*> MIToSUnit, + MachineBasicBlock::iterator &MII) +{ + + const HexagonRegisterInfo* QRI = + (const HexagonRegisterInfo *) TM.getRegisterInfo(); + if (!QRI->Subtarget.hasV4TOps() || + !IsNewifyStore(MI)) + return false; + + MachineInstr *PacketMI = PacketSU->getInstr(); + + // Check to see the store can be new value'ed. + if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit)) + return true; + + // Check to see the compare/jump can be new value'ed. + // This is done as a pass on its own. Don't need to check it here. + return false; +} + +// Check to see if an instruction can be dot new +// There are three kinds. +// 1. dot new on predicate - V2/V3/V4 +// 2. dot new on stores NV/ST - V4 +// 3. dot new on jump NV/J - V4 -- This is generated in a pass. +bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI, + SUnit *PacketSU, unsigned DepReg, + std::map <MachineInstr*, SUnit*> MIToSUnit, + MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC ) +{ + // already a dot new instruction + if (isDotNewInst(MI) && !IsNewifyStore(MI)) + return false; + + if (!isNewifiable(MI)) + return false; + + // predicate .new + if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI)) + return true; + else if (RC != &Hexagon::PredRegsRegClass && + !IsNewifyStore(MI)) // MI is not a new-value store + return false; + else { + // Create a dot new machine instruction to see if resources can be + // allocated. If not, bail out now. + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + int NewOpcode = GetDotNewOp(MI->getOpcode()); + const MCInstrDesc &desc = QII->get(NewOpcode); + DebugLoc dl; + MachineInstr *NewMI = + MI->getParent()->getParent()->CreateMachineInstr(desc, dl); + bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI); + MI->getParent()->getParent()->DeleteMachineInstr(NewMI); + + if (!ResourcesAvailable) + return false; + + // new value store only + // new new value jump generated as a passes + if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) { + return false; + } + } + return true; +} + +// Go through the packet instructions and search for anti dependency +// between them and DepReg from MI +// Consider this case: +// Trying to add +// a) %R1<def> = TFRI_cdNotPt %P3, 2 +// to this packet: +// { +// b) %P0<def> = OR_pp %P3<kill>, %P0<kill> +// c) %P3<def> = TFR_PdRs %R23 +// d) %R1<def> = TFRI_cdnPt %P3, 4 +// } +// The P3 from a) and d) will be complements after +// a)'s P3 is converted to .new form +// Anti Dep between c) and b) is irrelevant for this case +bool HexagonPacketizerList::RestrictingDepExistInPacket (MachineInstr* MI, + unsigned DepReg, + std::map <MachineInstr*, SUnit*> MIToSUnit) { + + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + SUnit* PacketSUDep = MIToSUnit[MI]; + + for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(), + VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) { + + // We only care for dependencies to predicated instructions + if(!QII->isPredicated(*VIN)) continue; + + // Scheduling Unit for current insn in the packet + SUnit* PacketSU = MIToSUnit[*VIN]; + + // Look at dependencies between current members of the packet + // and predicate defining instruction MI. + // Make sure that dependency is on the exact register + // we care about. + if (PacketSU->isSucc(PacketSUDep)) { + for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) { + if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) && + (PacketSU->Succs[i].getKind() == SDep::Anti) && + (PacketSU->Succs[i].getReg() == DepReg)) { + return true; + } + } + } + } + + return false; +} + + +// Given two predicated instructions, this function detects whether +// the predicates are complements +bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1, + MachineInstr* MI2, std::map <MachineInstr*, SUnit*> MIToSUnit) { + + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + // Currently can only reason about conditional transfers + if (!QII->isConditionalTransfer(MI1) || !QII->isConditionalTransfer(MI2)) { + return false; + } + + // Scheduling unit for candidate + SUnit* SU = MIToSUnit[MI1]; + + // One corner case deals with the following scenario: + // Trying to add + // a) %R24<def> = TFR_cPt %P0, %R25 + // to this packet: + // + // { + // b) %R25<def> = TFR_cNotPt %P0, %R24 + // c) %P0<def> = CMPEQri %R26, 1 + // } + // + // On general check a) and b) are complements, but + // presence of c) will convert a) to .new form, and + // then it is not a complement + // We attempt to detect it by analyzing existing + // dependencies in the packet + + // Analyze relationships between all existing members of the packet. + // Look for Anti dependecy on the same predicate reg + // as used in the candidate + for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(), + VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) { + + // Scheduling Unit for current insn in the packet + SUnit* PacketSU = MIToSUnit[*VIN]; + + // If this instruction in the packet is succeeded by the candidate... + if (PacketSU->isSucc(SU)) { + for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) { + // The corner case exist when there is true data + // dependency between candidate and one of current + // packet members, this dep is on predicate reg, and + // there already exist anti dep on the same pred in + // the packet. + if (PacketSU->Succs[i].getSUnit() == SU && + Hexagon::PredRegsRegClass.contains( + PacketSU->Succs[i].getReg()) && + PacketSU->Succs[i].getKind() == SDep::Data && + // Here I know that *VIN is predicate setting instruction + // with true data dep to candidate on the register + // we care about - c) in the above example. + // Now I need to see if there is an anti dependency + // from c) to any other instruction in the + // same packet on the pred reg of interest + RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(), + MIToSUnit)) { + return false; + } + } + } + } + + // If the above case does not apply, check regular + // complement condition. + // Check that the predicate register is the same and + // that the predicate sense is different + // We also need to differentiate .old vs. .new: + // !p0 is not complimentary to p0.new + return ((MI1->getOperand(1).getReg() == MI2->getOperand(1).getReg()) && + (GetPredicateSense(MI1, QII) != GetPredicateSense(MI2, QII)) && + (isDotNewInst(MI1) == isDotNewInst(MI2))); +} + +// initPacketizerState - Initialize packetizer flags +void HexagonPacketizerList::initPacketizerState() { + + Dependence = false; + PromotedToDotNew = false; + GlueToNewValueJump = false; + GlueAllocframeStore = false; + FoundSequentialDependence = false; + + return; +} + +// ignorePseudoInstruction - Ignore bundling of pseudo instructions. +bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI, + MachineBasicBlock *MBB) { + if (MI->isDebugValue()) + return true; + + // We must print out inline assembly + if (MI->isInlineAsm()) + return false; + + // We check if MI has any functional units mapped to it. + // If it doesn't, we ignore the instruction. + const MCInstrDesc& TID = MI->getDesc(); + unsigned SchedClass = TID.getSchedClass(); + const InstrStage* IS = + ResourceTracker->getInstrItins()->beginStage(SchedClass); + unsigned FuncUnits = IS->getUnits(); + return !FuncUnits; +} + +// isSoloInstruction: - Returns true for instructions that must be +// scheduled in their own packet. +bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) { + + if (MI->isInlineAsm()) + return true; + + if (MI->isEHLabel()) + return true; + + // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints: + // trap, pause, barrier, icinva, isync, and syncht are solo instructions. + // They must not be grouped with other instructions in a packet. + if (IsSchedBarrier(MI)) + return true; + + return false; +} + +// isLegalToPacketizeTogether: +// SUI is the current instruction that is out side of the current packet. +// SUJ is the current instruction inside the current packet against which that +// SUI will be packetized. +bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { + MachineInstr *I = SUI->getInstr(); + MachineInstr *J = SUJ->getInstr(); + assert(I && J && "Unable to packetize null instruction!"); + + const MCInstrDesc &MCIDI = I->getDesc(); + const MCInstrDesc &MCIDJ = J->getDesc(); + + MachineBasicBlock::iterator II = I; + + const unsigned FrameSize = MF.getFrameInfo()->getStackSize(); + const HexagonRegisterInfo* QRI = + (const HexagonRegisterInfo *) TM.getRegisterInfo(); + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + + // Inline asm cannot go in the packet. + if (I->getOpcode() == Hexagon::INLINEASM) + llvm_unreachable("Should not meet inline asm here!"); + + if (isSoloInstruction(I)) + llvm_unreachable("Should not meet solo instr here!"); + + // A save callee-save register function call can only be in a packet + // with instructions that don't write to the callee-save registers. + if ((QII->isSaveCalleeSavedRegsCall(I) && + DoesModifyCalleeSavedReg(J, QRI)) || + (QII->isSaveCalleeSavedRegsCall(J) && + DoesModifyCalleeSavedReg(I, QRI))) { + Dependence = true; + return false; + } + + // Two control flow instructions cannot go in the same packet. + if (IsControlFlow(I) && IsControlFlow(J)) { + Dependence = true; + return false; + } + + // A LoopN instruction cannot appear in the same packet as a jump or call. + if (IsLoopN(I) && ( IsDirectJump(J) + || MCIDJ.isCall() + || QII->isDeallocRet(J))) { + Dependence = true; + return false; + } + if (IsLoopN(J) && ( IsDirectJump(I) + || MCIDI.isCall() + || QII->isDeallocRet(I))) { + Dependence = true; + return false; + } + + // dealloc_return cannot appear in the same packet as a conditional or + // unconditional jump. + if (QII->isDeallocRet(I) && ( MCIDJ.isBranch() + || MCIDJ.isCall() + || MCIDJ.isBarrier())) { + Dependence = true; + return false; + } + + + // V4 allows dual store. But does not allow second store, if the + // first store is not in SLOT0. New value store, new value jump, + // dealloc_return and memop always take SLOT0. + // Arch spec 3.4.4.2 + if (QRI->Subtarget.hasV4TOps()) { + + if (MCIDI.mayStore() && MCIDJ.mayStore() && isNewValueInst(J)) { + Dependence = true; + return false; + } + + if ( (QII->isMemOp(J) && MCIDI.mayStore()) + || (MCIDJ.mayStore() && QII->isMemOp(I)) + || (QII->isMemOp(J) && QII->isMemOp(I))) { + Dependence = true; + return false; + } + + //if dealloc_return + if (MCIDJ.mayStore() && QII->isDeallocRet(I)){ + Dependence = true; + return false; + } + + // If an instruction feeds new value jump, glue it. + MachineBasicBlock::iterator NextMII = I; + ++NextMII; + MachineInstr *NextMI = NextMII; + + if (QII->isNewValueJump(NextMI)) { + + bool secondRegMatch = false; + bool maintainNewValueJump = false; + + if (NextMI->getOperand(1).isReg() && + I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) { + secondRegMatch = true; + maintainNewValueJump = true; + } + + if (!secondRegMatch && + I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) { + maintainNewValueJump = true; + } + + for (std::vector<MachineInstr*>::iterator + VI = CurrentPacketMIs.begin(), + VE = CurrentPacketMIs.end(); + (VI != VE && maintainNewValueJump); ++VI) { + SUnit* PacketSU = MIToSUnit[*VI]; + + // NVJ can not be part of the dual jump - Arch Spec: section 7.8 + if (PacketSU->getInstr()->getDesc().isCall()) { + Dependence = true; + break; + } + // Validate + // 1. Packet does not have a store in it. + // 2. If the first operand of the nvj is newified, and the second + // operand is also a reg, it (second reg) is not defined in + // the same packet. + // 3. If the second operand of the nvj is newified, (which means + // first operand is also a reg), first reg is not defined in + // the same packet. + if (PacketSU->getInstr()->getDesc().mayStore() || + PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME || + // Check #2. + (!secondRegMatch && NextMI->getOperand(1).isReg() && + PacketSU->getInstr()->modifiesRegister( + NextMI->getOperand(1).getReg(), QRI)) || + // Check #3. + (secondRegMatch && + PacketSU->getInstr()->modifiesRegister( + NextMI->getOperand(0).getReg(), QRI))) { + Dependence = true; + break; + } + } + if (!Dependence) + GlueToNewValueJump = true; + else + return false; + } + } + + if (SUJ->isSucc(SUI)) { + for (unsigned i = 0; + (i < SUJ->Succs.size()) && !FoundSequentialDependence; + ++i) { + + if (SUJ->Succs[i].getSUnit() != SUI) { + continue; + } + + SDep::Kind DepType = SUJ->Succs[i].getKind(); + + // For direct calls: + // Ignore register dependences for call instructions for + // packetization purposes except for those due to r31 and + // predicate registers. + // + // For indirect calls: + // Same as direct calls + check for true dependences to the register + // used in the indirect call. + // + // We completely ignore Order dependences for call instructions + // + // For returns: + // Ignore register dependences for return instructions like jumpr, + // dealloc return unless we have dependencies on the explicit uses + // of the registers used by jumpr (like r31) or dealloc return + // (like r29 or r30). + // + // TODO: Currently, jumpr is handling only return of r31. So, the + // following logic (specificaly IsCallDependent) is working fine. + // We need to enable jumpr for register other than r31 and then, + // we need to rework the last part, where it handles indirect call + // of that (IsCallDependent) function. Bug 6216 is opened for this. + // + unsigned DepReg = 0; + const TargetRegisterClass* RC = NULL; + if (DepType == SDep::Data) { + DepReg = SUJ->Succs[i].getReg(); + RC = QRI->getMinimalPhysRegClass(DepReg); + } + if ((MCIDI.isCall() || MCIDI.isReturn()) && + (!IsRegDependence(DepType) || + !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) { + /* do nothing */ + } + + // For instructions that can be promoted to dot-new, try to promote. + else if ((DepType == SDep::Data) && + CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) && + PromoteToDotNew(I, DepType, II, RC)) { + PromotedToDotNew = true; + /* do nothing */ + } + + else if ((DepType == SDep::Data) && + (QII->isNewValueJump(I))) { + /* do nothing */ + } + + // For predicated instructions, if the predicates are complements + // then there can be no dependence. + else if (QII->isPredicated(I) && + QII->isPredicated(J) && + ArePredicatesComplements(I, J, MIToSUnit)) { + /* do nothing */ + + } + else if (IsDirectJump(I) && + !MCIDJ.isBranch() && + !MCIDJ.isCall() && + (DepType == SDep::Order)) { + // Ignore Order dependences between unconditional direct branches + // and non-control-flow instructions + /* do nothing */ + } + else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) && + (DepType != SDep::Output)) { + // Ignore all dependences for jumps except for true and output + // dependences + /* do nothing */ + } + + // Ignore output dependences due to superregs. We can + // write to two different subregisters of R1:0 for instance + // in the same cycle + // + + // + // Let the + // If neither I nor J defines DepReg, then this is a + // superfluous output dependence. The dependence must be of the + // form: + // R0 = ... + // R1 = ... + // and there is an output dependence between the two instructions + // with + // DepReg = D0 + // We want to ignore these dependences. + // Ideally, the dependence constructor should annotate such + // dependences. We can then avoid this relatively expensive check. + // + else if (DepType == SDep::Output) { + // DepReg is the register that's responsible for the dependence. + unsigned DepReg = SUJ->Succs[i].getReg(); + + // Check if I and J really defines DepReg. + if (I->definesRegister(DepReg) || + J->definesRegister(DepReg)) { + FoundSequentialDependence = true; + break; + } + } + + // We ignore Order dependences for + // 1. Two loads unless they are volatile. + // 2. Two stores in V4 unless they are volatile. + else if ((DepType == SDep::Order) && + !I->hasVolatileMemoryRef() && + !J->hasVolatileMemoryRef()) { + if (QRI->Subtarget.hasV4TOps() && + // hexagonv4 allows dual store. + MCIDI.mayStore() && MCIDJ.mayStore()) { + /* do nothing */ + } + // store followed by store-- not OK on V2 + // store followed by load -- not OK on all (OK if addresses + // are not aliased) + // load followed by store -- OK on all + // load followed by load -- OK on all + else if ( !MCIDJ.mayStore()) { + /* do nothing */ + } + else { + FoundSequentialDependence = true; + break; + } + } + + // For V4, special case ALLOCFRAME. Even though there is dependency + // between ALLOCAFRAME and subsequent store, allow it to be + // packetized in a same packet. This implies that the store is using + // caller's SP. Hense, offset needs to be updated accordingly. + else if (DepType == SDep::Data + && QRI->Subtarget.hasV4TOps() + && J->getOpcode() == Hexagon::ALLOCFRAME + && (I->getOpcode() == Hexagon::STrid + || I->getOpcode() == Hexagon::STriw + || I->getOpcode() == Hexagon::STrib) + && I->getOperand(0).getReg() == QRI->getStackRegister() + && QII->isValidOffset(I->getOpcode(), + I->getOperand(1).getImm() - + (FrameSize + HEXAGON_LRFP_SIZE))) + { + GlueAllocframeStore = true; + // Since this store is to be glued with allocframe in the same + // packet, it will use SP of the previous stack frame, i.e + // caller's SP. Therefore, we need to recalculate offset according + // to this change. + I->getOperand(1).setImm(I->getOperand(1).getImm() - + (FrameSize + HEXAGON_LRFP_SIZE)); + } + + // + // Skip over anti-dependences. Two instructions that are + // anti-dependent can share a packet + // + else if (DepType != SDep::Anti) { + FoundSequentialDependence = true; + break; + } + } + + if (FoundSequentialDependence) { + Dependence = true; + return false; + } + } + + return true; +} + +// isLegalToPruneDependencies +bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) { + MachineInstr *I = SUI->getInstr(); + assert(I && SUJ->getInstr() && "Unable to packetize null instruction!"); + + const unsigned FrameSize = MF.getFrameInfo()->getStackSize(); + + if (Dependence) { + + // Check if the instruction was promoted to a dot-new. If so, demote it + // back into a dot-old. + if (PromotedToDotNew) { + DemoteToDotOld(I); + } + + // Check if the instruction (must be a store) was glued with an Allocframe + // instruction. If so, restore its offset to its original value, i.e. use + // curent SP instead of caller's SP. + if (GlueAllocframeStore) { + I->getOperand(1).setImm(I->getOperand(1).getImm() + + FrameSize + HEXAGON_LRFP_SIZE); + } + + return false; + } + return true; +} + +MachineBasicBlock::iterator +HexagonPacketizerList::addToPacket(MachineInstr *MI) { + + MachineBasicBlock::iterator MII = MI; + MachineBasicBlock *MBB = MI->getParent(); + + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + + if (GlueToNewValueJump) { + + ++MII; + MachineInstr *nvjMI = MII; + assert(ResourceTracker->canReserveResources(MI)); + ResourceTracker->reserveResources(MI); + if (QII->isExtended(MI) && + !tryAllocateResourcesForConstExt(MI)) { + endPacket(MBB, MI); + ResourceTracker->reserveResources(MI); + assert(canReserveResourcesForConstExt(MI) && + "Ensure that there is a slot"); + reserveResourcesForConstExt(MI); + // Reserve resources for new value jump constant extender. + assert(canReserveResourcesForConstExt(MI) && + "Ensure that there is a slot"); + reserveResourcesForConstExt(nvjMI); + assert(ResourceTracker->canReserveResources(nvjMI) && + "Ensure that there is a slot"); + + } else if ( // Extended instruction takes two slots in the packet. + // Try reserve and allocate 4-byte in the current packet first. + (QII->isExtended(nvjMI) + && (!tryAllocateResourcesForConstExt(nvjMI) + || !ResourceTracker->canReserveResources(nvjMI))) + || // For non-extended instruction, no need to allocate extra 4 bytes. + (!QII->isExtended(nvjMI) && + !ResourceTracker->canReserveResources(nvjMI))) + { + endPacket(MBB, MI); + // A new and empty packet starts. + // We are sure that the resources requirements can be satisfied. + // Therefore, do not need to call "canReserveResources" anymore. + ResourceTracker->reserveResources(MI); + if (QII->isExtended(nvjMI)) + reserveResourcesForConstExt(nvjMI); + } + // Here, we are sure that "reserveResources" would succeed. + ResourceTracker->reserveResources(nvjMI); + CurrentPacketMIs.push_back(MI); + CurrentPacketMIs.push_back(nvjMI); + } else { + if ( QII->isExtended(MI) + && ( !tryAllocateResourcesForConstExt(MI) + || !ResourceTracker->canReserveResources(MI))) + { + endPacket(MBB, MI); + // Check if the instruction was promoted to a dot-new. If so, demote it + // back into a dot-old + if (PromotedToDotNew) { + DemoteToDotOld(MI); + } + reserveResourcesForConstExt(MI); + } + // In case that "MI" is not an extended insn, + // the resource availability has already been checked. + ResourceTracker->reserveResources(MI); + CurrentPacketMIs.push_back(MI); + } + return MII; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createHexagonPacketizer() { + return new HexagonPacketizer(); +} + diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp index ef36881..035afe8 100644 --- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp +++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp @@ -15,6 +15,7 @@ #include "Hexagon.h" #include "HexagonAsmPrinter.h" #include "HexagonInstPrinter.h" +#include "HexagonMCInst.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" @@ -37,20 +38,50 @@ StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const { void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) { + printInst((const HexagonMCInst*)(MI), O, Annot); +} + +void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O, + StringRef Annot) { const char packetPadding[] = " "; const char startPacket = '{', endPacket = '}'; // TODO: add outer HW loop when it's supported too. if (MI->getOpcode() == Hexagon::ENDLOOP0) { - MCInst Nop; + // Ending a harware loop is different from ending an regular packet. + assert(MI->isEndPacket() && "Loop end must also end the packet"); + + if (MI->isStartPacket()) { + // There must be a packet to end a loop. + // FIXME: when shuffling is always run, this shouldn't be needed. + HexagonMCInst Nop; + StringRef NoAnnot; - O << packetPadding << startPacket << '\n'; - Nop.setOpcode(Hexagon::NOP); - printInstruction(&Nop, O); - O << packetPadding << endPacket; + Nop.setOpcode (Hexagon::NOP); + Nop.setStartPacket (MI->isStartPacket()); + printInst (&Nop, O, NoAnnot); + } + + // Close the packet. + if (MI->isEndPacket()) + O << packetPadding << endPacket; + + printInstruction(MI, O); + } + else { + // Prefix the insn opening the packet. + if (MI->isStartPacket()) + O << packetPadding << startPacket << '\n'; + + printInstruction(MI, O); + + // Suffix the insn closing the packet. + if (MI->isEndPacket()) + // Suffix the packet in a new line always, since the GNU assembler has + // issues with a closing brace on the same line as CONST{32,64}. + O << '\n' << packetPadding << endPacket; } - printInstruction(MI, O); printAnnotation(O, Annot); } @@ -65,22 +96,22 @@ void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } else if(MO.isImm()) { printImmOperand(MI, OpNo, O); } else { - assert(false && "Unknown operand"); + llvm_unreachable("Unknown operand"); } } -void HexagonInstPrinter::printImmOperand - (const MCInst *MI, unsigned OpNo, raw_ostream &O) const { +void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) const { O << MI->getOperand(OpNo).getImm(); } void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { + raw_ostream &O) const { O << MI->getOperand(OpNo).getImm(); } -void HexagonInstPrinter::printUnsignedImmOperand - (const MCInst *MI, unsigned OpNo, raw_ostream &O) const { +void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI, + unsigned OpNo, raw_ostream &O) const { O << MI->getOperand(OpNo).getImm(); } @@ -89,13 +120,13 @@ void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo, O << -MI->getOperand(OpNo).getImm(); } -void HexagonInstPrinter::printNOneImmOperand - (const MCInst *MI, unsigned OpNo, raw_ostream &O) const { +void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) const { O << -1; } -void HexagonInstPrinter::printMEMriOperand - (const MCInst *MI, unsigned OpNo, raw_ostream &O) const { +void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) const { const MCOperand& MO0 = MI->getOperand(OpNo); const MCOperand& MO1 = MI->getOperand(OpNo + 1); @@ -103,8 +134,8 @@ void HexagonInstPrinter::printMEMriOperand O << " + #" << MO1.getImm(); } -void HexagonInstPrinter::printFrameIndexOperand - (const MCInst *MI, unsigned OpNo, raw_ostream &O) const { +void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) const { const MCOperand& MO0 = MI->getOperand(OpNo); const MCOperand& MO1 = MI->getOperand(OpNo + 1); @@ -113,24 +144,21 @@ void HexagonInstPrinter::printFrameIndexOperand void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const { - const MCOperand& MO = MI->getOperand(OpNo); - assert(MO.isExpr() && "Expecting expression"); + assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); printOperand(MI, OpNo, O); } void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const { - const MCOperand& MO = MI->getOperand(OpNo); - assert(MO.isExpr() && "Expecting expression"); + assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); printOperand(MI, OpNo, O); } void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo, raw_ostream &O) const { - const MCOperand& MO = MI->getOperand(OpNo); - assert(MO.isExpr() && "Expecting expression"); + assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); printOperand(MI, OpNo, O); } diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h index dad4334..902a323 100644 --- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h +++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h @@ -14,6 +14,7 @@ #ifndef HEXAGONINSTPRINTER_H #define HEXAGONINSTPRINTER_H +#include "HexagonMCInst.h" #include "llvm/MC/MCInstPrinter.h" namespace llvm { @@ -25,6 +26,7 @@ namespace llvm { : MCInstPrinter(MAI, MII, MRI) {} virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot); + void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot); virtual StringRef getOpcodeName(unsigned Opcode) const; void printInstruction(const MCInst *MI, raw_ostream &O); StringRef getRegName(unsigned RegNo) const; @@ -33,16 +35,16 @@ namespace llvm { void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; + void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) const; void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printFrameIndexOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; + void printFrameIndexOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) const; void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) @@ -55,7 +57,8 @@ namespace llvm { const; void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printConstantPool(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; + void printConstantPool(const MCInst *MI, unsigned OpNo, + raw_ostream &O) const; void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const { printSymbol(MI, OpNo, O, true); } diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index ed55c3c..7221e90 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -23,14 +23,41 @@ namespace llvm { /// instruction info tracks. /// namespace HexagonII { - // *** The code below must match HexagonInstrFormat*.td *** // + // Insn types. + // *** Must match HexagonInstrFormat*.td *** + enum Type { + TypePSEUDO = 0, + TypeALU32 = 1, + TypeCR = 2, + TypeJR = 3, + TypeJ = 4, + TypeLD = 5, + TypeST = 6, + TypeSYSTEM = 7, + TypeXTYPE = 8, + TypeMEMOP = 9, + TypeNV = 10, + TypePREFIX = 30, // Such as extenders. + TypeMARKER = 31 // Such as end of a HW loop. + }; + + + // MCInstrDesc TSFlags + // *** Must match HexagonInstrFormat*.td *** enum { + // This 5-bit field describes the insn type. + TypePos = 0, + TypeMask = 0x1f, + + // Solo instructions. + SoloPos = 5, + SoloMask = 0x1, // Predicated instructions. - PredicatedPos = 1, + PredicatedPos = 6, PredicatedMask = 0x1 }; diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt index 8ec5673..8995080 100644 --- a/lib/Target/LLVMBuild.txt +++ b/lib/Target/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 Mips PTX PowerPC Sparc X86 XCore +subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore ; This is a special group whose required libraries are extended (by llvm-build) ; with the best execution engine (the native JIT, if available, or the diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt index bf1deef..6c3e8b6 100644 --- a/lib/Target/MBlaze/CMakeLists.txt +++ b/lib/Target/MBlaze/CMakeLists.txt @@ -30,6 +30,8 @@ add_llvm_target(MBlazeCodeGen MBlazeELFWriterInfo.cpp ) +add_dependencies(LLVMMBlazeCodeGen intrinsics_gen) + add_subdirectory(AsmParser) add_subdirectory(Disassembler) add_subdirectory(InstPrinter) diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td index b4edff0..c288855 100644 --- a/lib/Target/MBlaze/MBlaze.td +++ b/lib/Target/MBlaze/MBlaze.td @@ -50,7 +50,7 @@ def FeatureSqrt : SubtargetFeature<"sqrt", "HasSqrt", "true", // MBlaze processors supported. //===----------------------------------------------------------------------===// -def : Processor<"mblaze", MBlazeGenericItineraries, []>; +def : Processor<"mblaze", NoItineraries, []>; def : Processor<"mblaze3", MBlazePipe3Itineraries, []>; def : Processor<"mblaze5", MBlazePipe5Itineraries, []>; diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp index 55fffe3..e9f340f 100644 --- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp +++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp @@ -135,7 +135,7 @@ void MBlazeAsmPrinter::printSavedRegsBitmask() { for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); unsigned RegNum = getMBlazeRegisterNumbering(Reg); - if (MBlaze::GPRRegisterClass->contains(Reg)) + if (MBlaze::GPRRegClass.contains(Reg)) CPUBitmask |= (1 << RegNum); } @@ -187,7 +187,7 @@ void MBlazeAsmPrinter::EmitFunctionBodyEnd() { //===----------------------------------------------------------------------===// void MBlazeAsmPrinter::EmitInstruction(const MachineInstr *MI) { - MBlazeMCInstLower MCInstLowering(OutContext, *Mang, *this); + MBlazeMCInstLower MCInstLowering(OutContext, *this); MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); @@ -200,7 +200,13 @@ PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) - return true; // Unknown modifier. + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + } printOperand(MI, OpNo, O); return false; diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp index edfc335..310c25e 100644 --- a/lib/Target/MBlaze/MBlazeISelLowering.cpp +++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp @@ -62,9 +62,9 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM) setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? // Set up the register classes - addRegisterClass(MVT::i32, MBlaze::GPRRegisterClass); + addRegisterClass(MVT::i32, &MBlaze::GPRRegClass); if (Subtarget->hasFPU()) { - addRegisterClass(MVT::f32, MBlaze::GPRRegisterClass); + addRegisterClass(MVT::f32, &MBlaze::GPRRegClass); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); } @@ -291,12 +291,12 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI, loop->addSuccessor(finish); loop->addSuccessor(loop); - unsigned IAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass); + unsigned IAMT = R.createVirtualRegister(&MBlaze::GPRRegClass); BuildMI(MBB, dl, TII->get(MBlaze::ANDI), IAMT) .addReg(MI->getOperand(2).getReg()) .addImm(31); - unsigned IVAL = R.createVirtualRegister(MBlaze::GPRRegisterClass); + unsigned IVAL = R.createVirtualRegister(&MBlaze::GPRRegClass); BuildMI(MBB, dl, TII->get(MBlaze::ADDIK), IVAL) .addReg(MI->getOperand(1).getReg()) .addImm(0); @@ -305,14 +305,14 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI, .addReg(IAMT) .addMBB(finish); - unsigned DST = R.createVirtualRegister(MBlaze::GPRRegisterClass); - unsigned NDST = R.createVirtualRegister(MBlaze::GPRRegisterClass); + unsigned DST = R.createVirtualRegister(&MBlaze::GPRRegClass); + unsigned NDST = R.createVirtualRegister(&MBlaze::GPRRegClass); BuildMI(loop, dl, TII->get(MBlaze::PHI), DST) .addReg(IVAL).addMBB(MBB) .addReg(NDST).addMBB(loop); - unsigned SAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass); - unsigned NAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass); + unsigned SAMT = R.createVirtualRegister(&MBlaze::GPRRegClass); + unsigned NAMT = R.createVirtualRegister(&MBlaze::GPRRegClass); BuildMI(loop, dl, TII->get(MBlaze::PHI), SAMT) .addReg(IAMT).addMBB(MBB) .addReg(NAMT).addMBB(loop); @@ -500,7 +500,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI, case MBlaze::LAN32: opcode = MBlaze::AND; break; } - finalReg = R.createVirtualRegister(MBlaze::GPRRegisterClass); + finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass); start->addSuccessor(exit); start->addSuccessor(start); @@ -510,7 +510,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI, if (MI->getOpcode() == MBlaze::LAN32) { unsigned tmp = finalReg; - finalReg = R.createVirtualRegister(MBlaze::GPRRegisterClass); + finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass); BuildMI(start, dl, TII->get(MBlaze::XORI), finalReg) .addReg(tmp) .addImm(-1); @@ -528,7 +528,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI, final->addSuccessor(exit); final->addSuccessor(start); - unsigned CMP = R.createVirtualRegister(MBlaze::GPRRegisterClass); + unsigned CMP = R.createVirtualRegister(&MBlaze::GPRRegClass); BuildMI(start, dl, TII->get(MBlaze::CMP), CMP) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(2).getReg()); @@ -543,7 +543,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI, } } - unsigned CHK = R.createVirtualRegister(MBlaze::GPRRegisterClass); + unsigned CHK = R.createVirtualRegister(&MBlaze::GPRRegClass); BuildMI(final, dl, TII->get(MBlaze::SWX)) .addReg(finalReg) .addReg(MI->getOperand(1).getReg()) @@ -681,13 +681,19 @@ static bool CC_MBlaze_AssignReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT, /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted. /// TODO: isVarArg, isTailCall. SDValue MBlazeTargetLowering:: -LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, - bool isVarArg, bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + // MBlaze does not yet support tail call optimization isTailCall = false; @@ -702,7 +708,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze); // Get a count of how many bytes are to be pushed on the stack. @@ -841,7 +847,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze); @@ -884,7 +890,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze); SDValue StackPtr; @@ -899,9 +905,9 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const TargetRegisterClass *RC; if (RegVT == MVT::i32) - RC = MBlaze::GPRRegisterClass; + RC = &MBlaze::GPRRegClass; else if (RegVT == MVT::f32) - RC = MBlaze::GPRRegisterClass; + RC = &MBlaze::GPRRegClass; else llvm_unreachable("RegVT not supported by LowerFormalArguments"); @@ -964,7 +970,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, StackPtr = DAG.getRegister(StackReg, getPointerTy()); // The last register argument that must be saved is MBlaze::R10 - const TargetRegisterClass *RC = MBlaze::GPRRegisterClass; + const TargetRegisterClass *RC = &MBlaze::GPRRegClass; unsigned Begin = getMBlazeRegisterNumbering(MBlaze::R5); unsigned Start = getMBlazeRegisterNumbering(ArgRegEnd+1); @@ -1016,7 +1022,7 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze); @@ -1124,14 +1130,14 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': - return std::make_pair(0U, MBlaze::GPRRegisterClass); + return std::make_pair(0U, &MBlaze::GPRRegClass); // TODO: These can't possibly be right, but match what was in // getRegClassForInlineAsmConstraint. case 'd': case 'y': case 'f': if (VT == MVT::f32) - return std::make_pair(0U, MBlaze::GPRRegisterClass); + return std::make_pair(0U, &MBlaze::GPRRegClass); } } return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h index 6a79fc1..a01fab5 100644 --- a/lib/Target/MBlaze/MBlazeISelLowering.h +++ b/lib/Target/MBlaze/MBlazeISelLowering.h @@ -132,13 +132,7 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp index db71434..b5025fc 100644 --- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp +++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp @@ -287,7 +287,7 @@ unsigned MBlazeInstrInfo::getGlobalBaseReg(MachineFunction *MF) const { MachineRegisterInfo &RegInfo = MF->getRegInfo(); const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); - GlobalBaseReg = RegInfo.createVirtualRegister(MBlaze::GPRRegisterClass); + GlobalBaseReg = RegInfo.createVirtualRegister(&MBlaze::GPRRegClass); BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), GlobalBaseReg).addReg(MBlaze::R20); RegInfo.addLiveIn(MBlaze::R20); diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td index 02a2157..139bf71 100644 --- a/lib/Target/MBlaze/MBlazeInstrInfo.td +++ b/lib/Target/MBlaze/MBlazeInstrInfo.td @@ -295,7 +295,7 @@ class BranchI<bits<6> op, bits<5> br, string instr_asm> : // Branch and Link Instructions //===----------------------------------------------------------------------===// class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : - TA<op, flags, (outs), (ins GPR:$link, GPR:$target, variable_ops), + TA<op, flags, (outs), (ins GPR:$link, GPR:$target), !strconcat(instr_asm, " $link, $target"), [], IIC_BRl> { let ra = br; @@ -303,7 +303,7 @@ class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : } class BranchLI<bits<6> op, bits<5> br, string instr_asm> : - TB<op, (outs), (ins GPR:$link, calltarget:$target, variable_ops), + TB<op, (outs), (ins GPR:$link, calltarget:$target), !strconcat(instr_asm, " $link, $target"), [], IIC_BRl> { let ra = br; diff --git a/lib/Target/MBlaze/MBlazeMCInstLower.h b/lib/Target/MBlaze/MBlazeMCInstLower.h index 7b97744..8ab2c9a 100644 --- a/lib/Target/MBlaze/MBlazeMCInstLower.h +++ b/lib/Target/MBlaze/MBlazeMCInstLower.h @@ -21,18 +21,16 @@ namespace llvm { class MachineInstr; class MachineModuleInfoMachO; class MachineOperand; - class Mangler; /// MBlazeMCInstLower - This class is used to lower an MachineInstr /// into an MCInst. class LLVM_LIBRARY_VISIBILITY MBlazeMCInstLower { MCContext &Ctx; - Mangler &Mang; AsmPrinter &Printer; public: - MBlazeMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer) - : Ctx(ctx), Mang(mang), Printer(printer) {} + MBlazeMCInstLower(MCContext &ctx, AsmPrinter &printer) + : Ctx(ctx), Printer(printer) {} void Lower(const MachineInstr *MI, MCInst &OutMI) const; MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td index 4a3ae5f..cd5691c 100644 --- a/lib/Target/MBlaze/MBlazeSchedule.td +++ b/lib/Target/MBlaze/MBlazeSchedule.td @@ -40,11 +40,6 @@ def IIC_WDC : InstrItinClass; def IIC_Pseudo : InstrItinClass; //===----------------------------------------------------------------------===// -// MBlaze generic instruction itineraries. -//===----------------------------------------------------------------------===// -def MBlazeGenericItineraries : ProcessorItineraries<[], [], []>; - -//===----------------------------------------------------------------------===// // MBlaze instruction itineraries for three stage pipeline. //===----------------------------------------------------------------------===// include "MBlazeSchedule3.td" diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp index d12d142..dc2ad29 100644 --- a/lib/Target/MBlaze/MBlazeSubtarget.cpp +++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp @@ -43,13 +43,6 @@ MBlazeSubtarget::MBlazeSubtarget(const std::string &TT, // Initialize scheduling itinerary for the specified CPU. InstrItins = getInstrItineraryForCPU(CPUName); - - // Compute the issue width of the MBlaze itineraries - computeIssueWidth(); -} - -void MBlazeSubtarget::computeIssueWidth() { - InstrItins.IssueWidth = 1; } bool MBlazeSubtarget:: diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp index dd7de9b..5f82f14 100644 --- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp +++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp @@ -68,7 +68,7 @@ TargetPassConfig *MBlazeTargetMachine::createPassConfig(PassManagerBase &PM) { // Install an instruction selector pass using // the ISelDag to gen MBlaze code. bool MBlazePassConfig::addInstSelector() { - PM.add(createMBlazeISelDag(getMBlazeTargetMachine())); + addPass(createMBlazeISelDag(getMBlazeTargetMachine())); return false; } @@ -76,6 +76,6 @@ bool MBlazePassConfig::addInstSelector() { // machine code is emitted. return true if -print-machineinstrs should // print out the code after the passes. bool MBlazePassConfig::addPreEmitPass() { - PM.add(createMBlazeDelaySlotFillerPass(getMBlazeTargetMachine())); + addPass(createMBlazeDelaySlotFillerPass(getMBlazeTargetMachine())); return true; } diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp index c9b1636..bfd11a0 100644 --- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp +++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp @@ -98,6 +98,7 @@ public: MCCodeEmitter *llvm::createMBlazeMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx) { return new MBlazeMCCodeEmitter(MCII, STI, Ctx); diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h index ae82c32..7cc96c6 100644 --- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h +++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h @@ -22,6 +22,7 @@ class MCContext; class MCCodeEmitter; class MCInstrInfo; class MCObjectWriter; +class MCRegisterInfo; class MCSubtargetInfo; class Target; class StringRef; @@ -30,6 +31,7 @@ class raw_ostream; extern Target TheMBlazeTarget; MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx); diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt index a8f9b52..f9ecaed 100644 --- a/lib/Target/MSP430/CMakeLists.txt +++ b/lib/Target/MSP430/CMakeLists.txt @@ -23,6 +23,8 @@ add_llvm_target(MSP430CodeGen MSP430MCInstLower.cpp ) +add_dependencies(LLVMMSP430CodeGen intrinsics_gen) + add_subdirectory(InstPrinter) add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp index 1d1094b..86bc183 100644 --- a/lib/Target/MSP430/MSP430AsmPrinter.cpp +++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp @@ -154,7 +154,7 @@ bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, //===----------------------------------------------------------------------===// void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) { - MSP430MCInstLower MCInstLowering(OutContext, *Mang, *this); + MSP430MCInstLower MCInstLowering(OutContext, *this); MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index 071a2f7..f8b7e14 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -59,13 +59,13 @@ HWMultMode("msp430-hwmult-mode", MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) : TargetLowering(tm, new TargetLoweringObjectFileELF()), - Subtarget(*tm.getSubtargetImpl()), TM(tm) { + Subtarget(*tm.getSubtargetImpl()) { TD = getTargetData(); // Set up the register classes. - addRegisterClass(MVT::i8, MSP430::GR8RegisterClass); - addRegisterClass(MVT::i16, MSP430::GR16RegisterClass); + addRegisterClass(MVT::i8, &MSP430::GR8RegClass); + addRegisterClass(MVT::i16, &MSP430::GR16RegClass); // Compute derived properties from the register classes computeRegisterProperties(); @@ -226,9 +226,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint, default: break; case 'r': // GENERAL_REGS if (VT == MVT::i8) - return std::make_pair(0U, MSP430::GR8RegisterClass); + return std::make_pair(0U, &MSP430::GR8RegClass); - return std::make_pair(0U, MSP430::GR16RegisterClass); + return std::make_pair(0U, &MSP430::GR16RegClass); } } @@ -266,14 +266,19 @@ MSP430TargetLowering::LowerFormalArguments(SDValue Chain, } SDValue -MSP430TargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + // MSP430 target does not yet support tail call optimization. isTailCall = false; @@ -310,7 +315,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430); assert(!isVarArg && "Varargs not supported yet"); @@ -330,8 +335,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain, llvm_unreachable(0); } case MVT::i16: - unsigned VReg = - RegInfo.createVirtualRegister(MSP430::GR16RegisterClass); + unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT); @@ -391,7 +395,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_MSP430); @@ -445,7 +449,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_MSP430); @@ -568,7 +572,7 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430); @@ -1024,27 +1028,27 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI, default: llvm_unreachable("Invalid shift opcode!"); case MSP430::Shl8: Opc = MSP430::SHL8r1; - RC = MSP430::GR8RegisterClass; + RC = &MSP430::GR8RegClass; break; case MSP430::Shl16: Opc = MSP430::SHL16r1; - RC = MSP430::GR16RegisterClass; + RC = &MSP430::GR16RegClass; break; case MSP430::Sra8: Opc = MSP430::SAR8r1; - RC = MSP430::GR8RegisterClass; + RC = &MSP430::GR8RegClass; break; case MSP430::Sra16: Opc = MSP430::SAR16r1; - RC = MSP430::GR16RegisterClass; + RC = &MSP430::GR16RegClass; break; case MSP430::Srl8: Opc = MSP430::SAR8r1c; - RC = MSP430::GR8RegisterClass; + RC = &MSP430::GR8RegClass; break; case MSP430::Srl16: Opc = MSP430::SAR16r1c; - RC = MSP430::GR16RegisterClass; + RC = &MSP430::GR16RegClass; break; } @@ -1072,8 +1076,8 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI, LoopBB->addSuccessor(RemBB); LoopBB->addSuccessor(LoopBB); - unsigned ShiftAmtReg = RI.createVirtualRegister(MSP430::GR8RegisterClass); - unsigned ShiftAmtReg2 = RI.createVirtualRegister(MSP430::GR8RegisterClass); + unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass); + unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass); unsigned ShiftReg = RI.createVirtualRegister(RC); unsigned ShiftReg2 = RI.createVirtualRegister(RC); unsigned ShiftAmtSrcReg = MI->getOperand(2).getReg(); diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h index e372f00..d8ad02f 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.h +++ b/lib/Target/MSP430/MSP430ISelLowering.h @@ -152,12 +152,7 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, - bool isVarArg, bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue @@ -174,7 +169,6 @@ namespace llvm { SelectionDAG &DAG) const; const MSP430Subtarget &Subtarget; - const MSP430TargetMachine &TM; const TargetData *TD; }; } // namespace llvm diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp index c03ba47..be332f0 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -29,7 +29,7 @@ using namespace llvm; MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm) : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), - RI(tm, *this), TM(tm) {} + RI(tm, *this) {} void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h index 04f339b..d79f992 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.h +++ b/lib/Target/MSP430/MSP430InstrInfo.h @@ -42,7 +42,6 @@ namespace MSP430II { class MSP430InstrInfo : public MSP430GenInstrInfo { const MSP430RegisterInfo RI; - MSP430TargetMachine &TM; public: explicit MSP430InstrInfo(MSP430TargetMachine &TM); diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td index 4348dd5..f003574 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.td +++ b/lib/Target/MSP430/MSP430InstrInfo.td @@ -210,13 +210,13 @@ let isCall = 1 in let Defs = [R12W, R13W, R14W, R15W, SRW], Uses = [SPW] in { def CALLi : II16i<0x0, - (outs), (ins i16imm:$dst, variable_ops), + (outs), (ins i16imm:$dst), "call\t$dst", [(MSP430call imm:$dst)]>; def CALLr : II16r<0x0, - (outs), (ins GR16:$dst, variable_ops), + (outs), (ins GR16:$dst), "call\t$dst", [(MSP430call GR16:$dst)]>; def CALLm : II16m<0x0, - (outs), (ins memsrc:$dst, variable_ops), + (outs), (ins memsrc:$dst), "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>; } diff --git a/lib/Target/MSP430/MSP430MCInstLower.h b/lib/Target/MSP430/MSP430MCInstLower.h index 24151e2..794aa56 100644 --- a/lib/Target/MSP430/MSP430MCInstLower.h +++ b/lib/Target/MSP430/MSP430MCInstLower.h @@ -21,18 +21,16 @@ namespace llvm { class MachineInstr; class MachineModuleInfoMachO; class MachineOperand; - class Mangler; /// MSP430MCInstLower - This class is used to lower an MachineInstr /// into an MCInst. class LLVM_LIBRARY_VISIBILITY MSP430MCInstLower { MCContext &Ctx; - Mangler &Mang; AsmPrinter &Printer; public: - MSP430MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer) - : Ctx(ctx), Mang(mang), Printer(printer) {} + MSP430MCInstLower(MCContext &ctx, AsmPrinter &printer) + : Ctx(ctx), Printer(printer) {} void Lower(const MachineInstr *MI, MCInst &OutMI) const; MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 51ec71a..aed46a2 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -96,7 +96,8 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const { } const TargetRegisterClass * -MSP430RegisterInfo::getPointerRegClass(unsigned Kind) const { +MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { return &MSP430::GR16RegClass; } diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h index 82ee499..9ee0a03 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.h +++ b/lib/Target/MSP430/MSP430RegisterInfo.h @@ -39,7 +39,8 @@ public: const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const; BitVector getReservedRegs(const MachineFunction &MF) const; - const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const; + const TargetRegisterClass* + getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const; void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td index 3f2eb8c..07619d0 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.td +++ b/lib/Target/MSP430/MSP430RegisterInfo.td @@ -78,8 +78,4 @@ def GR16 : RegisterClass<"MSP430", [i16], 16, // Frame pointer, sometimes allocable FPW, // Volatile, but not allocable - PCW, SPW, SRW, CGW)> -{ - let SubRegClasses = [(GR8 subreg_8bit)]; -} - + PCW, SPW, SRW, CGW)>; diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index 9f2eda1..817001d 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -60,12 +60,12 @@ TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) { bool MSP430PassConfig::addInstSelector() { // Install an instruction selector. - PM.add(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel())); + addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel())); return false; } bool MSP430PassConfig::addPreEmitPass() { // Must run branch selection immediately preceding the asm printer. - PM.add(createMSP430BranchSelectionPass()); + addPass(createMSP430BranchSelectionPass()); return false; } diff --git a/lib/Target/Mips/AsmParser/CMakeLists.txt b/lib/Target/Mips/AsmParser/CMakeLists.txt index ac21c25..6c7343b 100644 --- a/lib/Target/Mips/AsmParser/CMakeLists.txt +++ b/lib/Target/Mips/AsmParser/CMakeLists.txt @@ -1,6 +1,5 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - add_llvm_library(LLVMMipsAsmParser MipsAsmParser.cpp ) +add_dependencies(LLVMMipsAsmParser MipsCommonTableGen) diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt index 0500c5d..e9a228c 100644 --- a/lib/Target/Mips/CMakeLists.txt +++ b/lib/Target/Mips/CMakeLists.txt @@ -17,13 +17,12 @@ add_llvm_target(MipsCodeGen MipsAsmPrinter.cpp MipsCodeEmitter.cpp MipsDelaySlotFiller.cpp - MipsEmitGPRestore.cpp - MipsExpandPseudo.cpp MipsJITInfo.cpp MipsInstrInfo.cpp MipsISelDAGToDAG.cpp MipsISelLowering.cpp MipsFrameLowering.cpp + MipsLongBranch.cpp MipsMCInstLower.cpp MipsMachineFunction.cpp MipsRegisterInfo.cpp @@ -33,6 +32,8 @@ add_llvm_target(MipsCodeGen MipsSelectionDAGInfo.cpp ) +add_dependencies(LLVMMipsCodeGen intrinsics_gen) + add_subdirectory(InstPrinter) add_subdirectory(Disassembler) add_subdirectory(TargetInfo) diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index 78dbc06..042b456 100644 --- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -13,136 +13,87 @@ #include "Mips.h" #include "MipsSubtarget.h" +#include "MipsRegisterInfo.h" #include "llvm/MC/EDInstInfo.h" #include "llvm/MC/MCDisassembler.h" #include "llvm/Support/MemoryObject.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/MathExtras.h" - #include "MipsGenEDInfo.inc" using namespace llvm; typedef MCDisassembler::DecodeStatus DecodeStatus; -/// MipsDisassembler - a disasembler class for Mips32. -class MipsDisassembler : public MCDisassembler { +namespace { + +/// MipsDisassemblerBase - a disasembler class for Mips. +class MipsDisassemblerBase : public MCDisassembler { public: /// Constructor - Initializes the disassembler. /// - MipsDisassembler(const MCSubtargetInfo &STI, bool bigEndian) : - MCDisassembler(STI), isBigEndian(bigEndian) { - } - - ~MipsDisassembler() { - } + MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info, + bool bigEndian) : + MCDisassembler(STI), RegInfo(Info), isBigEndian(bigEndian) {} - /// getInstruction - See MCDisassembler. - DecodeStatus getInstruction(MCInst &instr, - uint64_t &size, - const MemoryObject ®ion, - uint64_t address, - raw_ostream &vStream, - raw_ostream &cStream) const; + virtual ~MipsDisassemblerBase() {} /// getEDInfo - See MCDisassembler. const EDInstInfo *getEDInfo() const; + const MCRegisterInfo *getRegInfo() const { return RegInfo; } + private: + const MCRegisterInfo *RegInfo; +protected: bool isBigEndian; }; - -/// Mips64Disassembler - a disasembler class for Mips64. -class Mips64Disassembler : public MCDisassembler { +/// MipsDisassembler - a disasembler class for Mips32. +class MipsDisassembler : public MipsDisassemblerBase { public: /// Constructor - Initializes the disassembler. /// - Mips64Disassembler(const MCSubtargetInfo &STI, bool bigEndian) : - MCDisassembler(STI), isBigEndian(bigEndian) { - } - - ~Mips64Disassembler() { - } + MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info, + bool bigEndian) : + MipsDisassemblerBase(STI, Info, bigEndian) {} /// getInstruction - See MCDisassembler. - DecodeStatus getInstruction(MCInst &instr, - uint64_t &size, - const MemoryObject ®ion, - uint64_t address, - raw_ostream &vStream, - raw_ostream &cStream) const; - - /// getEDInfo - See MCDisassembler. - const EDInstInfo *getEDInfo() const; - -private: - bool isBigEndian; + virtual DecodeStatus getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const; }; -const EDInstInfo *MipsDisassembler::getEDInfo() const { - return instInfoMips; -} - -const EDInstInfo *Mips64Disassembler::getEDInfo() const { - return instInfoMips; -} - -// Decoder tables for Mips register -static const unsigned CPURegsTable[] = { - Mips::ZERO, Mips::AT, Mips::V0, Mips::V1, - Mips::A0, Mips::A1, Mips::A2, Mips::A3, - Mips::T0, Mips::T1, Mips::T2, Mips::T3, - Mips::T4, Mips::T5, Mips::T6, Mips::T7, - Mips::S0, Mips::S1, Mips::S2, Mips::S3, - Mips::S4, Mips::S5, Mips::S6, Mips::S7, - Mips::T8, Mips::T9, Mips::K0, Mips::K1, - Mips::GP, Mips::SP, Mips::FP, Mips::RA -}; -static const unsigned FGR32RegsTable[] = { - Mips::F0, Mips::F1, Mips::F2, Mips::F3, - Mips::F4, Mips::F5, Mips::F6, Mips::F7, - Mips::F8, Mips::F9, Mips::F10, Mips::F11, - Mips::F12, Mips::F13, Mips::F14, Mips::F15, - Mips::F16, Mips::F17, Mips::F18, Mips::F18, - Mips::F20, Mips::F21, Mips::F22, Mips::F23, - Mips::F24, Mips::F25, Mips::F26, Mips::F27, - Mips::F28, Mips::F29, Mips::F30, Mips::F31 -}; +/// Mips64Disassembler - a disasembler class for Mips64. +class Mips64Disassembler : public MipsDisassemblerBase { +public: + /// Constructor - Initializes the disassembler. + /// + Mips64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info, + bool bigEndian) : + MipsDisassemblerBase(STI, Info, bigEndian) {} -static const unsigned CPU64RegsTable[] = { - Mips::ZERO_64, Mips::AT_64, Mips::V0_64, Mips::V1_64, - Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64, - Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64, - Mips::T4_64, Mips::T5_64, Mips::T6_64, Mips::T7_64, - Mips::S0_64, Mips::S1_64, Mips::S2_64, Mips::S3_64, - Mips::S4_64, Mips::S5_64, Mips::S6_64, Mips::S7_64, - Mips::T8_64, Mips::T9_64, Mips::K0_64, Mips::K1_64, - Mips::GP_64, Mips::SP_64, Mips::FP_64, Mips::RA_64 + /// getInstruction - See MCDisassembler. + virtual DecodeStatus getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const; }; -static const unsigned FGR64RegsTable[] = { - Mips::D0_64, Mips::D1_64, Mips::D2_64, Mips::D3_64, - Mips::D4_64, Mips::D5_64, Mips::D6_64, Mips::D7_64, - Mips::D8_64, Mips::D9_64, Mips::D10_64, Mips::D11_64, - Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64, - Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64, - Mips::D20_64, Mips::D21_64, Mips::D22_64, Mips::D23_64, - Mips::D24_64, Mips::D25_64, Mips::D26_64, Mips::D27_64, - Mips::D28_64, Mips::D29_64, Mips::D30_64, Mips::D31_64 -}; +} // end anonymous namespace -static const unsigned AFGR64RegsTable[] = { - Mips::D0, Mips::D1, Mips::D2, Mips::D3, - Mips::D4, Mips::D5, Mips::D6, Mips::D7, - Mips::D8, Mips::D9, Mips::D10, Mips::D11, - Mips::D12, Mips::D13, Mips::D14, Mips::D15 -}; +const EDInstInfo *MipsDisassemblerBase::getEDInfo() const { + return instInfoMips; +} // Forward declare these because the autogenerated code will reference them. // Definitions are further down. @@ -239,25 +190,25 @@ extern Target TheMipselTarget, TheMipsTarget, TheMips64Target, static MCDisassembler *createMipsDisassembler( const Target &T, const MCSubtargetInfo &STI) { - return new MipsDisassembler(STI,true); + return new MipsDisassembler(STI, T.createMCRegInfo(""), true); } static MCDisassembler *createMipselDisassembler( const Target &T, const MCSubtargetInfo &STI) { - return new MipsDisassembler(STI,false); + return new MipsDisassembler(STI, T.createMCRegInfo(""), false); } static MCDisassembler *createMips64Disassembler( const Target &T, const MCSubtargetInfo &STI) { - return new Mips64Disassembler(STI,true); + return new Mips64Disassembler(STI, T.createMCRegInfo(""), true); } static MCDisassembler *createMips64elDisassembler( const Target &T, const MCSubtargetInfo &STI) { - return new Mips64Disassembler(STI, false); + return new Mips64Disassembler(STI, T.createMCRegInfo(""), false); } extern "C" void LLVMInitializeMipsDisassembler() { @@ -362,6 +313,11 @@ Mips64Disassembler::getInstruction(MCInst &instr, return MCDisassembler::Fail; } +static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) { + const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D); + return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo); +} + static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -370,7 +326,8 @@ static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst, if (RegNo > 31) return MCDisassembler::Fail; - Inst.addOperand(MCOperand::CreateReg(CPU64RegsTable[RegNo])); + unsigned Reg = getReg(Decoder, Mips::CPU64RegsRegClassID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Reg)); return MCDisassembler::Success; } @@ -380,8 +337,8 @@ static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst, const void *Decoder) { if (RegNo > 31) return MCDisassembler::Fail; - - Inst.addOperand(MCOperand::CreateReg(CPURegsTable[RegNo])); + unsigned Reg = getReg(Decoder, Mips::CPURegsRegClassID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Reg)); return MCDisassembler::Success; } @@ -392,7 +349,8 @@ static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst, if (RegNo > 31) return MCDisassembler::Fail; - Inst.addOperand(MCOperand::CreateReg(FGR64RegsTable[RegNo])); + unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Reg)); return MCDisassembler::Success; } @@ -403,7 +361,8 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, if (RegNo > 31) return MCDisassembler::Fail; - Inst.addOperand(MCOperand::CreateReg(FGR32RegsTable[RegNo])); + unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Reg)); return MCDisassembler::Success; } @@ -420,15 +379,18 @@ static DecodeStatus DecodeMem(MCInst &Inst, uint64_t Address, const void *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - int Reg = (int)fieldFromInstruction32(Insn, 16, 5); - int Base = (int)fieldFromInstruction32(Insn, 21, 5); + unsigned Reg = fieldFromInstruction32(Insn, 16, 5); + unsigned Base = fieldFromInstruction32(Insn, 21, 5); + + Reg = getReg(Decoder, Mips::CPURegsRegClassID, Reg); + Base = getReg(Decoder, Mips::CPURegsRegClassID, Base); if(Inst.getOpcode() == Mips::SC){ - Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Reg])); + Inst.addOperand(MCOperand::CreateReg(Reg)); } - Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Reg])); - Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Base])); + Inst.addOperand(MCOperand::CreateReg(Reg)); + Inst.addOperand(MCOperand::CreateReg(Base)); Inst.addOperand(MCOperand::CreateImm(Offset)); return MCDisassembler::Success; @@ -439,11 +401,14 @@ static DecodeStatus DecodeFMem(MCInst &Inst, uint64_t Address, const void *Decoder) { int Offset = SignExtend32<16>(Insn & 0xffff); - int Reg = (int)fieldFromInstruction32(Insn, 16, 5); - int Base = (int)fieldFromInstruction32(Insn, 21, 5); + unsigned Reg = fieldFromInstruction32(Insn, 16, 5); + unsigned Base = fieldFromInstruction32(Insn, 21, 5); - Inst.addOperand(MCOperand::CreateReg(FGR64RegsTable[Reg])); - Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Base])); + Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg); + Base = getReg(Decoder, Mips::CPURegsRegClassID, Base); + + Inst.addOperand(MCOperand::CreateReg(Reg)); + Inst.addOperand(MCOperand::CreateReg(Base)); Inst.addOperand(MCOperand::CreateImm(Offset)); return MCDisassembler::Success; @@ -474,10 +439,12 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { - if (RegNo > 31) + if (RegNo > 30 || RegNo %2) return MCDisassembler::Fail; - Inst.addOperand(MCOperand::CreateReg(AFGR64RegsTable[RegNo])); + ; + unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2); + Inst.addOperand(MCOperand::CreateReg(Reg)); return MCDisassembler::Success; } @@ -488,7 +455,7 @@ static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst, //Currently only hardware register 29 is supported if (RegNo != 29) return MCDisassembler::Fail; - Inst.addOperand(MCOperand::CreateReg(Mips::HWR29)); + Inst.addOperand(MCOperand::CreateReg(Mips::HWR29_64)); return MCDisassembler::Success; } diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp index 6886b17..b38463d 100644 --- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp +++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp @@ -13,6 +13,7 @@ #define DEBUG_TYPE "asm-printer" #include "MipsInstPrinter.h" +#include "MipsInstrInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -68,8 +69,25 @@ void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) { + switch (MI->getOpcode()) { + default: + break; + case Mips::RDHWR: + case Mips::RDHWR64: + O << "\t.set\tpush\n"; + O << "\t.set\tmips32r2\n"; + } + printInstruction(MI, O); printAnnotation(O, Annot); + + switch (MI->getOpcode()) { + default: + break; + case Mips::RDHWR: + case Mips::RDHWR64: + O << "\n\t.set\tpop"; + } } static void printExpr(const MCExpr *Expr, raw_ostream &OS) { @@ -108,6 +126,8 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) { case MCSymbolRefExpr::VK_Mips_GOT_DISP: OS << "%got_disp("; break; case MCSymbolRefExpr::VK_Mips_GOT_PAGE: OS << "%got_page("; break; case MCSymbolRefExpr::VK_Mips_GOT_OFST: OS << "%got_ofst("; break; + case MCSymbolRefExpr::VK_Mips_HIGHER: OS << "%higher("; break; + case MCSymbolRefExpr::VK_Mips_HIGHEST: OS << "%highest("; break; } OS << SRE->getSymbol(); diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h index 76b839b..3d8a6f9 100644 --- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h +++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h @@ -16,7 +16,7 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { -// These enumeration declarations were orignally in MipsInstrInfo.h but +// These enumeration declarations were originally in MipsInstrInfo.h but // had to be moved here to avoid circular dependencies between // LLVMMipsCodeGen and LLVMMipsAsmPrinter. namespace Mips { diff --git a/lib/Target/Mips/MCTargetDesc/Makefile b/lib/Target/Mips/MCTargetDesc/Makefile index 7fe2086..22a2721 100644 --- a/lib/Target/Mips/MCTargetDesc/Makefile +++ b/lib/Target/Mips/MCTargetDesc/Makefile @@ -14,3 +14,4 @@ LIBRARYNAME = LLVMMipsDesc CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. include $(LEVEL)/Makefile.common + diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 9b4caf6..6fe0c11 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -36,6 +36,11 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { case FK_GPRel_4: case FK_Data_4: case Mips::fixup_Mips_LO16: + case Mips::fixup_Mips_GPOFF_HI: + case Mips::fixup_Mips_GPOFF_LO: + case Mips::fixup_Mips_GOT_PAGE: + case Mips::fixup_Mips_GOT_OFST: + case Mips::fixup_Mips_GOT_DISP: break; case Mips::fixup_Mips_PC16: // So far we are only using this type for branches. @@ -74,7 +79,8 @@ public: :MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle), Is64Bit(_is64Bit) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createMipsELFObjectWriter(OS, OSType, IsLittle, Is64Bit); + return createMipsELFObjectWriter(OS, + MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit); } /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided @@ -115,7 +121,8 @@ public: CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8); } - uint64_t Mask = ((uint64_t)(-1) >> (64 - getFixupKindInfo(Kind).TargetSize)); + uint64_t Mask = ((uint64_t)(-1) >> + (64 - getFixupKindInfo(Kind).TargetSize)); CurVal |= Value & Mask; // Write out the fixed up bytes back to the code/data bits. @@ -156,7 +163,12 @@ public: { "fixup_Mips_TLSLDM", 0, 16, 0 }, { "fixup_Mips_DTPREL_HI", 0, 16, 0 }, { "fixup_Mips_DTPREL_LO", 0, 16, 0 }, - { "fixup_Mips_Branch_PCRel", 0, 16, MCFixupKindInfo::FKF_IsPCRel } + { "fixup_Mips_Branch_PCRel", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_Mips_GPOFF_HI", 0, 16, 0 }, + { "fixup_Mips_GPOFF_LO", 0, 16, 0 }, + { "fixup_Mips_GOT_PAGE", 0, 16, 0 }, + { "fixup_Mips_GOT_OFST", 0, 16, 0 }, + { "fixup_Mips_GOT_DISP", 0, 16, 0 } }; if (Kind < FirstTargetFixupKind) @@ -206,6 +218,14 @@ public: /// /// \return - True on success. bool writeNopData(uint64_t Count, MCObjectWriter *OW) const { + // Check for a less than instruction size number of bytes + // FIXME: 16 bit instructions are not handled yet here. + // We shouldn't be using a hard coded number for instruction size. + if (Count % 4) return false; + + uint64_t NumNops = Count / 4; + for (uint64_t i = 0; i != NumNops; ++i) + OW->Write32(0); return true; } }; // class MipsAsmBackend diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h index fb1c5ce..234455e 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h +++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h @@ -79,7 +79,12 @@ namespace MipsII { MO_GPOFF_LO, MO_GOT_DISP, MO_GOT_PAGE, - MO_GOT_OFST + MO_GOT_OFST, + + /// MO_HIGHER/HIGHEST - Represents the highest or higher half word of a + /// 64-bit symbol address. + MO_HIGHER, + MO_HIGHEST }; enum { diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 2091bec..77c1524 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -34,7 +34,7 @@ namespace { class MipsELFObjectWriter : public MCELFObjectTargetWriter { public: - MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI); + MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64); virtual ~MipsELFObjectWriter(); @@ -52,9 +52,11 @@ namespace { }; } -MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI) +MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, + bool _isN64) : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS, - /*HasRelocationAddend*/ false) {} + /*HasRelocationAddend*/ false, + /*IsN64*/ _isN64) {} MipsELFObjectWriter::~MipsELFObjectWriter() {} @@ -148,8 +150,26 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, case Mips::fixup_Mips_PC16: Type = ELF::R_MIPS_PC16; break; + case Mips::fixup_Mips_GOT_PAGE: + Type = ELF::R_MIPS_GOT_PAGE; + break; + case Mips::fixup_Mips_GOT_OFST: + Type = ELF::R_MIPS_GOT_OFST; + break; + case Mips::fixup_Mips_GOT_DISP: + Type = ELF::R_MIPS_GOT_DISP; + break; + case Mips::fixup_Mips_GPOFF_HI: + Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type); + Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type); + Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type); + break; + case Mips::fixup_Mips_GPOFF_LO: + Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type); + Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type); + Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type); + break; } - return Type; } @@ -184,10 +204,10 @@ static int CompareOffset(const RelEntry &R0, const RelEntry &R1) { void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm, std::vector<ELFRelocationEntry> &Relocs) { - // Call the defualt function first. Relocations are sorted in descending + // Call the default function first. Relocations are sorted in descending // order of r_offset. MCELFObjectTargetWriter::sortRelocs(Asm, Relocs); - + RelLs RelocLs; std::vector<RelLsIter> Unmatched; @@ -244,6 +264,7 @@ MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS, uint8_t OSABI, bool IsLittleEndian, bool Is64Bit) { - MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI); + MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI, + (Is64Bit) ? true : false); return createELFObjectWriter(MOTW, OS, IsLittleEndian); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h index 9b76eda..f5cbbd5 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h +++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h @@ -95,6 +95,21 @@ namespace Mips { // PC relative branch fixup resulting in - R_MIPS_PC16 fixup_Mips_Branch_PCRel, + // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 + fixup_Mips_GPOFF_HI, + + // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 + fixup_Mips_GPOFF_LO, + + // resulting in - R_MIPS_PAGE + fixup_Mips_GOT_PAGE, + + // resulting in - R_MIPS_GOT_OFST + fixup_Mips_GOT_OFST, + + // resulting in - R_MIPS_GOT_DISP + fixup_Mips_GOT_DISP, + // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 27954b1..ff3b3a7 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -91,6 +91,7 @@ public: } // namespace MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx) { @@ -98,6 +99,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII, } MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx) { @@ -179,7 +181,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, } else if (MO.isFPImm()) { return static_cast<unsigned>(APFloat(MO.getFPImm()) .bitcastToAPInt().getHiBits(32).getLimitedValue()); - } + } // MO must be an Expr. assert(MO.isExpr()); @@ -193,10 +195,27 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, } assert (Kind == MCExpr::SymbolRef); - - Mips::Fixups FixupKind; + + Mips::Fixups FixupKind = Mips::Fixups(0); switch(cast<MCSymbolRefExpr>(Expr)->getKind()) { + default: llvm_unreachable("Unknown fixup kind!"); + break; + case MCSymbolRefExpr::VK_Mips_GPOFF_HI : + FixupKind = Mips::fixup_Mips_GPOFF_HI; + break; + case MCSymbolRefExpr::VK_Mips_GPOFF_LO : + FixupKind = Mips::fixup_Mips_GPOFF_LO; + break; + case MCSymbolRefExpr::VK_Mips_GOT_PAGE : + FixupKind = Mips::fixup_Mips_GOT_PAGE; + break; + case MCSymbolRefExpr::VK_Mips_GOT_OFST : + FixupKind = Mips::fixup_Mips_GOT_OFST; + break; + case MCSymbolRefExpr::VK_Mips_GOT_DISP : + FixupKind = Mips::fixup_Mips_GOT_DISP; + break; case MCSymbolRefExpr::VK_Mips_GPREL: FixupKind = Mips::fixup_Mips_GPREL16; break; @@ -236,8 +255,6 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, case MCSymbolRefExpr::VK_Mips_TPREL_LO: FixupKind = Mips::fixup_Mips_TPREL_LO; break; - default: - break; } // switch Fixups.push_back(MCFixup::Create(0, MO.getExpr(), MCFixupKind(FixupKind))); diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h index 547ccdd..bfcc2a2 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h @@ -22,6 +22,7 @@ class MCCodeEmitter; class MCContext; class MCInstrInfo; class MCObjectWriter; +class MCRegisterInfo; class MCSubtargetInfo; class StringRef; class Target; @@ -33,9 +34,11 @@ extern Target TheMips64Target; extern Target TheMips64elTarget; MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx); MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx); diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h index bafadc8..2963f7e 100644 --- a/lib/Target/Mips/Mips.h +++ b/lib/Target/Mips/Mips.h @@ -24,9 +24,7 @@ namespace llvm { FunctionPass *createMipsISelDag(MipsTargetMachine &TM); FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM); - FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM); - FunctionPass *createMipsEmitGPRestorePass(MipsTargetMachine &TM); - + FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM); FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM, JITCodeEmitter &JCE); diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index cbebe84..8548ae0 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -72,6 +72,9 @@ def FeatureMips64r2 : SubtargetFeature<"mips64r2", "MipsArchVersion", "Mips64r2", "Mips64r2 ISA Support", [FeatureMips64, FeatureMips32r2]>; +def FeatureMips16 : SubtargetFeature<"mips16", "InMips16Mode", "true", + "Mips16 mode">; + //===----------------------------------------------------------------------===// // Mips processors supported. //===----------------------------------------------------------------------===// @@ -83,6 +86,7 @@ def : Proc<"mips32", [FeatureMips32]>; def : Proc<"mips32r2", [FeatureMips32r2]>; def : Proc<"mips64", [FeatureMips64]>; def : Proc<"mips64r2", [FeatureMips64r2]>; +def : Proc<"mips16", [FeatureMips16]>; def MipsAsmWriter : AsmWriter { string AsmWriterClassName = "InstPrinter"; diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td new file mode 100644 index 0000000..61602b6 --- /dev/null +++ b/lib/Target/Mips/Mips16InstrFormats.td @@ -0,0 +1,663 @@ +//===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Describe MIPS instructions format +// +// CPU INSTRUCTION FORMATS +// +// funct or f Function field +// +// immediate 4-,5-,8- or 11-bit immediate, branch displacement, or +// or imm address displacement +// +// op 5-bit major operation code +// +// rx 3-bit source or destination register +// +// ry 3-bit source or destination register +// +// rz 3-bit source or destination register +// +// sa 3- or 5-bit shift amount +// +//===----------------------------------------------------------------------===// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +// +class Format16<bits<5> val> { + bits<5> Value = val; +} + +def Pseudo16 : Format16<0>; +def FrmI16 : Format16<1>; +def FrmRI16 : Format16<2>; +def FrmRR16 : Format16<3>; +def FrmRRI16 : Format16<4>; +def FrmRRR16 : Format16<5>; +def FrmRRI_A16 : Format16<6>; +def FrmSHIFT16 : Format16<7>; +def FrmI8_TYPE16 : Format16<8>; +def FrmI8_MOVR3216 : Format16<9>; +def FrmI8_MOV32R16 : Format16<10>; +def FrmI8_SVRS16 : Format16<11>; +def FrmJAL16 : Format16<12>; +def FrmJALX16 : Format16<13>; +def FrmEXT_I16 : Format16<14>; +def FrmASMACRO16 : Format16<15>; +def FrmEXT_RI16 : Format16<16>; +def FrmEXT_RRI16 : Format16<17>; +def FrmEXT_RRI_A16 : Format16<18>; +def FrmEXT_SHIFT16 : Format16<19>; +def FrmEXT_I816 : Format16<20>; +def FrmEXT_I8_SVRS16 : Format16<21>; +def FrmOther16 : Format16<22>; // Instruction w/ a custom format + +// Base class for Mips 16 Format +// This class does not depend on the instruction size +// +class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern, + InstrItinClass itin, Format16 f>: Instruction +{ + Format16 Form = f; + + let Namespace = "Mips"; + + let OutOperandList = outs; + let InOperandList = ins; + + let AsmString = asmstr; + let Pattern = pattern; + let Itinerary = itin; + + // + // Attributes specific to Mips instructions... + // + bits<5> FormBits = Form.Value; + + // TSFlags layout should be kept in sync with MipsInstrInfo.h. + let TSFlags{4-0} = FormBits; + + let Predicates = [InMips16Mode]; +} + +// +// Generic Mips 16 Format +// +class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern, + InstrItinClass itin, Format16 f>: + MipsInst16_Base<outs, ins, asmstr, pattern, itin, f> +{ + field bits<16> Inst; + bits<5> Opcode = 0; + + // Top 5 bits are the 'opcode' field + let Inst{15-11} = Opcode; +} + +// +// For 32 bit extended instruction forms. +// +class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern, + InstrItinClass itin, Format16 f>: + MipsInst16_Base<outs, ins, asmstr, pattern, itin, f> +{ + field bits<32> Inst; + +} + +class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern, + InstrItinClass itin, Format16 f>: + MipsInst16_32<outs, ins, asmstr, pattern, itin, f> +{ + let Inst{31-27} = 0b11110; +} + + + +// Mips Pseudo Instructions Format +class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>: + MipsInst16<outs, ins, asmstr, pattern, IIPseudo, Pseudo16> { + let isCodeGenOnly = 1; + let isPseudo = 1; +} + + +//===----------------------------------------------------------------------===// +// Format I instruction class in Mips : <|opcode|imm11|> +//===----------------------------------------------------------------------===// + +class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern, + InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmI16> +{ + bits<11> imm11; + + let Opcode = op; + + let Inst{10-0} = imm11; +} + +//===----------------------------------------------------------------------===// +// Format RI instruction class in Mips : <|opcode|rx|imm8|> +//===----------------------------------------------------------------------===// + +class FRI16<bits<5> op, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmRI16> +{ + bits<3> rx; + bits<8> imm8; + + let Opcode = op; + + let Inst{10-8} = rx; + let Inst{7-0} = imm8; +} + +//===----------------------------------------------------------------------===// +// Format RR instruction class in Mips : <|opcode|rx|ry|funct|> +//===----------------------------------------------------------------------===// + +class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16> +{ + bits<3> rx; + bits<3> ry; + bits<5> funct; + + let Opcode = 0b11101; + let funct = _funct; + + let Inst{10-8} = rx; + let Inst{7-5} = ry; + let Inst{4-0} = funct; +} + +// +// For conversion functions. +// +class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins, + string asmstr, list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16> +{ + bits<3> rx; + bits<3> subfunct; + bits<5> funct; + + let Opcode = 0b11101; // RR + let funct = _funct; + let subfunct = _subfunct; + + let Inst{10-8} = rx; + let Inst{7-5} = subfunct; + let Inst{4-0} = funct; +} + +// +// just used for breakpoint (hardware and software) instructions. +// +class FC16<bits<5> _funct, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16> +{ + bits<6> _code; // code is a keyword in tablegen + bits<5> funct; + + let Opcode = 0b11101; // RR + let funct = _funct; + + let Inst{10-5} = _code; + let Inst{4-0} = funct; +} + +// +// J(AL)R(C) subformat +// +class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a, + dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16> +{ + bits<3> rx; + bits<1> nd; + bits<1> l; + bits<1> ra; + + let nd = _nd; + let l = _l; + let ra = r_a; + + let Opcode = 0b11101; + + let Inst{10-8} = rx; + let Inst{7} = nd; + let Inst{6} = l; + let Inst{5} = ra; + let Inst{4-0} = 0; +} + +//===----------------------------------------------------------------------===// +// Format RRI instruction class in Mips : <|opcode|rx|ry|imm5|> +//===----------------------------------------------------------------------===// + +class FRRI16<bits<5> op, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRI16> +{ + bits<3> rx; + bits<3> ry; + bits<5> imm5; + + let Opcode = op; + + + let Inst{10-8} = rx; + let Inst{7-5} = ry; + let Inst{4-0} = imm5; +} + +//===----------------------------------------------------------------------===// +// Format RRR instruction class in Mips : <|opcode|rx|ry|rz|f|> +//===----------------------------------------------------------------------===// + +class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRR16> +{ + bits<3> rx; + bits<3> ry; + bits<3> rz; + bits<2> f; + + let Opcode = 0b11100; + let f = _f; + + let Inst{10-8} = rx; + let Inst{7-5} = ry; + let Inst{4-2} = rz; + let Inst{1-0} = f; +} + +//===----------------------------------------------------------------------===// +// Format RRI-A instruction class in Mips : <|opcode|rx|ry|f|imm4|> +//===----------------------------------------------------------------------===// + +class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRI_A16> +{ + bits<3> rx; + bits<3> ry; + bits<1> f; + bits<4> imm4; + + let Opcode = 0b01000; + let f = _f; + + let Inst{10-8} = rx; + let Inst{7-5} = ry; + let Inst{4} = f; + let Inst{3-0} = imm4; +} + +//===----------------------------------------------------------------------===// +// Format Shift instruction class in Mips : <|opcode|rx|ry|sa|f|> +//===----------------------------------------------------------------------===// + +class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmSHIFT16> +{ + bits<3> rx; + bits<3> ry; + bits<3> sa; + bits<2> f; + + let Opcode = 0b00110; + let f = _f; + + let Inst{10-8} = rx; + let Inst{7-5} = ry; + let Inst{4-2} = sa; + let Inst{1-0} = f; +} + +//===----------------------------------------------------------------------===// +// Format i8 instruction class in Mips : <|opcode|funct|imm8> +//===----------------------------------------------------------------------===// + +class FI816<bits<3> _func, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_TYPE16> +{ + bits<3> func; + bits<8> imm8; + + let Opcode = 0b01100; + let func = _func; + + let Inst{10-8} = func; + let Inst{7-0} = imm8; +} + +//===----------------------------------------------------------------------===// +// Format i8_MOVR32 instruction class in Mips : <|opcode|func|ry|r32> +//===----------------------------------------------------------------------===// + +class FI8_MOVR3216<dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_MOVR3216> +{ + + bits<4> ry; + bits<4> r32; + + let Opcode = 0b01100; + + let Inst{10-8} = 0b111; + let Inst{7-4} = ry; + let Inst{3-0} = r32; + +} + + + +//===----------------------------------------------------------------------===// +// Format i8_MOV32R instruction class in Mips : <|opcode|func|r32|rz> +//===----------------------------------------------------------------------===// + +class FI8_MOV32R16<dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_MOV32R16> +{ + + bits<3> func; + bits<5> r32; + bits<3> rz; + + + let Opcode = 0b01100; + + let Inst{10-8} = 0b101; + let Inst{7-5} = r32{2-0}; + let Inst{4-3} = r32{4-3}; + let Inst{2-0} = rz; + +} + +//===----------------------------------------------------------------------===// +// Format i8_SVRS instruction class in Mips : +// <|opcode|svrs|s|ra|s0|s1|framesize> +//===----------------------------------------------------------------------===// + +class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_SVRS16> +{ + bits<1> s; + bits<1> ra = 0; + bits<1> s0 = 0; + bits<1> s1 = 0; + bits<4> framesize = 0; + + let s =_s; + let Opcode = 0b01100; + + let Inst{10-8} = 0b100; + let Inst{7} = s; + let Inst{6} = ra; + let Inst{5} = s0; + let Inst{4} = s1; + let Inst{3-0} = framesize; + +} + +//===----------------------------------------------------------------------===// +// Format JAL instruction class in Mips16 : +// <|opcode|svrs|s|ra|s0|s1|framesize> +//===----------------------------------------------------------------------===// + +class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_32<outs, ins, asmstr, pattern, itin, FrmJAL16> +{ + bits<1> X; + bits<26> imm26; + + + let X = _X; + + let Inst{31-27} = 0b00011; + let Inst{26} = X; + let Inst{25-21} = imm26{20-16}; + let Inst{20-16} = imm26{25-21}; + let Inst{15-0} = imm26{15-0}; + +} + +//===----------------------------------------------------------------------===// +// Format EXT-I instruction class in Mips16 : +// <|EXTEND|imm10:5|imm15:11|op|0|0|0|0|0|0|imm4:0> +//===----------------------------------------------------------------------===// + +class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_I16> +{ + bits<16> imm16; + bits<5> eop; + + let eop = _eop; + + let Inst{26-21} = imm16{10-5}; + let Inst{20-16} = imm16{15-11}; + let Inst{15-11} = eop; + let Inst{10-5} = 0; + let Inst{4-0} = imm16{4-0}; + +} + +//===----------------------------------------------------------------------===// +// Format ASMACRO instruction class in Mips16 : +// <EXTEND|select|p4|p3|RRR|p2|p1|p0> +//===----------------------------------------------------------------------===// + +class FASMACRO16<dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmASMACRO16> +{ + bits<3> select; + bits<3> p4; + bits<5> p3; + bits<5> RRR = 0b11100; + bits<3> p2; + bits<3> p1; + bits<5> p0; + + + let Inst{26-24} = select; + let Inst{23-21} = p4; + let Inst{20-16} = p3; + let Inst{15-11} = RRR; + let Inst{10-8} = p2; + let Inst{7-5} = p1; + let Inst{4-0} = p0; + +} + + +//===----------------------------------------------------------------------===// +// Format EXT-RI instruction class in Mips16 : +// <|EXTEND|imm10:5|imm15:11|op|rx|0|0|0|imm4:0> +//===----------------------------------------------------------------------===// + +class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RI16> +{ + bits<16> imm16; + bits<5> op; + bits<3> rx; + + let op = _op; + + let Inst{26-21} = imm16{10-5}; + let Inst{20-16} = imm16{15-11}; + let Inst{15-11} = op; + let Inst{10-8} = rx; + let Inst{7-5} = 0; + let Inst{4-0} = imm16{4-0}; + +} + +//===----------------------------------------------------------------------===// +// Format EXT-RRI instruction class in Mips16 : +// <|EXTEND|imm10:5|imm15:11|op|rx|ry|imm4:0> +//===----------------------------------------------------------------------===// + +class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RRI16> +{ + bits<5> op; + bits<16> imm16; + bits<3> rx; + bits<3> ry; + + let op=_op; + + let Inst{26-21} = imm16{10-5}; + let Inst{20-16} = imm16{15-11}; + let Inst{15-11} = op; + let Inst{10-8} = rx; + let Inst{7-5} = ry; + let Inst{4-0} = imm16{4-0}; + +} + +//===----------------------------------------------------------------------===// +// Format EXT-RRI-A instruction class in Mips16 : +// <|EXTEND|imm10:4|imm14:11|RRI-A|rx|ry|f|imm3:0> +//===----------------------------------------------------------------------===// + +class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RRI_A16> +{ + bits<15> imm15; + bits<3> rx; + bits<3> ry; + bits<1> f; + + let f = _f; + + let Inst{26-20} = imm15{10-4}; + let Inst{19-16} = imm15{14-11}; + let Inst{15-11} = 0b01000; + let Inst{10-8} = rx; + let Inst{7-5} = ry; + let Inst{4} = f; + let Inst{3-0} = imm15{3-0}; + +} + +//===----------------------------------------------------------------------===// +// Format EXT-SHIFT instruction class in Mips16 : +// <|EXTEND|sa 4:0|s5|0|SHIFT|rx|ry|0|f> +//===----------------------------------------------------------------------===// + +class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_SHIFT16> +{ + bits<6> sa6; + bits<3> rx; + bits<3> ry; + bits<2> f; + + let f = _f; + + let Inst{26-22} = sa6{4-0}; + let Inst{21} = sa6{5}; + let Inst{20-16} = 0; + let Inst{15-11} = 0b00110; + let Inst{10-8} = rx; + let Inst{7-5} = ry; + let Inst{4-2} = 0; + let Inst{1-0} = f; + +} + +//===----------------------------------------------------------------------===// +// Format EXT-I8 instruction class in Mips16 : +// <|EXTEND|imm10:5|imm15:11|I8|funct|0|imm4:0> +//===----------------------------------------------------------------------===// + +class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_I816> +{ + bits<16> imm16; + bits<5> I8; + bits<3> funct; + + let funct = _funct; + let I8 = 0b0110; + + let Inst{26-21} = imm16{10-5}; + let Inst{20-16} = imm16{15-11}; + let Inst{15-11} = I8; + let Inst{10-8} = funct; + let Inst{7-5} = 0; + let Inst{4-0} = imm16{4-0}; + +} + +//===----------------------------------------------------------------------===// +// Format EXT-I8_SVRS instruction class in Mips16 : +// <|EXTEND|xsregs|framesize7:4|aregs|I8|SVRS|s|ra|s0|s1|framesize3:0> +//===----------------------------------------------------------------------===// + +class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr, + list<dag> pattern, InstrItinClass itin>: + MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmI8_SVRS16> +{ + bits<3> xsregs =0; + bits<8> framesize =0; + bits<3> aregs =0; + bits<5> I8 = 0b01100; + bits<3> SVRS = 0b100; + bits<1> s; + bits<1> ra = 0; + bits<1> s0 = 0; + bits<1> s1 = 0; + + let s= s_; + + let Inst{26-24} = xsregs; + let Inst{23-20} = framesize{7-4}; + let Inst{19} = 0; + let Inst{18-16} = aregs; + let Inst{15-11} = I8; + let Inst{10-8} = SVRS; + let Inst{7} = s; + let Inst{6} = ra; + let Inst{5} = s0; + let Inst{4} = s1; + let Inst{3-0} = framesize{3-0}; + + +} + + + diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td new file mode 100644 index 0000000..c852042 --- /dev/null +++ b/lib/Target/Mips/Mips16InstrInfo.td @@ -0,0 +1,243 @@ +//===- Mips16InstrInfo.td - Target Description for Mips16 -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes Mips16 instructions. +// +//===----------------------------------------------------------------------===// + +def uimm5 : Operand<i8> { + let DecoderMethod= "DecodeSimm16"; +} + +// +// RRR-type instruction format +// + +class FRRR16_ins<bits<2> _f, string asmstr, InstrItinClass itin> : + FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry), + !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>; + +// +// I8_MOV32R instruction format (used only by MOV32R instruction) +// +class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>: + FI8_MOV32R16<(outs CPURegs:$r32), (ins CPU16Regs:$rz), + !strconcat(asmstr, "\t$r32, $rz"), [], itin>; + +// +// EXT-RI instruction format +// + +class FEXT_RI16_ins_base<bits<5> _op, string asmstr, string asmstr2, + InstrItinClass itin>: + FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins simm16:$imm), + !strconcat(asmstr, asmstr2), [], itin>; + +class FEXT_RI16_ins<bits<5> _op, string asmstr, + InstrItinClass itin>: + FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $imm", itin>; + +class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>: + FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>; + +// +// RR-type instruction format +// +let rx=0 in +class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_, + string asmstr, InstrItinClass itin>: + FRR16_JALRC<nd_, l_, 1, (outs), (ins), !strconcat(asmstr, "\t $$ra"), + [], itin> ; + +// +// EXT-RRI instruction format +// + +class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd, + InstrItinClass itin>: + FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins MemOpnd:$addr), + !strconcat(asmstr, "\t$ry, $addr"), [], itin>; + +// +// EXT-SHIFT instruction format +// +class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>: + FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, uimm5:$sa), + !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>; + +// +// Address operand +def mem16 : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops CPU16Regs, simm16); + let EncoderMethod = "getMemEncoding"; +} + +// +// Format: ADDIU rx, pc, immediate MIPS16e +// Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended) +// To add a constant to the program counter. +// +class AddiuRxPcImmX16_base : FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>; +def AddiuRxPcImmX16 : AddiuRxPcImmX16_base; +// +// Format: ADDU rz, rx, ry MIPS16e +// Purpose: Add Unsigned Word (3-Operand) +// To add 32-bit integers. +// + +class AdduRxRyRz16_base: FRRR16_ins<01, "addu", IIAlu>; +def AdduRxRyRz16: AdduRxRyRz16_base; + +// +// Format: JR ra MIPS16e +// Purpose: Jump Register Through Register ra +// To execute a branch to the instruction address in the return +// address register. +// + +def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu>; + +// +// Format: LI rx, immediate MIPS16e +// Purpose: Load Immediate (Extended) +// To load a constant into a GPR. +// +def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>; + +// +// Format: LW ry, offset(rx) MIPS16e +// Purpose: Load Word (Extended) +// To load a word from memory as a signed value. +// +class LwRxRyOffMemX16_base: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IIAlu>; +def LwRxRyOffMemX16: LwRxRyOffMemX16_base; + +// +// Format: MOVE r32, rz MIPS16e +// Purpose: Move +// To move the contents of a GPR to a GPR. +// +def Mov32R16: FI8_MOV32R16_ins<"move", IIAlu>; +// +// Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize} +// (All args are optional) MIPS16e +// Purpose: Restore Registers and Deallocate Stack Frame +// To deallocate a stack frame before exit from a subroutine, +// restoring return address and static registers, and adjusting +// stack +// + +// fixed form for restoring RA and the frame +// for direct object emitter, encoding needs to be adjusted for the +// frame size +// +let ra=1, s=0,s0=0,s1=0 in +def RestoreRaF16: + FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size), + "restore \t$$ra, $frame_size", [], IILoad >; + +// +// Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional) +// MIPS16e +// Purpose: Save Registers and Set Up Stack Frame +// To set up a stack frame on entry to a subroutine, +// saving return address and static registers, and adjusting stack +// +let ra=1, s=1,s0=0,s1=0 in +def SaveRaF16: + FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size), + "save \t$$ra, $frame_size", [], IILoad >; + +// +// Format: SLL rx, ry, sa MIPS16e +// Purpose: Shift Word Left Logical (Extended) +// To execute a left-shift of a word by a fixed number of bits—0 to 31 bits. +// +def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>; + +// +// Format: SW ry, offset(rx) MIPS16e +// Purpose: Store Word (Extended) +// To store a word to memory. +// +class SwRxRyOffMemX16_base: FEXT_RRI16_mem_ins<0b11011, "sw", mem16, IIAlu>; +def SwRxRyOffMemX16: SwRxRyOffMemX16_base; + +class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> { + let Predicates = [InMips16Mode]; +} + +class ArithLogicR16Defs<SDNode OpNode, bit isComm = 0> { + dag OutOperandList = (outs CPU16Regs:$rz); + dag InOperandList = (ins CPU16Regs:$rx, CPU16Regs:$ry); + list<dag> Pattern = [(set CPU16Regs:$rz, + (OpNode CPU16Regs:$rx, CPU16Regs:$ry))]; +} + +multiclass ArithLogicR16_base { + def _add: AdduRxRyRz16_base, ArithLogicR16Defs<add, 1>; +} + +defm ArithLogicR16_patt : ArithLogicR16_base; + +class LoadM16Defs<PatFrag OpNode, Operand _MemOpnd, bit Pseudo=0> { + bit isPseudo = Pseudo; + Operand MemOpnd = _MemOpnd; + dag OutOperandList = (outs CPU16Regs:$ry); + dag InOperandList = (ins MemOpnd:$addr); + list<dag> Pattern = [(set CPU16Regs:$ry, (OpNode addr:$addr))]; +} + +multiclass LoadM16_base { + def _LwRxRyOffMemX16: LwRxRyOffMemX16_base, LoadM16Defs<load_a, mem16>; +} + +defm LoadM16: LoadM16_base; + +class StoreM16Defs<PatFrag OpNode, Operand _MemOpnd, bit Pseudo=0> { + bit isPseudo = Pseudo; + Operand MemOpnd = _MemOpnd; + dag OutOperandList = (outs ); + dag InOperandList = (ins CPU16Regs:$ry, MemOpnd:$addr); + list<dag> Pattern = [(OpNode CPU16Regs:$ry, addr:$addr)]; +} + +multiclass StoreM16_base { + def _SwRxRyOffMemX16: SwRxRyOffMemX16_base, StoreM16Defs<store_a, mem16>; +} + +defm StoreM16: StoreM16_base; + +// Jump and Link (Call) +let isCall=1, hasDelaySlot=1 in +def JumpLinkReg16: + FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs), + "jalr \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch>; + +// Mips16 pseudos +let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1, + hasExtraSrcRegAllocReq = 1 in +def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>; + +// As stack alignment is always done with addiu, we need a 16-bit immediate +// This is basically deprecated code but needs to be there for things +// to work. +let Defs = [SP], Uses = [SP] in { +def ADJCALLSTACKDOWN16 : MipsPseudo16<(outs), (ins uimm16:$amt), + ";", + [(callseq_start timm:$amt)]>; +def ADJCALLSTACKUP16 : MipsPseudo16<(outs), (ins uimm16:$amt1, uimm16:$amt2), + ";", + [(callseq_end timm:$amt1, timm:$amt2)]>; +} + +// Small immediates +def : Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>; +def : Mips16Pat<(MipsLo tglobaladdr:$in), (LiRxImmX16 tglobaladdr:$in)>; diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 0382869..cceee24 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -49,21 +49,24 @@ class Div64<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>: Div<op, func, instr_asm, itin, CPU64Regs, [HI64, LO64]>; multiclass Atomic2Ops64<PatFrag Op, string Opstr> { - def #NAME# : Atomic2Ops<Op, Opstr, CPU64Regs, CPURegs>, Requires<[NotN64]>; - def _P8 : Atomic2Ops<Op, Opstr, CPU64Regs, CPU64Regs>, Requires<[IsN64]> { + def #NAME# : Atomic2Ops<Op, Opstr, CPU64Regs, CPURegs>, + Requires<[NotN64, HasStandardEncoding]>; + def _P8 : Atomic2Ops<Op, Opstr, CPU64Regs, CPU64Regs>, + Requires<[IsN64, HasStandardEncoding]> { let isCodeGenOnly = 1; } } multiclass AtomicCmpSwap64<PatFrag Op, string Width> { - def #NAME# : AtomicCmpSwap<Op, Width, CPU64Regs, CPURegs>, Requires<[NotN64]>; + def #NAME# : AtomicCmpSwap<Op, Width, CPU64Regs, CPURegs>, + Requires<[NotN64, HasStandardEncoding]>; def _P8 : AtomicCmpSwap<Op, Width, CPU64Regs, CPU64Regs>, - Requires<[IsN64]> { + Requires<[IsN64, HasStandardEncoding]> { let isCodeGenOnly = 1; } } } -let usesCustomInserter = 1, Predicates = [HasMips64], +let usesCustomInserter = 1, Predicates = [HasMips64, HasStandardEncoding], DecoderNamespace = "Mips64" in { defm ATOMIC_LOAD_ADD_I64 : Atomic2Ops64<atomic_load_add_64, "load_add_64">; defm ATOMIC_LOAD_SUB_I64 : Atomic2Ops64<atomic_load_sub_64, "load_sub_64">; @@ -106,9 +109,15 @@ def DSRA : shift_rotate_imm64<0x3b, 0x00, "dsra", sra>; def DSLLV : shift_rotate_reg<0x14, 0x00, "dsllv", shl, CPU64Regs>; def DSRLV : shift_rotate_reg<0x16, 0x00, "dsrlv", srl, CPU64Regs>; def DSRAV : shift_rotate_reg<0x17, 0x00, "dsrav", sra, CPU64Regs>; +let Pattern = []<dag> in { +def DSLL32 : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>; +def DSRL32 : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>; +def DSRA32 : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>; +} } // Rotate Instructions -let Predicates = [HasMips64r2], DecoderNamespace = "Mips64" in { +let Predicates = [HasMips64r2, HasStandardEncoding], + DecoderNamespace = "Mips64" in { def DROTR : shift_rotate_imm64<0x3a, 0x01, "drotr", rotr>; def DROTRV : shift_rotate_reg<0x16, 0x01, "drotrv", rotr, CPU64Regs>; } @@ -137,18 +146,34 @@ defm USW64 : StoreM64<0x2b, "usw", truncstorei32_u, 1>; defm ULD : LoadM64<0x37, "uld", load_u, 1>; defm USD : StoreM64<0x3f, "usd", store_u, 1>; +/// load/store left/right +let isCodeGenOnly = 1 in { + defm LWL64 : LoadLeftRightM64<0x22, "lwl", MipsLWL>; + defm LWR64 : LoadLeftRightM64<0x26, "lwr", MipsLWR>; + defm SWL64 : StoreLeftRightM64<0x2a, "swl", MipsSWL>; + defm SWR64 : StoreLeftRightM64<0x2e, "swr", MipsSWR>; +} +defm LDL : LoadLeftRightM64<0x1a, "ldl", MipsLDL>; +defm LDR : LoadLeftRightM64<0x1b, "ldr", MipsLDR>; +defm SDL : StoreLeftRightM64<0x2c, "sdl", MipsSDL>; +defm SDR : StoreLeftRightM64<0x2d, "sdr", MipsSDR>; + /// Load-linked, Store-conditional -def LLD : LLBase<0x34, "lld", CPU64Regs, mem>, Requires<[NotN64]>; -def LLD_P8 : LLBase<0x34, "lld", CPU64Regs, mem64>, Requires<[IsN64]> { +def LLD : LLBase<0x34, "lld", CPU64Regs, mem>, + Requires<[NotN64, HasStandardEncoding]>; +def LLD_P8 : LLBase<0x34, "lld", CPU64Regs, mem64>, + Requires<[IsN64, HasStandardEncoding]> { let isCodeGenOnly = 1; } -def SCD : SCBase<0x3c, "scd", CPU64Regs, mem>, Requires<[NotN64]>; -def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>, Requires<[IsN64]> { +def SCD : SCBase<0x3c, "scd", CPU64Regs, mem>, + Requires<[NotN64, HasStandardEncoding]>; +def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>, + Requires<[IsN64, HasStandardEncoding]> { let isCodeGenOnly = 1; } /// Jump and Branch Instructions -def JR64 : JumpFR<0x00, 0x08, "jr", CPU64Regs>; +def JR64 : IndirectBranch<CPU64Regs>; def BEQ64 : CBranch<0x04, "beq", seteq, CPU64Regs>; def BNE64 : CBranch<0x05, "bne", setne, CPU64Regs>; def BGEZ64 : CBranchZero<0x01, 1, "bgez", setge, CPU64Regs>; @@ -187,7 +212,7 @@ def LEA_ADDiu64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>; } let Uses = [SP_64], DecoderNamespace = "Mips64" in def DynAlloc64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>, - Requires<[IsN64]> { + Requires<[IsN64, HasStandardEncoding]> { let isCodeGenOnly = 1; } let DecoderNamespace = "Mips64" in { @@ -209,48 +234,50 @@ def SLL64_64 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPU64Regs:$rt), //===----------------------------------------------------------------------===// // extended loads -let Predicates = [NotN64] in { - def : Pat<(i64 (extloadi1 addr:$src)), (LB64 addr:$src)>; - def : Pat<(i64 (extloadi8 addr:$src)), (LB64 addr:$src)>; - def : Pat<(i64 (extloadi16_a addr:$src)), (LH64 addr:$src)>; - def : Pat<(i64 (extloadi16_u addr:$src)), (ULH64 addr:$src)>; - def : Pat<(i64 (extloadi32_a addr:$src)), (LW64 addr:$src)>; - def : Pat<(i64 (extloadi32_u addr:$src)), (ULW64 addr:$src)>; - def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>; +let Predicates = [NotN64, HasStandardEncoding] in { + def : MipsPat<(i64 (extloadi1 addr:$src)), (LB64 addr:$src)>; + def : MipsPat<(i64 (extloadi8 addr:$src)), (LB64 addr:$src)>; + def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64 addr:$src)>; + def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64 addr:$src)>; + def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64 addr:$src)>; + def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64 addr:$src)>; + def : MipsPat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>; } -let Predicates = [IsN64] in { - def : Pat<(i64 (extloadi1 addr:$src)), (LB64_P8 addr:$src)>; - def : Pat<(i64 (extloadi8 addr:$src)), (LB64_P8 addr:$src)>; - def : Pat<(i64 (extloadi16_a addr:$src)), (LH64_P8 addr:$src)>; - def : Pat<(i64 (extloadi16_u addr:$src)), (ULH64_P8 addr:$src)>; - def : Pat<(i64 (extloadi32_a addr:$src)), (LW64_P8 addr:$src)>; - def : Pat<(i64 (extloadi32_u addr:$src)), (ULW64_P8 addr:$src)>; - def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>; +let Predicates = [IsN64, HasStandardEncoding] in { + def : MipsPat<(i64 (extloadi1 addr:$src)), (LB64_P8 addr:$src)>; + def : MipsPat<(i64 (extloadi8 addr:$src)), (LB64_P8 addr:$src)>; + def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64_P8 addr:$src)>; + def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64_P8 addr:$src)>; + def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64_P8 addr:$src)>; + def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64_P8 addr:$src)>; + def : MipsPat<(zextloadi32_u addr:$a), + (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>; } // hi/lo relocs -def : Pat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>; -def : Pat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>; -def : Pat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>; -def : Pat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>; -def : Pat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>; - -def : Pat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>; -def : Pat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>; -def : Pat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>; -def : Pat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>; -def : Pat<(MipsLo tglobaltlsaddr:$in), (DADDiu ZERO_64, tglobaltlsaddr:$in)>; - -def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)), - (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>; -def : Pat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)), - (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>; -def : Pat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)), - (DADDiu CPU64Regs:$hi, tjumptable:$lo)>; -def : Pat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)), - (DADDiu CPU64Regs:$hi, tconstpool:$lo)>; -def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)), - (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>; +def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>; +def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>; +def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>; +def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>; +def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>; + +def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>; +def : MipsPat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>; +def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>; +def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>; +def : MipsPat<(MipsLo tglobaltlsaddr:$in), + (DADDiu ZERO_64, tglobaltlsaddr:$in)>; + +def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)), + (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>; +def : MipsPat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)), + (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>; +def : MipsPat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)), + (DADDiu CPU64Regs:$hi, tjumptable:$lo)>; +def : MipsPat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)), + (DADDiu CPU64Regs:$hi, tconstpool:$lo)>; +def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)), + (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>; def : WrapperPat<tglobaladdr, DADDiu, CPU64Regs>; def : WrapperPat<tconstpool, DADDiu, CPU64Regs>; @@ -270,19 +297,22 @@ defm : SetgePats<CPU64Regs, SLT64, SLTu64>; defm : SetgeImmPats<CPU64Regs, SLTi64, SLTiu64>; // select MipsDynAlloc -def : Pat<(MipsDynAlloc addr:$f), (DynAlloc64 addr:$f)>, Requires<[IsN64]>; +def : MipsPat<(MipsDynAlloc addr:$f), (DynAlloc64 addr:$f)>, + Requires<[IsN64, HasStandardEncoding]>; // truncate -def : Pat<(i32 (trunc CPU64Regs:$src)), - (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>, Requires<[IsN64]>; +def : MipsPat<(i32 (trunc CPU64Regs:$src)), + (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>, + Requires<[IsN64, HasStandardEncoding]>; // 32-to-64-bit extension -def : Pat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>; -def : Pat<(i64 (zext CPURegs:$src)), (DSRL (DSLL64_32 CPURegs:$src), 32)>; -def : Pat<(i64 (sext CPURegs:$src)), (SLL64_32 CPURegs:$src)>; +def : MipsPat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>; +def : MipsPat<(i64 (zext CPURegs:$src)), (DSRL (DSLL64_32 CPURegs:$src), 32)>; +def : MipsPat<(i64 (sext CPURegs:$src)), (SLL64_32 CPURegs:$src)>; // Sign extend in register -def : Pat<(i64 (sext_inreg CPU64Regs:$src, i32)), (SLL64_64 CPU64Regs:$src)>; +def : MipsPat<(i64 (sext_inreg CPU64Regs:$src, i32)), + (SLL64_64 CPU64Regs:$src)>; -// bswap pattern -def : Pat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>; +// bswap MipsPattern +def : MipsPat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>; diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 8206cfc..00ff754 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -13,29 +13,29 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "mips-asm-printer" -#include "MipsAsmPrinter.h" #include "Mips.h" +#include "MipsAsmPrinter.h" #include "MipsInstrInfo.h" +#include "MipsMCInstLower.h" #include "InstPrinter/MipsInstPrinter.h" #include "MCTargetDesc/MipsBaseInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/BasicBlock.h" -#include "llvm/Instructions.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/InlineAsm.h" #include "llvm/Instructions.h" -#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetRegistry.h" #include "llvm/Target/Mangler.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -43,19 +43,6 @@ using namespace llvm; -void MipsAsmPrinter::EmitInstrWithMacroNoAT(const MachineInstr *MI) { - MCInst TmpInst; - - MCInstLowering.Lower(MI, TmpInst); - OutStreamer.EmitRawText(StringRef("\t.set\tmacro")); - if (MipsFI->getEmitNOAT()) - OutStreamer.EmitRawText(StringRef("\t.set\tat")); - OutStreamer.EmitInstruction(TmpInst); - if (MipsFI->getEmitNOAT()) - OutStreamer.EmitRawText(StringRef("\t.set\tnoat")); - OutStreamer.EmitRawText(StringRef("\t.set\tnomacro")); -} - bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) { MipsFI = MF.getInfo<MipsFunctionInfo>(); AsmPrinter::runOnMachineFunction(MF); @@ -71,84 +58,33 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } - unsigned Opc = MI->getOpcode(); - MCInst TmpInst0; - SmallVector<MCInst, 4> MCInsts; - - switch (Opc) { - case Mips::ULW: - case Mips::ULH: - case Mips::ULHu: - case Mips::USW: - case Mips::USH: - case Mips::ULW_P8: - case Mips::ULH_P8: - case Mips::ULHu_P8: - case Mips::USW_P8: - case Mips::USH_P8: - case Mips::ULD: - case Mips::ULW64: - case Mips::ULH64: - case Mips::ULHu64: - case Mips::USD: - case Mips::USW64: - case Mips::USH64: - case Mips::ULD_P8: - case Mips::ULW64_P8: - case Mips::ULH64_P8: - case Mips::ULHu64_P8: - case Mips::USD_P8: - case Mips::USW64_P8: - case Mips::USH64_P8: { - if (OutStreamer.hasRawTextSupport()) { - EmitInstrWithMacroNoAT(MI); - return; - } - - MCInstLowering.LowerUnalignedLoadStore(MI, MCInsts); - for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); I - != MCInsts.end(); ++I) - OutStreamer.EmitInstruction(*I); - - return; - } - case Mips::CPRESTORE: { - const MachineOperand &MO = MI->getOperand(0); - assert(MO.isImm() && "CPRESTORE's operand must be an immediate."); - int64_t Offset = MO.getImm(); - - if (OutStreamer.hasRawTextSupport()) { - if (!isInt<16>(Offset)) { - EmitInstrWithMacroNoAT(MI); + // Direct object specific instruction lowering + if (!OutStreamer.hasRawTextSupport()) + switch (MI->getOpcode()) { + case Mips::DSLL: + case Mips::DSRL: + case Mips::DSRA: + assert(MI->getNumOperands() == 3 && + "Invalid no. of machine operands for shift!"); + assert(MI->getOperand(2).isImm()); + int64_t Shift = MI->getOperand(2).getImm(); + if (Shift > 31) { + MCInst TmpInst0; + MCInstLowering.LowerLargeShift(MI, TmpInst0, Shift - 32); + OutStreamer.EmitInstruction(TmpInst0); return; } - } else { - MCInstLowering.LowerCPRESTORE(Offset, MCInsts); - - for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); - I != MCInsts.end(); ++I) - OutStreamer.EmitInstruction(*I); - - return; + break; } - break; - } - case Mips::SETGP01: { - MCInstLowering.LowerSETGP01(MI, MCInsts); - - for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); - I != MCInsts.end(); ++I) - OutStreamer.EmitInstruction(*I); - - return; - } - default: - break; - } + MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); - MCInstLowering.Lower(MI, TmpInst0); - OutStreamer.EmitInstruction(TmpInst0); + do { + MCInst TmpInst0; + MCInstLowering.Lower(I++, TmpInst0); + OutStreamer.EmitInstruction(TmpInst0); + } while ((I != E) && I->isInsideBundle()); } //===----------------------------------------------------------------------===// @@ -197,9 +133,9 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) { const MachineFrameInfo *MFI = MF->getFrameInfo(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); // size of stack area to which FP callee-saved regs are saved. - unsigned CPURegSize = Mips::CPURegsRegisterClass->getSize(); - unsigned FGR32RegSize = Mips::FGR32RegisterClass->getSize(); - unsigned AFGR64RegSize = Mips::AFGR64RegisterClass->getSize(); + unsigned CPURegSize = Mips::CPURegsRegClass.getSize(); + unsigned FGR32RegSize = Mips::FGR32RegClass.getSize(); + unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize(); bool HasAFGR64Reg = false; unsigned CSFPRegsSize = 0; unsigned i, e = CSI.size(); @@ -207,11 +143,11 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) { // Set FPU Bitmask. for (i = 0; i != e; ++i) { unsigned Reg = CSI[i].getReg(); - if (Mips::CPURegsRegisterClass->contains(Reg)) + if (Mips::CPURegsRegClass.contains(Reg)) break; unsigned RegNum = getMipsRegisterNumbering(Reg); - if (Mips::AFGR64RegisterClass->contains(Reg)) { + if (Mips::AFGR64RegClass.contains(Reg)) { FPUBitmask |= (3 << RegNum); CSFPRegsSize += AFGR64RegSize; HasAFGR64Reg = true; @@ -283,8 +219,15 @@ const char *MipsAsmPrinter::getCurrentABIString() const { } void MipsAsmPrinter::EmitFunctionEntryLabel() { - if (OutStreamer.hasRawTextSupport()) + if (OutStreamer.hasRawTextSupport()) { + if (Subtarget->inMips16Mode()) + OutStreamer.EmitRawText(StringRef("\t.set\tmips16")); + else + OutStreamer.EmitRawText(StringRef("\t.set\tnomips16")); + // leave out until FSF available gas has micromips changes + // OutStreamer.EmitRawText(StringRef("\t.set\tnomicromips")); OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName())); + } OutStreamer.EmitLabel(CurrentFnSym); } @@ -295,10 +238,6 @@ void MipsAsmPrinter::EmitFunctionBodyStart() { emitFrameDirective(); - bool EmitCPLoad = (MF->getTarget().getRelocationModel() == Reloc::PIC_) && - Subtarget->isABI_O32() && MipsFI->globalBaseRegSet() && - MipsFI->globalBaseRegFixed(); - if (OutStreamer.hasRawTextSupport()) { SmallString<128> Str; raw_svector_ostream OS(Str); @@ -306,20 +245,9 @@ void MipsAsmPrinter::EmitFunctionBodyStart() { OutStreamer.EmitRawText(OS.str()); OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder")); - - // Emit .cpload directive if needed. - if (EmitCPLoad) - OutStreamer.EmitRawText(StringRef("\t.cpload\t$25")); - OutStreamer.EmitRawText(StringRef("\t.set\tnomacro")); if (MipsFI->getEmitNOAT()) OutStreamer.EmitRawText(StringRef("\t.set\tnoat")); - } else if (EmitCPLoad) { - SmallVector<MCInst, 4> MCInsts; - MCInstLowering.LowerCPLOAD(MCInsts); - for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); - I != MCInsts.end(); ++I) - OutStreamer.EmitInstruction(*I); } } @@ -382,14 +310,99 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock* } // Print out an operand for an inline asm expression. -bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, +bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant,const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? - if (ExtraCode && ExtraCode[0]) - return true; // Unknown modifier. + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. - printOperand(MI, OpNo, O); + const MachineOperand &MO = MI->getOperand(OpNum); + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O); + case 'X': // hex const int + if ((MO.getType()) != MachineOperand::MO_Immediate) + return true; + O << "0x" << StringRef(utohexstr(MO.getImm())).lower(); + return false; + case 'x': // hex const int (low 16 bits) + if ((MO.getType()) != MachineOperand::MO_Immediate) + return true; + O << "0x" << StringRef(utohexstr(MO.getImm() & 0xffff)).lower(); + return false; + case 'd': // decimal const int + if ((MO.getType()) != MachineOperand::MO_Immediate) + return true; + O << MO.getImm(); + return false; + case 'm': // decimal const int minus 1 + if ((MO.getType()) != MachineOperand::MO_Immediate) + return true; + O << MO.getImm() - 1; + return false; + case 'z': { + // $0 if zero, regular printing otherwise + if (MO.getType() != MachineOperand::MO_Immediate) + return true; + int64_t Val = MO.getImm(); + if (Val) + O << Val; + else + O << "$0"; + return false; + } + case 'D': // Second part of a double word register operand + case 'L': // Low order register of a double word register operand + case 'M': // High order register of a double word register operand + { + if (OpNum == 0) + return true; + const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1); + if (!FlagsOP.isImm()) + return true; + unsigned Flags = FlagsOP.getImm(); + unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); + // Number of registers represented by this operand. We are looking + // for 2 for 32 bit mode and 1 for 64 bit mode. + if (NumVals != 2) { + if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) { + unsigned Reg = MO.getReg(); + O << '$' << MipsInstPrinter::getRegisterName(Reg); + return false; + } + return true; + } + + unsigned RegOp = OpNum; + if (!Subtarget->isGP64bit()){ + // Endianess reverses which register holds the high or low value + // between M and L. + switch(ExtraCode[0]) { + case 'M': + RegOp = (Subtarget->isLittle()) ? OpNum + 1 : OpNum; + break; + case 'L': + RegOp = (Subtarget->isLittle()) ? OpNum : OpNum + 1; + break; + case 'D': // Always the second part + RegOp = OpNum + 1; + } + if (RegOp >= MI->getNumOperands()) + return true; + const MachineOperand &MO = MI->getOperand(RegOp); + if (!MO.isReg()) + return true; + unsigned Reg = MO.getReg(); + O << '$' << MipsInstPrinter::getRegisterName(Reg); + return false; + } + } + } + } + + printOperand(MI, OpNum, O); return false; } @@ -398,11 +411,12 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, const char *ExtraCode, raw_ostream &O) { if (ExtraCode && ExtraCode[0]) - return true; // Unknown modifier. + return true; // Unknown modifier. const MachineOperand &MO = MI->getOperand(OpNum); assert(MO.isReg() && "unexpected inline asm memory operand"); O << "0($" << MipsInstPrinter::getRegisterName(MO.getReg()) << ")"; + return false; } @@ -450,7 +464,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, break; case MachineOperand::MO_BlockAddress: { - MCSymbol* BA = GetBlockAddressSymbol(MO.getBlockAddress()); + MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress()); O << BA->getName(); break; } @@ -511,7 +525,7 @@ printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O) { void MipsAsmPrinter:: printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O, const char *Modifier) { - const MachineOperand& MO = MI->getOperand(opNum); + const MachineOperand &MO = MI->getOperand(opNum); O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm()); } diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td index 4b7e1d3..8aadefd 100644 --- a/lib/Target/Mips/MipsCallingConv.td +++ b/lib/Target/Mips/MipsCallingConv.td @@ -145,6 +145,58 @@ def RetCC_MipsEABI : CallingConv<[ ]>; //===----------------------------------------------------------------------===// +// Mips FastCC Calling Convention +//===----------------------------------------------------------------------===// +def CC_MipsO32_FastCC : CallingConv<[ + // f64 arguments are passed in double-precision floating pointer registers. + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7, D8, D9]>>, + + // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 8>> +]>; + +def CC_MipsN_FastCC : CallingConv<[ + // Integer arguments are passed in integer registers. + CCIfType<[i64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64, T0_64, T1_64, + T2_64, T3_64, T4_64, T5_64, T6_64, T7_64, + T8_64, V1_64]>>, + + // f64 arguments are passed in double-precision floating pointer registers. + CCIfType<[f64], CCAssignToReg<[D0_64, D1_64, D2_64, D3_64, D4_64, D5_64, + D6_64, D7_64, D8_64, D9_64, D10_64, D11_64, + D12_64, D13_64, D14_64, D15_64, D16_64, D17_64, + D18_64, D19_64]>>, + + // Stack parameter slots for i64 and f64 are 64-bit doublewords and + // 8-byte aligned. + CCIfType<[i64, f64], CCAssignToStack<8, 8>> +]>; + +def CC_Mips_FastCC : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Integer arguments are passed in integer registers. All scratch registers, + // except for AT, V0 and T9, are available to be used as argument registers. + CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, + T7, T8, V1]>>, + + // f32 arguments are passed in single-precision floating pointer registers. + CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, + F11, F12, F13, F14, F15, F16, F17, F18, F19]>>, + + // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>, + CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FastCC>>, + CCDelegateTo<CC_MipsN_FastCC> +]>; + +//===----------------------------------------------------------------------===// // Mips Calling Convention Dispatch //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp index 7d81902..cb7022b 100644 --- a/lib/Target/Mips/MipsCodeEmitter.cpp +++ b/lib/Target/Mips/MipsCodeEmitter.cpp @@ -145,8 +145,8 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; ++MBB){ MCE.StartMachineBasicBlock(MBB); - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); - I != E; ++I) + for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(), + E = MBB->instr_end(); I != E; ++I) emitInstruction(*I); } } while (MCE.finishFunction(MF)); @@ -258,7 +258,7 @@ void MipsCodeEmitter::emitGlobalAddressUnaligned(const GlobalValue *GV, void MipsCodeEmitter:: emitExternalSymbolAddress(const char *ES, unsigned Reloc) const { MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), - Reloc, ES, 0, 0, false)); + Reloc, ES, 0, 0)); } void MipsCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const { diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td index da33680..b12b1f2 100644 --- a/lib/Target/Mips/MipsCondMov.td +++ b/lib/Target/Mips/MipsCondMov.td @@ -61,41 +61,54 @@ multiclass MovzPats0<RegisterClass CRC, RegisterClass DRC, Instruction MOVZInst, Instruction SLTOp, Instruction SLTuOp, Instruction SLTiOp, Instruction SLTiuOp> { - def : Pat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), - (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>; - def : Pat<(select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), - (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>; - def : Pat<(select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F), - (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>; - def : Pat<(select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F), - (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>; - def : Pat<(select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), - (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>; - def : Pat<(select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), - (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>; + def : MipsPat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>; + def : MipsPat< + (select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>; + def : MipsPat< + (select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>; + def : MipsPat< + (select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>; + def : MipsPat< + (select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>; + def : MipsPat< + (select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>; } multiclass MovzPats1<RegisterClass CRC, RegisterClass DRC, Instruction MOVZInst, Instruction XOROp> { - def : Pat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), - (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>; - def : Pat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F), - (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>; + def : MipsPat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>; + def : MipsPat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>; +} + +multiclass MovzPats2<RegisterClass CRC, RegisterClass DRC, + Instruction MOVZInst, Instruction XORiOp> { + def : MipsPat< + (select (i32 (seteq CRC:$lhs, immZExt16:$uimm16)), DRC:$T, DRC:$F), + (MOVZInst DRC:$T, (XORiOp CRC:$lhs, immZExt16:$uimm16), DRC:$F)>; } multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst, Instruction XOROp> { - def : Pat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), - (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>; - def : Pat<(select CRC:$cond, DRC:$T, DRC:$F), - (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>; - def : Pat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F), - (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>; + def : MipsPat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F), + (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>; + def : MipsPat<(select CRC:$cond, DRC:$T, DRC:$F), + (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>; + def : MipsPat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F), + (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>; } // Instantiation of instructions. def MOVZ_I_I : CondMovIntInt<CPURegs, CPURegs, 0x0a, "movz">; -let Predicates = [HasMips64],DecoderNamespace = "Mips64" in { +let Predicates = [HasMips64, HasStandardEncoding], + DecoderNamespace = "Mips64" in { def MOVZ_I_I64 : CondMovIntInt<CPURegs, CPU64Regs, 0x0a, "movz">; def MOVZ_I64_I : CondMovIntInt<CPU64Regs, CPURegs, 0x0a, "movz"> { let isCodeGenOnly = 1; @@ -106,7 +119,8 @@ let Predicates = [HasMips64],DecoderNamespace = "Mips64" in { } def MOVN_I_I : CondMovIntInt<CPURegs, CPURegs, 0x0b, "movn">; -let Predicates = [HasMips64],DecoderNamespace = "Mips64" in { +let Predicates = [HasMips64, HasStandardEncoding], + DecoderNamespace = "Mips64" in { def MOVN_I_I64 : CondMovIntInt<CPURegs, CPU64Regs, 0x0b, "movn">; def MOVN_I64_I : CondMovIntInt<CPU64Regs, CPURegs, 0x0b, "movn"> { let isCodeGenOnly = 1; @@ -118,21 +132,22 @@ let Predicates = [HasMips64],DecoderNamespace = "Mips64" in { def MOVZ_I_S : CondMovIntFP<CPURegs, FGR32, 16, 18, "movz.s">; def MOVZ_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 18, "movz.s">, - Requires<[HasMips64]> { + Requires<[HasMips64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } def MOVN_I_S : CondMovIntFP<CPURegs, FGR32, 16, 19, "movn.s">; def MOVN_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 19, "movn.s">, - Requires<[HasMips64]> { + Requires<[HasMips64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } -let Predicates = [NotFP64bit] in { +let Predicates = [NotFP64bit, HasStandardEncoding] in { def MOVZ_I_D32 : CondMovIntFP<CPURegs, AFGR64, 17, 18, "movz.d">; def MOVN_I_D32 : CondMovIntFP<CPURegs, AFGR64, 17, 19, "movn.d">; } -let Predicates = [IsFP64bit],DecoderNamespace = "Mips64" in { +let Predicates = [IsFP64bit, HasStandardEncoding], + DecoderNamespace = "Mips64" in { def MOVZ_I_D64 : CondMovIntFP<CPURegs, FGR64, 17, 18, "movz.d">; def MOVZ_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 18, "movz.d"> { let isCodeGenOnly = 1; @@ -145,24 +160,25 @@ let Predicates = [IsFP64bit],DecoderNamespace = "Mips64" in { def MOVT_I : CondMovFPInt<CPURegs, MipsCMovFP_T, 1, "movt">; def MOVT_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_T, 1, "movt">, - Requires<[HasMips64]> { + Requires<[HasMips64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } def MOVF_I : CondMovFPInt<CPURegs, MipsCMovFP_F, 0, "movf">; def MOVF_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_F, 0, "movf">, - Requires<[HasMips64]> { + Requires<[HasMips64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">; def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">; -let Predicates = [NotFP64bit] in { +let Predicates = [NotFP64bit, HasStandardEncoding] in { def MOVT_D32 : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">; def MOVF_D32 : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">; } -let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in { +let Predicates = [IsFP64bit, HasStandardEncoding], + DecoderNamespace = "Mips64" in { def MOVT_D64 : CondMovFPFP<FGR64, MipsCMovFP_T, 17, 1, "movt.d">; def MOVF_D64 : CondMovFPFP<FGR64, MipsCMovFP_F, 17, 0, "movf.d">; } @@ -170,7 +186,8 @@ let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in { // Instantiation of conditional move patterns. defm : MovzPats0<CPURegs, CPURegs, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>; defm : MovzPats1<CPURegs, CPURegs, MOVZ_I_I, XOR>; -let Predicates = [HasMips64] in { +defm : MovzPats2<CPURegs, CPURegs, MOVZ_I_I, XORi>; +let Predicates = [HasMips64, HasStandardEncoding] in { defm : MovzPats0<CPURegs, CPU64Regs, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>; defm : MovzPats0<CPU64Regs, CPURegs, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>; @@ -179,10 +196,13 @@ let Predicates = [HasMips64] in { defm : MovzPats1<CPURegs, CPU64Regs, MOVZ_I_I64, XOR>; defm : MovzPats1<CPU64Regs, CPURegs, MOVZ_I64_I, XOR64>; defm : MovzPats1<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XOR64>; + defm : MovzPats2<CPURegs, CPU64Regs, MOVZ_I_I64, XORi>; + defm : MovzPats2<CPU64Regs, CPURegs, MOVZ_I64_I, XORi64>; + defm : MovzPats2<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XORi64>; } defm : MovnPats<CPURegs, CPURegs, MOVN_I_I, XOR>; -let Predicates = [HasMips64] in { +let Predicates = [HasMips64, HasStandardEncoding] in { defm : MovnPats<CPURegs, CPU64Regs, MOVN_I_I64, XOR>; defm : MovnPats<CPU64Regs, CPURegs, MOVN_I64_I, XOR64>; defm : MovnPats<CPU64Regs, CPU64Regs, MOVN_I64_I64, XOR64>; @@ -191,19 +211,19 @@ let Predicates = [HasMips64] in { defm : MovzPats0<CPURegs, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>; defm : MovzPats1<CPURegs, FGR32, MOVZ_I_S, XOR>; defm : MovnPats<CPURegs, FGR32, MOVN_I_S, XOR>; -let Predicates = [HasMips64] in { +let Predicates = [HasMips64, HasStandardEncoding] in { defm : MovzPats0<CPU64Regs, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>; defm : MovzPats1<CPU64Regs, FGR32, MOVZ_I64_S, XOR64>; defm : MovnPats<CPU64Regs, FGR32, MOVN_I64_S, XOR64>; } -let Predicates = [NotFP64bit] in { +let Predicates = [NotFP64bit, HasStandardEncoding] in { defm : MovzPats0<CPURegs, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>; defm : MovzPats1<CPURegs, AFGR64, MOVZ_I_D32, XOR>; defm : MovnPats<CPURegs, AFGR64, MOVN_I_D32, XOR>; } -let Predicates = [IsFP64bit] in { +let Predicates = [IsFP64bit, HasStandardEncoding] in { defm : MovzPats0<CPURegs, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>; defm : MovzPats0<CPU64Regs, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>; diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp index debf2f1..2bba8a3 100644 --- a/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -36,12 +36,21 @@ static cl::opt<bool> EnableDelaySlotFiller( cl::desc("Fill the Mips delay slots useful instructions."), cl::Hidden); +// This option can be used to silence complaints by machine verifier passes. +static cl::opt<bool> SkipDelaySlotFiller( + "skip-mips-delay-filler", + cl::init(false), + cl::desc("Skip MIPS' delay slot filling pass."), + cl::Hidden); + namespace { struct Filler : public MachineFunctionPass { + typedef MachineBasicBlock::instr_iterator InstrIter; + typedef MachineBasicBlock::reverse_instr_iterator ReverseInstrIter; TargetMachine &TM; const TargetInstrInfo *TII; - MachineBasicBlock::iterator LastFiller; + InstrIter LastFiller; static char ID; Filler(TargetMachine &tm) @@ -53,6 +62,9 @@ namespace { bool runOnMachineBasicBlock(MachineBasicBlock &MBB); bool runOnMachineFunction(MachineFunction &F) { + if (SkipDelaySlotFiller) + return false; + bool Changed = false; for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) @@ -61,27 +73,27 @@ namespace { } bool isDelayFiller(MachineBasicBlock &MBB, - MachineBasicBlock::iterator candidate); + InstrIter candidate); - void insertCallUses(MachineBasicBlock::iterator MI, - SmallSet<unsigned, 32>& RegDefs, - SmallSet<unsigned, 32>& RegUses); + void insertCallUses(InstrIter MI, + SmallSet<unsigned, 32> &RegDefs, + SmallSet<unsigned, 32> &RegUses); - void insertDefsUses(MachineBasicBlock::iterator MI, - SmallSet<unsigned, 32>& RegDefs, - SmallSet<unsigned, 32>& RegUses); + void insertDefsUses(InstrIter MI, + SmallSet<unsigned, 32> &RegDefs, + SmallSet<unsigned, 32> &RegUses); - bool IsRegInSet(SmallSet<unsigned, 32>& RegSet, + bool IsRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg); - bool delayHasHazard(MachineBasicBlock::iterator candidate, + bool delayHasHazard(InstrIter candidate, bool &sawLoad, bool &sawStore, SmallSet<unsigned, 32> &RegDefs, SmallSet<unsigned, 32> &RegUses); bool - findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot, - MachineBasicBlock::iterator &Filler); + findDelayInstr(MachineBasicBlock &MBB, InstrIter slot, + InstrIter &Filler); }; @@ -93,14 +105,14 @@ namespace { bool Filler:: runOnMachineBasicBlock(MachineBasicBlock &MBB) { bool Changed = false; - LastFiller = MBB.end(); + LastFiller = MBB.instr_end(); - for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) + for (InstrIter I = MBB.instr_begin(); I != MBB.instr_end(); ++I) if (I->hasDelaySlot()) { ++FilledSlots; Changed = true; - MachineBasicBlock::iterator D; + InstrIter D; if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) { MBB.splice(llvm::next(I), &MBB, D); @@ -111,6 +123,10 @@ runOnMachineBasicBlock(MachineBasicBlock &MBB) { // Record the filler instruction that filled the delay slot. // The instruction after it will be visited in the next iteration. LastFiller = ++I; + + // Set InsideBundle bit so that the machine verifier doesn't expect this + // instruction to be a terminator. + LastFiller->setIsInsideBundle(); } return Changed; @@ -123,8 +139,8 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) { } bool Filler::findDelayInstr(MachineBasicBlock &MBB, - MachineBasicBlock::iterator slot, - MachineBasicBlock::iterator &Filler) { + InstrIter slot, + InstrIter &Filler) { SmallSet<unsigned, 32> RegDefs; SmallSet<unsigned, 32> RegUses; @@ -133,13 +149,13 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB, bool sawLoad = false; bool sawStore = false; - for (MachineBasicBlock::reverse_iterator I(slot); I != MBB.rend(); ++I) { + for (ReverseInstrIter I(slot); I != MBB.instr_rend(); ++I) { // skip debug value if (I->isDebugValue()) continue; // Convert to forward iterator. - MachineBasicBlock::iterator FI(llvm::next(I).base()); + InstrIter FI(llvm::next(I).base()); if (I->hasUnmodeledSideEffects() || I->isInlineAsm() @@ -165,7 +181,7 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB, return false; } -bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate, +bool Filler::delayHasHazard(InstrIter candidate, bool &sawLoad, bool &sawStore, SmallSet<unsigned, 32> &RegDefs, SmallSet<unsigned, 32> &RegUses) { @@ -213,9 +229,9 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate, } // Insert Defs and Uses of MI into the sets RegDefs and RegUses. -void Filler::insertDefsUses(MachineBasicBlock::iterator MI, - SmallSet<unsigned, 32>& RegDefs, - SmallSet<unsigned, 32>& RegUses) { +void Filler::insertDefsUses(InstrIter MI, + SmallSet<unsigned, 32> &RegDefs, + SmallSet<unsigned, 32> &RegUses) { // If MI is a call or return, just examine the explicit non-variadic operands. MCInstrDesc MCID = MI->getDesc(); unsigned e = MI->isCall() || MI->isReturn() ? MCID.getNumOperands() : @@ -240,14 +256,11 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI, } //returns true if the Reg or its alias is in the RegSet. -bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg) { - if (RegSet.count(Reg)) - return true; - // check Aliased Registers - for (const uint16_t *Alias = TM.getRegisterInfo()->getAliasSet(Reg); - *Alias; ++Alias) - if (RegSet.count(*Alias)) +bool Filler::IsRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg) { + // Check Reg and all aliased Registers. + for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true); + AI.isValid(); ++AI) + if (RegSet.count(*AI)) return true; - return false; } diff --git a/lib/Target/Mips/MipsEmitGPRestore.cpp b/lib/Target/Mips/MipsEmitGPRestore.cpp deleted file mode 100644 index 119d1a8..0000000 --- a/lib/Target/Mips/MipsEmitGPRestore.cpp +++ /dev/null @@ -1,97 +0,0 @@ -//===-- MipsEmitGPRestore.cpp - Emit GP Restore Instruction ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass emits instructions that restore $gp right -// after jalr instructions. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "emit-gp-restore" - -#include "Mips.h" -#include "MipsTargetMachine.h" -#include "MipsMachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/ADT/Statistic.h" - -using namespace llvm; - -namespace { - struct Inserter : public MachineFunctionPass { - - TargetMachine &TM; - const TargetInstrInfo *TII; - - static char ID; - Inserter(TargetMachine &tm) - : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { } - - virtual const char *getPassName() const { - return "Mips Emit GP Restore"; - } - - bool runOnMachineFunction(MachineFunction &F); - }; - char Inserter::ID = 0; -} // end of anonymous namespace - -bool Inserter::runOnMachineFunction(MachineFunction &F) { - MipsFunctionInfo *MipsFI = F.getInfo<MipsFunctionInfo>(); - - if ((TM.getRelocationModel() != Reloc::PIC_) || - (!MipsFI->globalBaseRegFixed())) - return false; - - bool Changed = false; - int FI = MipsFI->getGPFI(); - - for (MachineFunction::iterator MFI = F.begin(), MFE = F.end(); - MFI != MFE; ++MFI) { - MachineBasicBlock& MBB = *MFI; - MachineBasicBlock::iterator I = MFI->begin(); - - // If MBB is a landing pad, insert instruction that restores $gp after - // EH_LABEL. - if (MBB.isLandingPad()) { - // Find EH_LABEL first. - for (; I->getOpcode() != TargetOpcode::EH_LABEL; ++I) ; - - // Insert lw. - ++I; - DebugLoc dl = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); - BuildMI(MBB, I, dl, TII->get(Mips::LW), Mips::GP).addFrameIndex(FI) - .addImm(0); - Changed = true; - } - - while (I != MFI->end()) { - if (I->getOpcode() != Mips::JALR) { - ++I; - continue; - } - - DebugLoc dl = I->getDebugLoc(); - // emit lw $gp, ($gp save slot on stack) after jalr - BuildMI(MBB, ++I, dl, TII->get(Mips::LW), Mips::GP).addFrameIndex(FI) - .addImm(0); - Changed = true; - } - } - - return Changed; -} - -/// createMipsEmitGPRestorePass - Returns a pass that emits instructions that -/// restores $gp clobbered by jalr instructions. -FunctionPass *llvm::createMipsEmitGPRestorePass(MipsTargetMachine &tm) { - return new Inserter(tm); -} - diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp deleted file mode 100644 index baeae97..0000000 --- a/lib/Target/Mips/MipsExpandPseudo.cpp +++ /dev/null @@ -1,123 +0,0 @@ -//===-- MipsExpandPseudo.cpp - Expand Pseudo Instructions ----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass expands pseudo instructions into target instructions after register -// allocation but before post-RA scheduling. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "mips-expand-pseudo" - -#include "Mips.h" -#include "MipsTargetMachine.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/ADT/Statistic.h" - -using namespace llvm; - -namespace { - struct MipsExpandPseudo : public MachineFunctionPass { - - TargetMachine &TM; - const TargetInstrInfo *TII; - - static char ID; - MipsExpandPseudo(TargetMachine &tm) - : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { } - - virtual const char *getPassName() const { - return "Mips PseudoInstrs Expansion"; - } - - bool runOnMachineFunction(MachineFunction &F); - bool runOnMachineBasicBlock(MachineBasicBlock &MBB); - - private: - void ExpandBuildPairF64(MachineBasicBlock&, MachineBasicBlock::iterator); - void ExpandExtractElementF64(MachineBasicBlock&, - MachineBasicBlock::iterator); - }; - char MipsExpandPseudo::ID = 0; -} // end of anonymous namespace - -bool MipsExpandPseudo::runOnMachineFunction(MachineFunction& F) { - bool Changed = false; - - for (MachineFunction::iterator I = F.begin(); I != F.end(); ++I) - Changed |= runOnMachineBasicBlock(*I); - - return Changed; -} - -bool MipsExpandPseudo::runOnMachineBasicBlock(MachineBasicBlock& MBB) { - - bool Changed = false; - for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) { - const MCInstrDesc& MCid = I->getDesc(); - - switch(MCid.getOpcode()) { - default: - ++I; - continue; - case Mips::SETGP2: - // Convert "setgp2 $globalreg, $t9" to "addu $globalreg, $v0, $t9" - BuildMI(MBB, I, I->getDebugLoc(), TII->get(Mips::ADDu), - I->getOperand(0).getReg()) - .addReg(Mips::V0).addReg(I->getOperand(1).getReg()); - break; - case Mips::BuildPairF64: - ExpandBuildPairF64(MBB, I); - break; - case Mips::ExtractElementF64: - ExpandExtractElementF64(MBB, I); - break; - } - - // delete original instr - MBB.erase(I++); - Changed = true; - } - - return Changed; -} - -void MipsExpandPseudo::ExpandBuildPairF64(MachineBasicBlock& MBB, - MachineBasicBlock::iterator I) { - unsigned DstReg = I->getOperand(0).getReg(); - unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg(); - const MCInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1); - DebugLoc dl = I->getDebugLoc(); - const uint16_t* SubReg = - TM.getRegisterInfo()->getSubRegisters(DstReg); - - // mtc1 Lo, $fp - // mtc1 Hi, $fp + 1 - BuildMI(MBB, I, dl, Mtc1Tdd, *SubReg).addReg(LoReg); - BuildMI(MBB, I, dl, Mtc1Tdd, *(SubReg + 1)).addReg(HiReg); -} - -void MipsExpandPseudo::ExpandExtractElementF64(MachineBasicBlock& MBB, - MachineBasicBlock::iterator I) { - unsigned DstReg = I->getOperand(0).getReg(); - unsigned SrcReg = I->getOperand(1).getReg(); - unsigned N = I->getOperand(2).getImm(); - const MCInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1); - DebugLoc dl = I->getDebugLoc(); - const uint16_t* SubReg = TM.getRegisterInfo()->getSubRegisters(SrcReg); - - BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(*(SubReg + N)); -} - -/// createMipsMipsExpandPseudoPass - Returns a pass that expands pseudo -/// instrs into real instrs -FunctionPass *llvm::createMipsExpandPseudoPass(MipsTargetMachine &tm) { - return new MipsExpandPseudo(tm); -} diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index f8ea3d0..6338f3c 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -94,38 +94,6 @@ bool MipsFrameLowering::targetHandlesStackFrameRounding() const { return true; } -// Build an instruction sequence to load an immediate that is too large to fit -// in 16-bit and add the result to Reg. -static void expandLargeImm(unsigned Reg, int64_t Imm, bool IsN64, - const MipsInstrInfo &TII, MachineBasicBlock& MBB, - MachineBasicBlock::iterator II, DebugLoc DL) { - unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi; - unsigned ADDu = IsN64 ? Mips::DADDu : Mips::ADDu; - unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO; - unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT; - MipsAnalyzeImmediate AnalyzeImm; - const MipsAnalyzeImmediate::InstSeq &Seq = - AnalyzeImm.Analyze(Imm, IsN64 ? 64 : 32, false /* LastInstrIsADDiu */); - MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin(); - - // The first instruction can be a LUi, which is different from other - // instructions (ADDiu, ORI and SLL) in that it does not have a register - // operand. - if (Inst->Opc == LUi) - BuildMI(MBB, II, DL, TII.get(LUi), ATReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - else - BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - - // Build the remaining instructions in Seq. - for (++Inst; Inst != Seq.end(); ++Inst) - BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - - BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(Reg).addReg(ATReg); -} - void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -136,7 +104,6 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); - bool isPIC = (MF.getTarget().getRelocationModel() == Reloc::PIC_); unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP; unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP; unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO; @@ -144,35 +111,17 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu; // First, compute final stack size. - unsigned RegSize = STI.isGP32bit() ? 4 : 8; unsigned StackAlign = getStackAlignment(); - unsigned LocalVarAreaOffset = MipsFI->needGPSaveRestore() ? - (MFI->getObjectOffset(MipsFI->getGPFI()) + RegSize) : - MipsFI->getMaxCallFrameSize(); - uint64_t StackSize = RoundUpToAlignment(LocalVarAreaOffset, StackAlign) + - RoundUpToAlignment(MFI->getStackSize(), StackAlign); + uint64_t StackSize = RoundUpToAlignment(MFI->getStackSize(), StackAlign); + + if (MipsFI->globalBaseRegSet()) + StackSize += MFI->getObjectOffset(MipsFI->getGlobalRegFI()) + StackAlign; + else + StackSize += RoundUpToAlignment(MipsFI->getMaxCallFrameSize(), StackAlign); // Update stack size MFI->setStackSize(StackSize); - // Emit instructions that set the global base register if the target ABI is - // O32. - if (isPIC && MipsFI->globalBaseRegSet() && STI.isABI_O32() && - !MipsFI->globalBaseRegFixed()) { - // See MipsInstrInfo.td for explanation. - MachineBasicBlock *NewEntry = MF.CreateMachineBasicBlock(); - MF.insert(&MBB, NewEntry); - NewEntry->addSuccessor(&MBB); - - // Copy live in registers. - for (MachineBasicBlock::livein_iterator R = MBB.livein_begin(); - R != MBB.livein_end(); ++R) - NewEntry->addLiveIn(*R); - - BuildMI(*NewEntry, NewEntry->begin(), dl, TII.get(Mips:: SETGP01), - Mips::V0); - } - // No need to allocate space on the stack. if (StackSize == 0 && !MFI->adjustsStack()) return; @@ -181,11 +130,20 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { MachineLocation DstML, SrcML; // Adjust stack. - if (isInt<16>(-StackSize)) // addi sp, sp, (-stacksize) - BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(-StackSize); + if (isInt<16>(-StackSize)) {// addi sp, sp, (-stacksize) + if (STI.inMips16Mode()) + BuildMI(MBB, MBBI, dl, + TII.get(Mips::SaveRaF16)).addImm(StackSize); // cleanup + else + BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(-StackSize); + } else { // Expand immediate that doesn't fit in 16-bit. - MipsFI->setEmitNOAT(); - expandLargeImm(SP, -StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl); + unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT; + + MF.getInfo<MipsFunctionInfo>()->setEmitNOAT(); + Mips::loadImmediate(-StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl, false, + 0); + BuildMI(MBB, MBBI, dl, TII.get(ADDu), SP).addReg(SP).addReg(ATReg); } // emit ".cfi_def_cfa_offset StackSize" @@ -217,20 +175,18 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { // If Reg is a double precision register, emit two cfa_offsets, // one for each of the paired single precision registers. - if (Mips::AFGR64RegisterClass->contains(Reg)) { - const uint16_t *SubRegs = RegInfo->getSubRegisters(Reg); + if (Mips::AFGR64RegClass.contains(Reg)) { MachineLocation DstML0(MachineLocation::VirtualFP, Offset); MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4); - MachineLocation SrcML0(*SubRegs); - MachineLocation SrcML1(*(SubRegs + 1)); + MachineLocation SrcML0(RegInfo->getSubReg(Reg, Mips::sub_fpeven)); + MachineLocation SrcML1(RegInfo->getSubReg(Reg, Mips::sub_fpodd)); if (!STI.isLittle()) std::swap(SrcML0, SrcML1); Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0)); Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1)); - } - else { + } else { // Reg is either in CPURegs or FGR32. DstML = MachineLocation(MachineLocation::VirtualFP, Offset); SrcML = MachineLocation(Reg); @@ -252,13 +208,6 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { SrcML = MachineLocation(MachineLocation::VirtualFP); Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML)); } - - // Restore GP from the saved stack location - if (MipsFI->needGPSaveRestore()) { - unsigned Offset = MFI->getObjectOffset(MipsFI->getGPFI()); - BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE)).addImm(Offset) - .addReg(Mips::GP); - } } void MipsFrameLowering::emitEpilogue(MachineFunction &MF, @@ -293,16 +242,28 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF, return; // Adjust stack. - if (isInt<16>(StackSize)) // addi sp, sp, (-stacksize) - BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(StackSize); - else // Expand immediate that doesn't fit in 16-bit. - expandLargeImm(SP, StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl); + if (isInt<16>(StackSize)) { // addi sp, sp, (-stacksize) + if (STI.inMips16Mode()) + // assumes stacksize multiple of 8 + BuildMI(MBB, MBBI, dl, + TII.get(Mips::RestoreRaF16)).addImm(StackSize); + else + BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(StackSize); + } + else { // Expand immediate that doesn't fit in 16-bit. + unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT; + + MF.getInfo<MipsFunctionInfo>()->setEmitNOAT(); + Mips::loadImmediate(StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl, false, + 0); + BuildMI(MBB, MBBI, dl, TII.get(ADDu), SP).addReg(SP).addReg(ATReg); + } } void MipsFrameLowering:: processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { - MachineRegisterInfo& MRI = MF.getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP; // FIXME: remove this code if register allocator can correctly mark @@ -311,16 +272,35 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // Mark $fp and $ra as used or unused. if (hasFP(MF)) MRI.setPhysRegUsed(FP); +} - // The register allocator might determine $ra is used after seeing - // instruction "jr $ra", but we do not want PrologEpilogInserter to insert - // instructions to save/restore $ra unless there is a function call. - // To correct this, $ra is explicitly marked unused if there is no - // function call. - if (MF.getFrameInfo()->hasCalls()) - MRI.setPhysRegUsed(Mips::RA); - else { - MRI.setPhysRegUnused(Mips::RA); - MRI.setPhysRegUnused(Mips::RA_64); +bool MipsFrameLowering:: +spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *EntryBlock = MF->begin(); + const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo(); + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + // Add the callee-saved register as live-in. Do not add if the register is + // RA and return address is taken, because it has already been added in + // method MipsTargetLowering::LowerRETURNADDR. + // It's killed at the spill, unless the register is RA and return address + // is taken. + unsigned Reg = CSI[i].getReg(); + bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64) + && MF->getFrameInfo()->isReturnAddressTaken(); + if (!IsRAAndRetAddrIsTaken) + EntryBlock->addLiveIn(Reg); + + // Insert the spill to the stack frame. + bool IsKill = !IsRAAndRetAddrIsTaken; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill, + CSI[i].getFrameIdx(), RC, TRI); } + + return true; } diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index bd1d89f..e364ded 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -38,6 +38,11 @@ public: void emitPrologue(MachineFunction &MF) const; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + bool hasFP(const MachineFunction &MF) const; void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index f0651c6..ea33b74 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -117,28 +117,32 @@ private: void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) { MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); - if (!MipsFI->globalBaseRegSet()) + if (((MF.getTarget().getRelocationModel() == Reloc::Static) || + Subtarget.inMips16Mode()) && !MipsFI->globalBaseRegSet()) return; MachineBasicBlock &MBB = MF.front(); MachineBasicBlock::iterator I = MBB.begin(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); + const MipsRegisterInfo *TargetRegInfo = TM.getRegisterInfo(); + const MipsInstrInfo *MII = TM.getInstrInfo(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); - unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg(); - bool FixGlobalBaseReg = MipsFI->globalBaseRegFixed(); - - if (Subtarget.isABI_O32() && FixGlobalBaseReg) - // $gp is the global base register. - V0 = V1 = GlobalBaseReg; - else { - const TargetRegisterClass *RC; - RC = Subtarget.isABI_N64() ? - Mips::CPU64RegsRegisterClass : Mips::CPURegsRegisterClass; - - V0 = RegInfo.createVirtualRegister(RC); - V1 = RegInfo.createVirtualRegister(RC); - } + unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg(); + int FI = 0; + + FI= MipsFI->initGlobalRegFI(); + + const TargetRegisterClass *RC = Subtarget.isABI_N64() ? + (const TargetRegisterClass*)&Mips::CPU64RegsRegClass : + (const TargetRegisterClass*)&Mips::CPURegsRegClass; + + if (Subtarget.inMips16Mode()) + RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass; + + V0 = RegInfo.createVirtualRegister(RC); + V1 = RegInfo.createVirtualRegister(RC); + V2 = RegInfo.createVirtualRegister(RC); if (Subtarget.isABI_N64()) { MF.getRegInfo().addLiveIn(Mips::T9_64); @@ -150,10 +154,31 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) { const GlobalValue *FName = MF.getFunction(); BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0) .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI); - BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0).addReg(Mips::T9_64); + BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0) + .addReg(Mips::T9_64); BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1) .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO); - } else if (MF.getTarget().getRelocationModel() == Reloc::Static) { + MII->storeRegToStackSlot(MBB, I, GlobalBaseReg, false, FI, RC, + TargetRegInfo); + return; + } + + if (Subtarget.inMips16Mode()) { + BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0) + .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI); + BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), + V1) + .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO); + BuildMI(MBB, I, DL, TII.get(Mips::SllX16), + V2 ).addReg(V0).addImm(16); + BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg) + .addReg(V1).addReg(V2); + + + return; + } + + if (MF.getTarget().getRelocationModel() == Reloc::Static) { // Set global register to __gnu_local_gp. // // lui $v0, %hi(__gnu_local_gp) @@ -162,27 +187,57 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) { .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI); BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0) .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO); - } else { - MF.getRegInfo().addLiveIn(Mips::T9); - MBB.addLiveIn(Mips::T9); - - if (Subtarget.isABI_N32()) { - // lui $v0, %hi(%neg(%gp_rel(fname))) - // addu $v1, $v0, $t9 - // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname))) - const GlobalValue *FName = MF.getFunction(); - BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0) - .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI); - BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9); - BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1) - .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO); - } else if (!MipsFI->globalBaseRegFixed()) { - assert(Subtarget.isABI_O32()); - - BuildMI(MBB, I, DL, TII.get(Mips::SETGP2), GlobalBaseReg) - .addReg(Mips::T9); - } + return; + } + + MF.getRegInfo().addLiveIn(Mips::T9); + MBB.addLiveIn(Mips::T9); + + if (Subtarget.isABI_N32()) { + // lui $v0, %hi(%neg(%gp_rel(fname))) + // addu $v1, $v0, $t9 + // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname))) + const GlobalValue *FName = MF.getFunction(); + BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0) + .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI); + BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9); + BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1) + .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO); + MII->storeRegToStackSlot(MBB, I, GlobalBaseReg, false, FI, RC, + TargetRegInfo); + return; } + + assert(Subtarget.isABI_O32()); + + + //if (Subtarget.inMips16Mode()) + // return; // no need to load GP. It can be calculated anywhere + + + + // For O32 ABI, the following instruction sequence is emitted to initialize + // the global base register: + // + // 0. lui $2, %hi(_gp_disp) + // 1. addiu $2, $2, %lo(_gp_disp) + // 2. addu $globalbasereg, $2, $t9 + // + // We emit only the last instruction here. + // + // GNU linker requires that the first two instructions appear at the beginning + // of a function and no instructions be inserted before or between them. + // The two instructions are emitted during lowering to MC layer in order to + // avoid any reordering. + // + // Register $2 (Mips::V0) is added to the list of live-in registers to ensure + // the value instruction 1 (addiu) defines is valid when instruction 2 (addu) + // reads it. + MF.getRegInfo().addLiveIn(Mips::V0); + MBB.addLiveIn(Mips::V0); + BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg) + .addReg(Mips::V0).addReg(Mips::T9); + MII->storeRegToStackSlot(MBB, I, GlobalBaseReg, false, FI, RC, TargetRegInfo); } bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI, @@ -212,7 +267,8 @@ bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI, MachineInstr *MI = MO.getParent(); // Do not replace if it is a phi's operand or is tied to def operand. - if (MI->isPHI() || MI->isRegTiedToDefOperand(U.getOperandNo())) + if (MI->isPHI() || MI->isRegTiedToDefOperand(U.getOperandNo()) || + MI->isPseudo()) continue; MO.setReg(ZeroReg); @@ -255,7 +311,7 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) { // If Parent is an unaligned f32 load or store, select a (base + index) // floating point load/store instruction (luxc1 or suxc1). - const LSBaseSDNode* LS = 0; + const LSBaseSDNode *LS = 0; if (Parent && (LS = dyn_cast<LSBaseSDNode>(Parent))) { EVT VT = LS->getMemoryVT(); @@ -316,17 +372,18 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) { // lui $2, %hi($CPI1_0) // lwc1 $f0, %lo($CPI1_0)($2) if (Addr.getOperand(1).getOpcode() == MipsISD::Lo) { - SDValue LoVal = Addr.getOperand(1); - if (isa<ConstantPoolSDNode>(LoVal.getOperand(0)) || - isa<GlobalAddressSDNode>(LoVal.getOperand(0))) { + SDValue LoVal = Addr.getOperand(1), Opnd0 = LoVal.getOperand(0); + if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) || + isa<JumpTableSDNode>(Opnd0)) { Base = Addr.getOperand(0); - Offset = LoVal.getOperand(0); + Offset = Opnd0; return true; } } // If an indexed floating point load/store can be emitted, return false. - if (LS && (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) && + if (LS && + (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) && Subtarget.hasMips32r2Or64()) return false; } diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 6a23bc3..7741f9f 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -34,6 +34,8 @@ #include "llvm/CodeGen/ValueTypes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + using namespace llvm; // If I is a shifted mask, set the size (Size) and the first bit of the @@ -79,6 +81,14 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const { case MipsISD::Sync: return "MipsISD::Sync"; case MipsISD::Ext: return "MipsISD::Ext"; case MipsISD::Ins: return "MipsISD::Ins"; + case MipsISD::LWL: return "MipsISD::LWL"; + case MipsISD::LWR: return "MipsISD::LWR"; + case MipsISD::SWL: return "MipsISD::SWL"; + case MipsISD::SWR: return "MipsISD::SWR"; + case MipsISD::LDL: return "MipsISD::LDL"; + case MipsISD::LDR: return "MipsISD::LDR"; + case MipsISD::SDL: return "MipsISD::SDL"; + case MipsISD::SDR: return "MipsISD::SDR"; default: return NULL; } } @@ -96,20 +106,25 @@ MipsTargetLowering(MipsTargetMachine &TM) setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? // Set up the register classes - addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass); + addRegisterClass(MVT::i32, &Mips::CPURegsRegClass); if (HasMips64) - addRegisterClass(MVT::i64, Mips::CPU64RegsRegisterClass); + addRegisterClass(MVT::i64, &Mips::CPU64RegsRegClass); + + if (Subtarget->inMips16Mode()) { + addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass); + addRegisterClass(MVT::i32, &Mips::CPURARegRegClass); + } if (!TM.Options.UseSoftFloat) { - addRegisterClass(MVT::f32, Mips::FGR32RegisterClass); + addRegisterClass(MVT::f32, &Mips::FGR32RegClass); // When dealing with single precision only, use libcalls if (!Subtarget->isSingleFloat()) { if (HasMips64) - addRegisterClass(MVT::f64, Mips::FGR64RegisterClass); + addRegisterClass(MVT::f64, &Mips::FGR64RegClass); else - addRegisterClass(MVT::f64, Mips::AFGR64RegisterClass); + addRegisterClass(MVT::f64, &Mips::AFGR64RegClass); } } @@ -137,6 +152,8 @@ MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); @@ -146,6 +163,8 @@ MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); if (!TM.Options.NoNaNsFPMath) { setOperationAction(ISD::FABS, MVT::f32, Custom); @@ -160,6 +179,14 @@ MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::ConstantPool, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + } + + if (!HasMips64) { + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } setOperationAction(ISD::SDIV, MVT::i32, Expand); @@ -197,9 +224,6 @@ MipsTargetLowering(MipsTargetMachine &TM) if (!Subtarget->hasMips64r2()) setOperationAction(ISD::ROTR, MVT::i64, Expand); - setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); - setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); - setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); @@ -241,9 +265,6 @@ MipsTargetLowering(MipsTargetMachine &TM) setInsertFencesForAtomic(true); - if (Subtarget->isSingleFloat()) - setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - if (!Subtarget->hasSEInReg()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); @@ -259,6 +280,13 @@ MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::BSWAP, MVT::i64, Expand); } + if (HasMips64) { + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom); + setTruncStoreAction(MVT::i64, MVT::i32, Custom); + } + setTargetDAGCombine(ISD::ADDE); setTargetDAGCombine(ISD::SUBE); setTargetDAGCombine(ISD::SDIVREM); @@ -266,6 +294,7 @@ MipsTargetLowering(MipsTargetMachine &TM) setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::ADD); setMinFunctionAlignment(HasMips64 ? 3 : 2); @@ -274,6 +303,8 @@ MipsTargetLowering(MipsTargetMachine &TM) setExceptionPointerRegister(IsN64 ? Mips::A0_64 : Mips::A0); setExceptionSelectorRegister(IsN64 ? Mips::A1_64 : Mips::A1); + + maxStoresPerMemcpy = 16; } bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { @@ -282,7 +313,6 @@ bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { switch (SVT) { case MVT::i64: case MVT::i32: - case MVT::i16: return true; case MVT::f32: return Subtarget->hasMips32r2Or64(); @@ -303,17 +333,17 @@ EVT MipsTargetLowering::getSetCCResultType(EVT VT) const { // Lo0: initial value of Lo register // Hi0: initial value of Hi register // Return true if pattern matching was successful. -static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) { +static bool SelectMadd(SDNode *ADDENode, SelectionDAG *CurDAG) { // ADDENode's second operand must be a flag output of an ADDC node in order // for the matching to be successful. - SDNode* ADDCNode = ADDENode->getOperand(2).getNode(); + SDNode *ADDCNode = ADDENode->getOperand(2).getNode(); if (ADDCNode->getOpcode() != ISD::ADDC) return false; SDValue MultHi = ADDENode->getOperand(0); SDValue MultLo = ADDCNode->getOperand(0); - SDNode* MultNode = MultHi.getNode(); + SDNode *MultNode = MultHi.getNode(); unsigned MultOpc = MultHi.getOpcode(); // MultHi and MultLo must be generated by the same node, @@ -376,17 +406,17 @@ static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) { // Lo0: initial value of Lo register // Hi0: initial value of Hi register // Return true if pattern matching was successful. -static bool SelectMsub(SDNode* SUBENode, SelectionDAG* CurDAG) { +static bool SelectMsub(SDNode *SUBENode, SelectionDAG *CurDAG) { // SUBENode's second operand must be a flag output of an SUBC node in order // for the matching to be successful. - SDNode* SUBCNode = SUBENode->getOperand(2).getNode(); + SDNode *SUBCNode = SUBENode->getOperand(2).getNode(); if (SUBCNode->getOpcode() != ISD::SUBC) return false; SDValue MultHi = SUBENode->getOperand(1); SDValue MultLo = SUBCNode->getOperand(1); - SDNode* MultNode = MultHi.getNode(); + SDNode *MultNode = MultHi.getNode(); unsigned MultOpc = MultHi.getOpcode(); // MultHi and MultLo must be generated by the same node, @@ -441,9 +471,9 @@ static bool SelectMsub(SDNode* SUBENode, SelectionDAG* CurDAG) { return true; } -static SDValue PerformADDECombine(SDNode *N, SelectionDAG& DAG, +static SDValue PerformADDECombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget* Subtarget) { + const MipsSubtarget *Subtarget) { if (DCI.isBeforeLegalize()) return SDValue(); @@ -454,9 +484,9 @@ static SDValue PerformADDECombine(SDNode *N, SelectionDAG& DAG, return SDValue(); } -static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG, +static SDValue PerformSUBECombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget* Subtarget) { + const MipsSubtarget *Subtarget) { if (DCI.isBeforeLegalize()) return SDValue(); @@ -467,9 +497,9 @@ static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG, return SDValue(); } -static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG, +static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget* Subtarget) { + const MipsSubtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -544,7 +574,7 @@ static bool InvertFPCondCode(Mips::CondCode CC) { // Creates and returns an FPCmp node from a setcc node. // Returns Op if setcc is not a floating point comparison. -static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) { +static SDValue CreateFPCmp(SelectionDAG &DAG, const SDValue &Op) { // must be a SETCC node if (Op.getOpcode() != ISD::SETCC) return Op; @@ -566,7 +596,7 @@ static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) { } // Creates and returns a CMovFPT/F node. -static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True, +static SDValue CreateCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True, SDValue False, DebugLoc DL) { bool invert = InvertFPCondCode((Mips::CondCode) cast<ConstantSDNode>(Cond.getOperand(2)) @@ -576,9 +606,9 @@ static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True, True.getValueType(), True, False, Cond); } -static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG& DAG, +static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget* Subtarget) { + const MipsSubtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -602,16 +632,16 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG& DAG, const DebugLoc DL = N->getDebugLoc(); ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); SDValue True = N->getOperand(1); - + SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0), SetCC.getOperand(1), ISD::getSetCCInverse(CC, true)); - + return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True); } -static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG, +static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget* Subtarget) { + const MipsSubtarget *Subtarget) { // Pattern match EXT. // $dst = and ((sra or srl) $src , pos), (2**size - 1) // => ext $dst, $src, size, pos @@ -649,9 +679,9 @@ static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG, DAG.getConstant(SMSize, MVT::i32)); } -static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG, +static SDValue PerformORCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget* Subtarget) { + const MipsSubtarget *Subtarget) { // Pattern match INS. // $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1), // where mask1 = (2**size - 1) << pos, mask0 = ~mask1 @@ -703,6 +733,33 @@ static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG, DAG.getConstant(SMSize0, MVT::i32), And0.getOperand(0)); } +static SDValue PerformADDCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget *Subtarget) { + // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt)) + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue Add = N->getOperand(1); + + if (Add.getOpcode() != ISD::ADD) + return SDValue(); + + SDValue Lo = Add.getOperand(1); + + if ((Lo.getOpcode() != MipsISD::Lo) || + (Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable)) + return SDValue(); + + EVT ValTy = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + + SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0), + Add.getOperand(0)); + return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo); +} + SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -718,11 +775,13 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) case ISD::UDIVREM: return PerformDivRemCombine(N, DAG, DCI, Subtarget); case ISD::SELECT: - return PerformSELECTCombine(N, DAG, DCI, Subtarget); + return PerformSELECTCombine(N, DAG, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DAG, DCI, Subtarget); + case ISD::ADD: + return PerformADDCombine(N, DAG, DCI, Subtarget); } return SDValue(); @@ -741,13 +800,20 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG); + case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG, true); + case ISD::SRL_PARTS: return LowerShiftRightParts(Op, DAG, false); + case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); } return SDValue(); } @@ -782,7 +848,7 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) { /* static MachineBasicBlock* ExpandCondMov(MachineInstr *MI, MachineBasicBlock *BB, DebugLoc dl, - const MipsSubtarget* Subtarget, + const MipsSubtarget *Subtarget, const TargetInstrInfo *TII, bool isFPCmp, unsigned Opc) { // There is no need to expand CMov instructions if target has @@ -1510,6 +1576,19 @@ LowerSELECT(SDValue Op, SelectionDAG &DAG) const Op.getDebugLoc()); } +SDValue MipsTargetLowering:: +LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT Ty = Op.getOperand(0).getValueType(); + SDValue Cond = DAG.getNode(ISD::SETCC, DL, getSetCCResultType(Ty), + Op.getOperand(0), Op.getOperand(1), + Op.getOperand(4)); + + return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2), + Op.getOperand(3)); +} + SDValue MipsTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = CreateFPCmp(DAG, Op); @@ -1612,10 +1691,13 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(); - if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { - // General Dynamic TLS Model - bool LocalDynamic = GV->hasInternalLinkage(); - unsigned Flag = LocalDynamic ? MipsII::MO_TLSLDM :MipsII::MO_TLSGD; + TLSModel::Model model = getTargetMachine().getTLSModel(GV); + + if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) { + // General Dynamic and Local Dynamic TLS Model. + unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM + : MipsII::MO_TLSGD; + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, Flag); SDValue Argument = DAG.getNode(MipsISD::Wrapper, dl, PtrVT, GetGlobalReg(DAG, PtrVT), TGA); @@ -1630,16 +1712,16 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const Entry.Ty = PtrTy; Args.push_back(Entry); - std::pair<SDValue, SDValue> CallResult = - LowerCallTo(DAG.getEntryNode(), PtrTy, + TargetLowering::CallLoweringInfo CLI(DAG.getEntryNode(), PtrTy, false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed=*/true, TlsGetAddr, Args, DAG, dl); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); SDValue Ret = CallResult.first; - if (!LocalDynamic) + if (model != TLSModel::LocalDynamic) return Ret; SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, @@ -1653,7 +1735,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const } SDValue Offset; - if (GV->isDeclaration()) { + if (model == TLSModel::InitialExec) { // Initial Exec TLS Model SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, MipsII::MO_GOTTPREL); @@ -1664,6 +1746,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const false, false, false, 0); } else { // Local Exec TLS Model + assert(model == TLSModel::LocalExec); SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, MipsII::MO_TPREL_HI); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, @@ -1940,9 +2023,26 @@ LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } +SDValue MipsTargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + // check the depth + assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) && + "Return address can be determined only for current frame."); + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + EVT VT = Op.getValueType(); + unsigned RA = IsN64 ? Mips::RA_64 : Mips::RA; + MFI->setReturnAddressIsTaken(true); + + // Return RA, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT)); + return DAG.getCopyFromReg(DAG.getEntryNode(), Op.getDebugLoc(), Reg, VT); +} + // TODO: set SType according to the desired memory barrier behavior. SDValue -MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const { +MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const { unsigned SType = 0; DebugLoc dl = Op.getDebugLoc(); return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0), @@ -1950,7 +2050,7 @@ MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const { } SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op, - SelectionDAG& DAG) const { + SelectionDAG &DAG) const { // FIXME: Need pseudo-fence for 'singlethread' fences // FIXME: Set SType for weaker fences where supported/appropriate. unsigned SType = 0; @@ -1959,6 +2059,210 @@ SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op, DAG.getConstant(SType, MVT::i32)); } +SDValue MipsTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1); + SDValue Shamt = Op.getOperand(2); + + // if shamt < 32: + // lo = (shl lo, shamt) + // hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt)) + // else: + // lo = 0 + // hi = (shl lo, shamt[4:0]) + SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt, + DAG.getConstant(-1, MVT::i32)); + SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo, + DAG.getConstant(1, MVT::i32)); + SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, ShiftRight1Lo, + Not); + SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, Shamt); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo); + SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, MVT::i32, Lo, Shamt); + SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt, + DAG.getConstant(0x20, MVT::i32)); + Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, + DAG.getConstant(0, MVT::i32), ShiftLeftLo); + Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftLeftLo, Or); + + SDValue Ops[2] = {Lo, Hi}; + return DAG.getMergeValues(Ops, 2, DL); +} + +SDValue MipsTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG, + bool IsSRA) const { + DebugLoc DL = Op.getDebugLoc(); + SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1); + SDValue Shamt = Op.getOperand(2); + + // if shamt < 32: + // lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt)) + // if isSRA: + // hi = (sra hi, shamt) + // else: + // hi = (srl hi, shamt) + // else: + // if isSRA: + // lo = (sra hi, shamt[4:0]) + // hi = (sra hi, 31) + // else: + // lo = (srl hi, shamt[4:0]) + // hi = 0 + SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt, + DAG.getConstant(-1, MVT::i32)); + SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, + DAG.getConstant(1, MVT::i32)); + SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, ShiftLeft1Hi, Not); + SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo, Shamt); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo); + SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL, DL, MVT::i32, + Hi, Shamt); + SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt, + DAG.getConstant(0x20, MVT::i32)); + SDValue Shift31 = DAG.getNode(ISD::SRA, DL, MVT::i32, Hi, + DAG.getConstant(31, MVT::i32)); + Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftRightHi, Or); + Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, + IsSRA ? Shift31 : DAG.getConstant(0, MVT::i32), + ShiftRightHi); + + SDValue Ops[2] = {Lo, Hi}; + return DAG.getMergeValues(Ops, 2, DL); +} + +static SDValue CreateLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD, + SDValue Chain, SDValue Src, unsigned Offset) { + SDValue Ptr = LD->getBasePtr(); + EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT(); + EVT BasePtrVT = Ptr.getValueType(); + DebugLoc DL = LD->getDebugLoc(); + SDVTList VTList = DAG.getVTList(VT, MVT::Other); + + if (Offset) + Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr, + DAG.getConstant(Offset, BasePtrVT)); + + SDValue Ops[] = { Chain, Ptr, Src }; + return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT, + LD->getMemOperand()); +} + +// Expand an unaligned 32 or 64-bit integer load node. +SDValue MipsTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + LoadSDNode *LD = cast<LoadSDNode>(Op); + EVT MemVT = LD->getMemoryVT(); + + // Return if load is aligned or if MemVT is neither i32 nor i64. + if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) || + ((MemVT != MVT::i32) && (MemVT != MVT::i64))) + return SDValue(); + + bool IsLittle = Subtarget->isLittle(); + EVT VT = Op.getValueType(); + ISD::LoadExtType ExtType = LD->getExtensionType(); + SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT); + + assert((VT == MVT::i32) || (VT == MVT::i64)); + + // Expand + // (set dst, (i64 (load baseptr))) + // to + // (set tmp, (ldl (add baseptr, 7), undef)) + // (set dst, (ldr baseptr, tmp)) + if ((VT == MVT::i64) && (ExtType == ISD::NON_EXTLOAD)) { + SDValue LDL = CreateLoadLR(MipsISD::LDL, DAG, LD, Chain, Undef, + IsLittle ? 7 : 0); + return CreateLoadLR(MipsISD::LDR, DAG, LD, LDL.getValue(1), LDL, + IsLittle ? 0 : 7); + } + + SDValue LWL = CreateLoadLR(MipsISD::LWL, DAG, LD, Chain, Undef, + IsLittle ? 3 : 0); + SDValue LWR = CreateLoadLR(MipsISD::LWR, DAG, LD, LWL.getValue(1), LWL, + IsLittle ? 0 : 3); + + // Expand + // (set dst, (i32 (load baseptr))) or + // (set dst, (i64 (sextload baseptr))) or + // (set dst, (i64 (extload baseptr))) + // to + // (set tmp, (lwl (add baseptr, 3), undef)) + // (set dst, (lwr baseptr, tmp)) + if ((VT == MVT::i32) || (ExtType == ISD::SEXTLOAD) || + (ExtType == ISD::EXTLOAD)) + return LWR; + + assert((VT == MVT::i64) && (ExtType == ISD::ZEXTLOAD)); + + // Expand + // (set dst, (i64 (zextload baseptr))) + // to + // (set tmp0, (lwl (add baseptr, 3), undef)) + // (set tmp1, (lwr baseptr, tmp0)) + // (set tmp2, (shl tmp1, 32)) + // (set dst, (srl tmp2, 32)) + DebugLoc DL = LD->getDebugLoc(); + SDValue Const32 = DAG.getConstant(32, MVT::i32); + SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32); + SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32); + SDValue Ops[] = { SRL, LWR.getValue(1) }; + return DAG.getMergeValues(Ops, 2, DL); +} + +static SDValue CreateStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD, + SDValue Chain, unsigned Offset) { + SDValue Ptr = SD->getBasePtr(), Value = SD->getValue(); + EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType(); + DebugLoc DL = SD->getDebugLoc(); + SDVTList VTList = DAG.getVTList(MVT::Other); + + if (Offset) + Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr, + DAG.getConstant(Offset, BasePtrVT)); + + SDValue Ops[] = { Chain, Value, Ptr }; + return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT, + SD->getMemOperand()); +} + +// Expand an unaligned 32 or 64-bit integer store node. +SDValue MipsTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + StoreSDNode *SD = cast<StoreSDNode>(Op); + EVT MemVT = SD->getMemoryVT(); + + // Return if store is aligned or if MemVT is neither i32 nor i64. + if ((SD->getAlignment() >= MemVT.getSizeInBits() / 8) || + ((MemVT != MVT::i32) && (MemVT != MVT::i64))) + return SDValue(); + + bool IsLittle = Subtarget->isLittle(); + SDValue Value = SD->getValue(), Chain = SD->getChain(); + EVT VT = Value.getValueType(); + + // Expand + // (store val, baseptr) or + // (truncstore val, baseptr) + // to + // (swl val, (add baseptr, 3)) + // (swr val, baseptr) + if ((VT == MVT::i32) || SD->isTruncatingStore()) { + SDValue SWL = CreateStoreLR(MipsISD::SWL, DAG, SD, Chain, + IsLittle ? 3 : 0); + return CreateStoreLR(MipsISD::SWR, DAG, SD, SWL, IsLittle ? 0 : 3); + } + + assert(VT == MVT::i64); + + // Expand + // (store val, baseptr) + // to + // (sdl val, (add baseptr, 7)) + // (sdr val, baseptr) + SDValue SDL = CreateStoreLR(MipsISD::SDL, DAG, SD, Chain, IsLittle ? 7 : 0); + return CreateStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7); +} + //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -2152,10 +2456,10 @@ static unsigned getNextIntArgReg(unsigned Reg) { // Write ByVal Arg to arg registers and stack. static void WriteByValArg(SDValue& ByValChain, SDValue Chain, DebugLoc dl, - SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass, - SmallVector<SDValue, 8>& MemOpChains, int& LastFI, + SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass, + SmallVector<SDValue, 8> &MemOpChains, int &LastFI, MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg, - const CCValAssign &VA, const ISD::ArgFlagsTy& Flags, + const CCValAssign &VA, const ISD::ArgFlagsTy &Flags, MVT PtrType, bool isLittle) { unsigned LocMemOffset = VA.getLocMemOffset(); unsigned Offset = 0; @@ -2243,10 +2547,10 @@ WriteByValArg(SDValue& ByValChain, SDValue Chain, DebugLoc dl, // Copy Mips64 byVal arg to registers and stack. void static PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl, - SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass, - SmallVector<SDValue, 8>& MemOpChains, int& LastFI, + SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass, + SmallVector<SDValue, 8> &MemOpChains, int &LastFI, MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg, - const CCValAssign &VA, const ISD::ArgFlagsTy& Flags, + const CCValAssign &VA, const ISD::ArgFlagsTy &Flags, EVT PtrTy, bool isLittle) { unsigned ByValSize = Flags.getByValSize(); unsigned Alignment = std::min(Flags.getByValAlign(), (unsigned)8); @@ -2332,14 +2636,20 @@ PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl, /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted. /// TODO: isTailCall. SDValue -MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue InChain = CLI.Chain; + SDValue Callee = CLI.Callee; + SDValue CalleeSave = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + // MIPs target does not yet support tail call optimization. isTailCall = false; @@ -2354,7 +2664,9 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee, CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), getTargetMachine(), ArgLocs, *DAG.getContext()); - if (IsO32) + if (CallConv == CallingConv::Fast) + CCInfo.AnalyzeCallOperands(Outs, CC_Mips_FastCC); + else if (IsO32) CCInfo.AnalyzeCallOperands(Outs, CC_MipsO32); else if (HasMips64) AnalyzeMips64CallOperands(CCInfo, Outs); @@ -2372,11 +2684,6 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee, Chain = CallSeqStart = DAG.getCALLSEQ_START(InChain, NextStackOffsetVal); ByValChain = InChain; - // If this is the first call, create a stack frame object that points to - // a location to which .cprestore saves $gp. - if (IsO32 && IsPIC && MipsFI->globalBaseRegFixed() && !MipsFI->getGPFI()) - MipsFI->setGPFI(MFI->CreateFixedObject(4, 0, true)); - // Get the frame index of the stack frame object that points to the location // of dynamically allocated area on the stack. int DynAllocFI = MipsFI->getDynAllocFI(); @@ -2384,7 +2691,7 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee, // Update size of the maximum argument space. // For O32, a minimum of four words (16 bytes) of argument space is // allocated. - if (IsO32) + if (IsO32 && (CallConv != CallingConv::Fast)) NextStackOffset = std::max(NextStackOffset, (unsigned)16); unsigned MaxCallFrameSize = MipsFI->getMaxCallFrameSize(); @@ -2399,9 +2706,6 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee, NextStackOffset = (NextStackOffset + StackAlignment - 1) / StackAlignment * StackAlignment; - if (MipsFI->needGPSaveRestore()) - MFI->setObjectOffset(MipsFI->getGPFI(), NextStackOffset); - MFI->setObjectOffset(DynAllocFI, NextStackOffset); } @@ -2573,6 +2877,14 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee, Callee = DAG.getRegister(T9Reg, getPointerTy()); } + // Insert node "GP copy globalreg" before call to function. + // Lazy-binding stubs require GP to point to the GOT. + if (IsPICCall) { + unsigned GPReg = IsN64 ? Mips::GP_64 : Mips::GP; + EVT Ty = IsN64 ? MVT::i64 : MVT::i32; + RegsToPass.push_back(std::make_pair(GPReg, GetGlobalReg(DAG, Ty))); + } + // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. // The InFlag in necessary since all emitted instructions must be @@ -2590,7 +2902,7 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee, SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector<SDValue, 8> Ops; Ops.push_back(Chain); - Ops.push_back(Callee); + Ops.push_back(Subtarget->inMips16Mode()? CalleeSave: Callee); // Add argument registers to the end of the list so that they are // known live into the call. @@ -2598,6 +2910,8 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); + if (Subtarget->inMips16Mode()) + Ops.push_back(Callee); // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); @@ -2633,7 +2947,7 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_Mips); @@ -2652,9 +2966,9 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Formal Arguments Calling Convention Implementation //===----------------------------------------------------------------------===// static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl, - std::vector<SDValue>& OutChains, + std::vector<SDValue> &OutChains, SelectionDAG &DAG, unsigned NumWords, SDValue FIN, - const CCValAssign &VA, const ISD::ArgFlagsTy& Flags, + const CCValAssign &VA, const ISD::ArgFlagsTy &Flags, const Argument *FuncArg) { unsigned LocMem = VA.getLocMemOffset(); unsigned FirstWord = LocMem / 4; @@ -2666,7 +2980,7 @@ static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl, break; unsigned SrcReg = O32IntRegs[CurWord]; - unsigned Reg = AddLiveIn(MF, SrcReg, Mips::CPURegsRegisterClass); + unsigned Reg = AddLiveIn(MF, SrcReg, &Mips::CPURegsRegClass); SDValue StorePtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIN, DAG.getConstant(i * 4, MVT::i32)); SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(Reg, MVT::i32), @@ -2679,8 +2993,8 @@ static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl, // Create frame object on stack and copy registers used for byval passing to it. static unsigned CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl, - std::vector<SDValue>& OutChains, SelectionDAG &DAG, - const CCValAssign &VA, const ISD::ArgFlagsTy& Flags, + std::vector<SDValue> &OutChains, SelectionDAG &DAG, + const CCValAssign &VA, const ISD::ArgFlagsTy &Flags, MachineFrameInfo *MFI, bool IsRegLoc, SmallVectorImpl<SDValue> &InVals, MipsFunctionInfo *MipsFI, EVT PtrTy, const Argument *FuncArg) { @@ -2703,7 +3017,7 @@ CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl, // Copy arg registers. for (unsigned I = 0; (Reg != Mips64IntRegs + 8) && (I < NumRegs); ++Reg, ++I) { - unsigned VReg = AddLiveIn(MF, *Reg, Mips::CPU64RegsRegisterClass); + unsigned VReg = AddLiveIn(MF, *Reg, &Mips::CPU64RegsRegClass); SDValue StorePtr = DAG.getNode(ISD::ADD, dl, PtrTy, FIN, DAG.getConstant(I * 8, PtrTy)); SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(VReg, MVT::i64), @@ -2739,7 +3053,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), getTargetMachine(), ArgLocs, *DAG.getContext()); - if (IsO32) + if (CallConv == CallingConv::Fast) + CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FastCC); + else if (IsO32) CCInfo.AnalyzeFormalArguments(Ins, CC_MipsO32); else CCInfo.AnalyzeFormalArguments(Ins, CC_Mips); @@ -2779,13 +3095,13 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, const TargetRegisterClass *RC; if (RegVT == MVT::i32) - RC = Mips::CPURegsRegisterClass; + RC = &Mips::CPURegsRegClass; else if (RegVT == MVT::i64) - RC = Mips::CPU64RegsRegisterClass; + RC = &Mips::CPU64RegsRegClass; else if (RegVT == MVT::f32) - RC = Mips::FGR32RegisterClass; + RC = &Mips::FGR32RegClass; else if (RegVT == MVT::f64) - RC = HasMips64 ? Mips::FGR64RegisterClass : Mips::AFGR64RegisterClass; + RC = HasMips64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass; else llvm_unreachable("RegVT not supported by FormalArguments Lowering"); @@ -2859,8 +3175,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, const uint16_t *ArgRegs = IsO32 ? O32IntRegs : Mips64IntRegs; unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumOfRegs); int FirstRegSlotOffset = IsO32 ? 0 : -64 ; // offset of $a0's slot. - const TargetRegisterClass *RC - = IsO32 ? Mips::CPURegsRegisterClass : Mips::CPU64RegsRegisterClass; + const TargetRegisterClass *RC = IsO32 ? + (const TargetRegisterClass*)&Mips::CPURegsRegClass : + (const TargetRegisterClass*)&Mips::CPU64RegsRegClass; unsigned RegSize = RC->getSize(); int RegSlotOffset = FirstRegSlotOffset + Idx * RegSize; @@ -2924,7 +3241,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain, // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_Mips); @@ -2970,11 +3287,10 @@ MipsTargetLowering::LowerReturn(SDValue Chain, // Return on Mips is always a "jr $ra" if (Flag.getNode()) - return DAG.getNode(MipsISD::Ret, dl, MVT::Other, - Chain, DAG.getRegister(Mips::RA, MVT::i32), Flag); - else // Return Void - return DAG.getNode(MipsISD::Ret, dl, MVT::Other, - Chain, DAG.getRegister(Mips::RA, MVT::i32)); + return DAG.getNode(MipsISD::Ret, dl, MVT::Other, Chain, Flag); + + // Return Void + return DAG.getNode(MipsISD::Ret, dl, MVT::Other, Chain); } //===----------------------------------------------------------------------===// @@ -2993,13 +3309,19 @@ getConstraintType(const std::string &Constraint) const // unless generating MIPS16 code. // 'y' : Equivalent to r; retained for // backwards compatibility. - // 'f' : Floating Point registers. + // 'c' : A register suitable for use in an indirect + // jump. This will always be $25 for -mabicalls. + // 'l' : The lo register. 1 word storage. + // 'x' : The hilo register pair. Double word storage. if (Constraint.size() == 1) { switch (Constraint[0]) { default : break; case 'd': case 'y': case 'f': + case 'c': + case 'l': + case 'x': return C_RegisterClass; } } @@ -3033,6 +3355,22 @@ MipsTargetLowering::getSingleConstraintMatchWeight( if (type->isFloatTy()) weight = CW_Register; break; + case 'c': // $25 for indirect jumps + case 'l': // lo register + case 'x': // hilo register pair + if (type->isIntegerTy()) + weight = CW_SpecificReg; + break; + case 'I': // signed 16 bit immediate + case 'J': // integer zero + case 'K': // unsigned 16 bit immediate + case 'L': // signed 32 bit immediate where lower 16 bits are 0 + case 'N': // immediate in the range of -65535 to -1 (inclusive) + case 'O': // signed 15 bit immediate (+- 16383) + case 'P': // immediate in the range of 65535 to 1 (inclusive) + if (isa<ConstantInt>(CallOperandVal)) + weight = CW_Constant; + break; } return weight; } @@ -3048,30 +3386,152 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const case 'd': // Address register. Same as 'r' unless generating MIPS16 code. case 'y': // Same as 'r'. Exists for compatibility. case 'r': - if (VT == MVT::i32) - return std::make_pair(0U, Mips::CPURegsRegisterClass); - assert(VT == MVT::i64 && "Unexpected type."); - return std::make_pair(0U, Mips::CPU64RegsRegisterClass); + if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) + return std::make_pair(0U, &Mips::CPURegsRegClass); + if (VT == MVT::i64 && !HasMips64) + return std::make_pair(0U, &Mips::CPURegsRegClass); + if (VT == MVT::i64 && HasMips64) + return std::make_pair(0U, &Mips::CPU64RegsRegClass); + // This will generate an error message + return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0)); case 'f': if (VT == MVT::f32) - return std::make_pair(0U, Mips::FGR32RegisterClass); + return std::make_pair(0U, &Mips::FGR32RegClass); if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) { if (Subtarget->isFP64bit()) - return std::make_pair(0U, Mips::FGR64RegisterClass); - else - return std::make_pair(0U, Mips::AFGR64RegisterClass); + return std::make_pair(0U, &Mips::FGR64RegClass); + return std::make_pair(0U, &Mips::AFGR64RegClass); } + break; + case 'c': // register suitable for indirect jump + if (VT == MVT::i32) + return std::make_pair((unsigned)Mips::T9, &Mips::CPURegsRegClass); + assert(VT == MVT::i64 && "Unexpected type."); + return std::make_pair((unsigned)Mips::T9_64, &Mips::CPU64RegsRegClass); + case 'l': // register suitable for indirect jump + if (VT == MVT::i32) + return std::make_pair((unsigned)Mips::LO, &Mips::HILORegClass); + return std::make_pair((unsigned)Mips::LO64, &Mips::HILO64RegClass); + case 'x': // register suitable for indirect jump + // Fixme: Not triggering the use of both hi and low + // This will generate an error message + return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0)); } } return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result(0, 0); + + // Only support length 1 constraints for now. + if (Constraint.length() > 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { + default: break; // This will fall through to the generic implementation + case 'I': // Signed 16 bit constant + // If this fails, the parent routine will give an error + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + EVT Type = Op.getValueType(); + int64_t Val = C->getSExtValue(); + if (isInt<16>(Val)) { + Result = DAG.getTargetConstant(Val, Type); + break; + } + } + return; + case 'J': // integer zero + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + EVT Type = Op.getValueType(); + int64_t Val = C->getZExtValue(); + if (Val == 0) { + Result = DAG.getTargetConstant(0, Type); + break; + } + } + return; + case 'K': // unsigned 16 bit immediate + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + EVT Type = Op.getValueType(); + uint64_t Val = (uint64_t)C->getZExtValue(); + if (isUInt<16>(Val)) { + Result = DAG.getTargetConstant(Val, Type); + break; + } + } + return; + case 'L': // signed 32 bit immediate where lower 16 bits are 0 + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + EVT Type = Op.getValueType(); + int64_t Val = C->getSExtValue(); + if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)){ + Result = DAG.getTargetConstant(Val, Type); + break; + } + } + return; + case 'N': // immediate in the range of -65535 to -1 (inclusive) + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + EVT Type = Op.getValueType(); + int64_t Val = C->getSExtValue(); + if ((Val >= -65535) && (Val <= -1)) { + Result = DAG.getTargetConstant(Val, Type); + break; + } + } + return; + case 'O': // signed 15 bit immediate + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + EVT Type = Op.getValueType(); + int64_t Val = C->getSExtValue(); + if ((isInt<15>(Val))) { + Result = DAG.getTargetConstant(Val, Type); + break; + } + } + return; + case 'P': // immediate in the range of 1 to 65535 (inclusive) + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + EVT Type = Op.getValueType(); + int64_t Val = C->getSExtValue(); + if ((Val <= 65535) && (Val >= 1)) { + Result = DAG.getTargetConstant(Val, Type); + break; + } + } + return; + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + bool MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The Mips target isn't yet aware of offsets. return false; } +EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsZeroVal, + bool MemcpyStrSrc, + MachineFunction &MF) const { + if (Subtarget->hasMips64()) + return MVT::i64; + + return MVT::i32; +} + bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { if (VT != MVT::f32 && VT != MVT::f64) return false; diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index c36f40f..edab03c 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -79,7 +79,17 @@ namespace llvm { Sync, Ext, - Ins + Ins, + + // Load/Store Left/Right nodes. + LWL = ISD::FIRST_TARGET_MEMORY_OPCODE, + LWR, + SWL, + SWR, + LDL, + LDR, + SDL, + SDR }; } @@ -128,13 +138,20 @@ namespace llvm { SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const; SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG& DAG, + bool IsSRA) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; virtual SDValue LowerFormalArguments(SDValue Chain, @@ -144,13 +161,7 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue @@ -176,8 +187,22 @@ namespace llvm { getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops + /// vector. If it is invalid, don't add anything to Ops. If hasMemory is + /// true it means one of the asm constraint of the inline asm instruction + /// being processed is 'm'. + virtual void LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const; + virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; + virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsZeroVal, + bool MemcpyStrSrc, + MachineFunction &MF) const; + /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index 14d8f1e..9654b86 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -54,10 +54,14 @@ let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in // Feature predicates. //===----------------------------------------------------------------------===// -def IsFP64bit : Predicate<"Subtarget.isFP64bit()">, AssemblerPredicate<"FeatureFP64Bit">; -def NotFP64bit : Predicate<"!Subtarget.isFP64bit()">, AssemblerPredicate<"!FeatureFP64Bit">; -def IsSingleFloat : Predicate<"Subtarget.isSingleFloat()">, AssemblerPredicate<"FeatureSingleFloat">; -def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">, AssemblerPredicate<"!FeatureSingleFloat">; +def IsFP64bit : Predicate<"Subtarget.isFP64bit()">, + AssemblerPredicate<"FeatureFP64Bit">; +def NotFP64bit : Predicate<"!Subtarget.isFP64bit()">, + AssemblerPredicate<"!FeatureFP64Bit">; +def IsSingleFloat : Predicate<"Subtarget.isSingleFloat()">, + AssemblerPredicate<"FeatureSingleFloat">; +def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">, + AssemblerPredicate<"!FeatureSingleFloat">; // FP immediate patterns. def fpimm0 : PatLeaf<(fpimm), [{ @@ -117,15 +121,15 @@ class FPIdxStore<bits<6> funct, string opstr, RegisterClass DRC, multiclass FFR1_W_M<bits<6> funct, string opstr> { def _S : FFR1<funct, 16, opstr, "w.s", FGR32, FGR32>; def _D32 : FFR1<funct, 17, opstr, "w.d", FGR32, AFGR64>, - Requires<[NotFP64bit]>; + Requires<[NotFP64bit, HasStandardEncoding]>; def _D64 : FFR1<funct, 17, opstr, "w.d", FGR32, FGR64>, - Requires<[IsFP64bit]> { + Requires<[IsFP64bit, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } } // Instructions that convert an FP value to 64-bit fixed point. -let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in +let Predicates = [IsFP64bit, HasStandardEncoding], DecoderNamespace = "Mips64" in multiclass FFR1_L_M<bits<6> funct, string opstr> { def _S : FFR1<funct, 16, opstr, "l.s", FGR64, FGR32>; def _D64 : FFR1<funct, 17, opstr, "l.d", FGR64, FGR64>; @@ -135,9 +139,9 @@ multiclass FFR1_L_M<bits<6> funct, string opstr> { multiclass FFR1P_M<bits<6> funct, string opstr, SDNode OpNode> { def _S : FFR1P<funct, 16, opstr, "s", FGR32, FGR32, OpNode>; def _D32 : FFR1P<funct, 17, opstr, "d", AFGR64, AFGR64, OpNode>, - Requires<[NotFP64bit]>; + Requires<[NotFP64bit, HasStandardEncoding]>; def _D64 : FFR1P<funct, 17, opstr, "d", FGR64, FGR64, OpNode>, - Requires<[IsFP64bit]> { + Requires<[IsFP64bit, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } } @@ -146,9 +150,9 @@ multiclass FFR2P_M<bits<6> funct, string opstr, SDNode OpNode, bit isComm = 0> { let isCommutable = isComm in { def _S : FFR2P<funct, 16, opstr, "s", FGR32, OpNode>; def _D32 : FFR2P<funct, 17, opstr, "d", AFGR64, OpNode>, - Requires<[NotFP64bit]>; + Requires<[NotFP64bit, HasStandardEncoding]>; def _D64 : FFR2P<funct, 17, opstr, "d", FGR64, OpNode>, - Requires<[IsFP64bit]> { + Requires<[IsFP64bit, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } } @@ -185,13 +189,13 @@ def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>; def CVT_L_S : FFR1<0x25, 16, "cvt", "l.s", FGR64, FGR32>; def CVT_L_D64: FFR1<0x25, 17, "cvt", "l.d", FGR64, FGR64>; -let Predicates = [NotFP64bit] in { +let Predicates = [NotFP64bit, HasStandardEncoding] in { def CVT_S_D32 : FFR1<0x20, 17, "cvt", "s.d", FGR32, AFGR64>; def CVT_D32_W : FFR1<0x21, 20, "cvt", "d.w", AFGR64, FGR32>; def CVT_D32_S : FFR1<0x21, 16, "cvt", "d.s", AFGR64, FGR32>; } -let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in { +let Predicates = [IsFP64bit, HasStandardEncoding], DecoderNamespace = "Mips64" in { def CVT_S_D64 : FFR1<0x20, 17, "cvt", "s.d", FGR32, FGR64>; def CVT_S_L : FFR1<0x20, 21, "cvt", "s.l", FGR32, FGR64>; def CVT_D64_W : FFR1<0x21, 20, "cvt", "d.w", FGR64, FGR32>; @@ -199,7 +203,7 @@ let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in { def CVT_D64_L : FFR1<0x21, 21, "cvt", "d.l", FGR64, FGR64>; } -let Predicates = [NoNaNsFPMath] in { +let Predicates = [NoNaNsFPMath, HasStandardEncoding] in { defm FABS : FFR1P_M<0x5, "abs", fabs>; defm FNEG : FFR1P_M<0x7, "neg", fneg>; } @@ -242,14 +246,14 @@ def DMTC1 : FFRGPR<0x05, (outs FGR64:$fs), (ins CPU64Regs:$rt), def FMOV_S : FFR1<0x6, 16, "mov", "s", FGR32, FGR32>; def FMOV_D32 : FFR1<0x6, 17, "mov", "d", AFGR64, AFGR64>, - Requires<[NotFP64bit]>; + Requires<[NotFP64bit, HasStandardEncoding]>; def FMOV_D64 : FFR1<0x6, 17, "mov", "d", FGR64, FGR64>, - Requires<[IsFP64bit]> { + Requires<[IsFP64bit, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } /// Floating Point Memory Instructions -let Predicates = [IsN64], DecoderNamespace = "Mips64" in { +let Predicates = [IsN64, HasStandardEncoding], DecoderNamespace = "Mips64" in { def LWC1_P8 : FPLoad<0x31, "lwc1", FGR32, mem64>; def SWC1_P8 : FPStore<0x39, "swc1", FGR32, mem64>; def LDC164_P8 : FPLoad<0x35, "ldc1", FGR64, mem64> { @@ -260,41 +264,42 @@ let Predicates = [IsN64], DecoderNamespace = "Mips64" in { } } -let Predicates = [NotN64] in { +let Predicates = [NotN64, HasStandardEncoding] in { def LWC1 : FPLoad<0x31, "lwc1", FGR32, mem>; def SWC1 : FPStore<0x39, "swc1", FGR32, mem>; } -let Predicates = [NotN64, HasMips64], DecoderNamespace = "Mips64" in { +let Predicates = [NotN64, HasMips64, HasStandardEncoding], + DecoderNamespace = "Mips64" in { def LDC164 : FPLoad<0x35, "ldc1", FGR64, mem>; def SDC164 : FPStore<0x3d, "sdc1", FGR64, mem>; } -let Predicates = [NotN64, NotMips64] in { +let Predicates = [NotN64, NotMips64, HasStandardEncoding] in { def LDC1 : FPLoad<0x35, "ldc1", AFGR64, mem>; def SDC1 : FPStore<0x3d, "sdc1", AFGR64, mem>; } // Indexed loads and stores. -let Predicates = [HasMips32r2Or64] in { +let Predicates = [HasMips32r2Or64, HasStandardEncoding] in { def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load_a>; def LUXC1 : FPIdxLoad<0x5, "luxc1", FGR32, CPURegs, load_u>; def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store_a>; def SUXC1 : FPIdxStore<0xd, "suxc1", FGR32, CPURegs, store_u>; } -let Predicates = [HasMips32r2, NotMips64] in { +let Predicates = [HasMips32r2, NotMips64, HasStandardEncoding] in { def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load_a>; def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store_a>; } -let Predicates = [HasMips64, NotN64], DecoderNamespace="Mips64" in { +let Predicates = [HasMips64, NotN64, HasStandardEncoding], DecoderNamespace="Mips64" in { def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load_a>; def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store_a>; } // n64 -let Predicates = [IsN64], isCodeGenOnly=1 in { +let Predicates = [IsN64, HasStandardEncoding], isCodeGenOnly=1 in { def LWXC1_P8 : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load_a>; def LUXC1_P8 : FPIdxLoad<0x5, "luxc1", FGR32, CPU64Regs, load_u>; def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load_a>; @@ -309,32 +314,33 @@ defm FDIV : FFR2P_M<0x03, "div", fdiv>; defm FMUL : FFR2P_M<0x02, "mul", fmul, 1>; defm FSUB : FFR2P_M<0x01, "sub", fsub>; -let Predicates = [HasMips32r2] in { +let Predicates = [HasMips32r2, HasStandardEncoding] in { def MADD_S : FMADDSUB<0x4, 0, "madd", "s", fadd, FGR32>; def MSUB_S : FMADDSUB<0x5, 0, "msub", "s", fsub, FGR32>; } -let Predicates = [HasMips32r2, NoNaNsFPMath] in { +let Predicates = [HasMips32r2, NoNaNsFPMath, HasStandardEncoding] in { def NMADD_S : FNMADDSUB<0x6, 0, "nmadd", "s", fadd, FGR32>; def NMSUB_S : FNMADDSUB<0x7, 0, "nmsub", "s", fsub, FGR32>; } -let Predicates = [HasMips32r2, NotFP64bit] in { +let Predicates = [HasMips32r2, NotFP64bit, HasStandardEncoding] in { def MADD_D32 : FMADDSUB<0x4, 1, "madd", "d", fadd, AFGR64>; def MSUB_D32 : FMADDSUB<0x5, 1, "msub", "d", fsub, AFGR64>; } -let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath] in { +let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStandardEncoding] in { def NMADD_D32 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, AFGR64>; def NMSUB_D32 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, AFGR64>; } -let Predicates = [HasMips32r2, IsFP64bit], isCodeGenOnly=1 in { +let Predicates = [HasMips32r2, IsFP64bit, HasStandardEncoding], isCodeGenOnly=1 in { def MADD_D64 : FMADDSUB<0x4, 1, "madd", "d", fadd, FGR64>; def MSUB_D64 : FMADDSUB<0x5, 1, "msub", "d", fsub, FGR64>; } -let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath], isCodeGenOnly=1 in { +let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStandardEncoding], + isCodeGenOnly=1 in { def NMADD_D64 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, FGR64>; def NMSUB_D64 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, FGR64>; } @@ -391,8 +397,10 @@ class FCMP<bits<5> fmt, RegisterClass RC, string typestr> : /// Floating Point Compare let Defs=[FCR31] in { def FCMP_S32 : FCMP<0x10, FGR32, "s">; - def FCMP_D32 : FCMP<0x11, AFGR64, "d">, Requires<[NotFP64bit]>; - def FCMP_D64 : FCMP<0x11, FGR64, "d">, Requires<[IsFP64bit]> { + def FCMP_D32 : FCMP<0x11, AFGR64, "d">, + Requires<[NotFP64bit, HasStandardEncoding]>; + def FCMP_D64 : FCMP<0x11, FGR64, "d">, + Requires<[IsFP64bit, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } } @@ -423,46 +431,52 @@ def ExtractElementF64 : //===----------------------------------------------------------------------===// // Floating Point Patterns //===----------------------------------------------------------------------===// -def : Pat<(f32 fpimm0), (MTC1 ZERO)>; -def : Pat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>; +def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>; +def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>; -def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>; -def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>; +def : MipsPat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>; +def : MipsPat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>; -let Predicates = [NotFP64bit] in { - def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVT_D32_W (MTC1 CPURegs:$src))>; - def : Pat<(i32 (fp_to_sint AFGR64:$src)), (MFC1 (TRUNC_W_D32 AFGR64:$src))>; - def : Pat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>; - def : Pat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>; +let Predicates = [NotFP64bit, HasStandardEncoding] in { + def : MipsPat<(f64 (sint_to_fp CPURegs:$src)), + (CVT_D32_W (MTC1 CPURegs:$src))>; + def : MipsPat<(i32 (fp_to_sint AFGR64:$src)), + (MFC1 (TRUNC_W_D32 AFGR64:$src))>; + def : MipsPat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>; + def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>; } -let Predicates = [IsFP64bit] in { - def : Pat<(f64 fpimm0), (DMTC1 ZERO_64)>; - def : Pat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>; +let Predicates = [IsFP64bit, HasStandardEncoding] in { + def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>; + def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>; - def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVT_D64_W (MTC1 CPURegs:$src))>; - def : Pat<(f32 (sint_to_fp CPU64Regs:$src)), - (CVT_S_L (DMTC1 CPU64Regs:$src))>; - def : Pat<(f64 (sint_to_fp CPU64Regs:$src)), - (CVT_D64_L (DMTC1 CPU64Regs:$src))>; + def : MipsPat<(f64 (sint_to_fp CPURegs:$src)), + (CVT_D64_W (MTC1 CPURegs:$src))>; + def : MipsPat<(f32 (sint_to_fp CPU64Regs:$src)), + (CVT_S_L (DMTC1 CPU64Regs:$src))>; + def : MipsPat<(f64 (sint_to_fp CPU64Regs:$src)), + (CVT_D64_L (DMTC1 CPU64Regs:$src))>; - def : Pat<(i32 (fp_to_sint FGR64:$src)), (MFC1 (TRUNC_W_D64 FGR64:$src))>; - def : Pat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>; - def : Pat<(i64 (fp_to_sint FGR64:$src)), (DMFC1 (TRUNC_L_D64 FGR64:$src))>; + def : MipsPat<(i32 (fp_to_sint FGR64:$src)), + (MFC1 (TRUNC_W_D64 FGR64:$src))>; + def : MipsPat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>; + def : MipsPat<(i64 (fp_to_sint FGR64:$src)), + (DMFC1 (TRUNC_L_D64 FGR64:$src))>; - def : Pat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>; - def : Pat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>; + def : MipsPat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>; + def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>; } // Patterns for unaligned floating point loads and stores. -let Predicates = [HasMips32r2Or64, NotN64] in { - def : Pat<(f32 (load_u CPURegs:$addr)), (LUXC1 CPURegs:$addr, ZERO)>; - def : Pat<(store_u FGR32:$src, CPURegs:$addr), - (SUXC1 FGR32:$src, CPURegs:$addr, ZERO)>; +let Predicates = [HasMips32r2Or64, NotN64, HasStandardEncoding] in { + def : MipsPat<(f32 (load_u CPURegs:$addr)), (LUXC1 CPURegs:$addr, ZERO)>; + def : MipsPat<(store_u FGR32:$src, CPURegs:$addr), + (SUXC1 FGR32:$src, CPURegs:$addr, ZERO)>; } -let Predicates = [IsN64] in { - def : Pat<(f32 (load_u CPU64Regs:$addr)), (LUXC1_P8 CPU64Regs:$addr, ZERO_64)>; - def : Pat<(store_u FGR32:$src, CPU64Regs:$addr), - (SUXC1_P8 FGR32:$src, CPU64Regs:$addr, ZERO_64)>; +let Predicates = [IsN64, HasStandardEncoding] in { + def : MipsPat<(f32 (load_u CPU64Regs:$addr)), + (LUXC1_P8 CPU64Regs:$addr, ZERO_64)>; + def : MipsPat<(store_u FGR32:$src, CPU64Regs:$addr), + (SUXC1_P8 FGR32:$src, CPU64Regs:$addr, ZERO_64)>; } diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td index 841eba0..15a77fb 100644 --- a/lib/Target/Mips/MipsInstrFormats.td +++ b/lib/Target/Mips/MipsInstrFormats.td @@ -70,6 +70,9 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern, let DecoderNamespace = "Mips"; field bits<32> SoftFail = 0; + + let Predicates = [HasStandardEncoding]; + } // Mips Pseudo Instructions Format diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp index a3a18bf..458e4f7 100644 --- a/lib/Target/Mips/MipsInstrInfo.cpp +++ b/lib/Target/Mips/MipsInstrInfo.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "MipsAnalyzeImmediate.h" #include "MipsInstrInfo.h" #include "MipsTargetMachine.h" #include "MipsMachineFunction.h" @@ -29,6 +30,7 @@ using namespace llvm; MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm) : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), TM(tm), IsN64(TM.getSubtarget<MipsSubtarget>().isABI_N64()), + InMips16Mode(TM.getSubtarget<MipsSubtarget>().inMips16Mode()), RI(*TM.getSubtargetImpl(), *this), UncondBrOpc(TM.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J) {} @@ -106,8 +108,13 @@ copyPhysReg(MachineBasicBlock &MBB, unsigned Opc = 0, ZeroReg = 0; if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg. - if (Mips::CPURegsRegClass.contains(SrcReg)) - Opc = Mips::ADDu, ZeroReg = Mips::ZERO; + if (Mips::CPURegsRegClass.contains(SrcReg)) { + if (InMips16Mode) + Opc=Mips::Mov32R16; + else { + Opc = Mips::ADDu, ZeroReg = Mips::ZERO; + } + } else if (Mips::CCRRegClass.contains(SrcReg)) Opc = Mips::CFC1; else if (Mips::FGR32RegClass.contains(SrcReg)) @@ -189,15 +196,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned Opc = 0; - if (RC == Mips::CPURegsRegisterClass) + if (Mips::CPURegsRegClass.hasSubClassEq(RC)) Opc = IsN64 ? Mips::SW_P8 : Mips::SW; - else if (RC == Mips::CPU64RegsRegisterClass) + else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC)) Opc = IsN64 ? Mips::SD_P8 : Mips::SD; - else if (RC == Mips::FGR32RegisterClass) + else if (Mips::FGR32RegClass.hasSubClassEq(RC)) Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1; - else if (RC == Mips::AFGR64RegisterClass) + else if (Mips::AFGR64RegClass.hasSubClassEq(RC)) Opc = Mips::SDC1; - else if (RC == Mips::FGR64RegisterClass) + else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164; assert(Opc && "Register class not handled!"); @@ -216,15 +223,15 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad); unsigned Opc = 0; - if (RC == Mips::CPURegsRegisterClass) + if (Mips::CPURegsRegClass.hasSubClassEq(RC)) Opc = IsN64 ? Mips::LW_P8 : Mips::LW; - else if (RC == Mips::CPU64RegsRegisterClass) + else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC)) Opc = IsN64 ? Mips::LD_P8 : Mips::LD; - else if (RC == Mips::FGR32RegisterClass) + else if (Mips::FGR32RegClass.hasSubClassEq(RC)) Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1; - else if (RC == Mips::AFGR64RegisterClass) + else if (Mips::AFGR64RegClass.hasSubClassEq(RC)) Opc = Mips::LDC1; - else if (RC == Mips::FGR64RegisterClass) + else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164; assert(Opc && "Register class not handled!"); @@ -232,6 +239,76 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addMemOperand(MMO); } +void MipsInstrInfo::ExpandRetRA(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opc) const { + BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(Opc)) + .addReg(Mips::RA); +} + +void MipsInstrInfo::ExpandRetRA16(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opc) const { + BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(Opc)); +} + +void MipsInstrInfo::ExpandExtractElementF64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const TargetInstrInfo *TII = TM.getInstrInfo(); + unsigned DstReg = I->getOperand(0).getReg(); + unsigned SrcReg = I->getOperand(1).getReg(); + unsigned N = I->getOperand(2).getImm(); + const MCInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1); + DebugLoc dl = I->getDebugLoc(); + + assert(N < 2 && "Invalid immediate"); + unsigned SubIdx = N ? Mips::sub_fpodd : Mips::sub_fpeven; + unsigned SubReg = TM.getRegisterInfo()->getSubReg(SrcReg, SubIdx); + + BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(SubReg); +} + +void MipsInstrInfo::ExpandBuildPairF64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const TargetInstrInfo *TII = TM.getInstrInfo(); + unsigned DstReg = I->getOperand(0).getReg(); + unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg(); + const MCInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1); + DebugLoc dl = I->getDebugLoc(); + const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + + // mtc1 Lo, $fp + // mtc1 Hi, $fp + 1 + BuildMI(MBB, I, dl, Mtc1Tdd, TRI->getSubReg(DstReg, Mips::sub_fpeven)) + .addReg(LoReg); + BuildMI(MBB, I, dl, Mtc1Tdd, TRI->getSubReg(DstReg, Mips::sub_fpodd)) + .addReg(HiReg); +} + +bool MipsInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + MachineBasicBlock &MBB = *MI->getParent(); + + switch(MI->getDesc().getOpcode()) { + default: + return false; + case Mips::RetRA: + ExpandRetRA(MBB, MI, Mips::RET); + break; + case Mips::RetRA16: + ExpandRetRA16(MBB, MI, Mips::JrRa16); + break; + case Mips::BuildPairF64: + ExpandBuildPairF64(MBB, MI); + break; + case Mips::ExtractElementF64: + ExpandExtractElementF64(MBB, MI); + break; + } + + MBB.erase(MI); + return true; +} + MachineInstr* MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *MDPtr, @@ -278,9 +355,9 @@ unsigned Mips::GetOppositeBranchOpc(unsigned Opc) } } -static void AnalyzeCondBr(const MachineInstr* Inst, unsigned Opc, +static void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc, MachineBasicBlock *&BB, - SmallVectorImpl<MachineOperand>& Cond) { + SmallVectorImpl<MachineOperand> &Cond) { assert(GetAnalyzableBrOpc(Opc) && "Not an analyzable branch"); int NumOp = Inst->getNumExplicitOperands(); @@ -454,3 +531,58 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const return false; } +/// Return the number of bytes of code the specified instruction may be. +unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + return MI->getDesc().getSize(); + case TargetOpcode::INLINEASM: { // Inline Asm: Variable size. + const MachineFunction *MF = MI->getParent()->getParent(); + const char *AsmStr = MI->getOperand(0).getSymbolName(); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + } + } +} + +unsigned +llvm::Mips::loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII, + MachineBasicBlock& MBB, + MachineBasicBlock::iterator II, DebugLoc DL, + bool LastInstrIsADDiu, + MipsAnalyzeImmediate::Inst *LastInst) { + MipsAnalyzeImmediate AnalyzeImm; + unsigned Size = IsN64 ? 64 : 32; + unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi; + unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO; + unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT; + + const MipsAnalyzeImmediate::InstSeq &Seq = + AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu); + MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin(); + + if (LastInst && (Seq.size() == 1)) { + *LastInst = *Inst; + return 0; + } + + // The first instruction can be a LUi, which is different from other + // instructions (ADDiu, ORI and SLL) in that it does not have a register + // operand. + if (Inst->Opc == LUi) + BuildMI(MBB, II, DL, TII.get(LUi), ATReg) + .addImm(SignExtend64<16>(Inst->ImmOpnd)); + else + BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg) + .addImm(SignExtend64<16>(Inst->ImmOpnd)); + + // Build the remaining instructions in Seq. Skip the last instruction if + // LastInst is not 0. + for (++Inst; Inst != Seq.end() - !!LastInst; ++Inst) + BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg) + .addImm(SignExtend64<16>(Inst->ImmOpnd)); + + if (LastInst) + *LastInst = *Inst; + + return Seq.size() - !!LastInst; +} diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h index 4be727d..358f817 100644 --- a/lib/Target/Mips/MipsInstrInfo.h +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -15,6 +15,7 @@ #define MIPSINSTRUCTIONINFO_H #include "Mips.h" +#include "MipsAnalyzeImmediate.h" #include "MipsRegisterInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetInstrInfo.h" @@ -24,15 +25,9 @@ namespace llvm { -namespace Mips { - /// GetOppositeBranchOpc - Return the inverse of the specified - /// opcode, e.g. turning BEQ to BNE. - unsigned GetOppositeBranchOpc(unsigned Opc); -} - class MipsInstrInfo : public MipsGenInstrInfo { MipsTargetMachine &TM; - bool IsN64; + bool IsN64; bool InMips16Mode; const MipsRegisterInfo RI; unsigned UncondBrOpc; public: @@ -68,8 +63,17 @@ public: virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; private: + void ExpandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned Opc) const; + void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned Opc) const; + void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL, const SmallVectorImpl<MachineOperand>& Cond) const; + void ExpandExtractElementF64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + void ExpandBuildPairF64(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; public: virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, @@ -92,6 +96,8 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const; + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + virtual MachineInstr* emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *MDPtr, @@ -103,8 +109,27 @@ public: /// Insert nop instruction when hazard condition is found virtual void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const; + + /// Return the number of bytes of code the specified instruction may be. + unsigned GetInstSizeInBytes(const MachineInstr *MI) const; }; +namespace Mips { + /// GetOppositeBranchOpc - Return the inverse of the specified + /// opcode, e.g. turning BEQ to BNE. + unsigned GetOppositeBranchOpc(unsigned Opc); + + /// Emit a series of instructions to load an immediate. All instructions + /// except for the last one are emitted. The function returns the number of + /// MachineInstrs generated. The opcode-immediate pair of the last + /// instruction is returned in LastInst, if it is not 0. + unsigned + loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII, + MachineBasicBlock& MBB, MachineBasicBlock::iterator II, + DebugLoc DL, bool LastInstrIsADDiu, + MipsAnalyzeImmediate::Inst *LastInst); +} + } #endif diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index 873d2bd..f1aada4 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -11,17 +11,11 @@ // //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// Instruction format superclass -//===----------------------------------------------------------------------===// - -include "MipsInstrFormats.td" //===----------------------------------------------------------------------===// // Mips profiles and nodes //===----------------------------------------------------------------------===// -def SDT_MipsRet : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_MipsJmpLink : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; def SDT_MipsCMov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, @@ -49,6 +43,10 @@ def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>, SDTCisSameAs<2, 3>, SDTCisSameAs<0, 4>]>; +def SDTMipsLoadLR : SDTypeProfile<1, 2, + [SDTCisInt<0>, SDTCisPtrTy<1>, + SDTCisSameAs<0, 2>]>; + // Call def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink, [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, @@ -72,8 +70,7 @@ def MipsTprelLo : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>; def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>; // Return -def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain, - SDNPOptInGlue]>; +def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; // These are target-independent nodes, but have target-specific formats. def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart, @@ -118,6 +115,23 @@ def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>; def MipsExt : SDNode<"MipsISD::Ext", SDT_Ext>; def MipsIns : SDNode<"MipsISD::Ins", SDT_Ins>; +def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def MipsLWR : SDNode<"MipsISD::LWR", SDTMipsLoadLR, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def MipsSWL : SDNode<"MipsISD::SWL", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def MipsSWR : SDNode<"MipsISD::SWR", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def MipsLDL : SDNode<"MipsISD::LDL", SDTMipsLoadLR, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def MipsLDR : SDNode<"MipsISD::LDR", SDTMipsLoadLR, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def MipsSDL : SDNode<"MipsISD::SDL", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def MipsSDR : SDNode<"MipsISD::SDR", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + //===----------------------------------------------------------------------===// // Mips Instruction Predicate Definitions. //===----------------------------------------------------------------------===// @@ -145,12 +159,26 @@ def IsN64 : Predicate<"Subtarget.isABI_N64()">, AssemblerPredicate<"FeatureN64">; def NotN64 : Predicate<"!Subtarget.isABI_N64()">, AssemblerPredicate<"!FeatureN64">; +def InMips16Mode : Predicate<"Subtarget.inMips16Mode()">, + AssemblerPredicate<"FeatureMips16">; def RelocStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">, AssemblerPredicate<"FeatureMips32">; def RelocPIC : Predicate<"TM.getRelocationModel() == Reloc::PIC_">, AssemblerPredicate<"FeatureMips32">; def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">, AssemblerPredicate<"FeatureMips32">; +def HasStandardEncoding : Predicate<"Subtarget.hasStandardEncoding()">, + AssemblerPredicate<"!FeatureMips16">; + +class MipsPat<dag pattern, dag result> : Pat<pattern, result> { + let Predicates = [HasStandardEncoding]; +} + +//===----------------------------------------------------------------------===// +// Instruction format superclass +//===----------------------------------------------------------------------===// + +include "MipsInstrFormats.td" //===----------------------------------------------------------------------===// // Mips Operand, Complex Patterns and Transformations Definitions. @@ -190,6 +218,7 @@ def mem : Operand<i32> { def mem64 : Operand<i64> { let PrintMethod = "printMemOperand"; let MIOperandInfo = (ops CPU64Regs, simm16_64); + let EncoderMethod = "getMemEncoding"; } def mem_ea : Operand<i32> { @@ -252,7 +281,8 @@ def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>; // Mips Address Mode! SDNode frameindex could possibily be a match // since load and store instructions from stack used it. -def addr : ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>; +def addr : + ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>; //===----------------------------------------------------------------------===// // Pattern fragment for load/store @@ -418,21 +448,13 @@ class StoreM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC, let isPseudo = Pseudo; } -// Unaligned Memory Load/Store -let canFoldAsLoad = 1 in -class LoadUnAlign<bits<6> op, RegisterClass RC, Operand MemOpnd>: - FMem<op, (outs RC:$rt), (ins MemOpnd:$addr), "", [], IILoad> {} - -class StoreUnAlign<bits<6> op, RegisterClass RC, Operand MemOpnd>: - FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr), "", [], IIStore> {} - // 32-bit load. multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode, bit Pseudo = 0> { def #NAME# : LoadM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>, - Requires<[NotN64]>; + Requires<[NotN64, HasStandardEncoding]>; def _P8 : LoadM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>, - Requires<[IsN64]> { + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; let isCodeGenOnly = 1; } @@ -442,31 +464,21 @@ multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode, multiclass LoadM64<bits<6> op, string instr_asm, PatFrag OpNode, bit Pseudo = 0> { def #NAME# : LoadM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>, - Requires<[NotN64]>; + Requires<[NotN64, HasStandardEncoding]>; def _P8 : LoadM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>, - Requires<[IsN64]> { + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; let isCodeGenOnly = 1; } } -// 32-bit load. -multiclass LoadUnAlign32<bits<6> op> { - def #NAME# : LoadUnAlign<op, CPURegs, mem>, - Requires<[NotN64]>; - def _P8 : LoadUnAlign<op, CPURegs, mem64>, - Requires<[IsN64]> { - let DecoderNamespace = "Mips64"; - let isCodeGenOnly = 1; - } -} // 32-bit store. multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode, bit Pseudo = 0> { def #NAME# : StoreM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>, - Requires<[NotN64]>; + Requires<[NotN64, HasStandardEncoding]>; def _P8 : StoreM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>, - Requires<[IsN64]> { + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; let isCodeGenOnly = 1; } @@ -476,20 +488,69 @@ multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode, multiclass StoreM64<bits<6> op, string instr_asm, PatFrag OpNode, bit Pseudo = 0> { def #NAME# : StoreM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>, - Requires<[NotN64]>; + Requires<[NotN64, HasStandardEncoding]>; def _P8 : StoreM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>, - Requires<[IsN64]> { + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; let isCodeGenOnly = 1; } } -// 32-bit store. -multiclass StoreUnAlign32<bits<6> op> { - def #NAME# : StoreUnAlign<op, CPURegs, mem>, - Requires<[NotN64]>; - def _P8 : StoreUnAlign<op, CPURegs, mem64>, - Requires<[IsN64]> { +// Load/Store Left/Right +let canFoldAsLoad = 1 in +class LoadLeftRight<bits<6> op, string instr_asm, SDNode OpNode, + RegisterClass RC, Operand MemOpnd> : + FMem<op, (outs RC:$rt), (ins MemOpnd:$addr, RC:$src), + !strconcat(instr_asm, "\t$rt, $addr"), + [(set RC:$rt, (OpNode addr:$addr, RC:$src))], IILoad> { + string Constraints = "$src = $rt"; +} + +class StoreLeftRight<bits<6> op, string instr_asm, SDNode OpNode, + RegisterClass RC, Operand MemOpnd>: + FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr), + !strconcat(instr_asm, "\t$rt, $addr"), [(OpNode RC:$rt, addr:$addr)], + IIStore>; + +// 32-bit load left/right. +multiclass LoadLeftRightM32<bits<6> op, string instr_asm, SDNode OpNode> { + def #NAME# : LoadLeftRight<op, instr_asm, OpNode, CPURegs, mem>, + Requires<[NotN64, HasStandardEncoding]>; + def _P8 : LoadLeftRight<op, instr_asm, OpNode, CPURegs, mem64>, + Requires<[IsN64, HasStandardEncoding]> { + let DecoderNamespace = "Mips64"; + let isCodeGenOnly = 1; + } +} + +// 64-bit load left/right. +multiclass LoadLeftRightM64<bits<6> op, string instr_asm, SDNode OpNode> { + def #NAME# : LoadLeftRight<op, instr_asm, OpNode, CPU64Regs, mem>, + Requires<[NotN64, HasStandardEncoding]>; + def _P8 : LoadLeftRight<op, instr_asm, OpNode, CPU64Regs, mem64>, + Requires<[IsN64, HasStandardEncoding]> { + let DecoderNamespace = "Mips64"; + let isCodeGenOnly = 1; + } +} + +// 32-bit store left/right. +multiclass StoreLeftRightM32<bits<6> op, string instr_asm, SDNode OpNode> { + def #NAME# : StoreLeftRight<op, instr_asm, OpNode, CPURegs, mem>, + Requires<[NotN64, HasStandardEncoding]>; + def _P8 : StoreLeftRight<op, instr_asm, OpNode, CPURegs, mem64>, + Requires<[IsN64, HasStandardEncoding]> { + let DecoderNamespace = "Mips64"; + let isCodeGenOnly = 1; + } +} + +// 64-bit store left/right. +multiclass StoreLeftRightM64<bits<6> op, string instr_asm, SDNode OpNode> { + def #NAME# : StoreLeftRight<op, instr_asm, OpNode, CPU64Regs, mem>, + Requires<[NotN64, HasStandardEncoding]>; + def _P8 : StoreLeftRight<op, instr_asm, OpNode, CPU64Regs, mem64>, + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; let isCodeGenOnly = 1; } @@ -503,6 +564,7 @@ class CBranch<bits<6> op, string instr_asm, PatFrag cond_op, RegisterClass RC>: let isBranch = 1; let isTerminator = 1; let hasDelaySlot = 1; + let Defs = [AT]; } class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op, @@ -514,6 +576,7 @@ class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op, let isBranch = 1; let isTerminator = 1; let hasDelaySlot = 1; + let Defs = [AT]; } // SetCC @@ -541,8 +604,9 @@ class JumpFJ<bits<6> op, string instr_asm>: let isTerminator=1; let isBarrier=1; let hasDelaySlot = 1; - let Predicates = [RelocStatic]; + let Predicates = [RelocStatic, HasStandardEncoding]; let DecoderMethod = "DecodeJumpTarget"; + let Defs = [AT]; } // Unconditional branch @@ -555,23 +619,37 @@ class UncondBranch<bits<6> op, string instr_asm>: let isTerminator = 1; let isBarrier = 1; let hasDelaySlot = 1; - let Predicates = [RelocPIC]; + let Predicates = [RelocPIC, HasStandardEncoding]; + let Defs = [AT]; } -let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1, - isIndirectBranch = 1 in -class JumpFR<bits<6> op, bits<6> func, string instr_asm, RegisterClass RC>: - FR<op, func, (outs), (ins RC:$rs), - !strconcat(instr_asm, "\t$rs"), [(brind RC:$rs)], IIBranch> { +// Base class for indirect branch and return instruction classes. +let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in +class JumpFR<RegisterClass RC, list<dag> pattern>: + FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", pattern, IIBranch> { let rt = 0; let rd = 0; let shamt = 0; } +// Indirect branch +class IndirectBranch<RegisterClass RC>: JumpFR<RC, [(brind RC:$rs)]> { + let isBranch = 1; + let isIndirectBranch = 1; +} + +// Return instruction +class RetBase<RegisterClass RC>: JumpFR<RC, []> { + let isReturn = 1; + let isCodeGenOnly = 1; + let hasCtrlDep = 1; + let hasExtraSrcRegAllocReq = 1; +} + // Jump and Link (Call) -let isCall=1, hasDelaySlot=1 in { +let isCall=1, hasDelaySlot=1, Defs = [RA] in { class JumpLink<bits<6> op, string instr_asm>: - FJ<op, (outs), (ins calltarget:$target, variable_ops), + FJ<op, (outs), (ins calltarget:$target), !strconcat(instr_asm, "\t$target"), [(MipsJmpLink imm:$target)], IIBranch> { let DecoderMethod = "DecodeJumpTarget"; @@ -579,7 +657,7 @@ let isCall=1, hasDelaySlot=1 in { class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm, RegisterClass RC>: - FR<op, func, (outs), (ins RC:$rs, variable_ops), + FR<op, func, (outs), (ins RC:$rs), !strconcat(instr_asm, "\t$rs"), [(MipsJmpLink RC:$rs)], IIBranch> { let rt = 0; let rd = 31; @@ -587,7 +665,7 @@ let isCall=1, hasDelaySlot=1 in { } class BranchLink<string instr_asm, bits<5> _rt, RegisterClass RC>: - FI<0x1, (outs), (ins RC:$rs, brtarget:$imm16, variable_ops), + FI<0x1, (outs), (ins RC:$rs, brtarget:$imm16), !strconcat(instr_asm, "\t$rs, $imm16"), [], IIBranch> { let rt = _rt; } @@ -653,7 +731,7 @@ class CountLeading0<bits<6> func, string instr_asm, RegisterClass RC>: FR<0x1c, func, (outs RC:$rd), (ins RC:$rs), !strconcat(instr_asm, "\t$rd, $rs"), [(set RC:$rd, (ctlz RC:$rs))], IIAlu>, - Requires<[HasBitCount]> { + Requires<[HasBitCount, HasStandardEncoding]> { let shamt = 0; let rt = rd; } @@ -662,7 +740,7 @@ class CountLeading1<bits<6> func, string instr_asm, RegisterClass RC>: FR<0x1c, func, (outs RC:$rd), (ins RC:$rs), !strconcat(instr_asm, "\t$rd, $rs"), [(set RC:$rd, (ctlz (not RC:$rs)))], IIAlu>, - Requires<[HasBitCount]> { + Requires<[HasBitCount, HasStandardEncoding]> { let shamt = 0; let rt = rd; } @@ -675,7 +753,7 @@ class SignExtInReg<bits<5> sa, string instr_asm, ValueType vt, [(set RC:$rd, (sext_inreg RC:$rt, vt))], NoItinerary> { let rs = 0; let shamt = sa; - let Predicates = [HasSEInReg]; + let Predicates = [HasSEInReg, HasStandardEncoding]; } // Subword Swap @@ -684,7 +762,7 @@ class SubwordSwap<bits<6> func, bits<5> sa, string instr_asm, RegisterClass RC>: !strconcat(instr_asm, "\t$rd, $rt"), [], NoItinerary> { let rs = 0; let shamt = sa; - let Predicates = [HasSwap]; + let Predicates = [HasSwap, HasStandardEncoding]; let neverHasSideEffects = 1; } @@ -705,7 +783,7 @@ class ExtBase<bits<6> _funct, string instr_asm, RegisterClass RC>: bits<5> sz; let rd = sz; let shamt = pos; - let Predicates = [HasMips32r2]; + let Predicates = [HasMips32r2, HasStandardEncoding]; } class InsBase<bits<6> _funct, string instr_asm, RegisterClass RC>: @@ -718,7 +796,7 @@ class InsBase<bits<6> _funct, string instr_asm, RegisterClass RC>: bits<5> sz; let rd = sz; let shamt = pos; - let Predicates = [HasMips32r2]; + let Predicates = [HasMips32r2, HasStandardEncoding]; let Constraints = "$src = $rt"; } @@ -730,8 +808,10 @@ class Atomic2Ops<PatFrag Op, string Opstr, RegisterClass DRC, [(set DRC:$dst, (Op PRC:$ptr, DRC:$incr))]>; multiclass Atomic2Ops32<PatFrag Op, string Opstr> { - def #NAME# : Atomic2Ops<Op, Opstr, CPURegs, CPURegs>, Requires<[NotN64]>; - def _P8 : Atomic2Ops<Op, Opstr, CPURegs, CPU64Regs>, Requires<[IsN64]> { + def #NAME# : Atomic2Ops<Op, Opstr, CPURegs, CPURegs>, + Requires<[NotN64, HasStandardEncoding]>; + def _P8 : Atomic2Ops<Op, Opstr, CPURegs, CPU64Regs>, + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } } @@ -744,8 +824,10 @@ class AtomicCmpSwap<PatFrag Op, string Width, RegisterClass DRC, [(set DRC:$dst, (Op PRC:$ptr, DRC:$cmp, DRC:$swap))]>; multiclass AtomicCmpSwap32<PatFrag Op, string Width> { - def #NAME# : AtomicCmpSwap<Op, Width, CPURegs, CPURegs>, Requires<[NotN64]>; - def _P8 : AtomicCmpSwap<Op, Width, CPURegs, CPU64Regs>, Requires<[IsN64]> { + def #NAME# : AtomicCmpSwap<Op, Width, CPURegs, CPURegs>, + Requires<[NotN64, HasStandardEncoding]>; + def _P8 : AtomicCmpSwap<Op, Width, CPURegs, CPU64Regs>, + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } } @@ -767,6 +849,10 @@ class SCBase<bits<6> Opc, string opstring, RegisterClass RC, Operand Mem> : // Pseudo instructions //===----------------------------------------------------------------------===// +// Return RA. +let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in +def RetRA : MipsPseudo<(outs), (ins), "", [(MipsRet)]>; + // As stack alignment is always done with addiu, we need a 16-bit immediate let Defs = [SP], Uses = [SP] in { def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins uimm16:$amt), @@ -785,29 +871,6 @@ let neverHasSideEffects = 1 in def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc, CPURegs:$gp), ".cprestore\t$loc", []>; -// For O32 ABI & PIC & non-fixed global base register, the following instruction -// seqeunce is emitted to set the global base register: -// -// 0. lui $2, %hi(_gp_disp) -// 1. addiu $2, $2, %lo(_gp_disp) -// 2. addu $globalbasereg, $2, $t9 -// -// SETGP01 is emitted during Prologue/Epilogue insertion and then converted to -// instructions 0 and 1 in the sequence above during MC lowering. -// SETGP2 is emitted just before register allocation and converted to -// instruction 2 just prior to post-RA scheduling. -// -// These pseudo instructions are needed to ensure no instructions are inserted -// before or between instructions 0 and 1, which is a limitation imposed by -// GNU linker. - -let isTerminator = 1, isBarrier = 1 in -def SETGP01 : MipsPseudo<(outs CPURegs:$dst), (ins), "", []>; - -let neverHasSideEffects = 1 in -def SETGP2 : MipsPseudo<(outs CPURegs:$globalreg), (ins CPURegs:$picreg), "", - []>; - let usesCustomInserter = 1 in { defm ATOMIC_LOAD_ADD_I8 : Atomic2Ops32<atomic_load_add_8, "load_add_8">; defm ATOMIC_LOAD_ADD_I16 : Atomic2Ops32<atomic_load_add_16, "load_add_16">; @@ -876,7 +939,7 @@ def SRLV : shift_rotate_reg<0x06, 0x00, "srlv", srl, CPURegs>; def SRAV : shift_rotate_reg<0x07, 0x00, "srav", sra, CPURegs>; // Rotate Instructions -let Predicates = [HasMips32r2] in { +let Predicates = [HasMips32r2, HasStandardEncoding] in { def ROTR : shift_rotate_imm32<0x02, 0x01, "rotr", rotr>; def ROTRV : shift_rotate_reg<0x06, 0x01, "rotrv", rotr, CPURegs>; } @@ -899,11 +962,11 @@ defm ULW : LoadM32<0x23, "ulw", load_u, 1>; defm USH : StoreM32<0x29, "ush", truncstorei16_u, 1>; defm USW : StoreM32<0x2b, "usw", store_u, 1>; -/// Primitives for unaligned -defm LWL : LoadUnAlign32<0x22>; -defm LWR : LoadUnAlign32<0x26>; -defm SWL : StoreUnAlign32<0x2A>; -defm SWR : StoreUnAlign32<0x2E>; +/// load/store left/right +defm LWL : LoadLeftRightM32<0x22, "lwl", MipsLWL>; +defm LWR : LoadLeftRightM32<0x26, "lwr", MipsLWR>; +defm SWL : StoreLeftRightM32<0x2a, "swl", MipsSWL>; +defm SWR : StoreLeftRightM32<0x2e, "swr", MipsSWR>; let hasSideEffects = 1 in def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype", @@ -917,19 +980,23 @@ def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype", } /// Load-linked, Store-conditional -def LL : LLBase<0x30, "ll", CPURegs, mem>, Requires<[NotN64]>; -def LL_P8 : LLBase<0x30, "ll", CPURegs, mem64>, Requires<[IsN64]> { +def LL : LLBase<0x30, "ll", CPURegs, mem>, + Requires<[NotN64, HasStandardEncoding]>; +def LL_P8 : LLBase<0x30, "ll", CPURegs, mem64>, + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } -def SC : SCBase<0x38, "sc", CPURegs, mem>, Requires<[NotN64]>; -def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>, Requires<[IsN64]> { +def SC : SCBase<0x38, "sc", CPURegs, mem>, + Requires<[NotN64, HasStandardEncoding]>; +def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>, + Requires<[IsN64, HasStandardEncoding]> { let DecoderNamespace = "Mips64"; } /// Jump and Branch Instructions def J : JumpFJ<0x02, "j">; -def JR : JumpFR<0x00, 0x08, "jr", CPURegs>; +def JR : IndirectBranch<CPURegs>; def B : UncondBranch<0x04, "b">; def BEQ : CBranch<0x04, "beq", seteq, CPURegs>; def BNE : CBranch<0x05, "bne", setne, CPURegs>; @@ -938,15 +1005,16 @@ def BGTZ : CBranchZero<0x07, 0, "bgtz", setgt, CPURegs>; def BLEZ : CBranchZero<0x06, 0, "blez", setle, CPURegs>; def BLTZ : CBranchZero<0x01, 0, "bltz", setlt, CPURegs>; +let rt = 0, rs = 0, isBranch = 1, isTerminator = 1, isBarrier = 1, + hasDelaySlot = 1, Defs = [RA] in +def BAL_BR: FI<0x1, (outs), (ins brtarget:$imm16), "bal\t$imm16", [], IIBranch>; + def JAL : JumpLink<0x03, "jal">; def JALR : JumpLinkReg<0x00, 0x09, "jalr", CPURegs>; def BGEZAL : BranchLink<"bgezal", 0x11, CPURegs>; def BLTZAL : BranchLink<"bltzal", 0x10, CPURegs>; -let isReturn=1, isTerminator=1, hasDelaySlot=1, isCodeGenOnly=1, - isBarrier=1, hasCtrlDep=1, rd=0, rt=0, shamt=0 in - def RET : FR <0x00, 0x08, (outs), (ins CPURegs:$target), - "jr\t$target", [(MipsRet CPURegs:$target)], IIBranch>; +def RET : RetBase<CPURegs>; /// Multiply and Divide Instructions. def MULT : Mult32<0x18, "mult", IIImul>; @@ -999,7 +1067,7 @@ def MSUBU : MArithR<5, "msubu", MipsMSubu>; // MUL is a assembly macro in the current used ISAs. In recent ISA's // it is a real instruction. def MUL : ArithLogicR<0x1c, 0x02, "mul", mul, IIImul, CPURegs, 1>, - Requires<[HasMips32]>; + Requires<[HasMips32, HasStandardEncoding]>; def RDHWR : ReadHardware<CPURegs, HWRegs>; @@ -1011,67 +1079,67 @@ def INS : InsBase<4, "ins", CPURegs>; //===----------------------------------------------------------------------===// // Small immediates -def : Pat<(i32 immSExt16:$in), - (ADDiu ZERO, imm:$in)>; -def : Pat<(i32 immZExt16:$in), - (ORi ZERO, imm:$in)>; -def : Pat<(i32 immLow16Zero:$in), - (LUi (HI16 imm:$in))>; +def : MipsPat<(i32 immSExt16:$in), + (ADDiu ZERO, imm:$in)>; +def : MipsPat<(i32 immZExt16:$in), + (ORi ZERO, imm:$in)>; +def : MipsPat<(i32 immLow16Zero:$in), + (LUi (HI16 imm:$in))>; // Arbitrary immediates -def : Pat<(i32 imm:$imm), +def : MipsPat<(i32 imm:$imm), (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>; -// Carry patterns -def : Pat<(subc CPURegs:$lhs, CPURegs:$rhs), - (SUBu CPURegs:$lhs, CPURegs:$rhs)>; -def : Pat<(addc CPURegs:$lhs, CPURegs:$rhs), - (ADDu CPURegs:$lhs, CPURegs:$rhs)>; -def : Pat<(addc CPURegs:$src, immSExt16:$imm), - (ADDiu CPURegs:$src, imm:$imm)>; +// Carry MipsPatterns +def : MipsPat<(subc CPURegs:$lhs, CPURegs:$rhs), + (SUBu CPURegs:$lhs, CPURegs:$rhs)>; +def : MipsPat<(addc CPURegs:$lhs, CPURegs:$rhs), + (ADDu CPURegs:$lhs, CPURegs:$rhs)>; +def : MipsPat<(addc CPURegs:$src, immSExt16:$imm), + (ADDiu CPURegs:$src, imm:$imm)>; // Call -def : Pat<(MipsJmpLink (i32 tglobaladdr:$dst)), - (JAL tglobaladdr:$dst)>; -def : Pat<(MipsJmpLink (i32 texternalsym:$dst)), - (JAL texternalsym:$dst)>; -//def : Pat<(MipsJmpLink CPURegs:$dst), -// (JALR CPURegs:$dst)>; +def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)), + (JAL tglobaladdr:$dst)>; +def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)), + (JAL texternalsym:$dst)>; +//def : MipsPat<(MipsJmpLink CPURegs:$dst), +// (JALR CPURegs:$dst)>; // hi/lo relocs -def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>; -def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>; -def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>; -def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>; -def : Pat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>; - -def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>; -def : Pat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>; -def : Pat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>; -def : Pat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>; -def : Pat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>; - -def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)), - (ADDiu CPURegs:$hi, tglobaladdr:$lo)>; -def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)), - (ADDiu CPURegs:$hi, tblockaddress:$lo)>; -def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)), - (ADDiu CPURegs:$hi, tjumptable:$lo)>; -def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)), - (ADDiu CPURegs:$hi, tconstpool:$lo)>; -def : Pat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)), - (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>; +def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>; +def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>; +def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>; +def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>; +def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>; + +def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>; +def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>; +def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>; +def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>; +def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>; + +def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)), + (ADDiu CPURegs:$hi, tglobaladdr:$lo)>; +def : MipsPat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)), + (ADDiu CPURegs:$hi, tblockaddress:$lo)>; +def : MipsPat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)), + (ADDiu CPURegs:$hi, tjumptable:$lo)>; +def : MipsPat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)), + (ADDiu CPURegs:$hi, tconstpool:$lo)>; +def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)), + (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>; // gp_rel relocs -def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)), - (ADDiu CPURegs:$gp, tglobaladdr:$in)>; -def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)), - (ADDiu CPURegs:$gp, tconstpool:$in)>; +def : MipsPat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)), + (ADDiu CPURegs:$gp, tglobaladdr:$in)>; +def : MipsPat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)), + (ADDiu CPURegs:$gp, tconstpool:$in)>; // wrapper_pic class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>: - Pat<(MipsWrapper RC:$gp, node:$in), - (ADDiuOp RC:$gp, node:$in)>; + MipsPat<(MipsWrapper RC:$gp, node:$in), + (ADDiuOp RC:$gp, node:$in)>; def : WrapperPat<tglobaladdr, ADDiu, CPURegs>; def : WrapperPat<tconstpool, ADDiu, CPURegs>; @@ -1081,58 +1149,58 @@ def : WrapperPat<tjumptable, ADDiu, CPURegs>; def : WrapperPat<tglobaltlsaddr, ADDiu, CPURegs>; // Mips does not have "not", so we expand our way -def : Pat<(not CPURegs:$in), - (NOR CPURegs:$in, ZERO)>; +def : MipsPat<(not CPURegs:$in), + (NOR CPURegs:$in, ZERO)>; // extended loads -let Predicates = [NotN64] in { - def : Pat<(i32 (extloadi1 addr:$src)), (LBu addr:$src)>; - def : Pat<(i32 (extloadi8 addr:$src)), (LBu addr:$src)>; - def : Pat<(i32 (extloadi16_a addr:$src)), (LHu addr:$src)>; - def : Pat<(i32 (extloadi16_u addr:$src)), (ULHu addr:$src)>; +let Predicates = [NotN64, HasStandardEncoding] in { + def : MipsPat<(i32 (extloadi1 addr:$src)), (LBu addr:$src)>; + def : MipsPat<(i32 (extloadi8 addr:$src)), (LBu addr:$src)>; + def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu addr:$src)>; + def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu addr:$src)>; } -let Predicates = [IsN64] in { - def : Pat<(i32 (extloadi1 addr:$src)), (LBu_P8 addr:$src)>; - def : Pat<(i32 (extloadi8 addr:$src)), (LBu_P8 addr:$src)>; - def : Pat<(i32 (extloadi16_a addr:$src)), (LHu_P8 addr:$src)>; - def : Pat<(i32 (extloadi16_u addr:$src)), (ULHu_P8 addr:$src)>; +let Predicates = [IsN64, HasStandardEncoding] in { + def : MipsPat<(i32 (extloadi1 addr:$src)), (LBu_P8 addr:$src)>; + def : MipsPat<(i32 (extloadi8 addr:$src)), (LBu_P8 addr:$src)>; + def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu_P8 addr:$src)>; + def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu_P8 addr:$src)>; } // peepholes -let Predicates = [NotN64] in { - def : Pat<(store_a (i32 0), addr:$dst), (SW ZERO, addr:$dst)>; - def : Pat<(store_u (i32 0), addr:$dst), (USW ZERO, addr:$dst)>; +let Predicates = [NotN64, HasStandardEncoding] in { + def : MipsPat<(store_a (i32 0), addr:$dst), (SW ZERO, addr:$dst)>; + def : MipsPat<(store_u (i32 0), addr:$dst), (USW ZERO, addr:$dst)>; } -let Predicates = [IsN64] in { - def : Pat<(store_a (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>; - def : Pat<(store_u (i32 0), addr:$dst), (USW_P8 ZERO, addr:$dst)>; +let Predicates = [IsN64, HasStandardEncoding] in { + def : MipsPat<(store_a (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>; + def : MipsPat<(store_u (i32 0), addr:$dst), (USW_P8 ZERO, addr:$dst)>; } // brcond patterns multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp, Instruction SLTOp, Instruction SLTuOp, Instruction SLTiOp, Instruction SLTiuOp, Register ZEROReg> { -def : Pat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst), - (BNEOp RC:$lhs, ZEROReg, bb:$dst)>; -def : Pat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst), - (BEQOp RC:$lhs, ZEROReg, bb:$dst)>; +def : MipsPat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst), + (BNEOp RC:$lhs, ZEROReg, bb:$dst)>; +def : MipsPat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst), + (BEQOp RC:$lhs, ZEROReg, bb:$dst)>; -def : Pat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst), - (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>; -def : Pat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst), - (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>; -def : Pat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst), - (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>; -def : Pat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst), - (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>; +def : MipsPat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst), + (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>; +def : MipsPat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst), + (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>; +def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst), + (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>; +def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst), + (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>; -def : Pat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst), - (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>; -def : Pat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst), - (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>; +def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst), + (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>; +def : MipsPat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst), + (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>; -def : Pat<(brcond RC:$cond, bb:$dst), - (BNEOp RC:$cond, ZEROReg, bb:$dst)>; +def : MipsPat<(brcond RC:$cond, bb:$dst), + (BNEOp RC:$cond, ZEROReg, bb:$dst)>; } defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>; @@ -1140,39 +1208,39 @@ defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>; // setcc patterns multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp, Instruction SLTuOp, Register ZEROReg> { - def : Pat<(seteq RC:$lhs, RC:$rhs), - (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>; - def : Pat<(setne RC:$lhs, RC:$rhs), - (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>; + def : MipsPat<(seteq RC:$lhs, RC:$rhs), + (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>; + def : MipsPat<(setne RC:$lhs, RC:$rhs), + (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>; } multiclass SetlePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> { - def : Pat<(setle RC:$lhs, RC:$rhs), - (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>; - def : Pat<(setule RC:$lhs, RC:$rhs), - (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>; + def : MipsPat<(setle RC:$lhs, RC:$rhs), + (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>; + def : MipsPat<(setule RC:$lhs, RC:$rhs), + (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>; } multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> { - def : Pat<(setgt RC:$lhs, RC:$rhs), - (SLTOp RC:$rhs, RC:$lhs)>; - def : Pat<(setugt RC:$lhs, RC:$rhs), - (SLTuOp RC:$rhs, RC:$lhs)>; + def : MipsPat<(setgt RC:$lhs, RC:$rhs), + (SLTOp RC:$rhs, RC:$lhs)>; + def : MipsPat<(setugt RC:$lhs, RC:$rhs), + (SLTuOp RC:$rhs, RC:$lhs)>; } multiclass SetgePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> { - def : Pat<(setge RC:$lhs, RC:$rhs), - (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>; - def : Pat<(setuge RC:$lhs, RC:$rhs), - (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>; + def : MipsPat<(setge RC:$lhs, RC:$rhs), + (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>; + def : MipsPat<(setuge RC:$lhs, RC:$rhs), + (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>; } multiclass SetgeImmPats<RegisterClass RC, Instruction SLTiOp, Instruction SLTiuOp> { - def : Pat<(setge RC:$lhs, immSExt16:$rhs), - (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>; - def : Pat<(setuge RC:$lhs, immSExt16:$rhs), - (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>; + def : MipsPat<(setge RC:$lhs, immSExt16:$rhs), + (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>; + def : MipsPat<(setuge RC:$lhs, immSExt16:$rhs), + (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>; } defm : SeteqPats<CPURegs, SLTiu, XOR, SLTu, ZERO>; @@ -1182,10 +1250,10 @@ defm : SetgePats<CPURegs, SLT, SLTu>; defm : SetgeImmPats<CPURegs, SLTi, SLTiu>; // select MipsDynAlloc -def : Pat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>; +def : MipsPat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>; // bswap pattern -def : Pat<(bswap CPURegs:$rt), (ROTR (WSBH CPURegs:$rt), 16)>; +def : MipsPat<(bswap CPURegs:$rt), (ROTR (WSBH CPURegs:$rt), 16)>; //===----------------------------------------------------------------------===// // Floating Point Support @@ -1195,3 +1263,8 @@ include "MipsInstrFPU.td" include "Mips64InstrInfo.td" include "MipsCondMov.td" +// +// Mips16 + +include "Mips16InstrFormats.td" +include "Mips16InstrInfo.td" diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp index 76ca3e1..150bdbb 100644 --- a/lib/Target/Mips/MipsJITInfo.cpp +++ b/lib/Target/Mips/MipsJITInfo.cpp @@ -154,8 +154,8 @@ TargetJITInfo::StubLayout MipsJITInfo::getStubLayout() { return Result; } -void *MipsJITInfo::emitFunctionStub(const Function* F, void *Fn, - JITCodeEmitter &JCE) { +void *MipsJITInfo::emitFunctionStub(const Function *F, void *Fn, + JITCodeEmitter &JCE) { JCE.emitAlignment(4); void *Addr = (void*) (JCE.getCurrentPCValue()); if (!sys::Memory::setRangeWritable(Addr, 16)) @@ -193,7 +193,7 @@ void *MipsJITInfo::emitFunctionStub(const Function* F, void *Fn, /// it must rewrite the code to contain the actual addresses of any /// referenced global symbols. void MipsJITInfo::relocate(void *Function, MachineRelocation *MR, - unsigned NumRelocs, unsigned char* GOTBase) { + unsigned NumRelocs, unsigned char *GOTBase) { for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { void *RelocPos = (char*) Function + MR->getMachineCodeOffset(); diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h index f4c4ae8..637a318 100644 --- a/lib/Target/Mips/MipsJITInfo.h +++ b/lib/Target/Mips/MipsJITInfo.h @@ -45,8 +45,8 @@ class MipsJITInfo : public TargetJITInfo { /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a /// small native function that simply calls the function at the specified /// address. - virtual void *emitFunctionStub(const Function* F, void *Fn, - JITCodeEmitter &JCE); + virtual void *emitFunctionStub(const Function *F, void *Fn, + JITCodeEmitter &JCE); /// getLazyResolverFunction - Expose the lazy resolver to the JIT. virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); @@ -55,7 +55,7 @@ class MipsJITInfo : public TargetJITInfo { /// it must rewrite the code to contain the actual addresses of any /// referenced global symbols. virtual void relocate(void *Function, MachineRelocation *MR, - unsigned NumRelocs, unsigned char* GOTBase); + unsigned NumRelocs, unsigned char *GOTBase); /// Initialize - Initialize internal stage for the function being JITted. void Initialize(const MachineFunction &MF, bool isPIC) { diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp new file mode 100644 index 0000000..70ecbc1 --- /dev/null +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -0,0 +1,419 @@ +//===-- MipsLongBranch.cpp - Emit long branches ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass expands a branch or jump instruction into a long branch if its +// offset is too large to fit into its immediate field. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mips-long-branch" + +#include "Mips.h" +#include "MipsTargetMachine.h" +#include "MCTargetDesc/MipsBaseInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Function.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +STATISTIC(LongBranches, "Number of long branches."); + +static cl::opt<bool> SkipLongBranch( + "skip-mips-long-branch", + cl::init(false), + cl::desc("MIPS: Skip long branch pass."), + cl::Hidden); + +static cl::opt<bool> ForceLongBranch( + "force-mips-long-branch", + cl::init(false), + cl::desc("MIPS: Expand all branches to long format."), + cl::Hidden); + +namespace { + typedef MachineBasicBlock::iterator Iter; + typedef MachineBasicBlock::reverse_iterator ReverseIter; + + struct MBBInfo { + uint64_t Size; + bool HasLongBranch; + MachineInstr *Br; + + MBBInfo() : Size(0), HasLongBranch(false), Br(0) {} + }; + + class MipsLongBranch : public MachineFunctionPass { + + public: + static char ID; + MipsLongBranch(TargetMachine &tm) + : MachineFunctionPass(ID), TM(tm), + TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())) {} + + virtual const char *getPassName() const { + return "Mips Long Branch"; + } + + bool runOnMachineFunction(MachineFunction &F); + + private: + void splitMBB(MachineBasicBlock *MBB); + void initMBBInfo(); + int64_t computeOffset(const MachineInstr *Br); + void replaceBranch(MachineBasicBlock &MBB, Iter Br, DebugLoc DL, + MachineBasicBlock *MBBOpnd); + void expandToLongBranch(MBBInfo &Info); + + const TargetMachine &TM; + const MipsInstrInfo *TII; + MachineFunction *MF; + SmallVector<MBBInfo, 16> MBBInfos; + }; + + char MipsLongBranch::ID = 0; +} // end of anonymous namespace + +/// createMipsLongBranchPass - Returns a pass that converts branches to long +/// branches. +FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) { + return new MipsLongBranch(tm); +} + +/// Iterate over list of Br's operands and search for a MachineBasicBlock +/// operand. +static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) { + for (unsigned I = 0, E = Br.getDesc().getNumOperands(); I < E; ++I) { + const MachineOperand &MO = Br.getOperand(I); + + if (MO.isMBB()) + return MO.getMBB(); + } + + assert(false && "This instruction does not have an MBB operand."); + return 0; +} + +// Traverse the list of instructions backwards until a non-debug instruction is +// found or it reaches E. +static ReverseIter getNonDebugInstr(ReverseIter B, ReverseIter E) { + for (; B != E; ++B) + if (!B->isDebugValue()) + return B; + + return E; +} + +// Split MBB if it has two direct jumps/branches. +void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) { + ReverseIter End = MBB->rend(); + ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End); + + // Return if MBB has no branch instructions. + if ((LastBr == End) || + (!LastBr->isConditionalBranch() && !LastBr->isUnconditionalBranch())) + return; + + ReverseIter FirstBr = getNonDebugInstr(llvm::next(LastBr), End); + + // MBB has only one branch instruction if FirstBr is not a branch + // instruction. + if ((FirstBr == End) || + (!FirstBr->isConditionalBranch() && !FirstBr->isUnconditionalBranch())) + return; + + assert(!FirstBr->isIndirectBranch() && "Unexpected indirect branch found."); + + // Create a new MBB. Move instructions in MBB to the newly created MBB. + MachineBasicBlock *NewMBB = + MF->CreateMachineBasicBlock(MBB->getBasicBlock()); + + // Insert NewMBB and fix control flow. + MachineBasicBlock *Tgt = getTargetMBB(*FirstBr); + NewMBB->transferSuccessors(MBB); + NewMBB->removeSuccessor(Tgt); + MBB->addSuccessor(NewMBB); + MBB->addSuccessor(Tgt); + MF->insert(llvm::next(MachineFunction::iterator(MBB)), NewMBB); + + NewMBB->splice(NewMBB->end(), MBB, (++LastBr).base(), MBB->end()); +} + +// Fill MBBInfos. +void MipsLongBranch::initMBBInfo() { + // Split the MBBs if they have two branches. Each basic block should have at + // most one branch after this loop is executed. + for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;) + splitMBB(I++); + + MF->RenumberBlocks(); + MBBInfos.clear(); + MBBInfos.resize(MF->size()); + + for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) { + MachineBasicBlock *MBB = MF->getBlockNumbered(I); + + // Compute size of MBB. + for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin(); + MI != MBB->instr_end(); ++MI) + MBBInfos[I].Size += TII->GetInstSizeInBytes(&*MI); + + // Search for MBB's branch instruction. + ReverseIter End = MBB->rend(); + ReverseIter Br = getNonDebugInstr(MBB->rbegin(), End); + + if ((Br != End) && !Br->isIndirectBranch() && + (Br->isConditionalBranch() || + (Br->isUnconditionalBranch() && + TM.getRelocationModel() == Reloc::PIC_))) + MBBInfos[I].Br = (++Br).base(); + } +} + +// Compute offset of branch in number of bytes. +int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) { + int64_t Offset = 0; + int ThisMBB = Br->getParent()->getNumber(); + int TargetMBB = getTargetMBB(*Br)->getNumber(); + + // Compute offset of a forward branch. + if (ThisMBB < TargetMBB) { + for (int N = ThisMBB + 1; N < TargetMBB; ++N) + Offset += MBBInfos[N].Size; + + return Offset + 4; + } + + // Compute offset of a backward branch. + for (int N = ThisMBB; N >= TargetMBB; --N) + Offset += MBBInfos[N].Size; + + return -Offset + 4; +} + +// Replace Br with a branch which has the opposite condition code and a +// MachineBasicBlock operand MBBOpnd. +void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br, + DebugLoc DL, MachineBasicBlock *MBBOpnd) { + unsigned NewOpc = Mips::GetOppositeBranchOpc(Br->getOpcode()); + const MCInstrDesc &NewDesc = TII->get(NewOpc); + + MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc); + + for (unsigned I = 0, E = Br->getDesc().getNumOperands(); I < E; ++I) { + MachineOperand &MO = Br->getOperand(I); + + if (!MO.isReg()) { + assert(MO.isMBB() && "MBB operand expected."); + break; + } + + MIB.addReg(MO.getReg()); + } + + MIB.addMBB(MBBOpnd); + + Br->eraseFromParent(); +} + +// Expand branch instructions to long branches. +void MipsLongBranch::expandToLongBranch(MBBInfo &I) { + I.HasLongBranch = true; + + bool IsPIC = TM.getRelocationModel() == Reloc::PIC_; + unsigned ABI = TM.getSubtarget<MipsSubtarget>().getTargetABI(); + bool N64 = ABI == MipsSubtarget::N64; + + MachineBasicBlock::iterator Pos; + MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br); + DebugLoc DL = I.Br->getDebugLoc(); + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB); + MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB); + + MF->insert(FallThroughMBB, LongBrMBB); + MBB->removeSuccessor(TgtMBB); + MBB->addSuccessor(LongBrMBB); + + if (IsPIC) { + // $longbr: + // addiu $sp, $sp, -regsize * 2 + // sw $ra, 0($sp) + // bal $baltgt + // sw $a3, regsize($sp) + // $baltgt: + // lui $a3, %hi($baltgt) + // lui $at, %hi($tgt) + // addiu $a3, $a3, %lo($baltgt) + // addiu $at, $at, %lo($tgt) + // subu $at, $at, $a3 + // addu $at, $ra, $at + // + // if n64: + // lui $a3, %highest($baltgt) + // lui $ra, %highest($tgt) + // addiu $a3, $a3, %higher($baltgt) + // addiu $ra, $ra, %higher($tgt) + // dsll $a3, $a3, 32 + // dsll $ra, $ra, 32 + // subu $at, $at, $a3 + // addu $at, $at, $ra + // + // lw $ra, 0($sp) + // lw $a3, regsize($sp) + // jr $at + // addiu $sp, $sp, regsize * 2 + // $fallthrough: + // + MF->getInfo<MipsFunctionInfo>()->setEmitNOAT(); + MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(FallThroughMBB, BalTgtMBB); + LongBrMBB->addSuccessor(BalTgtMBB); + BalTgtMBB->addSuccessor(TgtMBB); + + int RegSize = N64 ? 8 : 4; + unsigned AT = N64 ? Mips::AT_64 : Mips::AT; + unsigned A3 = N64 ? Mips::A3_64 : Mips::A3; + unsigned SP = N64 ? Mips::SP_64 : Mips::SP; + unsigned RA = N64 ? Mips::RA_64 : Mips::RA; + unsigned Load = N64 ? Mips::LD_P8 : Mips::LW; + unsigned Store = N64 ? Mips::SD_P8 : Mips::SW; + unsigned LUi = N64 ? Mips::LUi64 : Mips::LUi; + unsigned ADDiu = N64 ? Mips::DADDiu : Mips::ADDiu; + unsigned ADDu = N64 ? Mips::DADDu : Mips::ADDu; + unsigned SUBu = N64 ? Mips::SUBu : Mips::SUBu; + unsigned JR = N64 ? Mips::JR64 : Mips::JR; + + Pos = LongBrMBB->begin(); + + BuildMI(*LongBrMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP) + .addImm(-RegSize * 2); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(RA).addReg(SP) + .addImm(0); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(A3).addReg(SP) + .addImm(RegSize)->setIsInsideBundle(); + + Pos = BalTgtMBB->begin(); + + BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3) + .addMBB(BalTgtMBB, MipsII::MO_ABS_HI); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), AT) + .addMBB(TgtMBB, MipsII::MO_ABS_HI); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3) + .addMBB(BalTgtMBB, MipsII::MO_ABS_LO); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), AT).addReg(AT) + .addMBB(TgtMBB, MipsII::MO_ABS_LO); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(RA).addReg(AT); + + if (N64) { + BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3) + .addMBB(BalTgtMBB, MipsII::MO_HIGHEST); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), RA) + .addMBB(TgtMBB, MipsII::MO_HIGHEST); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3) + .addMBB(BalTgtMBB, MipsII::MO_HIGHER); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), RA).addReg(RA) + .addMBB(TgtMBB, MipsII::MO_HIGHER); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), A3).addReg(A3) + .addImm(32); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), RA).addReg(RA) + .addImm(32); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(AT).addReg(RA); + I.Size += 4 * 8; + } + + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), RA).addReg(SP).addImm(0); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), A3).addReg(SP).addImm(RegSize); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(JR)).addReg(AT); + BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP) + .addImm(RegSize * 2)->setIsInsideBundle(); + I.Size += 4 * 14; + } else { + // $longbr: + // j $tgt + // nop + // $fallthrough: + // + Pos = LongBrMBB->begin(); + LongBrMBB->addSuccessor(TgtMBB); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::J)).addMBB(TgtMBB); + BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::NOP))->setIsInsideBundle(); + I.Size += 4 * 2; + } + + if (I.Br->isUnconditionalBranch()) { + // Change branch destination. + assert(I.Br->getDesc().getNumOperands() == 1); + I.Br->RemoveOperand(0); + I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB)); + } else + // Change branch destination and reverse condition. + replaceBranch(*MBB, I.Br, DL, FallThroughMBB); +} + +static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) { + MachineBasicBlock &MBB = F.front(); + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL = MBB.findDebugLoc(MBB.begin()); + BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0) + .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI); + BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0) + .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO); + MBB.removeLiveIn(Mips::V0); +} + +bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) { + if ((TM.getRelocationModel() == Reloc::PIC_) && + TM.getSubtarget<MipsSubtarget>().isABI_O32() && + F.getInfo<MipsFunctionInfo>()->globalBaseRegSet()) + emitGPDisp(F, TII); + + if (SkipLongBranch) + return true; + + MF = &F; + initMBBInfo(); + + SmallVector<MBBInfo, 16>::iterator I, E = MBBInfos.end(); + bool EverMadeChange = false, MadeChange = true; + + while (MadeChange) { + MadeChange = false; + + for (I = MBBInfos.begin(); I != E; ++I) { + // Skip if this MBB doesn't have a branch or the branch has already been + // converted to a long branch. + if (!I->Br || I->HasLongBranch) + continue; + + if (!ForceLongBranch) + // Check if offset fits into 16-bit immediate field of branches. + if (isInt<16>(computeOffset(I->Br) / 4)) + continue; + + expandToLongBranch(*I); + ++LongBranches; + EverMadeChange = MadeChange = true; + } + } + + if (EverMadeChange) + MF->RenumberBlocks(); + + return true; +} diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp index 1597b93..d4c5e6d 100644 --- a/lib/Target/Mips/MipsMCInstLower.cpp +++ b/lib/Target/Mips/MipsMCInstLower.cpp @@ -29,7 +29,7 @@ using namespace llvm; MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter) : AsmPrinter(asmprinter) {} -void MipsMCInstLower::Initialize(Mangler *M, MCContext* C) { +void MipsMCInstLower::Initialize(Mangler *M, MCContext *C) { Mang = M; Ctx = C; } @@ -61,6 +61,8 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO, case MipsII::MO_GOT_DISP: Kind = MCSymbolRefExpr::VK_Mips_GOT_DISP; break; case MipsII::MO_GOT_PAGE: Kind = MCSymbolRefExpr::VK_Mips_GOT_PAGE; break; case MipsII::MO_GOT_OFST: Kind = MCSymbolRefExpr::VK_Mips_GOT_OFST; break; + case MipsII::MO_HIGHER: Kind = MCSymbolRefExpr::VK_Mips_HIGHER; break; + case MipsII::MO_HIGHEST: Kind = MCSymbolRefExpr::VK_Mips_HIGHEST; break; } switch (MOTy) { @@ -70,14 +72,17 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO, case MachineOperand::MO_GlobalAddress: Symbol = Mang->getSymbol(MO.getGlobal()); + Offset += MO.getOffset(); break; case MachineOperand::MO_BlockAddress: Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()); + Offset += MO.getOffset(); break; case MachineOperand::MO_ExternalSymbol: Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName()); + Offset += MO.getOffset(); break; case MachineOperand::MO_JumpTableIndex: @@ -86,8 +91,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO, case MachineOperand::MO_ConstantPoolIndex: Symbol = AsmPrinter.GetCPISymbol(MO.getIndex()); - if (MO.getOffset()) - Offset += MO.getOffset(); + Offset += MO.getOffset(); break; default: @@ -103,71 +107,23 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO, assert(Offset > 0); const MCConstantExpr *OffsetExpr = MCConstantExpr::Create(Offset, *Ctx); - const MCBinaryExpr *AddExpr = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx); - return MCOperand::CreateExpr(AddExpr); + const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx); + return MCOperand::CreateExpr(Add); } -static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand& Opnd0, - const MCOperand& Opnd1, - const MCOperand& Opnd2 = MCOperand()) { +/* +static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand &Opnd0, + const MCOperand &Opnd1, + const MCOperand &Opnd2 = MCOperand()) { Inst.setOpcode(Opc); Inst.addOperand(Opnd0); Inst.addOperand(Opnd1); if (Opnd2.isValid()) Inst.addOperand(Opnd2); } +*/ -// Lower ".cpload $reg" to -// "lui $gp, %hi(_gp_disp)" -// "addiu $gp, $gp, %lo(_gp_disp)" -// "addu $gp, $gp, $t9" -void MipsMCInstLower::LowerCPLOAD(SmallVector<MCInst, 4>& MCInsts) { - MCOperand GPReg = MCOperand::CreateReg(Mips::GP); - MCOperand T9Reg = MCOperand::CreateReg(Mips::T9); - StringRef SymName("_gp_disp"); - const MCSymbol *Sym = Ctx->GetOrCreateSymbol(SymName); - const MCSymbolRefExpr *MCSym; - - MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, *Ctx); - MCOperand SymHi = MCOperand::CreateExpr(MCSym); - MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, *Ctx); - MCOperand SymLo = MCOperand::CreateExpr(MCSym); - - MCInsts.resize(3); - - CreateMCInst(MCInsts[0], Mips::LUi, GPReg, SymHi); - CreateMCInst(MCInsts[1], Mips::ADDiu, GPReg, GPReg, SymLo); - CreateMCInst(MCInsts[2], Mips::ADDu, GPReg, GPReg, T9Reg); -} - -// Lower ".cprestore offset" to "sw $gp, offset($sp)". -void MipsMCInstLower::LowerCPRESTORE(int64_t Offset, - SmallVector<MCInst, 4>& MCInsts) { - assert(isInt<32>(Offset) && (Offset >= 0) && - "Imm operand of .cprestore must be a non-negative 32-bit value."); - - MCOperand SPReg = MCOperand::CreateReg(Mips::SP), BaseReg = SPReg; - MCOperand GPReg = MCOperand::CreateReg(Mips::GP); - - if (!isInt<16>(Offset)) { - unsigned Hi = ((Offset + 0x8000) >> 16) & 0xffff; - Offset &= 0xffff; - MCOperand ATReg = MCOperand::CreateReg(Mips::AT); - BaseReg = ATReg; - - // lui at,hi - // addu at,at,sp - MCInsts.resize(2); - CreateMCInst(MCInsts[0], Mips::LUi, ATReg, MCOperand::CreateImm(Hi)); - CreateMCInst(MCInsts[1], Mips::ADDu, ATReg, ATReg, SPReg); - } - - MCInst Sw; - CreateMCInst(Sw, Mips::SW, GPReg, BaseReg, MCOperand::CreateImm(Offset)); - MCInsts.push_back(Sw); -} - -MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO, +MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO, unsigned offset) const { MachineOperandType MOTy = MO.getType(); @@ -205,139 +161,31 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { } } -void MipsMCInstLower::LowerUnalignedLoadStore(const MachineInstr *MI, - SmallVector<MCInst, - 4>& MCInsts) { - unsigned Opc = MI->getOpcode(); - MCInst Instr1, Instr2, Instr3, Move; - - bool TwoInstructions = false; - - assert(MI->getNumOperands() == 3); - assert(MI->getOperand(0).isReg()); - assert(MI->getOperand(1).isReg()); - - MCOperand Target = LowerOperand(MI->getOperand(0)); - MCOperand Base = LowerOperand(MI->getOperand(1)); - MCOperand ATReg = MCOperand::CreateReg(Mips::AT); - MCOperand ZeroReg = MCOperand::CreateReg(Mips::ZERO); - - MachineOperand UnLoweredName = MI->getOperand(2); - MCOperand Name = LowerOperand(UnLoweredName); - - Move.setOpcode(Mips::ADDu); - Move.addOperand(Target); - Move.addOperand(ATReg); - Move.addOperand(ZeroReg); - - switch (Opc) { - case Mips::ULW: { - // FIXME: only works for little endian right now - MCOperand AdjName = LowerOperand(UnLoweredName, 3); - if (Base.getReg() == (Target.getReg())) { - Instr1.setOpcode(Mips::LWL); - Instr1.addOperand(ATReg); - Instr1.addOperand(Base); - Instr1.addOperand(AdjName); - Instr2.setOpcode(Mips::LWR); - Instr2.addOperand(ATReg); - Instr2.addOperand(Base); - Instr2.addOperand(Name); - Instr3 = Move; - } else { - TwoInstructions = true; - Instr1.setOpcode(Mips::LWL); - Instr1.addOperand(Target); - Instr1.addOperand(Base); - Instr1.addOperand(AdjName); - Instr2.setOpcode(Mips::LWR); - Instr2.addOperand(Target); - Instr2.addOperand(Base); - Instr2.addOperand(Name); - } +// If the D<shift> instruction has a shift amount that is greater +// than 31 (checked in calling routine), lower it to a D<shift>32 instruction +void MipsMCInstLower::LowerLargeShift(const MachineInstr *MI, + MCInst& Inst, + int64_t Shift) { + // rt + Inst.addOperand(LowerOperand(MI->getOperand(0))); + // rd + Inst.addOperand(LowerOperand(MI->getOperand(1))); + // saminus32 + Inst.addOperand(MCOperand::CreateImm(Shift)); + + switch (MI->getOpcode()) { + default: + // Calling function is not synchronized + llvm_unreachable("Unexpected shift instruction"); break; - } - case Mips::ULHu: { - // FIXME: only works for little endian right now - MCOperand AdjName = LowerOperand(UnLoweredName, 1); - Instr1.setOpcode(Mips::LBu); - Instr1.addOperand(ATReg); - Instr1.addOperand(Base); - Instr1.addOperand(AdjName); - Instr2.setOpcode(Mips::LBu); - Instr2.addOperand(Target); - Instr2.addOperand(Base); - Instr2.addOperand(Name); - Instr3.setOpcode(Mips::INS); - Instr3.addOperand(Target); - Instr3.addOperand(ATReg); - Instr3.addOperand(MCOperand::CreateImm(0x8)); - Instr3.addOperand(MCOperand::CreateImm(0x18)); + case Mips::DSLL: + Inst.setOpcode(Mips::DSLL32); break; - } - - case Mips::USW: { - // FIXME: only works for little endian right now - assert (Base.getReg() != Target.getReg()); - TwoInstructions = true; - MCOperand AdjName = LowerOperand(UnLoweredName, 3); - Instr1.setOpcode(Mips::SWL); - Instr1.addOperand(Target); - Instr1.addOperand(Base); - Instr1.addOperand(AdjName); - Instr2.setOpcode(Mips::SWR); - Instr2.addOperand(Target); - Instr2.addOperand(Base); - Instr2.addOperand(Name); + case Mips::DSRL: + Inst.setOpcode(Mips::DSRL32); break; - } - case Mips::USH: { - MCOperand AdjName = LowerOperand(UnLoweredName, 1); - Instr1.setOpcode(Mips::SB); - Instr1.addOperand(Target); - Instr1.addOperand(Base); - Instr1.addOperand(Name); - Instr2.setOpcode(Mips::SRL); - Instr2.addOperand(ATReg); - Instr2.addOperand(Target); - Instr2.addOperand(MCOperand::CreateImm(8)); - Instr3.setOpcode(Mips::SB); - Instr3.addOperand(ATReg); - Instr3.addOperand(Base); - Instr3.addOperand(AdjName); + case Mips::DSRA: + Inst.setOpcode(Mips::DSRA32); break; } - default: - // FIXME: need to add others - llvm_unreachable("unaligned instruction not processed"); - } - - MCInsts.push_back(Instr1); - MCInsts.push_back(Instr2); - if (!TwoInstructions) MCInsts.push_back(Instr3); -} - -// Convert -// "setgp01 $reg" -// to -// "lui $reg, %hi(_gp_disp)" -// "addiu $reg, $reg, %lo(_gp_disp)" -void MipsMCInstLower::LowerSETGP01(const MachineInstr *MI, - SmallVector<MCInst, 4>& MCInsts) { - const MachineOperand &MO = MI->getOperand(0); - assert(MO.isReg()); - MCOperand RegOpnd = MCOperand::CreateReg(MO.getReg()); - StringRef SymName("_gp_disp"); - const MCSymbol *Sym = Ctx->GetOrCreateSymbol(SymName); - const MCSymbolRefExpr *MCSym; - - MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, *Ctx); - MCOperand SymHi = MCOperand::CreateExpr(MCSym); - MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, *Ctx); - MCOperand SymLo = MCOperand::CreateExpr(MCSym); - - MCInsts.resize(2); - - CreateMCInst(MCInsts[0], Mips::LUi, RegOpnd, SymHi); - CreateMCInst(MCInsts[1], Mips::ADDiu, RegOpnd, RegOpnd, SymLo); } diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h index c1d007d..0abb996 100644 --- a/lib/Target/Mips/MipsMCInstLower.h +++ b/lib/Target/Mips/MipsMCInstLower.h @@ -31,13 +31,10 @@ class LLVM_LIBRARY_VISIBILITY MipsMCInstLower { MipsAsmPrinter &AsmPrinter; public: MipsMCInstLower(MipsAsmPrinter &asmprinter); - void Initialize(Mangler *mang, MCContext* C); + void Initialize(Mangler *mang, MCContext *C); void Lower(const MachineInstr *MI, MCInst &OutMI) const; - void LowerCPLOAD(SmallVector<MCInst, 4>& MCInsts); - void LowerCPRESTORE(int64_t Offset, SmallVector<MCInst, 4>& MCInsts); - void LowerUnalignedLoadStore(const MachineInstr *MI, - SmallVector<MCInst, 4>& MCInsts); - void LowerSETGP01(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts); + void LowerLargeShift(const MachineInstr *MI, MCInst &Inst, int64_t Shift); + private: MCOperand LowerSymbolOperand(const MachineOperand &MO, MachineOperandType MOTy, unsigned Offset) const; diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp index b00c62b..362173e 100644 --- a/lib/Target/Mips/MipsMachineFunction.cpp +++ b/lib/Target/Mips/MipsMachineFunction.cpp @@ -22,10 +22,6 @@ static cl::opt<bool> FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true), cl::desc("Always use $gp as the global base register.")); -bool MipsFunctionInfo::globalBaseRegFixed() const { - return FixGlobalBaseReg; -} - bool MipsFunctionInfo::globalBaseRegSet() const { return GlobalBaseReg; } @@ -37,13 +33,13 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() { const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>(); - if (FixGlobalBaseReg) // $gp is the global base register. - return GlobalBaseReg = ST.isABI_N64() ? Mips::GP_64 : Mips::GP; - const TargetRegisterClass *RC; - RC = ST.isABI_N64() ? - Mips::CPU64RegsRegisterClass : Mips::CPURegsRegisterClass; - + if (ST.inMips16Mode()) + RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass; + else + RC = ST.isABI_N64() ? + (const TargetRegisterClass*)&Mips::CPU64RegsRegClass : + (const TargetRegisterClass*)&Mips::CPURegsRegClass; return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC); } diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h index 0fde55c..b2232c6 100644 --- a/lib/Target/Mips/MipsMachineFunction.h +++ b/lib/Target/Mips/MipsMachineFunction.h @@ -14,8 +14,11 @@ #ifndef MIPS_MACHINE_FUNCTION_INFO_H #define MIPS_MACHINE_FUNCTION_INFO_H +#include "MipsSubtarget.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" #include <utility> namespace llvm { @@ -45,7 +48,7 @@ class MipsFunctionInfo : public MachineFunctionInfo { // OutArgFIRange: Range of indices of all frame objects created during call to // LowerCall except for the frame object for restoring $gp. std::pair<int, int> InArgFIRange, OutArgFIRange; - int GPFI; // Index of the frame object for restoring $gp + int GlobalRegFI; mutable int DynAllocFI; // Frame index of dynamically allocated stack area. unsigned MaxCallFrameSize; @@ -55,7 +58,7 @@ public: MipsFunctionInfo(MachineFunction& MF) : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)), - OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), DynAllocFI(0), + OutArgFIRange(std::make_pair(-1, 0)), GlobalRegFI(0), DynAllocFI(0), MaxCallFrameSize(0), EmitNOAT(false) {} @@ -74,10 +77,23 @@ public: OutArgFIRange.second = LastFI; } - int getGPFI() const { return GPFI; } - void setGPFI(int FI) { GPFI = FI; } - bool needGPSaveRestore() const { return getGPFI(); } - bool isGPFI(int FI) const { return GPFI && GPFI == FI; } + bool isGlobalRegFI(int FI) const { + return GlobalRegFI && (FI == GlobalRegFI); + } + + int getGlobalRegFI() const { + return GlobalRegFI; + } + + int initGlobalRegFI() { + const TargetMachine &TM = MF.getTarget(); + unsigned RegSize = TM.getSubtarget<MipsSubtarget>().isABI_N64() ? 8 : 4; + int64_t StackAlignment = TM.getFrameLowering()->getStackAlignment(); + uint64_t Offset = RoundUpToAlignment(MaxCallFrameSize, StackAlignment); + + GlobalRegFI = MF.getFrameInfo()->CreateFixedObject(RegSize, Offset, true); + return GlobalRegFI; + } // The first call to this function creates a frame object for dynamically // allocated stack area. @@ -92,7 +108,6 @@ public: unsigned getSRetReturnReg() const { return SRetReturnReg; } void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } - bool globalBaseRegFixed() const; bool globalBaseRegSet() const; unsigned getGlobalBaseReg(); diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index f30de44..a3ce236 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -16,9 +16,11 @@ #include "MipsRegisterInfo.h" #include "Mips.h" #include "MipsAnalyzeImmediate.h" +#include "MipsInstrInfo.h" #include "MipsSubtarget.h" #include "MipsMachineFunction.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/Type.h" #include "llvm/Function.h" #include "llvm/CodeGen/ValueTypes.h" @@ -35,7 +37,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Analysis/DebugInfo.h" #define GET_REGINFO_TARGET_DESC #include "MipsGenRegisterInfo.inc" @@ -54,8 +55,7 @@ unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; } /// Mips Callee Saved Registers const uint16_t* MipsRegisterInfo:: -getCalleeSavedRegs(const MachineFunction *MF) const -{ +getCalleeSavedRegs(const MachineFunction *MF) const { if (Subtarget.isSingleFloat()) return CSR_SingleFloatOnly_SaveList; else if (!Subtarget.hasMips64()) @@ -64,12 +64,11 @@ getCalleeSavedRegs(const MachineFunction *MF) const return CSR_N32_SaveList; assert(Subtarget.isABI_N64()); - return CSR_N64_SaveList; + return CSR_N64_SaveList; } const uint32_t* -MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const -{ +MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const { if (Subtarget.isSingleFloat()) return CSR_SingleFloatOnly_RegMask; else if (!Subtarget.hasMips64()) @@ -78,23 +77,21 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const return CSR_N32_RegMask; assert(Subtarget.isABI_N64()); - return CSR_N64_RegMask; + return CSR_N64_RegMask; } BitVector MipsRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { static const uint16_t ReservedCPURegs[] = { - Mips::ZERO, Mips::AT, Mips::K0, Mips::K1, - Mips::SP, Mips::FP, Mips::RA + Mips::ZERO, Mips::AT, Mips::K0, Mips::K1, Mips::SP }; static const uint16_t ReservedCPU64Regs[] = { - Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64, - Mips::SP_64, Mips::FP_64, Mips::RA_64 + Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64, Mips::SP_64 }; BitVector Reserved(getNumRegs()); - typedef TargetRegisterClass::iterator RegIter; + typedef TargetRegisterClass::const_iterator RegIter; for (unsigned I = 0; I < array_lengthof(ReservedCPURegs); ++I) Reserved.set(ReservedCPURegs[I]); @@ -104,31 +101,36 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(ReservedCPU64Regs[I]); // Reserve all registers in AFGR64. - for (RegIter Reg = Mips::AFGR64RegisterClass->begin(); - Reg != Mips::AFGR64RegisterClass->end(); ++Reg) + for (RegIter Reg = Mips::AFGR64RegClass.begin(), + EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg) Reserved.set(*Reg); - } - else { + } else { // Reserve all registers in CPU64Regs & FGR64. - for (RegIter Reg = Mips::CPU64RegsRegisterClass->begin(); - Reg != Mips::CPU64RegsRegisterClass->end(); ++Reg) + for (RegIter Reg = Mips::CPU64RegsRegClass.begin(), + EReg = Mips::CPU64RegsRegClass.end(); Reg != EReg; ++Reg) Reserved.set(*Reg); - for (RegIter Reg = Mips::FGR64RegisterClass->begin(); - Reg != Mips::FGR64RegisterClass->end(); ++Reg) + for (RegIter Reg = Mips::FGR64RegClass.begin(), + EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg) Reserved.set(*Reg); } - // If GP is dedicated as a global base register, reserve it. - if (MF.getInfo<MipsFunctionInfo>()->globalBaseRegFixed()) { - Reserved.set(Mips::GP); - Reserved.set(Mips::GP_64); + // Reserve FP if this function should have a dedicated frame pointer register. + if (MF.getTarget().getFrameLowering()->hasFP(MF)) { + Reserved.set(Mips::FP); + Reserved.set(Mips::FP_64); } // Reserve hardware registers. Reserved.set(Mips::HWR29); Reserved.set(Mips::HWR29_64); + // Reserve RA if in mips16 mode. + if (Subtarget.inMips16Mode()) { + Reserved.set(Mips::RA); + Reserved.set(Mips::RA_64); + } + return Reserved; } @@ -137,6 +139,11 @@ MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const { return true; } +bool +MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + return true; +} + // This function eliminate ADJCALLSTACKDOWN, // ADJCALLSTACKUP pseudo instructions void MipsRegisterInfo:: @@ -207,8 +214,8 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, // incoming argument, callee-saved register location or local variable. int64_t Offset; - if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isGPFI(FrameIndex) || - MipsFI->isDynAllocFI(FrameIndex)) + if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isDynAllocFI(FrameIndex) || + MipsFI->isGlobalRegFI(FrameIndex)) Offset = spOffset; else Offset = spOffset + (int64_t)stackSize; @@ -222,37 +229,17 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, if (!MI.isDebugValue() && !isInt<16>(Offset)) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = II->getDebugLoc(); - MipsAnalyzeImmediate AnalyzeImm; - unsigned Size = Subtarget.isABI_N64() ? 64 : 32; - unsigned LUi = Subtarget.isABI_N64() ? Mips::LUi64 : Mips::LUi; unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu; - unsigned ZEROReg = Subtarget.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO; unsigned ATReg = Subtarget.isABI_N64() ? Mips::AT_64 : Mips::AT; - const MipsAnalyzeImmediate::InstSeq &Seq = - AnalyzeImm.Analyze(Offset, Size, true /* LastInstrIsADDiu */); - MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin(); + MipsAnalyzeImmediate::Inst LastInst(0, 0); MipsFI->setEmitNOAT(); - - // The first instruction can be a LUi, which is different from other - // instructions (ADDiu, ORI and SLL) in that it does not have a register - // operand. - if (Inst->Opc == LUi) - BuildMI(MBB, II, DL, TII.get(LUi), ATReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - else - BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - - // Build the remaining instructions in Seq except for the last one. - for (++Inst; Inst != Seq.end() - 1; ++Inst) - BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg) - .addImm(SignExtend64<16>(Inst->ImmOpnd)); - + Mips::loadImmediate(Offset, Subtarget.isABI_N64(), TII, MBB, II, DL, true, + &LastInst); BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(ATReg); FrameReg = ATReg; - Offset = SignExtend64<16>(Inst->ImmOpnd); + Offset = SignExtend64<16>(LastInst.ImmOpnd); } MI.getOperand(i).ChangeToRegister(FrameReg, false); diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h index 0716d29..f320bae 100644 --- a/lib/Target/Mips/MipsRegisterInfo.h +++ b/lib/Target/Mips/MipsRegisterInfo.h @@ -42,13 +42,15 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo { void adjustMipsStackFrame(MachineFunction &MF) const; /// Code Generation virtual methods... - const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const; + const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const; const uint32_t *getCallPreservedMask(CallingConv::ID) const; BitVector getReservedRegs(const MachineFunction &MF) const; virtual bool requiresRegisterScavenging(const MachineFunction &MF) const; + virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const; + void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index ce399a0..b255e42 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -70,8 +70,8 @@ class HWR<bits<5> num, string n> : MipsReg<n> { let Namespace = "Mips" in { // General Purpose Registers - def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<[0]>; - def AT : MipsGPRReg< 1, "AT">, DwarfRegNum<[1]>; + def ZERO : MipsGPRReg< 0, "zero">, DwarfRegNum<[0]>; + def AT : MipsGPRReg< 1, "at">, DwarfRegNum<[1]>; def V0 : MipsGPRReg< 2, "2">, DwarfRegNum<[2]>; def V1 : MipsGPRReg< 3, "3">, DwarfRegNum<[3]>; def A0 : MipsGPRReg< 4, "4">, DwarfRegNum<[4]>; @@ -98,14 +98,14 @@ let Namespace = "Mips" in { def T9 : MipsGPRReg< 25, "25">, DwarfRegNum<[25]>; def K0 : MipsGPRReg< 26, "26">, DwarfRegNum<[26]>; def K1 : MipsGPRReg< 27, "27">, DwarfRegNum<[27]>; - def GP : MipsGPRReg< 28, "GP">, DwarfRegNum<[28]>; - def SP : MipsGPRReg< 29, "SP">, DwarfRegNum<[29]>; - def FP : MipsGPRReg< 30, "FP">, DwarfRegNum<[30]>; - def RA : MipsGPRReg< 31, "RA">, DwarfRegNum<[31]>; + def GP : MipsGPRReg< 28, "gp">, DwarfRegNum<[28]>; + def SP : MipsGPRReg< 29, "sp">, DwarfRegNum<[29]>; + def FP : MipsGPRReg< 30, "fp">, DwarfRegNum<[30]>; + def RA : MipsGPRReg< 31, "ra">, DwarfRegNum<[31]>; // General Purpose 64-bit Registers - def ZERO_64 : Mips64GPRReg< 0, "ZERO", [ZERO]>, DwarfRegNum<[0]>; - def AT_64 : Mips64GPRReg< 1, "AT", [AT]>, DwarfRegNum<[1]>; + def ZERO_64 : Mips64GPRReg< 0, "zero", [ZERO]>, DwarfRegNum<[0]>; + def AT_64 : Mips64GPRReg< 1, "at", [AT]>, DwarfRegNum<[1]>; def V0_64 : Mips64GPRReg< 2, "2", [V0]>, DwarfRegNum<[2]>; def V1_64 : Mips64GPRReg< 3, "3", [V1]>, DwarfRegNum<[3]>; def A0_64 : Mips64GPRReg< 4, "4", [A0]>, DwarfRegNum<[4]>; @@ -132,97 +132,97 @@ let Namespace = "Mips" in { def T9_64 : Mips64GPRReg< 25, "25", [T9]>, DwarfRegNum<[25]>; def K0_64 : Mips64GPRReg< 26, "26", [K0]>, DwarfRegNum<[26]>; def K1_64 : Mips64GPRReg< 27, "27", [K1]>, DwarfRegNum<[27]>; - def GP_64 : Mips64GPRReg< 28, "GP", [GP]>, DwarfRegNum<[28]>; - def SP_64 : Mips64GPRReg< 29, "SP", [SP]>, DwarfRegNum<[29]>; - def FP_64 : Mips64GPRReg< 30, "FP", [FP]>, DwarfRegNum<[30]>; - def RA_64 : Mips64GPRReg< 31, "RA", [RA]>, DwarfRegNum<[31]>; + def GP_64 : Mips64GPRReg< 28, "gp", [GP]>, DwarfRegNum<[28]>; + def SP_64 : Mips64GPRReg< 29, "sp", [SP]>, DwarfRegNum<[29]>; + def FP_64 : Mips64GPRReg< 30, "fp", [FP]>, DwarfRegNum<[30]>; + def RA_64 : Mips64GPRReg< 31, "ra", [RA]>, DwarfRegNum<[31]>; /// Mips Single point precision FPU Registers - def F0 : FPR< 0, "F0">, DwarfRegNum<[32]>; - def F1 : FPR< 1, "F1">, DwarfRegNum<[33]>; - def F2 : FPR< 2, "F2">, DwarfRegNum<[34]>; - def F3 : FPR< 3, "F3">, DwarfRegNum<[35]>; - def F4 : FPR< 4, "F4">, DwarfRegNum<[36]>; - def F5 : FPR< 5, "F5">, DwarfRegNum<[37]>; - def F6 : FPR< 6, "F6">, DwarfRegNum<[38]>; - def F7 : FPR< 7, "F7">, DwarfRegNum<[39]>; - def F8 : FPR< 8, "F8">, DwarfRegNum<[40]>; - def F9 : FPR< 9, "F9">, DwarfRegNum<[41]>; - def F10 : FPR<10, "F10">, DwarfRegNum<[42]>; - def F11 : FPR<11, "F11">, DwarfRegNum<[43]>; - def F12 : FPR<12, "F12">, DwarfRegNum<[44]>; - def F13 : FPR<13, "F13">, DwarfRegNum<[45]>; - def F14 : FPR<14, "F14">, DwarfRegNum<[46]>; - def F15 : FPR<15, "F15">, DwarfRegNum<[47]>; - def F16 : FPR<16, "F16">, DwarfRegNum<[48]>; - def F17 : FPR<17, "F17">, DwarfRegNum<[49]>; - def F18 : FPR<18, "F18">, DwarfRegNum<[50]>; - def F19 : FPR<19, "F19">, DwarfRegNum<[51]>; - def F20 : FPR<20, "F20">, DwarfRegNum<[52]>; - def F21 : FPR<21, "F21">, DwarfRegNum<[53]>; - def F22 : FPR<22, "F22">, DwarfRegNum<[54]>; - def F23 : FPR<23, "F23">, DwarfRegNum<[55]>; - def F24 : FPR<24, "F24">, DwarfRegNum<[56]>; - def F25 : FPR<25, "F25">, DwarfRegNum<[57]>; - def F26 : FPR<26, "F26">, DwarfRegNum<[58]>; - def F27 : FPR<27, "F27">, DwarfRegNum<[59]>; - def F28 : FPR<28, "F28">, DwarfRegNum<[60]>; - def F29 : FPR<29, "F29">, DwarfRegNum<[61]>; - def F30 : FPR<30, "F30">, DwarfRegNum<[62]>; - def F31 : FPR<31, "F31">, DwarfRegNum<[63]>; + def F0 : FPR< 0, "f0">, DwarfRegNum<[32]>; + def F1 : FPR< 1, "f1">, DwarfRegNum<[33]>; + def F2 : FPR< 2, "f2">, DwarfRegNum<[34]>; + def F3 : FPR< 3, "f3">, DwarfRegNum<[35]>; + def F4 : FPR< 4, "f4">, DwarfRegNum<[36]>; + def F5 : FPR< 5, "f5">, DwarfRegNum<[37]>; + def F6 : FPR< 6, "f6">, DwarfRegNum<[38]>; + def F7 : FPR< 7, "f7">, DwarfRegNum<[39]>; + def F8 : FPR< 8, "f8">, DwarfRegNum<[40]>; + def F9 : FPR< 9, "f9">, DwarfRegNum<[41]>; + def F10 : FPR<10, "f10">, DwarfRegNum<[42]>; + def F11 : FPR<11, "f11">, DwarfRegNum<[43]>; + def F12 : FPR<12, "f12">, DwarfRegNum<[44]>; + def F13 : FPR<13, "f13">, DwarfRegNum<[45]>; + def F14 : FPR<14, "f14">, DwarfRegNum<[46]>; + def F15 : FPR<15, "f15">, DwarfRegNum<[47]>; + def F16 : FPR<16, "f16">, DwarfRegNum<[48]>; + def F17 : FPR<17, "f17">, DwarfRegNum<[49]>; + def F18 : FPR<18, "f18">, DwarfRegNum<[50]>; + def F19 : FPR<19, "f19">, DwarfRegNum<[51]>; + def F20 : FPR<20, "f20">, DwarfRegNum<[52]>; + def F21 : FPR<21, "f21">, DwarfRegNum<[53]>; + def F22 : FPR<22, "f22">, DwarfRegNum<[54]>; + def F23 : FPR<23, "f23">, DwarfRegNum<[55]>; + def F24 : FPR<24, "f24">, DwarfRegNum<[56]>; + def F25 : FPR<25, "f25">, DwarfRegNum<[57]>; + def F26 : FPR<26, "f26">, DwarfRegNum<[58]>; + def F27 : FPR<27, "f27">, DwarfRegNum<[59]>; + def F28 : FPR<28, "f28">, DwarfRegNum<[60]>; + def F29 : FPR<29, "f29">, DwarfRegNum<[61]>; + def F30 : FPR<30, "f30">, DwarfRegNum<[62]>; + def F31 : FPR<31, "f31">, DwarfRegNum<[63]>; /// Mips Double point precision FPU Registers (aliased /// with the single precision to hold 64 bit values) - def D0 : AFPR< 0, "F0", [F0, F1]>; - def D1 : AFPR< 2, "F2", [F2, F3]>; - def D2 : AFPR< 4, "F4", [F4, F5]>; - def D3 : AFPR< 6, "F6", [F6, F7]>; - def D4 : AFPR< 8, "F8", [F8, F9]>; - def D5 : AFPR<10, "F10", [F10, F11]>; - def D6 : AFPR<12, "F12", [F12, F13]>; - def D7 : AFPR<14, "F14", [F14, F15]>; - def D8 : AFPR<16, "F16", [F16, F17]>; - def D9 : AFPR<18, "F18", [F18, F19]>; - def D10 : AFPR<20, "F20", [F20, F21]>; - def D11 : AFPR<22, "F22", [F22, F23]>; - def D12 : AFPR<24, "F24", [F24, F25]>; - def D13 : AFPR<26, "F26", [F26, F27]>; - def D14 : AFPR<28, "F28", [F28, F29]>; - def D15 : AFPR<30, "F30", [F30, F31]>; + def D0 : AFPR< 0, "f0", [F0, F1]>; + def D1 : AFPR< 2, "f2", [F2, F3]>; + def D2 : AFPR< 4, "f4", [F4, F5]>; + def D3 : AFPR< 6, "f6", [F6, F7]>; + def D4 : AFPR< 8, "f8", [F8, F9]>; + def D5 : AFPR<10, "f10", [F10, F11]>; + def D6 : AFPR<12, "f12", [F12, F13]>; + def D7 : AFPR<14, "f14", [F14, F15]>; + def D8 : AFPR<16, "f16", [F16, F17]>; + def D9 : AFPR<18, "f18", [F18, F19]>; + def D10 : AFPR<20, "f20", [F20, F21]>; + def D11 : AFPR<22, "f22", [F22, F23]>; + def D12 : AFPR<24, "f24", [F24, F25]>; + def D13 : AFPR<26, "f26", [F26, F27]>; + def D14 : AFPR<28, "f28", [F28, F29]>; + def D15 : AFPR<30, "f30", [F30, F31]>; /// Mips Double point precision FPU Registers in MFP64 mode. - def D0_64 : AFPR64<0, "F0", [F0]>, DwarfRegNum<[32]>; - def D1_64 : AFPR64<1, "F1", [F1]>, DwarfRegNum<[33]>; - def D2_64 : AFPR64<2, "F2", [F2]>, DwarfRegNum<[34]>; - def D3_64 : AFPR64<3, "F3", [F3]>, DwarfRegNum<[35]>; - def D4_64 : AFPR64<4, "F4", [F4]>, DwarfRegNum<[36]>; - def D5_64 : AFPR64<5, "F5", [F5]>, DwarfRegNum<[37]>; - def D6_64 : AFPR64<6, "F6", [F6]>, DwarfRegNum<[38]>; - def D7_64 : AFPR64<7, "F7", [F7]>, DwarfRegNum<[39]>; - def D8_64 : AFPR64<8, "F8", [F8]>, DwarfRegNum<[40]>; - def D9_64 : AFPR64<9, "F9", [F9]>, DwarfRegNum<[41]>; - def D10_64 : AFPR64<10, "F10", [F10]>, DwarfRegNum<[42]>; - def D11_64 : AFPR64<11, "F11", [F11]>, DwarfRegNum<[43]>; - def D12_64 : AFPR64<12, "F12", [F12]>, DwarfRegNum<[44]>; - def D13_64 : AFPR64<13, "F13", [F13]>, DwarfRegNum<[45]>; - def D14_64 : AFPR64<14, "F14", [F14]>, DwarfRegNum<[46]>; - def D15_64 : AFPR64<15, "F15", [F15]>, DwarfRegNum<[47]>; - def D16_64 : AFPR64<16, "F16", [F16]>, DwarfRegNum<[48]>; - def D17_64 : AFPR64<17, "F17", [F17]>, DwarfRegNum<[49]>; - def D18_64 : AFPR64<18, "F18", [F18]>, DwarfRegNum<[50]>; - def D19_64 : AFPR64<19, "F19", [F19]>, DwarfRegNum<[51]>; - def D20_64 : AFPR64<20, "F20", [F20]>, DwarfRegNum<[52]>; - def D21_64 : AFPR64<21, "F21", [F21]>, DwarfRegNum<[53]>; - def D22_64 : AFPR64<22, "F22", [F22]>, DwarfRegNum<[54]>; - def D23_64 : AFPR64<23, "F23", [F23]>, DwarfRegNum<[55]>; - def D24_64 : AFPR64<24, "F24", [F24]>, DwarfRegNum<[56]>; - def D25_64 : AFPR64<25, "F25", [F25]>, DwarfRegNum<[57]>; - def D26_64 : AFPR64<26, "F26", [F26]>, DwarfRegNum<[58]>; - def D27_64 : AFPR64<27, "F27", [F27]>, DwarfRegNum<[59]>; - def D28_64 : AFPR64<28, "F28", [F28]>, DwarfRegNum<[60]>; - def D29_64 : AFPR64<29, "F29", [F29]>, DwarfRegNum<[61]>; - def D30_64 : AFPR64<30, "F30", [F30]>, DwarfRegNum<[62]>; - def D31_64 : AFPR64<31, "F31", [F31]>, DwarfRegNum<[63]>; + def D0_64 : AFPR64<0, "f0", [F0]>, DwarfRegNum<[32]>; + def D1_64 : AFPR64<1, "f1", [F1]>, DwarfRegNum<[33]>; + def D2_64 : AFPR64<2, "f2", [F2]>, DwarfRegNum<[34]>; + def D3_64 : AFPR64<3, "f3", [F3]>, DwarfRegNum<[35]>; + def D4_64 : AFPR64<4, "f4", [F4]>, DwarfRegNum<[36]>; + def D5_64 : AFPR64<5, "f5", [F5]>, DwarfRegNum<[37]>; + def D6_64 : AFPR64<6, "f6", [F6]>, DwarfRegNum<[38]>; + def D7_64 : AFPR64<7, "f7", [F7]>, DwarfRegNum<[39]>; + def D8_64 : AFPR64<8, "f8", [F8]>, DwarfRegNum<[40]>; + def D9_64 : AFPR64<9, "f9", [F9]>, DwarfRegNum<[41]>; + def D10_64 : AFPR64<10, "f10", [F10]>, DwarfRegNum<[42]>; + def D11_64 : AFPR64<11, "f11", [F11]>, DwarfRegNum<[43]>; + def D12_64 : AFPR64<12, "f12", [F12]>, DwarfRegNum<[44]>; + def D13_64 : AFPR64<13, "f13", [F13]>, DwarfRegNum<[45]>; + def D14_64 : AFPR64<14, "f14", [F14]>, DwarfRegNum<[46]>; + def D15_64 : AFPR64<15, "f15", [F15]>, DwarfRegNum<[47]>; + def D16_64 : AFPR64<16, "f16", [F16]>, DwarfRegNum<[48]>; + def D17_64 : AFPR64<17, "f17", [F17]>, DwarfRegNum<[49]>; + def D18_64 : AFPR64<18, "f18", [F18]>, DwarfRegNum<[50]>; + def D19_64 : AFPR64<19, "f19", [F19]>, DwarfRegNum<[51]>; + def D20_64 : AFPR64<20, "f20", [F20]>, DwarfRegNum<[52]>; + def D21_64 : AFPR64<21, "f21", [F21]>, DwarfRegNum<[53]>; + def D22_64 : AFPR64<22, "f22", [F22]>, DwarfRegNum<[54]>; + def D23_64 : AFPR64<23, "f23", [F23]>, DwarfRegNum<[55]>; + def D24_64 : AFPR64<24, "f24", [F24]>, DwarfRegNum<[56]>; + def D25_64 : AFPR64<25, "f25", [F25]>, DwarfRegNum<[57]>; + def D26_64 : AFPR64<26, "f26", [F26]>, DwarfRegNum<[58]>; + def D27_64 : AFPR64<27, "f27", [F27]>, DwarfRegNum<[59]>; + def D28_64 : AFPR64<28, "f28", [F28]>, DwarfRegNum<[60]>; + def D29_64 : AFPR64<29, "f29", [F29]>, DwarfRegNum<[61]>; + def D30_64 : AFPR64<30, "f30", [F30]>, DwarfRegNum<[62]>; + def D31_64 : AFPR64<31, "f31", [F31]>, DwarfRegNum<[63]>; // Hi/Lo registers def HI : Register<"hi">, DwarfRegNum<[64]>; @@ -236,6 +236,9 @@ let Namespace = "Mips" in { // Status flags register def FCR31 : Register<"31">; + // fcc0 register + def FCC0 : Register<"fcc0">; + // Hardware register $29 def HWR29 : Register<"29">; def HWR29_64 : Register<"29">; @@ -246,26 +249,41 @@ let Namespace = "Mips" in { //===----------------------------------------------------------------------===// def CPURegs : RegisterClass<"Mips", [i32], 32, (add + // Reserved + ZERO, AT, // Return Values and Arguments V0, V1, A0, A1, A2, A3, // Not preserved across procedure calls - T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, + T0, T1, T2, T3, T4, T5, T6, T7, // Callee save S0, S1, S2, S3, S4, S5, S6, S7, + // Not preserved across procedure calls + T8, T9, // Reserved - ZERO, AT, K0, K1, GP, SP, FP, RA)>; + K0, K1, GP, SP, FP, RA)>; def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add +// Reserved + ZERO_64, AT_64, // Return Values and Arguments V0_64, V1_64, A0_64, A1_64, A2_64, A3_64, // Not preserved across procedure calls - T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64, T8_64, T9_64, + T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64, // Callee save S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64, + // Not preserved across procedure calls + T8_64, T9_64, // Reserved - ZERO_64, AT_64, K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)> { - let SubRegClasses = [(CPURegs sub_32)]; -} + K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)>; + +def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add + // Return Values and Arguments + V0, V1, A0, A1, A2, A3, + // Callee save + S0, S1)>; + +def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>; + // 64bit fp: // * FGR64 - 32 64-bit registers @@ -278,26 +296,24 @@ def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>; def AFGR64 : RegisterClass<"Mips", [f64], 64, (add // Return Values and Arguments - D0, D1, D6, D7, + D0, D1, + // Not preserved across procedure calls + D2, D3, D4, D5, + // Return Values and Arguments + D6, D7, // Not preserved across procedure calls - D2, D3, D4, D5, D8, D9, + D8, D9, // Callee save - D10, D11, D12, D13, D14, D15)> { - let SubRegClasses = [(FGR32 sub_fpeven, sub_fpodd)]; -} + D10, D11, D12, D13, D14, D15)>; -def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)> { - let SubRegClasses = [(FGR32 sub_32)]; -} +def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>; // Condition Register for floating point operations -def CCR : RegisterClass<"Mips", [i32], 32, (add FCR31)>; +def CCR : RegisterClass<"Mips", [i32], 32, (add FCR31,FCC0)>; // Hi/Lo Registers def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>; -def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)> { - let SubRegClasses = [(HILO sub_32)]; -} +def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)>; // Hardware registers def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>; diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index 00347df..11ff809 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -30,7 +30,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false), IsLinux(true), HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false), - HasMinMax(false), HasSwap(false), HasBitCount(false) + HasMinMax(false), HasSwap(false), HasBitCount(false), InMips16Mode(false) { std::string CPUName = CPU; if (CPUName.empty()) @@ -58,9 +58,9 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, bool MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel, - TargetSubtargetInfo::AntiDepBreakMode& Mode, - RegClassVector& CriticalPathRCs) const { - Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; + TargetSubtargetInfo::AntiDepBreakMode &Mode, + RegClassVector &CriticalPathRCs) const { + Mode = TargetSubtargetInfo::ANTIDEP_NONE; CriticalPathRCs.clear(); CriticalPathRCs.push_back(hasMips64() ? &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass); diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index 7faf77b..3215c44 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -86,6 +86,9 @@ protected: // HasBitCount - Count leading '1' and '0' bits. bool HasBitCount; + // InMips16 -- can process Mips16 instructions + bool InMips16Mode; + InstrItineraryData InstrItins; public: @@ -124,8 +127,11 @@ public: bool isSingleFloat() const { return IsSingleFloat; } bool isNotSingleFloat() const { return !IsSingleFloat; } bool hasVFPU() const { return HasVFPU; } + bool inMips16Mode() const { return InMips16Mode; } bool isLinux() const { return IsLinux; } + bool hasStandardEncoding() const { return !inMips16Mode(); } + /// Features related to the presence of specific instructions. bool hasSEInReg() const { return HasSEInReg; } bool hasCondMov() const { return HasCondMov; } diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index ad02231..dd5d35f 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -105,8 +105,6 @@ public: } virtual bool addInstSelector(); - virtual bool addPreRegAlloc(); - virtual bool addPreSched2(); virtual bool addPreEmitPass(); }; } // namespace @@ -117,31 +115,22 @@ TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) { // Install an instruction selector pass using // the ISelDag to gen Mips code. -bool MipsPassConfig::addInstSelector() -{ - PM.add(createMipsISelDag(getMipsTargetMachine())); +bool MipsPassConfig::addInstSelector() { + addPass(createMipsISelDag(getMipsTargetMachine())); return false; } // Implemented by targets that want to run passes immediately before // machine code is emitted. return true if -print-machineinstrs should // print out the code after the passes. -bool MipsPassConfig::addPreEmitPass() -{ - PM.add(createMipsDelaySlotFillerPass(getMipsTargetMachine())); - return true; -} +bool MipsPassConfig::addPreEmitPass() { + MipsTargetMachine &TM = getMipsTargetMachine(); + addPass(createMipsDelaySlotFillerPass(TM)); -bool MipsPassConfig::addPreRegAlloc() { - // Do not restore $gp if target is Mips64. - // In N32/64, $gp is a callee-saved register. - if (!getMipsSubtarget().hasMips64()) - PM.add(createMipsEmitGPRestorePass(getMipsTargetMachine())); - return true; -} + // NOTE: long branch has not been implemented for mips16. + if (TM.getSubtarget<MipsSubtarget>().hasStandardEncoding()) + addPass(createMipsLongBranchPass(TM)); -bool MipsPassConfig::addPreSched2() { - PM.add(createMipsExpandPseudoPass(getMipsTargetMachine())); return true; } diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index 80c00e8..5cbf057 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -69,9 +69,7 @@ namespace llvm { // Pass Pipeline Configuration virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); - virtual bool addCodeEmitter(PassManagerBase &PM, - JITCodeEmitter &JCE); - + virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE); }; /// MipsebTargetMachine - Mips32 big endian target machine. diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt new file mode 100644 index 0000000..7cb16b4 --- /dev/null +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -0,0 +1,34 @@ +set(LLVM_TARGET_DEFINITIONS NVPTX.td) + + +tablegen(LLVM NVPTXGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM NVPTXGenInstrInfo.inc -gen-instr-info) +tablegen(LLVM NVPTXGenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM NVPTXGenDAGISel.inc -gen-dag-isel) +tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget) +add_public_tablegen_target(NVPTXCommonTableGen) + +set(NVPTXCodeGen_sources + NVPTXFrameLowering.cpp + NVPTXInstrInfo.cpp + NVPTXISelDAGToDAG.cpp + NVPTXISelLowering.cpp + NVPTXRegisterInfo.cpp + NVPTXSubtarget.cpp + NVPTXTargetMachine.cpp + NVPTXSplitBBatBar.cpp + NVPTXLowerAggrCopies.cpp + NVPTXutil.cpp + NVPTXAllocaHoisting.cpp + NVPTXAsmPrinter.cpp + NVPTXUtilities.cpp + VectorElementize.cpp + ) + +add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources}) + +add_dependencies(LLVMNVPTXCodeGen intrinsics_gen) + +add_subdirectory(TargetInfo) +add_subdirectory(InstPrinter) +add_subdirectory(MCTargetDesc) diff --git a/lib/Target/NVPTX/InstPrinter/CMakeLists.txt b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt new file mode 100644 index 0000000..ae4c751 --- /dev/null +++ b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMNVPTXAsmPrinter + NVPTXInstPrinter.cpp + ) + +add_dependencies(LLVMNVPTXAsmPrinter NVPTXCommonTableGen) diff --git a/lib/Target/PTX/InstPrinter/LLVMBuild.txt b/lib/Target/NVPTX/InstPrinter/LLVMBuild.txt index af5d200..032b573 100644 --- a/lib/Target/PTX/InstPrinter/LLVMBuild.txt +++ b/lib/Target/NVPTX/InstPrinter/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Target/PTX/InstPrinter/LLVMBuild.txt ---------------*- Conf -*--===; +;===- ./lib/Target/NVPTX/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -17,7 +17,7 @@ [component_0] type = Library -name = PTXAsmPrinter -parent = PTX +name = NVPTXAsmPrinter +parent = NVPTX required_libraries = MC Support -add_to_library_groups = PTX +add_to_library_groups = NVPTX diff --git a/lib/Target/PTX/InstPrinter/Makefile b/lib/Target/NVPTX/InstPrinter/Makefile index 0ccfe44..7b78654 100644 --- a/lib/Target/PTX/InstPrinter/Makefile +++ b/lib/Target/NVPTX/InstPrinter/Makefile @@ -1,4 +1,4 @@ -##===- lib/Target/PTX/AsmPrinter/Makefile ------------------*- Makefile -*-===## +##===- lib/Target/NVPTX/AsmPrinter/Makefile ----------------*- Makefile -*-===## # # The LLVM Compiler Infrastructure # @@ -7,10 +7,9 @@ # ##===----------------------------------------------------------------------===## LEVEL = ../../../.. -LIBRARYNAME = LLVMPTXAsmPrinter +LIBRARYNAME = LLVMNVPTXAsmPrinter # Hack: we need to include 'main' ptx target directory to grab private headers CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. include $(LEVEL)/Makefile.common - diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp new file mode 100644 index 0000000..10051c7 --- /dev/null +++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp @@ -0,0 +1 @@ +// Placeholder diff --git a/lib/Target/PTX/LLVMBuild.txt b/lib/Target/NVPTX/LLVMBuild.txt index 15a1eb5..e2d6ed2 100644 --- a/lib/Target/PTX/LLVMBuild.txt +++ b/lib/Target/NVPTX/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Target/PTX/LLVMBuild.txt ---------------------------*- Conf -*--===; +;===- ./lib/Target/NVPTX/LLVMBuild.txt -------------------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -20,13 +20,13 @@ subdirectories = InstPrinter MCTargetDesc TargetInfo [component_0] type = TargetGroup -name = PTX +name = NVPTX parent = Target has_asmprinter = 1 [component_1] type = Library -name = PTXCodeGen -parent = PTX -required_libraries = Analysis AsmPrinter CodeGen Core MC PTXDesc PTXInfo SelectionDAG Support Target TransformUtils -add_to_library_groups = PTX +name = NVPTXCodeGen +parent = NVPTX +required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXDesc NVPTXInfo SelectionDAG Support Target TransformUtils +add_to_library_groups = NVPTX diff --git a/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt new file mode 100644 index 0000000..a030d9f --- /dev/null +++ b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt @@ -0,0 +1,9 @@ +add_llvm_library(LLVMNVPTXDesc + NVPTXMCAsmInfo.cpp + NVPTXMCTargetDesc.cpp + ) + +add_dependencies(LLVMNVPTXDesc NVPTXCommonTableGen) + +# Hack: we need to include 'main' target directory to grab private headers +#include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..) diff --git a/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt b/lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt index 19b80c5..01a051a 100644 --- a/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt +++ b/lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Target/PTX/MCTargetDesc/LLVMBuild.txt --------------*- Conf -*--===; +;===- ./lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -17,7 +17,7 @@ [component_0] type = Library -name = PTXDesc -parent = PTX -required_libraries = MC PTXAsmPrinter PTXInfo Support -add_to_library_groups = PTX +name = NVPTXDesc +parent = NVPTX +required_libraries = MC NVPTXAsmPrinter NVPTXInfo Support +add_to_library_groups = NVPTX diff --git a/lib/Target/PTX/MCTargetDesc/Makefile b/lib/Target/NVPTX/MCTargetDesc/Makefile index 35f5a7b..31d06cb 100644 --- a/lib/Target/PTX/MCTargetDesc/Makefile +++ b/lib/Target/NVPTX/MCTargetDesc/Makefile @@ -1,4 +1,4 @@ -##===- lib/Target/PTX/TargetDesc/Makefile ------------------*- Makefile -*-===## +##===- lib/Target/NVPTX/TargetDesc/Makefile ----------------*- Makefile -*-===## # # The LLVM Compiler Infrastructure # @@ -8,7 +8,7 @@ ##===----------------------------------------------------------------------===## LEVEL = ../../../.. -LIBRARYNAME = LLVMPTXDesc +LIBRARYNAME = LLVMNVPTXDesc # Hack: we need to include 'main' target directory to grab private headers CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h new file mode 100644 index 0000000..4545838 --- /dev/null +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h @@ -0,0 +1,88 @@ +//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the NVPTX target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTXBASEINFO_H +#define NVPTXBASEINFO_H + +namespace llvm { + +enum AddressSpace { + ADDRESS_SPACE_GENERIC = 0, + ADDRESS_SPACE_GLOBAL = 1, + ADDRESS_SPACE_CONST_NOT_GEN = 2, // Not part of generic space + ADDRESS_SPACE_SHARED = 3, + ADDRESS_SPACE_CONST = 4, + ADDRESS_SPACE_LOCAL = 5, + + // NVVM Internal + ADDRESS_SPACE_PARAM = 101 +}; + +enum PropertyAnnotation { + PROPERTY_MAXNTID_X = 0, + PROPERTY_MAXNTID_Y, + PROPERTY_MAXNTID_Z, + PROPERTY_REQNTID_X, + PROPERTY_REQNTID_Y, + PROPERTY_REQNTID_Z, + PROPERTY_MINNCTAPERSM, + PROPERTY_ISTEXTURE, + PROPERTY_ISSURFACE, + PROPERTY_ISSAMPLER, + PROPERTY_ISREADONLY_IMAGE_PARAM, + PROPERTY_ISWRITEONLY_IMAGE_PARAM, + PROPERTY_ISKERNEL_FUNCTION, + PROPERTY_ALIGN, + + // last property + PROPERTY_LAST +}; + +const unsigned AnnotationNameLen = 8; // length of each annotation name +const char +PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = { + "maxntidx", // PROPERTY_MAXNTID_X + "maxntidy", // PROPERTY_MAXNTID_Y + "maxntidz", // PROPERTY_MAXNTID_Z + "reqntidx", // PROPERTY_REQNTID_X + "reqntidy", // PROPERTY_REQNTID_Y + "reqntidz", // PROPERTY_REQNTID_Z + "minctasm", // PROPERTY_MINNCTAPERSM + "texture", // PROPERTY_ISTEXTURE + "surface", // PROPERTY_ISSURFACE + "sampler", // PROPERTY_ISSAMPLER + "rdoimage", // PROPERTY_ISREADONLY_IMAGE_PARAM + "wroimage", // PROPERTY_ISWRITEONLY_IMAGE_PARAM + "kernel", // PROPERTY_ISKERNEL_FUNCTION + "align", // PROPERTY_ALIGN + + // last property + "proplast", // PROPERTY_LAST +}; + +// name of named metadata used for global annotations +#if defined(__GNUC__) +// As this is declared to be static but some of the .cpp files that +// include NVVM.h do not use this array, gcc gives a warning when +// compiling those .cpp files, hence __attribute__((unused)). +__attribute__((unused)) +#endif +static const char* NamedMDForAnnotations = "nvvm.annotations"; + +} + + +#endif diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp new file mode 100644 index 0000000..1d41665 --- /dev/null +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -0,0 +1,63 @@ +//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the NVPTXMCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXMCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +bool CompileForDebugging; + +// -debug-compile - Command line option to inform opt and llc passes to +// compile for debugging +static cl::opt<bool, true> +Debug("debug-compile", cl::desc("Compile for debugging"), cl::Hidden, + cl::location(CompileForDebugging), + cl::init(false)); + +void NVPTXMCAsmInfo::anchor() { } + +NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) { + Triple TheTriple(TT); + if (TheTriple.getArch() == Triple::nvptx64) + PointerSize = 8; + + CommentString = "//"; + + PrivateGlobalPrefix = "$L__"; + + AllowPeriodsInName = false; + + HasSetDirective = false; + + HasSingleParameterDotFile = false; + + InlineAsmStart = " inline asm"; + InlineAsmEnd = " inline asm"; + + SupportsDebugInformation = CompileForDebugging; + HasDotTypeDotSizeDirective = false; + + Data8bitsDirective = " .b8 "; + Data16bitsDirective = " .b16 "; + Data32bitsDirective = " .b32 "; + Data64bitsDirective = " .b64 "; + PrivateGlobalPrefix = ""; + ZeroDirective = " .b8"; + AsciiDirective = " .b8"; + AscizDirective = " .b8"; + + // @TODO: Can we just disable this? + GlobalDirective = "\t// .globl\t"; +} diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h index 32ca069..82097da 100644 --- a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h @@ -1,4 +1,4 @@ -//===-- PTXMCAsmInfo.h - PTX asm properties --------------------*- C++ -*--===// +//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===// // // The LLVM Compiler Infrastructure // @@ -7,24 +7,24 @@ // //===----------------------------------------------------------------------===// // -// This file contains the declaration of the PTXMCAsmInfo class. +// This file contains the declaration of the NVPTXMCAsmInfo class. // //===----------------------------------------------------------------------===// -#ifndef PTX_MCASM_INFO_H -#define PTX_MCASM_INFO_H +#ifndef NVPTX_MCASM_INFO_H +#define NVPTX_MCASM_INFO_H #include "llvm/MC/MCAsmInfo.h" namespace llvm { - class Target; - class StringRef; +class Target; +class StringRef; - class PTXMCAsmInfo : public MCAsmInfo { - virtual void anchor(); - public: - explicit PTXMCAsmInfo(const Target &T, const StringRef &TT); - }; +class NVPTXMCAsmInfo : public MCAsmInfo { + virtual void anchor(); +public: + explicit NVPTXMCAsmInfo(const Target &T, const StringRef &TT); +}; } // namespace llvm -#endif // PTX_MCASM_INFO_H +#endif // NVPTX_MCASM_INFO_H diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp new file mode 100644 index 0000000..44aa01c --- /dev/null +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -0,0 +1,91 @@ +//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides NVPTX specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXMCTargetDesc.h" +#include "NVPTXMCAsmInfo.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/TargetRegistry.h" + +#define GET_INSTRINFO_MC_DESC +#include "NVPTXGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "NVPTXGenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "NVPTXGenRegisterInfo.inc" + + +using namespace llvm; + +static MCInstrInfo *createNVPTXMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitNVPTXMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) { + MCRegisterInfo *X = new MCRegisterInfo(); + // PTX does not have a return address register. + InitNVPTXMCRegisterInfo(X, 0); + return X; +} + +static MCSubtargetInfo *createNVPTXMCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS) { + MCSubtargetInfo *X = new MCSubtargetInfo(); + InitNVPTXMCSubtargetInfo(X, TT, CPU, FS); + return X; +} + +static MCCodeGenInfo *createNVPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + X->InitMCCodeGenInfo(RM, CM, OL); + return X; +} + + +// Force static initialization. +extern "C" void LLVMInitializeNVPTXTargetMC() { + // Register the MC asm info. + RegisterMCAsmInfo<NVPTXMCAsmInfo> X(TheNVPTXTarget32); + RegisterMCAsmInfo<NVPTXMCAsmInfo> Y(TheNVPTXTarget64); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget32, + createNVPTXMCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget64, + createNVPTXMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget32, createNVPTXMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget64, createNVPTXMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget32, + createNVPTXMCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget64, + createNVPTXMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget32, + createNVPTXMCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64, + createNVPTXMCSubtargetInfo); + +} diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h index 542638a..af95c76 100644 --- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h @@ -1,4 +1,4 @@ -//===-- PTXMCTargetDesc.h - PTX Target Descriptions ------------*- C++ -*-===// +//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -7,30 +7,30 @@ // //===----------------------------------------------------------------------===// // -// This file provides PTX specific target descriptions. +// This file provides NVPTX specific target descriptions. // //===----------------------------------------------------------------------===// -#ifndef PTXMCTARGETDESC_H -#define PTXMCTARGETDESC_H +#ifndef NVPTXMCTARGETDESC_H +#define NVPTXMCTARGETDESC_H namespace llvm { class Target; -extern Target ThePTX32Target; -extern Target ThePTX64Target; +extern Target TheNVPTXTarget32; +extern Target TheNVPTXTarget64; } // End llvm namespace // Defines symbolic names for PTX registers. #define GET_REGINFO_ENUM -#include "PTXGenRegisterInfo.inc" +#include "NVPTXGenRegisterInfo.inc" // Defines symbolic names for the PTX instructions. #define GET_INSTRINFO_ENUM -#include "PTXGenInstrInfo.inc" +#include "NVPTXGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM -#include "PTXGenSubtargetInfo.inc" +#include "NVPTXGenSubtargetInfo.inc" #endif diff --git a/lib/Target/PTX/Makefile b/lib/Target/NVPTX/Makefile index fa09634..8db20eb 100644 --- a/lib/Target/PTX/Makefile +++ b/lib/Target/NVPTX/Makefile @@ -1,4 +1,4 @@ -##===- lib/Target/PTX/Makefile -----------------------------*- Makefile -*-===## +##===- lib/Target/NVPTX/Makefile ---------------------------*- Makefile -*-===## # # The LLVM Compiler Infrastructure # @@ -8,15 +8,15 @@ ##===----------------------------------------------------------------------===## LEVEL = ../../.. -LIBRARYNAME = LLVMPTXCodeGen -TARGET = PTX +LIBRARYNAME = LLVMNVPTXCodeGen +TARGET = NVPTX # Make sure that tblgen is run, first thing. -BUILT_SOURCES = PTXGenAsmWriter.inc \ - PTXGenDAGISel.inc \ - PTXGenInstrInfo.inc \ - PTXGenRegisterInfo.inc \ - PTXGenSubtargetInfo.inc +BUILT_SOURCES = NVPTXGenAsmWriter.inc \ + NVPTXGenDAGISel.inc \ + NVPTXGenInstrInfo.inc \ + NVPTXGenRegisterInfo.inc \ + NVPTXGenSubtargetInfo.inc DIRS = InstPrinter TargetInfo MCTargetDesc diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h new file mode 100644 index 0000000..b568488 --- /dev/null +++ b/lib/Target/NVPTX/ManagedStringPool.h @@ -0,0 +1,49 @@ +//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The strings allocated from a managed string pool are owned by the string +// pool and will be deleted together with the managed string pool. +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_SUPPORT_MANAGED_STRING_H +#define LLVM_SUPPORT_MANAGED_STRING_H + +#include "llvm/ADT/SmallVector.h" +#include <string> + +namespace llvm { + +/// ManagedStringPool - The strings allocated from a managed string pool are +/// owned by the string pool and will be deleted together with the managed +/// string pool. +class ManagedStringPool { + SmallVector<std::string *, 8> Pool; + +public: + ManagedStringPool() {} + ~ManagedStringPool() { + SmallVector<std::string *, 8>::iterator Current = Pool.begin(); + while (Current != Pool.end()) { + delete *Current; + Current++; + } + } + + std::string *getManagedString(const char *S) { + std::string *Str = new std::string(S); + Pool.push_back(Str); + return Str; + } +}; + +} + +#endif diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h new file mode 100644 index 0000000..a8d082a --- /dev/null +++ b/lib/Target/NVPTX/NVPTX.h @@ -0,0 +1,137 @@ +//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in +// the LLVM NVPTX back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_NVPTX_H +#define LLVM_TARGET_NVPTX_H + +#include "llvm/Value.h" +#include "llvm/Module.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" +#include <cassert> +#include <iosfwd> + +namespace llvm { +class NVPTXTargetMachine; +class FunctionPass; +class formatted_raw_ostream; + +namespace NVPTXCC { +enum CondCodes { + EQ, + NE, + LT, + LE, + GT, + GE +}; +} + +inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) { + switch (CC) { + case NVPTXCC::NE: return "ne"; + case NVPTXCC::EQ: return "eq"; + case NVPTXCC::LT: return "lt"; + case NVPTXCC::LE: return "le"; + case NVPTXCC::GT: return "gt"; + case NVPTXCC::GE: return "ge"; + } + llvm_unreachable("Unknown condition code"); +} + +FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM, + llvm::CodeGenOpt::Level OptLevel); +FunctionPass *createVectorElementizePass(NVPTXTargetMachine &); +FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &); +FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &); +FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &); + +bool isImageOrSamplerVal(const Value *, const Module *); + +extern Target TheNVPTXTarget32; +extern Target TheNVPTXTarget64; + +namespace NVPTX +{ +enum DrvInterface { + NVCL, + CUDA, + TEST +}; + +// A field inside TSFlags needs a shift and a mask. The usage is +// always as follows : +// ((TSFlags & fieldMask) >> fieldShift) +// The enum keeps the mask, the shift, and all valid values of the +// field in one place. +enum VecInstType { + VecInstTypeShift = 0, + VecInstTypeMask = 0xF, + + VecNOP = 0, + VecLoad = 1, + VecStore = 2, + VecBuild = 3, + VecShuffle = 4, + VecExtract = 5, + VecInsert = 6, + VecDest = 7, + VecOther = 15 +}; + +enum SimpleMove { + SimpleMoveMask = 0x10, + SimpleMoveShift = 4 +}; +enum LoadStore { + isLoadMask = 0x20, + isLoadShift = 5, + isStoreMask = 0x40, + isStoreShift = 6 +}; + +namespace PTXLdStInstCode { +enum AddressSpace{ + GENERIC = 0, + GLOBAL = 1, + CONSTANT = 2, + SHARED = 3, + PARAM = 4, + LOCAL = 5 +}; +enum FromType { + Unsigned = 0, + Signed, + Float +}; +enum VecType { + Scalar = 1, + V2 = 2, + V4 = 4 +}; +} +} +} // end namespace llvm; + +// Defines symbolic names for NVPTX registers. This defines a mapping from +// register name to register number. +#define GET_REGINFO_ENUM +#include "NVPTXGenRegisterInfo.inc" + +// Defines symbolic names for the NVPTX instructions. +#define GET_INSTRINFO_ENUM +#include "NVPTXGenInstrInfo.inc" + +#endif diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td new file mode 100644 index 0000000..ae7710e --- /dev/null +++ b/lib/Target/NVPTX/NVPTX.td @@ -0,0 +1,44 @@ +//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This is the top level entry point for the NVPTX target. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +include "NVPTXRegisterInfo.td" +include "NVPTXInstrInfo.td" + +//===----------------------------------------------------------------------===// +// Subtarget Features. +// - We use the SM version number instead of explicit feature table. +// - Need at least one feature to avoid generating zero sized array by +// TableGen in NVPTXGenSubtarget.inc. +//===----------------------------------------------------------------------===// +def FeatureDummy : SubtargetFeature<"dummy", "dummy", "true", "">; + +//===----------------------------------------------------------------------===// +// NVPTX supported processors. +//===----------------------------------------------------------------------===// + +class Proc<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +def : Proc<"sm_10", [FeatureDummy]>; + + +def NVPTXInstrInfo : InstrInfo { +} + +def NVPTX : Target { + let InstructionSet = NVPTXInstrInfo; +} diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp new file mode 100644 index 0000000..668c393 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp @@ -0,0 +1,48 @@ +//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Hoist the alloca instructions in the non-entry blocks to the entry blocks. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Constants.h" +#include "NVPTXAllocaHoisting.h" + +namespace llvm { + +bool NVPTXAllocaHoisting::runOnFunction(Function &function) { + bool functionModified = false; + Function::iterator I = function.begin(); + TerminatorInst *firstTerminatorInst = (I++)->getTerminator(); + + for (Function::iterator E = function.end(); I != E; ++I) { + for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { + AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++); + if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) { + allocaInst->moveBefore(firstTerminatorInst); + functionModified = true; + } + } + } + + return functionModified; +} + +char NVPTXAllocaHoisting::ID = 1; +RegisterPass<NVPTXAllocaHoisting> X("alloca-hoisting", + "Hoisting alloca instructions in non-entry " + "blocks to the entry block"); + +FunctionPass *createAllocaHoisting() { + return new NVPTXAllocaHoisting(); +} + +} // end namespace llvm diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h new file mode 100644 index 0000000..24b3bd5 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h @@ -0,0 +1,49 @@ +//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Hoist the alloca instructions in the non-entry blocks to the entry blocks. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTX_ALLOCA_HOISTING_H_ +#define NVPTX_ALLOCA_HOISTING_H_ + +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" + +namespace llvm { + +class FunctionPass; +class Function; + +// Hoisting the alloca instructions in the non-entry blocks to the entry +// block. +class NVPTXAllocaHoisting : public FunctionPass { +public: + static char ID; // Pass ID + NVPTXAllocaHoisting() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetData>(); + AU.addPreserved<MachineFunctionAnalysis>(); + } + + virtual const char *getPassName() const { + return "NVPTX specific alloca hoisting"; + } + + virtual bool runOnFunction(Function &function); +}; + +extern FunctionPass *createAllocaHoisting(); + +} // end namespace llvm + +#endif // NVPTX_ALLOCA_HOISTING_H_ diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp new file mode 100644 index 0000000..f2b9616 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -0,0 +1,2064 @@ +//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to NVPTX assembly language. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXAsmPrinter.h" +#include "NVPTX.h" +#include "NVPTXInstrInfo.h" +#include "NVPTXTargetMachine.h" +#include "NVPTXRegisterInfo.h" +#include "NVPTXUtilities.h" +#include "MCTargetDesc/NVPTXMCAsmInfo.h" +#include "NVPTXNumRegisters.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/DebugInfo.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Module.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Support/TimeValue.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Support/Path.h" +#include "llvm/Assembly/Writer.h" +#include "cl_common_defines.h" +#include <sstream> +using namespace llvm; + + +#include "NVPTXGenAsmWriter.inc" + +bool RegAllocNilUsed = true; + +#define DEPOTNAME "__local_depot" + +static cl::opt<bool> +EmitLineNumbers("nvptx-emit-line-numbers", + cl::desc("NVPTX Specific: Emit Line numbers even without -G"), + cl::init(true)); + +namespace llvm { +bool InterleaveSrcInPtx = false; +} + +static cl::opt<bool, true>InterleaveSrc("nvptx-emit-src", + cl::ZeroOrMore, + cl::desc("NVPTX Specific: Emit source line in ptx file"), + cl::location(llvm::InterleaveSrcInPtx)); + + + + +// @TODO: This is a copy from AsmPrinter.cpp. The function is static, so we +// cannot just link to the existing version. +/// LowerConstant - Lower the specified LLVM Constant to an MCExpr. +/// +using namespace nvptx; +const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) { + MCContext &Ctx = AP.OutContext; + + if (CV->isNullValue() || isa<UndefValue>(CV)) + return MCConstantExpr::Create(0, Ctx); + + if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) + return MCConstantExpr::Create(CI->getZExtValue(), Ctx); + + if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) + return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx); + + if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) + return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx); + + const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV); + if (CE == 0) + llvm_unreachable("Unknown constant value to lower!"); + + + switch (CE->getOpcode()) { + default: + // If the code isn't optimized, there may be outstanding folding + // opportunities. Attempt to fold the expression using TargetData as a + // last resort before giving up. + if (Constant *C = + ConstantFoldConstantExpression(CE, AP.TM.getTargetData())) + if (C != CE) + return LowerConstant(C, AP); + + // Otherwise report the problem to the user. + { + std::string S; + raw_string_ostream OS(S); + OS << "Unsupported expression in static initializer: "; + WriteAsOperand(OS, CE, /*PrintType=*/false, + !AP.MF ? 0 : AP.MF->getFunction()->getParent()); + report_fatal_error(OS.str()); + } + case Instruction::GetElementPtr: { + const TargetData &TD = *AP.TM.getTargetData(); + // Generate a symbolic expression for the byte address + const Constant *PtrVal = CE->getOperand(0); + SmallVector<Value*, 8> IdxVec(CE->op_begin()+1, CE->op_end()); + int64_t Offset = TD.getIndexedOffset(PtrVal->getType(), IdxVec); + + const MCExpr *Base = LowerConstant(CE->getOperand(0), AP); + if (Offset == 0) + return Base; + + // Truncate/sext the offset to the pointer size. + if (TD.getPointerSizeInBits() != 64) { + int SExtAmount = 64-TD.getPointerSizeInBits(); + Offset = (Offset << SExtAmount) >> SExtAmount; + } + + return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx), + Ctx); + } + + case Instruction::Trunc: + // We emit the value and depend on the assembler to truncate the generated + // expression properly. This is important for differences between + // blockaddress labels. Since the two labels are in the same function, it + // is reasonable to treat their delta as a 32-bit value. + // FALL THROUGH. + case Instruction::BitCast: + return LowerConstant(CE->getOperand(0), AP); + + case Instruction::IntToPtr: { + const TargetData &TD = *AP.TM.getTargetData(); + // Handle casts to pointers by changing them into casts to the appropriate + // integer type. This promotes constant folding and simplifies this code. + Constant *Op = CE->getOperand(0); + Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()), + false/*ZExt*/); + return LowerConstant(Op, AP); + } + + case Instruction::PtrToInt: { + const TargetData &TD = *AP.TM.getTargetData(); + // Support only foldable casts to/from pointers that can be eliminated by + // changing the pointer to the appropriately sized integer type. + Constant *Op = CE->getOperand(0); + Type *Ty = CE->getType(); + + const MCExpr *OpExpr = LowerConstant(Op, AP); + + // We can emit the pointer value into this slot if the slot is an + // integer slot equal to the size of the pointer. + if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType())) + return OpExpr; + + // Otherwise the pointer is smaller than the resultant integer, mask off + // the high bits so we are sure to get a proper truncation if the input is + // a constant expr. + unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType()); + const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx); + return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx); + } + + // The MC library also has a right-shift operator, but it isn't consistently + // signed or unsigned between different targets. + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::Shl: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP); + const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP); + switch (CE->getOpcode()) { + default: llvm_unreachable("Unknown binary operator constant cast expr"); + case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx); + case Instruction::Sub: return MCBinaryExpr::CreateSub(LHS, RHS, Ctx); + case Instruction::Mul: return MCBinaryExpr::CreateMul(LHS, RHS, Ctx); + case Instruction::SDiv: return MCBinaryExpr::CreateDiv(LHS, RHS, Ctx); + case Instruction::SRem: return MCBinaryExpr::CreateMod(LHS, RHS, Ctx); + case Instruction::Shl: return MCBinaryExpr::CreateShl(LHS, RHS, Ctx); + case Instruction::And: return MCBinaryExpr::CreateAnd(LHS, RHS, Ctx); + case Instruction::Or: return MCBinaryExpr::CreateOr (LHS, RHS, Ctx); + case Instruction::Xor: return MCBinaryExpr::CreateXor(LHS, RHS, Ctx); + } + } + } +} + + +void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) +{ + if (!EmitLineNumbers) + return; + if (ignoreLoc(MI)) + return; + + DebugLoc curLoc = MI.getDebugLoc(); + + if (prevDebugLoc.isUnknown() && curLoc.isUnknown()) + return; + + if (prevDebugLoc == curLoc) + return; + + prevDebugLoc = curLoc; + + if (curLoc.isUnknown()) + return; + + + const MachineFunction *MF = MI.getParent()->getParent(); + //const TargetMachine &TM = MF->getTarget(); + + const LLVMContext &ctx = MF->getFunction()->getContext(); + DIScope Scope(curLoc.getScope(ctx)); + + if (!Scope.Verify()) + return; + + StringRef fileName(Scope.getFilename()); + StringRef dirName(Scope.getDirectory()); + SmallString<128> FullPathName = dirName; + if (!dirName.empty() && !sys::path::is_absolute(fileName)) { + sys::path::append(FullPathName, fileName); + fileName = FullPathName.str(); + } + + if (filenameMap.find(fileName.str()) == filenameMap.end()) + return; + + + // Emit the line from the source file. + if (llvm::InterleaveSrcInPtx) + this->emitSrcInText(fileName.str(), curLoc.getLine()); + + std::stringstream temp; + temp << "\t.loc " << filenameMap[fileName.str()] + << " " << curLoc.getLine() << " " << curLoc.getCol(); + OutStreamer.EmitRawText(Twine(temp.str().c_str())); +} + +void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) { + SmallString<128> Str; + raw_svector_ostream OS(Str); + if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) + emitLineNumberAsDotLoc(*MI); + printInstruction(MI, OS); + OutStreamer.EmitRawText(OS.str()); +} + +void NVPTXAsmPrinter::printReturnValStr(const Function *F, + raw_ostream &O) +{ + const TargetData *TD = TM.getTargetData(); + const TargetLowering *TLI = TM.getTargetLowering(); + + Type *Ty = F->getReturnType(); + + bool isABI = (nvptxSubtarget.getSmVersion() >= 20); + + if (Ty->getTypeID() == Type::VoidTyID) + return; + + O << " ("; + + if (isABI) { + if (Ty->isPrimitiveType() || Ty->isIntegerTy()) { + unsigned size = 0; + if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) { + size = ITy->getBitWidth(); + if (size < 32) size = 32; + } else { + assert(Ty->isFloatingPointTy() && + "Floating point type expected here"); + size = Ty->getPrimitiveSizeInBits(); + } + + O << ".param .b" << size << " func_retval0"; + } + else if (isa<PointerType>(Ty)) { + O << ".param .b" << TLI->getPointerTy().getSizeInBits() + << " func_retval0"; + } else { + if ((Ty->getTypeID() == Type::StructTyID) || + isa<VectorType>(Ty)) { + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*TLI, Ty, vtparts); + unsigned totalsz = 0; + for (unsigned i=0,e=vtparts.size(); i!=e; ++i) { + unsigned elems = 1; + EVT elemtype = vtparts[i]; + if (vtparts[i].isVector()) { + elems = vtparts[i].getVectorNumElements(); + elemtype = vtparts[i].getVectorElementType(); + } + for (unsigned j=0, je=elems; j!=je; ++j) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 8)) sz = 8; + totalsz += sz/8; + } + } + unsigned retAlignment = 0; + if (!llvm::getAlign(*F, 0, retAlignment)) + retAlignment = TD->getABITypeAlignment(Ty); + O << ".param .align " + << retAlignment + << " .b8 func_retval0[" + << totalsz << "]"; + } else + assert(false && + "Unknown return type"); + } + } else { + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*TLI, Ty, vtparts); + unsigned idx = 0; + for (unsigned i=0,e=vtparts.size(); i!=e; ++i) { + unsigned elems = 1; + EVT elemtype = vtparts[i]; + if (vtparts[i].isVector()) { + elems = vtparts[i].getVectorNumElements(); + elemtype = vtparts[i].getVectorElementType(); + } + + for (unsigned j=0, je=elems; j!=je; ++j) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 32)) sz = 32; + O << ".reg .b" << sz << " func_retval" << idx; + if (j<je-1) O << ", "; + ++idx; + } + if (i < e-1) + O << ", "; + } + } + O << ") "; + return; +} + +void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF, + raw_ostream &O) { + const Function *F = MF.getFunction(); + printReturnValStr(F, O); +} + +void NVPTXAsmPrinter::EmitFunctionEntryLabel() { + SmallString<128> Str; + raw_svector_ostream O(Str); + + // Set up + MRI = &MF->getRegInfo(); + F = MF->getFunction(); + emitLinkageDirective(F,O); + if (llvm::isKernelFunction(*F)) + O << ".entry "; + else { + O << ".func "; + printReturnValStr(*MF, O); + } + + O << *CurrentFnSym; + + emitFunctionParamList(*MF, O); + + if (llvm::isKernelFunction(*F)) + emitKernelFunctionDirectives(*F, O); + + OutStreamer.EmitRawText(O.str()); + + prevDebugLoc = DebugLoc(); +} + +void NVPTXAsmPrinter::EmitFunctionBodyStart() { + const TargetRegisterInfo &TRI = *TM.getRegisterInfo(); + unsigned numRegClasses = TRI.getNumRegClasses(); + VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses+1]; + OutStreamer.EmitRawText(StringRef("{\n")); + setAndEmitFunctionVirtualRegisters(*MF); + + SmallString<128> Str; + raw_svector_ostream O(Str); + emitDemotedVars(MF->getFunction(), O); + OutStreamer.EmitRawText(O.str()); +} + +void NVPTXAsmPrinter::EmitFunctionBodyEnd() { + OutStreamer.EmitRawText(StringRef("}\n")); + delete []VRidGlobal2LocalMap; +} + + +void +NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function& F, + raw_ostream &O) const { + // If the NVVM IR has some of reqntid* specified, then output + // the reqntid directive, and set the unspecified ones to 1. + // If none of reqntid* is specified, don't output reqntid directive. + unsigned reqntidx, reqntidy, reqntidz; + bool specified = false; + if (llvm::getReqNTIDx(F, reqntidx) == false) reqntidx = 1; + else specified = true; + if (llvm::getReqNTIDy(F, reqntidy) == false) reqntidy = 1; + else specified = true; + if (llvm::getReqNTIDz(F, reqntidz) == false) reqntidz = 1; + else specified = true; + + if (specified) + O << ".reqntid " << reqntidx << ", " + << reqntidy << ", " << reqntidz << "\n"; + + // If the NVVM IR has some of maxntid* specified, then output + // the maxntid directive, and set the unspecified ones to 1. + // If none of maxntid* is specified, don't output maxntid directive. + unsigned maxntidx, maxntidy, maxntidz; + specified = false; + if (llvm::getMaxNTIDx(F, maxntidx) == false) maxntidx = 1; + else specified = true; + if (llvm::getMaxNTIDy(F, maxntidy) == false) maxntidy = 1; + else specified = true; + if (llvm::getMaxNTIDz(F, maxntidz) == false) maxntidz = 1; + else specified = true; + + if (specified) + O << ".maxntid " << maxntidx << ", " + << maxntidy << ", " << maxntidz << "\n"; + + unsigned mincta; + if (llvm::getMinCTASm(F, mincta)) + O << ".minnctapersm " << mincta << "\n"; +} + +void +NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec, + raw_ostream &O) { + const TargetRegisterClass * RC = MRI->getRegClass(vr); + unsigned id = RC->getID(); + + std::map<unsigned, unsigned> ®map = VRidGlobal2LocalMap[id]; + unsigned mapped_vr = regmap[vr]; + + if (!isVec) { + O << getNVPTXRegClassStr(RC) << mapped_vr; + return; + } + // Vector virtual register + if (getNVPTXVectorSize(RC) == 4) + O << "{" + << getNVPTXRegClassStr(RC) << mapped_vr << "_0, " + << getNVPTXRegClassStr(RC) << mapped_vr << "_1, " + << getNVPTXRegClassStr(RC) << mapped_vr << "_2, " + << getNVPTXRegClassStr(RC) << mapped_vr << "_3" + << "}"; + else if (getNVPTXVectorSize(RC) == 2) + O << "{" + << getNVPTXRegClassStr(RC) << mapped_vr << "_0, " + << getNVPTXRegClassStr(RC) << mapped_vr << "_1" + << "}"; + else + llvm_unreachable("Unsupported vector size"); +} + +void +NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, bool isVec, + raw_ostream &O) { + getVirtualRegisterName(vr, isVec, O); +} + +void NVPTXAsmPrinter::printVecModifiedImmediate(const MachineOperand &MO, + const char *Modifier, + raw_ostream &O) { + static const char vecelem[] = {'0', '1', '2', '3', '0', '1', '2', '3'}; + int Imm = (int)MO.getImm(); + if(0 == strcmp(Modifier, "vecelem")) + O << "_" << vecelem[Imm]; + else if(0 == strcmp(Modifier, "vecv4comm1")) { + if((Imm < 0) || (Imm > 3)) + O << "//"; + } + else if(0 == strcmp(Modifier, "vecv4comm2")) { + if((Imm < 4) || (Imm > 7)) + O << "//"; + } + else if(0 == strcmp(Modifier, "vecv4pos")) { + if(Imm < 0) Imm = 0; + O << "_" << vecelem[Imm%4]; + } + else if(0 == strcmp(Modifier, "vecv2comm1")) { + if((Imm < 0) || (Imm > 1)) + O << "//"; + } + else if(0 == strcmp(Modifier, "vecv2comm2")) { + if((Imm < 2) || (Imm > 3)) + O << "//"; + } + else if(0 == strcmp(Modifier, "vecv2pos")) { + if(Imm < 0) Imm = 0; + O << "_" << vecelem[Imm%2]; + } + else + llvm_unreachable("Unknown Modifier on immediate operand"); +} + +void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, + raw_ostream &O, const char *Modifier) { + const MachineOperand &MO = MI->getOperand(opNum); + switch (MO.getType()) { + case MachineOperand::MO_Register: + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (MO.getReg() == NVPTX::VRDepot) + O << DEPOTNAME << getFunctionNumber(); + else + O << getRegisterName(MO.getReg()); + } else { + if (!Modifier) + emitVirtualRegister(MO.getReg(), false, O); + else { + if (strcmp(Modifier, "vecfull") == 0) + emitVirtualRegister(MO.getReg(), true, O); + else + llvm_unreachable( + "Don't know how to handle the modifier on virtual register."); + } + } + return; + + case MachineOperand::MO_Immediate: + if (!Modifier) + O << MO.getImm(); + else if (strstr(Modifier, "vec") == Modifier) + printVecModifiedImmediate(MO, Modifier, O); + else + llvm_unreachable("Don't know how to handle modifier on immediate operand"); + return; + + case MachineOperand::MO_FPImmediate: + printFPConstant(MO.getFPImm(), O); + break; + + case MachineOperand::MO_GlobalAddress: + O << *Mang->getSymbol(MO.getGlobal()); + break; + + case MachineOperand::MO_ExternalSymbol: { + const char * symbname = MO.getSymbolName(); + if (strstr(symbname, ".PARAM") == symbname) { + unsigned index; + sscanf(symbname+6, "%u[];", &index); + printParamName(index, O); + } + else if (strstr(symbname, ".HLPPARAM") == symbname) { + unsigned index; + sscanf(symbname+9, "%u[];", &index); + O << *CurrentFnSym << "_param_" << index << "_offset"; + } + else + O << symbname; + break; + } + + case MachineOperand::MO_MachineBasicBlock: + O << *MO.getMBB()->getSymbol(); + return; + + default: + llvm_unreachable("Operand type not supported."); + } +} + +void NVPTXAsmPrinter:: +printImplicitDef(const MachineInstr *MI, raw_ostream &O) const { +#ifndef __OPTIMIZE__ + O << "\t// Implicit def :"; + //printOperand(MI, 0); + O << "\n"; +#endif +} + +void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum, + raw_ostream &O, const char *Modifier) { + printOperand(MI, opNum, O); + + if (Modifier && !strcmp(Modifier, "add")) { + O << ", "; + printOperand(MI, opNum+1, O); + } else { + if (MI->getOperand(opNum+1).isImm() && + MI->getOperand(opNum+1).getImm() == 0) + return; // don't print ',0' or '+0' + O << "+"; + printOperand(MI, opNum+1, O); + } +} + +void NVPTXAsmPrinter::printLdStCode(const MachineInstr *MI, int opNum, + raw_ostream &O, const char *Modifier) +{ + if (Modifier) { + const MachineOperand &MO = MI->getOperand(opNum); + int Imm = (int)MO.getImm(); + if (!strcmp(Modifier, "volatile")) { + if (Imm) + O << ".volatile"; + } else if (!strcmp(Modifier, "addsp")) { + switch (Imm) { + case NVPTX::PTXLdStInstCode::GLOBAL: O << ".global"; break; + case NVPTX::PTXLdStInstCode::SHARED: O << ".shared"; break; + case NVPTX::PTXLdStInstCode::LOCAL: O << ".local"; break; + case NVPTX::PTXLdStInstCode::PARAM: O << ".param"; break; + case NVPTX::PTXLdStInstCode::CONSTANT: O << ".const"; break; + case NVPTX::PTXLdStInstCode::GENERIC: + if (!nvptxSubtarget.hasGenericLdSt()) + O << ".global"; + break; + default: + assert("wrong value"); + } + } + else if (!strcmp(Modifier, "sign")) { + if (Imm==NVPTX::PTXLdStInstCode::Signed) + O << "s"; + else if (Imm==NVPTX::PTXLdStInstCode::Unsigned) + O << "u"; + else + O << "f"; + } + else if (!strcmp(Modifier, "vec")) { + if (Imm==NVPTX::PTXLdStInstCode::V2) + O << ".v2"; + else if (Imm==NVPTX::PTXLdStInstCode::V4) + O << ".v4"; + } + else + assert("unknown modifier"); + } + else + assert("unknown modifier"); +} + +void NVPTXAsmPrinter::emitDeclaration (const Function *F, raw_ostream &O) { + + emitLinkageDirective(F,O); + if (llvm::isKernelFunction(*F)) + O << ".entry "; + else + O << ".func "; + printReturnValStr(F, O); + O << *CurrentFnSym << "\n"; + emitFunctionParamList(F, O); + O << ";\n"; +} + +static bool usedInGlobalVarDef(const Constant *C) +{ + if (!C) + return false; + + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { + if (GV->getName().str() == "llvm.used") + return false; + return true; + } + + for (Value::const_use_iterator ui=C->use_begin(), ue=C->use_end(); + ui!=ue; ++ui) { + const Constant *C = dyn_cast<Constant>(*ui); + if (usedInGlobalVarDef(C)) + return true; + } + return false; +} + +static bool usedInOneFunc(const User *U, Function const *&oneFunc) +{ + if (const GlobalVariable *othergv = dyn_cast<GlobalVariable>(U)) { + if (othergv->getName().str() == "llvm.used") + return true; + } + + if (const Instruction *instr = dyn_cast<Instruction>(U)) { + if (instr->getParent() && instr->getParent()->getParent()) { + const Function *curFunc = instr->getParent()->getParent(); + if (oneFunc && (curFunc != oneFunc)) + return false; + oneFunc = curFunc; + return true; + } + else + return false; + } + + if (const MDNode *md = dyn_cast<MDNode>(U)) + if (md->hasName() && ((md->getName().str() == "llvm.dbg.gv") || + (md->getName().str() == "llvm.dbg.sp"))) + return true; + + + for (User::const_use_iterator ui=U->use_begin(), ue=U->use_end(); + ui!=ue; ++ui) { + if (usedInOneFunc(*ui, oneFunc) == false) + return false; + } + return true; +} + +/* Find out if a global variable can be demoted to local scope. + * Currently, this is valid for CUDA shared variables, which have local + * scope and global lifetime. So the conditions to check are : + * 1. Is the global variable in shared address space? + * 2. Does it have internal linkage? + * 3. Is the global variable referenced only in one function? + */ +static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { + if (gv->hasInternalLinkage() == false) + return false; + const PointerType *Pty = gv->getType(); + if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED) + return false; + + const Function *oneFunc = 0; + + bool flag = usedInOneFunc(gv, oneFunc); + if (flag == false) + return false; + if (!oneFunc) + return false; + f = oneFunc; + return true; +} + +static bool useFuncSeen(const Constant *C, + llvm::DenseMap<const Function *, bool> &seenMap) { + for (Value::const_use_iterator ui=C->use_begin(), ue=C->use_end(); + ui!=ue; ++ui) { + if (const Constant *cu = dyn_cast<Constant>(*ui)) { + if (useFuncSeen(cu, seenMap)) + return true; + } else if (const Instruction *I = dyn_cast<Instruction>(*ui)) { + const BasicBlock *bb = I->getParent(); + if (!bb) continue; + const Function *caller = bb->getParent(); + if (!caller) continue; + if (seenMap.find(caller) != seenMap.end()) + return true; + } + } + return false; +} + +void NVPTXAsmPrinter::emitDeclarations (Module &M, raw_ostream &O) { + llvm::DenseMap<const Function *, bool> seenMap; + for (Module::const_iterator FI=M.begin(), FE=M.end(); + FI!=FE; ++FI) { + const Function *F = FI; + + if (F->isDeclaration()) { + if (F->use_empty()) + continue; + if (F->getIntrinsicID()) + continue; + CurrentFnSym = Mang->getSymbol(F); + emitDeclaration(F, O); + continue; + } + for (Value::const_use_iterator iter=F->use_begin(), + iterEnd=F->use_end(); iter!=iterEnd; ++iter) { + if (const Constant *C = dyn_cast<Constant>(*iter)) { + if (usedInGlobalVarDef(C)) { + // The use is in the initialization of a global variable + // that is a function pointer, so print a declaration + // for the original function + CurrentFnSym = Mang->getSymbol(F); + emitDeclaration(F, O); + break; + } + // Emit a declaration of this function if the function that + // uses this constant expr has already been seen. + if (useFuncSeen(C, seenMap)) { + CurrentFnSym = Mang->getSymbol(F); + emitDeclaration(F, O); + break; + } + } + + if (!isa<Instruction>(*iter)) continue; + const Instruction *instr = cast<Instruction>(*iter); + const BasicBlock *bb = instr->getParent(); + if (!bb) continue; + const Function *caller = bb->getParent(); + if (!caller) continue; + + // If a caller has already been seen, then the caller is + // appearing in the module before the callee. so print out + // a declaration for the callee. + if (seenMap.find(caller) != seenMap.end()) { + CurrentFnSym = Mang->getSymbol(F); + emitDeclaration(F, O); + break; + } + } + seenMap[F] = true; + } +} + +void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) { + DebugInfoFinder DbgFinder; + DbgFinder.processModule(M); + + unsigned i=1; + for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(), + E = DbgFinder.compile_unit_end(); I != E; ++I) { + DICompileUnit DIUnit(*I); + StringRef Filename(DIUnit.getFilename()); + StringRef Dirname(DIUnit.getDirectory()); + SmallString<128> FullPathName = Dirname; + if (!Dirname.empty() && !sys::path::is_absolute(Filename)) { + sys::path::append(FullPathName, Filename); + Filename = FullPathName.str(); + } + if (filenameMap.find(Filename.str()) != filenameMap.end()) + continue; + filenameMap[Filename.str()] = i; + OutStreamer.EmitDwarfFileDirective(i, "", Filename.str()); + ++i; + } + + for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(), + E = DbgFinder.subprogram_end(); I != E; ++I) { + DISubprogram SP(*I); + StringRef Filename(SP.getFilename()); + StringRef Dirname(SP.getDirectory()); + SmallString<128> FullPathName = Dirname; + if (!Dirname.empty() && !sys::path::is_absolute(Filename)) { + sys::path::append(FullPathName, Filename); + Filename = FullPathName.str(); + } + if (filenameMap.find(Filename.str()) != filenameMap.end()) + continue; + filenameMap[Filename.str()] = i; + ++i; + } +} + +bool NVPTXAsmPrinter::doInitialization (Module &M) { + + SmallString<128> Str1; + raw_svector_ostream OS1(Str1); + + MMI = getAnalysisIfAvailable<MachineModuleInfo>(); + MMI->AnalyzeModule(M); + + // We need to call the parent's one explicitly. + //bool Result = AsmPrinter::doInitialization(M); + + // Initialize TargetLoweringObjectFile. + const_cast<TargetLoweringObjectFile&>(getObjFileLowering()) + .Initialize(OutContext, TM); + + Mang = new Mangler(OutContext, *TM.getTargetData()); + + // Emit header before any dwarf directives are emitted below. + emitHeader(M, OS1); + OutStreamer.EmitRawText(OS1.str()); + + + // Already commented out + //bool Result = AsmPrinter::doInitialization(M); + + + if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) + recordAndEmitFilenames(M); + + SmallString<128> Str2; + raw_svector_ostream OS2(Str2); + + emitDeclarations(M, OS2); + + // Print out module-level global variables here. + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) + printModuleLevelGV(I, OS2); + + OS2 << '\n'; + + OutStreamer.EmitRawText(OS2.str()); + return false; // success +} + +void NVPTXAsmPrinter::emitHeader (Module &M, raw_ostream &O) { + O << "//\n"; + O << "// Generated by LLVM NVPTX Back-End\n"; + O << "//\n"; + O << "\n"; + + O << ".version 3.0\n"; + + O << ".target "; + O << nvptxSubtarget.getTargetName(); + + if (nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) + O << ", texmode_independent"; + if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) { + if (!nvptxSubtarget.hasDouble()) + O << ", map_f64_to_f32"; + } + + if (MAI->doesSupportDebugInformation()) + O << ", debug"; + + O << "\n"; + + O << ".address_size "; + if (nvptxSubtarget.is64Bit()) + O << "64"; + else + O << "32"; + O << "\n"; + + O << "\n"; +} + +bool NVPTXAsmPrinter::doFinalization(Module &M) { + // XXX Temproarily remove global variables so that doFinalization() will not + // emit them again (global variables are emitted at beginning). + + Module::GlobalListType &global_list = M.getGlobalList(); + int i, n = global_list.size(); + GlobalVariable **gv_array = new GlobalVariable* [n]; + + // first, back-up GlobalVariable in gv_array + i = 0; + for (Module::global_iterator I = global_list.begin(), E = global_list.end(); + I != E; ++I) + gv_array[i++] = &*I; + + // second, empty global_list + while (!global_list.empty()) + global_list.remove(global_list.begin()); + + // call doFinalization + bool ret = AsmPrinter::doFinalization(M); + + // now we restore global variables + for (i = 0; i < n; i ++) + global_list.insert(global_list.end(), gv_array[i]); + + delete[] gv_array; + return ret; + + + //bool Result = AsmPrinter::doFinalization(M); + // Instead of calling the parents doFinalization, we may + // clone parents doFinalization and customize here. + // Currently, we if NVISA out the EmitGlobals() in + // parent's doFinalization, which is too intrusive. + // + // Same for the doInitialization. + //return Result; +} + +// This function emits appropriate linkage directives for +// functions and global variables. +// +// extern function declaration -> .extern +// extern function definition -> .visible +// external global variable with init -> .visible +// external without init -> .extern +// appending -> not allowed, assert. + +void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue* V, raw_ostream &O) +{ + if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) { + if (V->hasExternalLinkage()) { + if (isa<GlobalVariable>(V)) { + const GlobalVariable *GVar = cast<GlobalVariable>(V); + if (GVar) { + if (GVar->hasInitializer()) + O << ".visible "; + else + O << ".extern "; + } + } else if (V->isDeclaration()) + O << ".extern "; + else + O << ".visible "; + } else if (V->hasAppendingLinkage()) { + std::string msg; + msg.append("Error: "); + msg.append("Symbol "); + if (V->hasName()) + msg.append(V->getName().str()); + msg.append("has unsupported appending linkage type"); + llvm_unreachable(msg.c_str()); + } + } +} + + +void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable* GVar, raw_ostream &O, + bool processDemoted) { + + // Skip meta data + if (GVar->hasSection()) { + if (GVar->getSection() == "llvm.metadata") + return; + } + + const TargetData *TD = TM.getTargetData(); + + // GlobalVariables are always constant pointers themselves. + const PointerType *PTy = GVar->getType(); + Type *ETy = PTy->getElementType(); + + if (GVar->hasExternalLinkage()) { + if (GVar->hasInitializer()) + O << ".visible "; + else + O << ".extern "; + } + + if (llvm::isTexture(*GVar)) { + O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n"; + return; + } + + if (llvm::isSurface(*GVar)) { + O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n"; + return; + } + + if (GVar->isDeclaration()) { + // (extern) declarations, no definition or initializer + // Currently the only known declaration is for an automatic __local + // (.shared) promoted to global. + emitPTXGlobalVariable(GVar, O); + O << ";\n"; + return; + } + + if (llvm::isSampler(*GVar)) { + O << ".global .samplerref " << llvm::getSamplerName(*GVar); + + Constant *Initializer = NULL; + if (GVar->hasInitializer()) + Initializer = GVar->getInitializer(); + ConstantInt *CI = NULL; + if (Initializer) + CI = dyn_cast<ConstantInt>(Initializer); + if (CI) { + unsigned sample=CI->getZExtValue(); + + O << " = { "; + + for (int i =0, addr=((sample & __CLK_ADDRESS_MASK ) >> + __CLK_ADDRESS_BASE) ; i < 3 ; i++) { + O << "addr_mode_" << i << " = "; + switch (addr) { + case 0: O << "wrap"; break; + case 1: O << "clamp_to_border"; break; + case 2: O << "clamp_to_edge"; break; + case 3: O << "wrap"; break; + case 4: O << "mirror"; break; + } + O <<", "; + } + O << "filter_mode = "; + switch (( sample & __CLK_FILTER_MASK ) >> __CLK_FILTER_BASE ) { + case 0: O << "nearest"; break; + case 1: O << "linear"; break; + case 2: assert ( 0 && "Anisotropic filtering is not supported"); + default: O << "nearest"; break; + } + if (!(( sample &__CLK_NORMALIZED_MASK ) >> __CLK_NORMALIZED_BASE)) { + O << ", force_unnormalized_coords = 1"; + } + O << " }"; + } + + O << ";\n"; + return; + } + + if (GVar->hasPrivateLinkage()) { + + if (!strncmp(GVar->getName().data(), "unrollpragma", 12)) + return; + + // FIXME - need better way (e.g. Metadata) to avoid generating this global + if (!strncmp(GVar->getName().data(), "filename", 8)) + return; + if (GVar->use_empty()) + return; + } + + const Function *demotedFunc = 0; + if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) { + O << "// " << GVar->getName().str() << " has been demoted\n"; + if (localDecls.find(demotedFunc) != localDecls.end()) + localDecls[demotedFunc].push_back(GVar); + else { + std::vector<GlobalVariable *> temp; + temp.push_back(GVar); + localDecls[demotedFunc] = temp; + } + return; + } + + O << "."; + emitPTXAddressSpace(PTy->getAddressSpace(), O); + if (GVar->getAlignment() == 0) + O << " .align " << (int) TD->getPrefTypeAlignment(ETy); + else + O << " .align " << GVar->getAlignment(); + + + if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) { + O << " ."; + O << getPTXFundamentalTypeStr(ETy, false); + O << " "; + O << *Mang->getSymbol(GVar); + + // Ptx allows variable initilization only for constant and global state + // spaces. + if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || + (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) || + (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) + && GVar->hasInitializer()) { + Constant *Initializer = GVar->getInitializer(); + if (!Initializer->isNullValue()) { + O << " = " ; + printScalarConstant(Initializer, O); + } + } + } else { + unsigned int ElementSize =0; + + // Although PTX has direct support for struct type and array type and + // LLVM IR is very similar to PTX, the LLVM CodeGen does not support for + // targets that support these high level field accesses. Structs, arrays + // and vectors are lowered into arrays of bytes. + switch (ETy->getTypeID()) { + case Type::StructTyID: + case Type::ArrayTyID: + case Type::VectorTyID: + ElementSize = TD->getTypeStoreSize(ETy); + // Ptx allows variable initilization only for constant and + // global state spaces. + if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || + (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) || + (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) + && GVar->hasInitializer()) { + Constant *Initializer = GVar->getInitializer(); + if (!isa<UndefValue>(Initializer) && + !Initializer->isNullValue()) { + AggBuffer aggBuffer(ElementSize, O, *this); + bufferAggregateConstant(Initializer, &aggBuffer); + if (aggBuffer.numSymbols) { + if (nvptxSubtarget.is64Bit()) { + O << " .u64 " << *Mang->getSymbol(GVar) <<"[" ; + O << ElementSize/8; + } + else { + O << " .u32 " << *Mang->getSymbol(GVar) <<"[" ; + O << ElementSize/4; + } + O << "]"; + } + else { + O << " .b8 " << *Mang->getSymbol(GVar) <<"[" ; + O << ElementSize; + O << "]"; + } + O << " = {" ; + aggBuffer.print(); + O << "}"; + } + else { + O << " .b8 " << *Mang->getSymbol(GVar) ; + if (ElementSize) { + O <<"[" ; + O << ElementSize; + O << "]"; + } + } + } + else { + O << " .b8 " << *Mang->getSymbol(GVar); + if (ElementSize) { + O <<"[" ; + O << ElementSize; + O << "]"; + } + } + break; + default: + assert( 0 && "type not supported yet"); + } + + } + O << ";\n"; +} + +void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) { + if (localDecls.find(f) == localDecls.end()) + return; + + std::vector<GlobalVariable *> &gvars = localDecls[f]; + + for (unsigned i=0, e=gvars.size(); i!=e; ++i) { + O << "\t// demoted variable\n\t"; + printModuleLevelGV(gvars[i], O, true); + } +} + +void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace, + raw_ostream &O) const { + switch (AddressSpace) { + case llvm::ADDRESS_SPACE_LOCAL: + O << "local" ; + break; + case llvm::ADDRESS_SPACE_GLOBAL: + O << "global" ; + break; + case llvm::ADDRESS_SPACE_CONST: + // This logic should be consistent with that in + // getCodeAddrSpace() (NVPTXISelDATToDAT.cpp) + if (nvptxSubtarget.hasGenericLdSt()) + O << "global" ; + else + O << "const" ; + break; + case llvm::ADDRESS_SPACE_CONST_NOT_GEN: + O << "const" ; + break; + case llvm::ADDRESS_SPACE_SHARED: + O << "shared" ; + break; + default: + llvm_unreachable("unexpected address space"); + } +} + +std::string NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, + bool useB4PTR) const { + switch (Ty->getTypeID()) { + default: + llvm_unreachable("unexpected type"); + break; + case Type::IntegerTyID: { + unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); + if (NumBits == 1) + return "pred"; + else if (NumBits <= 64) { + std::string name = "u"; + return name + utostr(NumBits); + } else { + llvm_unreachable("Integer too large"); + break; + } + break; + } + case Type::FloatTyID: + return "f32"; + case Type::DoubleTyID: + return "f64"; + case Type::PointerTyID: + if (nvptxSubtarget.is64Bit()) + if (useB4PTR) return "b64"; + else return "u64"; + else + if (useB4PTR) return "b32"; + else return "u32"; + } + llvm_unreachable("unexpected type"); + return NULL; +} + +void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable* GVar, + raw_ostream &O) { + + const TargetData *TD = TM.getTargetData(); + + // GlobalVariables are always constant pointers themselves. + const PointerType *PTy = GVar->getType(); + Type *ETy = PTy->getElementType(); + + O << "."; + emitPTXAddressSpace(PTy->getAddressSpace(), O); + if (GVar->getAlignment() == 0) + O << " .align " << (int) TD->getPrefTypeAlignment(ETy); + else + O << " .align " << GVar->getAlignment(); + + if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) { + O << " ."; + O << getPTXFundamentalTypeStr(ETy); + O << " "; + O << *Mang->getSymbol(GVar); + return; + } + + int64_t ElementSize =0; + + // Although PTX has direct support for struct type and array type and LLVM IR + // is very similar to PTX, the LLVM CodeGen does not support for targets that + // support these high level field accesses. Structs and arrays are lowered + // into arrays of bytes. + switch (ETy->getTypeID()) { + case Type::StructTyID: + case Type::ArrayTyID: + case Type::VectorTyID: + ElementSize = TD->getTypeStoreSize(ETy); + O << " .b8 " << *Mang->getSymbol(GVar) <<"[" ; + if (ElementSize) { + O << itostr(ElementSize) ; + } + O << "]"; + break; + default: + assert( 0 && "type not supported yet"); + } + return ; +} + + +static unsigned int +getOpenCLAlignment(const TargetData *TD, + Type *Ty) { + if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<PointerType>(Ty)) + return TD->getPrefTypeAlignment(Ty); + + const ArrayType *ATy = dyn_cast<ArrayType>(Ty); + if (ATy) + return getOpenCLAlignment(TD, ATy->getElementType()); + + const VectorType *VTy = dyn_cast<VectorType>(Ty); + if (VTy) { + Type *ETy = VTy->getElementType(); + unsigned int numE = VTy->getNumElements(); + unsigned int alignE = TD->getPrefTypeAlignment(ETy); + if (numE == 3) + return 4*alignE; + else + return numE*alignE; + } + + const StructType *STy = dyn_cast<StructType>(Ty); + if (STy) { + unsigned int alignStruct = 1; + // Go through each element of the struct and find the + // largest alignment. + for (unsigned i=0, e=STy->getNumElements(); i != e; i++) { + Type *ETy = STy->getElementType(i); + unsigned int align = getOpenCLAlignment(TD, ETy); + if (align > alignStruct) + alignStruct = align; + } + return alignStruct; + } + + const FunctionType *FTy = dyn_cast<FunctionType>(Ty); + if (FTy) + return TD->getPointerPrefAlignment(); + return TD->getPrefTypeAlignment(Ty); +} + +void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I, + int paramIndex, raw_ostream &O) { + if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) || + (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) + O << *CurrentFnSym << "_param_" << paramIndex; + else { + std::string argName = I->getName(); + const char *p = argName.c_str(); + while (*p) { + if (*p == '.') + O << "_"; + else + O << *p; + p++; + } + } +} + +void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) { + Function::const_arg_iterator I, E; + int i = 0; + + if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) || + (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) { + O << *CurrentFnSym << "_param_" << paramIndex; + return; + } + + for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, i++) { + if (i==paramIndex) { + printParamName(I, paramIndex, O); + return; + } + } + llvm_unreachable("paramIndex out of bound"); +} + +void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, + raw_ostream &O) { + const TargetData *TD = TM.getTargetData(); + const AttrListPtr &PAL = F->getAttributes(); + const TargetLowering *TLI = TM.getTargetLowering(); + Function::const_arg_iterator I, E; + unsigned paramIndex = 0; + bool first = true; + bool isKernelFunc = llvm::isKernelFunction(*F); + bool isABI = (nvptxSubtarget.getSmVersion() >= 20); + MVT thePointerTy = TLI->getPointerTy(); + + O << "(\n"; + + for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) { + const Type *Ty = I->getType(); + + if (!first) + O << ",\n"; + + first = false; + + // Handle image/sampler parameters + if (llvm::isSampler(*I) || llvm::isImage(*I)) { + if (llvm::isImage(*I)) { + std::string sname = I->getName(); + if (llvm::isImageWriteOnly(*I)) + O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex; + else // Default image is read_only + O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex; + } + else // Should be llvm::isSampler(*I) + O << "\t.param .samplerref " << *CurrentFnSym << "_param_" + << paramIndex; + continue; + } + + if (PAL.paramHasAttr(paramIndex+1, Attribute::ByVal) == false) { + // Just a scalar + const PointerType *PTy = dyn_cast<PointerType>(Ty); + if (isKernelFunc) { + if (PTy) { + // Special handling for pointer arguments to kernel + O << "\t.param .u" << thePointerTy.getSizeInBits() << " "; + + if (nvptxSubtarget.getDrvInterface() != NVPTX::CUDA) { + Type *ETy = PTy->getElementType(); + int addrSpace = PTy->getAddressSpace(); + switch(addrSpace) { + default: + O << ".ptr "; + break; + case llvm::ADDRESS_SPACE_CONST_NOT_GEN: + O << ".ptr .const "; + break; + case llvm::ADDRESS_SPACE_SHARED: + O << ".ptr .shared "; + break; + case llvm::ADDRESS_SPACE_GLOBAL: + case llvm::ADDRESS_SPACE_CONST: + O << ".ptr .global "; + break; + } + O << ".align " << (int)getOpenCLAlignment(TD, ETy) << " "; + } + printParamName(I, paramIndex, O); + continue; + } + + // non-pointer scalar to kernel func + O << "\t.param ." + << getPTXFundamentalTypeStr(Ty) << " "; + printParamName(I, paramIndex, O); + continue; + } + // Non-kernel function, just print .param .b<size> for ABI + // and .reg .b<size> for non ABY + unsigned sz = 0; + if (isa<IntegerType>(Ty)) { + sz = cast<IntegerType>(Ty)->getBitWidth(); + if (sz < 32) sz = 32; + } + else if (isa<PointerType>(Ty)) + sz = thePointerTy.getSizeInBits(); + else + sz = Ty->getPrimitiveSizeInBits(); + if (isABI) + O << "\t.param .b" << sz << " "; + else + O << "\t.reg .b" << sz << " "; + printParamName(I, paramIndex, O); + continue; + } + + // param has byVal attribute. So should be a pointer + const PointerType *PTy = dyn_cast<PointerType>(Ty); + assert(PTy && + "Param with byval attribute should be a pointer type"); + Type *ETy = PTy->getElementType(); + + if (isABI || isKernelFunc) { + // Just print .param .b8 .align <a> .param[size]; + // <a> = PAL.getparamalignment + // size = typeallocsize of element type + unsigned align = PAL.getParamAlignment(paramIndex+1); + unsigned sz = TD->getTypeAllocSize(ETy); + O << "\t.param .align " << align + << " .b8 "; + printParamName(I, paramIndex, O); + O << "[" << sz << "]"; + continue; + } else { + // Split the ETy into constituent parts and + // print .param .b<size> <name> for each part. + // Further, if a part is vector, print the above for + // each vector element. + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*TLI, ETy, vtparts); + for (unsigned i=0,e=vtparts.size(); i!=e; ++i) { + unsigned elems = 1; + EVT elemtype = vtparts[i]; + if (vtparts[i].isVector()) { + elems = vtparts[i].getVectorNumElements(); + elemtype = vtparts[i].getVectorElementType(); + } + + for (unsigned j=0,je=elems; j!=je; ++j) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 32)) sz = 32; + O << "\t.reg .b" << sz << " "; + printParamName(I, paramIndex, O); + if (j<je-1) O << ",\n"; + ++paramIndex; + } + if (i<e-1) + O << ",\n"; + } + --paramIndex; + continue; + } + } + + O << "\n)\n"; +} + +void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF, + raw_ostream &O) { + const Function *F = MF.getFunction(); + emitFunctionParamList(F, O); +} + + +void NVPTXAsmPrinter:: +setAndEmitFunctionVirtualRegisters(const MachineFunction &MF) { + SmallString<128> Str; + raw_svector_ostream O(Str); + + // Map the global virtual register number to a register class specific + // virtual register number starting from 1 with that class. + const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo(); + //unsigned numRegClasses = TRI->getNumRegClasses(); + + // Emit the Fake Stack Object + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int NumBytes = (int) MFI->getStackSize(); + if (NumBytes) { + O << "\t.local .align " << MFI->getMaxAlignment() << " .b8 \t" + << DEPOTNAME + << getFunctionNumber() << "[" << NumBytes << "];\n"; + if (nvptxSubtarget.is64Bit()) { + O << "\t.reg .b64 \t%SP;\n"; + O << "\t.reg .b64 \t%SPL;\n"; + } + else { + O << "\t.reg .b32 \t%SP;\n"; + O << "\t.reg .b32 \t%SPL;\n"; + } + } + + // Go through all virtual registers to establish the mapping between the + // global virtual + // register number and the per class virtual register number. + // We use the per class virtual register number in the ptx output. + unsigned int numVRs = MRI->getNumVirtRegs(); + for (unsigned i=0; i< numVRs; i++) { + unsigned int vr = TRI->index2VirtReg(i); + const TargetRegisterClass *RC = MRI->getRegClass(vr); + std::map<unsigned, unsigned> ®map = VRidGlobal2LocalMap[RC->getID()]; + int n = regmap.size(); + regmap.insert(std::make_pair(vr, n+1)); + } + + // Emit register declarations + // @TODO: Extract out the real register usage + O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n"; + O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n"; + O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n"; + O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n"; + O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n"; + O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n"; + O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n"; + + // Emit declaration of the virtual registers or 'physical' registers for + // each register class + //for (unsigned i=0; i< numRegClasses; i++) { + // std::map<unsigned, unsigned> ®map = VRidGlobal2LocalMap[i]; + // const TargetRegisterClass *RC = TRI->getRegClass(i); + // std::string rcname = getNVPTXRegClassName(RC); + // std::string rcStr = getNVPTXRegClassStr(RC); + // //int n = regmap.size(); + // if (!isNVPTXVectorRegClass(RC)) { + // O << "\t.reg " << rcname << " \t" << rcStr << "<" + // << NVPTXNumRegisters << ">;\n"; + // } + + // Only declare those registers that may be used. And do not emit vector + // registers as + // they are all elementized to scalar registers. + //if (n && !isNVPTXVectorRegClass(RC)) { + // if (RegAllocNilUsed) { + // O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1) + // << ">;\n"; + // } + // else { + // O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr) + // << "<" << 32 << ">;\n"; + // } + //} + //} + + OutStreamer.EmitRawText(O.str()); +} + + +void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) { + APFloat APF = APFloat(Fp->getValueAPF()); // make a copy + bool ignored; + unsigned int numHex; + const char *lead; + + if (Fp->getType()->getTypeID()==Type::FloatTyID) { + numHex = 8; + lead = "0f"; + APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, + &ignored); + } else if (Fp->getType()->getTypeID() == Type::DoubleTyID) { + numHex = 16; + lead = "0d"; + APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, + &ignored); + } else + llvm_unreachable("unsupported fp type"); + + APInt API = APF.bitcastToAPInt(); + std::string hexstr(utohexstr(API.getZExtValue())); + O << lead; + if (hexstr.length() < numHex) + O << std::string(numHex - hexstr.length(), '0'); + O << utohexstr(API.getZExtValue()); +} + +void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) { + O << CI->getValue(); + return; + } + if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) { + printFPConstant(CFP, O); + return; + } + if (isa<ConstantPointerNull>(CPV)) { + O << "0"; + return; + } + if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { + O << *Mang->getSymbol(GVar); + return; + } + if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + Value *v = Cexpr->stripPointerCasts(); + if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { + O << *Mang->getSymbol(GVar); + return; + } else { + O << *LowerConstant(CPV, *this); + return; + } + } + llvm_unreachable("Not scalar type found in printScalarConstant()"); +} + + +void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, + AggBuffer *aggBuffer) { + + const TargetData *TD = TM.getTargetData(); + + if (isa<UndefValue>(CPV) || CPV->isNullValue()) { + int s = TD->getTypeAllocSize(CPV->getType()); + if (s<Bytes) + s = Bytes; + aggBuffer->addZeros(s); + return; + } + + unsigned char *ptr; + switch (CPV->getType()->getTypeID()) { + + case Type::IntegerTyID: { + const Type *ETy = CPV->getType(); + if ( ETy == Type::getInt8Ty(CPV->getContext()) ){ + unsigned char c = + (unsigned char)(dyn_cast<ConstantInt>(CPV))->getZExtValue(); + ptr = &c; + aggBuffer->addBytes(ptr, 1, Bytes); + } else if ( ETy == Type::getInt16Ty(CPV->getContext()) ) { + short int16 = + (short)(dyn_cast<ConstantInt>(CPV))->getZExtValue(); + ptr = (unsigned char*)&int16; + aggBuffer->addBytes(ptr, 2, Bytes); + } else if ( ETy == Type::getInt32Ty(CPV->getContext()) ) { + if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { + int int32 =(int)(constInt->getZExtValue()); + ptr = (unsigned char*)&int32; + aggBuffer->addBytes(ptr, 4, Bytes); + break; + } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + if (ConstantInt *constInt = + dyn_cast<ConstantInt>(ConstantFoldConstantExpression( + Cexpr, TD))) { + int int32 =(int)(constInt->getZExtValue()); + ptr = (unsigned char*)&int32; + aggBuffer->addBytes(ptr, 4, Bytes); + break; + } + if (Cexpr->getOpcode() == Instruction::PtrToInt) { + Value *v = Cexpr->getOperand(0)->stripPointerCasts(); + aggBuffer->addSymbol(v); + aggBuffer->addZeros(4); + break; + } + } + llvm_unreachable("unsupported integer const type"); + } else if (ETy == Type::getInt64Ty(CPV->getContext()) ) { + if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { + long long int64 =(long long)(constInt->getZExtValue()); + ptr = (unsigned char*)&int64; + aggBuffer->addBytes(ptr, 8, Bytes); + break; + } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + if (ConstantInt *constInt = dyn_cast<ConstantInt>( + ConstantFoldConstantExpression(Cexpr, TD))) { + long long int64 =(long long)(constInt->getZExtValue()); + ptr = (unsigned char*)&int64; + aggBuffer->addBytes(ptr, 8, Bytes); + break; + } + if (Cexpr->getOpcode() == Instruction::PtrToInt) { + Value *v = Cexpr->getOperand(0)->stripPointerCasts(); + aggBuffer->addSymbol(v); + aggBuffer->addZeros(8); + break; + } + } + llvm_unreachable("unsupported integer const type"); + } else + llvm_unreachable("unsupported integer const type"); + break; + } + case Type::FloatTyID: + case Type::DoubleTyID: { + ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); + const Type* Ty = CFP->getType(); + if (Ty == Type::getFloatTy(CPV->getContext())) { + float float32 = (float)CFP->getValueAPF().convertToFloat(); + ptr = (unsigned char*)&float32; + aggBuffer->addBytes(ptr, 4, Bytes); + } else if (Ty == Type::getDoubleTy(CPV->getContext())) { + double float64 = CFP->getValueAPF().convertToDouble(); + ptr = (unsigned char*)&float64; + aggBuffer->addBytes(ptr, 8, Bytes); + } + else { + llvm_unreachable("unsupported fp const type"); + } + break; + } + case Type::PointerTyID: { + if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { + aggBuffer->addSymbol(GVar); + } + else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + Value *v = Cexpr->stripPointerCasts(); + aggBuffer->addSymbol(v); + } + unsigned int s = TD->getTypeAllocSize(CPV->getType()); + aggBuffer->addZeros(s); + break; + } + + case Type::ArrayTyID: + case Type::VectorTyID: + case Type::StructTyID: { + if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) || + isa<ConstantStruct>(CPV)) { + int ElementSize = TD->getTypeAllocSize(CPV->getType()); + bufferAggregateConstant(CPV, aggBuffer); + if ( Bytes > ElementSize ) + aggBuffer->addZeros(Bytes-ElementSize); + } + else if (isa<ConstantAggregateZero>(CPV)) + aggBuffer->addZeros(Bytes); + else + llvm_unreachable("Unexpected Constant type"); + break; + } + + default: + llvm_unreachable("unsupported type"); + } +} + +void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV, + AggBuffer *aggBuffer) { + const TargetData *TD = TM.getTargetData(); + int Bytes; + + // Old constants + if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) { + if (CPV->getNumOperands()) + for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) + bufferLEByte(cast<Constant>(CPV->getOperand(i)), 0, aggBuffer); + return; + } + + if (const ConstantDataSequential *CDS = + dyn_cast<ConstantDataSequential>(CPV)) { + if (CDS->getNumElements()) + for (unsigned i = 0; i < CDS->getNumElements(); ++i) + bufferLEByte(cast<Constant>(CDS->getElementAsConstant(i)), 0, + aggBuffer); + return; + } + + + if (isa<ConstantStruct>(CPV)) { + if (CPV->getNumOperands()) { + StructType *ST = cast<StructType>(CPV->getType()); + for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) { + if ( i == (e - 1)) + Bytes = TD->getStructLayout(ST)->getElementOffset(0) + + TD->getTypeAllocSize(ST) + - TD->getStructLayout(ST)->getElementOffset(i); + else + Bytes = TD->getStructLayout(ST)->getElementOffset(i+1) - + TD->getStructLayout(ST)->getElementOffset(i); + bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, + aggBuffer); + } + } + return; + } + llvm_unreachable("unsupported constant type in printAggregateConstant()"); +} + +// buildTypeNameMap - Run through symbol table looking for type names. +// + + +bool NVPTXAsmPrinter::isImageType(const Type *Ty) { + + std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty); + + if (PI != TypeNameMap.end() && + (!PI->second.compare("struct._image1d_t") || + !PI->second.compare("struct._image2d_t") || + !PI->second.compare("struct._image3d_t"))) + return true; + + return false; +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'r': + break; + } + } + + printOperand(MI, OpNo, O); + + return false; +} + +bool NVPTXAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) + return true; // Unknown modifier + + O << '['; + printMemOperand(MI, OpNo, O); + O << ']'; + + return false; +} + +bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) +{ + switch(MI.getOpcode()) { + default: + return false; + case NVPTX::CallArgBeginInst: case NVPTX::CallArgEndInst0: + case NVPTX::CallArgEndInst1: case NVPTX::CallArgF32: + case NVPTX::CallArgF64: case NVPTX::CallArgI16: + case NVPTX::CallArgI32: case NVPTX::CallArgI32imm: + case NVPTX::CallArgI64: case NVPTX::CallArgI8: + case NVPTX::CallArgParam: case NVPTX::CallVoidInst: + case NVPTX::CallVoidInstReg: case NVPTX::Callseq_End: + case NVPTX::CallVoidInstReg64: + case NVPTX::DeclareParamInst: case NVPTX::DeclareRetMemInst: + case NVPTX::DeclareRetRegInst: case NVPTX::DeclareRetScalarInst: + case NVPTX::DeclareScalarParamInst: case NVPTX::DeclareScalarRegInst: + case NVPTX::StoreParamF32: case NVPTX::StoreParamF64: + case NVPTX::StoreParamI16: case NVPTX::StoreParamI32: + case NVPTX::StoreParamI64: case NVPTX::StoreParamI8: + case NVPTX::StoreParamS32I8: case NVPTX::StoreParamU32I8: + case NVPTX::StoreParamS32I16: case NVPTX::StoreParamU32I16: + case NVPTX::StoreParamScalar2F32: case NVPTX::StoreParamScalar2F64: + case NVPTX::StoreParamScalar2I16: case NVPTX::StoreParamScalar2I32: + case NVPTX::StoreParamScalar2I64: case NVPTX::StoreParamScalar2I8: + case NVPTX::StoreParamScalar4F32: case NVPTX::StoreParamScalar4I16: + case NVPTX::StoreParamScalar4I32: case NVPTX::StoreParamScalar4I8: + case NVPTX::StoreParamV2F32: case NVPTX::StoreParamV2F64: + case NVPTX::StoreParamV2I16: case NVPTX::StoreParamV2I32: + case NVPTX::StoreParamV2I64: case NVPTX::StoreParamV2I8: + case NVPTX::StoreParamV4F32: case NVPTX::StoreParamV4I16: + case NVPTX::StoreParamV4I32: case NVPTX::StoreParamV4I8: + case NVPTX::StoreRetvalF32: case NVPTX::StoreRetvalF64: + case NVPTX::StoreRetvalI16: case NVPTX::StoreRetvalI32: + case NVPTX::StoreRetvalI64: case NVPTX::StoreRetvalI8: + case NVPTX::StoreRetvalScalar2F32: case NVPTX::StoreRetvalScalar2F64: + case NVPTX::StoreRetvalScalar2I16: case NVPTX::StoreRetvalScalar2I32: + case NVPTX::StoreRetvalScalar2I64: case NVPTX::StoreRetvalScalar2I8: + case NVPTX::StoreRetvalScalar4F32: case NVPTX::StoreRetvalScalar4I16: + case NVPTX::StoreRetvalScalar4I32: case NVPTX::StoreRetvalScalar4I8: + case NVPTX::StoreRetvalV2F32: case NVPTX::StoreRetvalV2F64: + case NVPTX::StoreRetvalV2I16: case NVPTX::StoreRetvalV2I32: + case NVPTX::StoreRetvalV2I64: case NVPTX::StoreRetvalV2I8: + case NVPTX::StoreRetvalV4F32: case NVPTX::StoreRetvalV4I16: + case NVPTX::StoreRetvalV4I32: case NVPTX::StoreRetvalV4I8: + case NVPTX::LastCallArgF32: case NVPTX::LastCallArgF64: + case NVPTX::LastCallArgI16: case NVPTX::LastCallArgI32: + case NVPTX::LastCallArgI32imm: case NVPTX::LastCallArgI64: + case NVPTX::LastCallArgI8: case NVPTX::LastCallArgParam: + case NVPTX::LoadParamMemF32: case NVPTX::LoadParamMemF64: + case NVPTX::LoadParamMemI16: case NVPTX::LoadParamMemI32: + case NVPTX::LoadParamMemI64: case NVPTX::LoadParamMemI8: + case NVPTX::LoadParamRegF32: case NVPTX::LoadParamRegF64: + case NVPTX::LoadParamRegI16: case NVPTX::LoadParamRegI32: + case NVPTX::LoadParamRegI64: case NVPTX::LoadParamRegI8: + case NVPTX::LoadParamScalar2F32: case NVPTX::LoadParamScalar2F64: + case NVPTX::LoadParamScalar2I16: case NVPTX::LoadParamScalar2I32: + case NVPTX::LoadParamScalar2I64: case NVPTX::LoadParamScalar2I8: + case NVPTX::LoadParamScalar4F32: case NVPTX::LoadParamScalar4I16: + case NVPTX::LoadParamScalar4I32: case NVPTX::LoadParamScalar4I8: + case NVPTX::LoadParamV2F32: case NVPTX::LoadParamV2F64: + case NVPTX::LoadParamV2I16: case NVPTX::LoadParamV2I32: + case NVPTX::LoadParamV2I64: case NVPTX::LoadParamV2I8: + case NVPTX::LoadParamV4F32: case NVPTX::LoadParamV4I16: + case NVPTX::LoadParamV4I32: case NVPTX::LoadParamV4I8: + case NVPTX::PrototypeInst: case NVPTX::DBG_VALUE: + return true; + } + return false; +} + +// Force static initialization. +extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() { + RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32); + RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64); +} + + +void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) { + std::stringstream temp; + LineReader * reader = this->getReader(filename.str()); + temp << "\n//"; + temp << filename.str(); + temp << ":"; + temp << line; + temp << " "; + temp << reader->readLine(line); + temp << "\n"; + this->OutStreamer.EmitRawText(Twine(temp.str())); +} + + +LineReader *NVPTXAsmPrinter::getReader(std::string filename) { + if (reader == NULL) { + reader = new LineReader(filename); + } + + if (reader->fileName() != filename) { + delete reader; + reader = new LineReader(filename); + } + + return reader; +} + + +std::string +LineReader::readLine(unsigned lineNum) { + if (lineNum < theCurLine) { + theCurLine = 0; + fstr.seekg(0,std::ios::beg); + } + while (theCurLine < lineNum) { + fstr.getline(buff,500); + theCurLine++; + } + return buff; +} + +// Force static initialization. +extern "C" void LLVMInitializeNVPTXAsmPrinter() { + RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32); + RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64); +} diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h new file mode 100644 index 0000000..6488b14 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -0,0 +1,315 @@ +//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to NVPTX assembly language. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTXASMPRINTER_H +#define NVPTXASMPRINTER_H + +#include "NVPTX.h" +#include "NVPTXTargetMachine.h" +#include "NVPTXSubtarget.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Target/Mangler.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include <fstream> + +// The ptx syntax and format is very different from that usually seem in a .s +// file, +// therefore we are not able to use the MCAsmStreamer interface here. +// +// We are handcrafting the output method here. +// +// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer +// (subclass of MCStreamer). + +// This is defined in AsmPrinter.cpp. +// Used to process the constant expressions in initializers. +namespace nvptx { +const llvm::MCExpr *LowerConstant(const llvm::Constant *CV, + llvm::AsmPrinter &AP) ; +} + +namespace llvm { + +class LineReader { +private: + unsigned theCurLine ; + std::ifstream fstr; + char buff[512]; + std::string theFileName; + SmallVector<unsigned, 32> lineOffset; +public: + LineReader(std::string filename) { + theCurLine = 0; + fstr.open(filename.c_str()); + theFileName = filename; + } + std::string fileName() { return theFileName; } + ~LineReader() { + fstr.close(); + } + std::string readLine(unsigned line); +}; + + + +class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { + + + class AggBuffer { + // Used to buffer the emitted string for initializing global + // aggregates. + // + // Normally an aggregate (array, vector or structure) is emitted + // as a u8[]. However, if one element/field of the aggregate + // is a non-NULL address, then the aggregate is emitted as u32[] + // or u64[]. + // + // We first layout the aggregate in 'buffer' in bytes, except for + // those symbol addresses. For the i-th symbol address in the + //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer' + // are filled with 0s. symbolPosInBuffer[i-1] records its position + // in 'buffer', and Symbols[i-1] records the Value*. + // + // Once we have this AggBuffer setup, we can choose how to print + // it out. + public: + unsigned size; // size of the buffer in bytes + unsigned char *buffer; // the buffer + unsigned numSymbols; // number of symbol addresses + SmallVector<unsigned, 4> symbolPosInBuffer; + SmallVector<Value *, 4> Symbols; + + private: + unsigned curpos; + raw_ostream &O; + NVPTXAsmPrinter &AP; + + public: + AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP) + :O(_O),AP(_AP) { + buffer = new unsigned char[_size]; + size = _size; + curpos = 0; + numSymbols = 0; + } + ~AggBuffer() { + delete [] buffer; + } + unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) { + assert((curpos+Num) <= size); + assert((curpos+Bytes) <= size); + for ( int i= 0; i < Num; ++i) { + buffer[curpos] = Ptr[i]; + curpos ++; + } + for ( int i=Num; i < Bytes ; ++i) { + buffer[curpos] = 0; + curpos ++; + } + return curpos; + } + unsigned addZeros(int Num) { + assert((curpos+Num) <= size); + for ( int i= 0; i < Num; ++i) { + buffer[curpos] = 0; + curpos ++; + } + return curpos; + } + void addSymbol(Value *GVar) { + symbolPosInBuffer.push_back(curpos); + Symbols.push_back(GVar); + numSymbols++; + } + void print() { + if (numSymbols == 0) { + // print out in bytes + for (unsigned i=0; i<size; i++) { + if (i) + O << ", "; + O << (unsigned int)buffer[i]; + } + } else { + // print out in 4-bytes or 8-bytes + unsigned int pos = 0; + unsigned int nSym = 0; + unsigned int nextSymbolPos = symbolPosInBuffer[nSym]; + unsigned int nBytes = 4; + if (AP.nvptxSubtarget.is64Bit()) + nBytes = 8; + for (pos=0; pos<size; pos+=nBytes) { + if (pos) + O << ", "; + if (pos == nextSymbolPos) { + Value *v = Symbols[nSym]; + if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { + MCSymbol *Name = AP.Mang->getSymbol(GVar); + O << *Name; + } + else if (ConstantExpr *Cexpr = + dyn_cast<ConstantExpr>(v)) { + O << *nvptx::LowerConstant(Cexpr, AP); + } else + llvm_unreachable("symbol type unknown"); + nSym++; + if (nSym >= numSymbols) + nextSymbolPos = size+1; + else + nextSymbolPos = symbolPosInBuffer[nSym]; + } else + if (nBytes == 4) + O << *(unsigned int*)(buffer+pos); + else + O << *(unsigned long long*)(buffer+pos); + } + } + } + }; + + friend class AggBuffer; + + virtual void emitSrcInText(StringRef filename, unsigned line); + +private : + virtual const char *getPassName() const { + return "NVPTX Assembly Printer"; + } + + const Function *F; + std::string CurrentFnName; + + void EmitFunctionEntryLabel(); + void EmitFunctionBodyStart(); + void EmitFunctionBodyEnd(); + + void EmitInstruction(const MachineInstr *); + + void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {} + + void printGlobalVariable(const GlobalVariable *GVar); + void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O, + const char *Modifier=0); + void printLdStCode(const MachineInstr *MI, int opNum, raw_ostream &O, + const char *Modifier=0); + void printVecModifiedImmediate(const MachineOperand &MO, + const char *Modifier, raw_ostream &O); + void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, + const char *Modifier=0); + void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const; + // definition autogenerated. + void printInstruction(const MachineInstr *MI, raw_ostream &O); + void printModuleLevelGV(GlobalVariable* GVar, raw_ostream &O, + bool=false); + void printParamName(int paramIndex, raw_ostream &O); + void printParamName(Function::const_arg_iterator I, int paramIndex, + raw_ostream &O); + void emitHeader(Module &M, raw_ostream &O); + void emitKernelFunctionDirectives(const Function& F, + raw_ostream &O) const; + void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O); + void emitFunctionExternParamList(const MachineFunction &MF); + void emitFunctionParamList(const Function *, raw_ostream &O); + void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O); + void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF); + void emitFunctionTempData(const MachineFunction &MF, + unsigned &FrameSize); + bool isImageType(const Type *Ty); + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &); + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &); + void printReturnValStr(const Function *, raw_ostream &O); + void printReturnValStr(const MachineFunction &MF, raw_ostream &O); + +protected: + bool doInitialization(Module &M); + bool doFinalization(Module &M); + +private: + std::string CurrentBankselLabelInBasicBlock; + + // This is specific per MachineFunction. + const MachineRegisterInfo *MRI; + // The contents are specific for each + // MachineFunction. But the size of the + // array is not. + std::map<unsigned, unsigned> *VRidGlobal2LocalMap; + // cache the subtarget here. + const NVPTXSubtarget &nvptxSubtarget; + // Build the map between type name and ID based on module's type + // symbol table. + std::map<const Type *, std::string> TypeNameMap; + + // List of variables demoted to a function scope. + std::map<const Function *, std::vector<GlobalVariable *> > localDecls; + + // To record filename to ID mapping + std::map<std::string, unsigned> filenameMap; + void recordAndEmitFilenames(Module &); + + void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O); + void emitPTXAddressSpace(unsigned int AddressSpace, + raw_ostream &O) const; + std::string getPTXFundamentalTypeStr(const Type *Ty, bool=true) const ; + void printScalarConstant(Constant *CPV, raw_ostream &O) ; + void printFPConstant(const ConstantFP *Fp, raw_ostream &O) ; + void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer) ; + void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer) ; + + void printOperandProper(const MachineOperand &MO); + + void emitLinkageDirective(const GlobalValue* V, raw_ostream &O); + void emitDeclarations(Module &, raw_ostream &O); + void emitDeclaration(const Function *, raw_ostream &O); + + static const char *getRegisterName(unsigned RegNo); + void emitDemotedVars(const Function *, raw_ostream &); + + LineReader *reader; + LineReader *getReader(std::string); +public: + NVPTXAsmPrinter(TargetMachine &TM, + MCStreamer &Streamer) + : AsmPrinter(TM, Streamer), + nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) { + CurrentBankselLabelInBasicBlock = ""; + VRidGlobal2LocalMap = NULL; + reader = NULL; + } + + ~NVPTXAsmPrinter() { + if (!reader) + delete reader; + } + + bool ignoreLoc(const MachineInstr &); + + virtual void getVirtualRegisterName(unsigned, bool, raw_ostream &); + + DebugLoc prevDebugLoc; + void emitLineNumberAsDotLoc(const MachineInstr &); +}; +} // end of namespace + +#endif diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp new file mode 100644 index 0000000..a9abc00 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -0,0 +1,76 @@ +//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the NVPTX implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXFrameLowering.h" +#include "NVPTX.h" +#include "NVPTXRegisterInfo.h" +#include "NVPTXSubtarget.h" +#include "NVPTXTargetMachine.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { + return true; +} + +void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { + if (MF.getFrameInfo()->hasStackObjects()) { + MachineBasicBlock &MBB = MF.front(); + // Insert "mov.u32 %SP, %Depot" + MachineBasicBlock::iterator MBBI = MBB.begin(); + // This instruction really occurs before first instruction + // in the BB, so giving it no debug location. + DebugLoc dl = DebugLoc(); + + if (tm.getSubtargetImpl()->hasGenericLdSt()) { + // mov %SPL, %depot; + // cvta.local %SP, %SPL; + if (is64bit) { + MachineInstr *MI = BuildMI(MBB, MBBI, dl, + tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64), + NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal); + BuildMI(MBB, MI, dl, + tm.getInstrInfo()->get(NVPTX::IMOV64rr), NVPTX::VRFrameLocal) + .addReg(NVPTX::VRDepot); + } else { + MachineInstr *MI = BuildMI(MBB, MBBI, dl, + tm.getInstrInfo()->get(NVPTX::cvta_local_yes), + NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal); + BuildMI(MBB, MI, dl, + tm.getInstrInfo()->get(NVPTX::IMOV32rr), NVPTX::VRFrameLocal) + .addReg(NVPTX::VRDepot); + } + } + else { + // mov %SP, %depot; + if (is64bit) + BuildMI(MBB, MBBI, dl, + tm.getInstrInfo()->get(NVPTX::IMOV64rr), NVPTX::VRFrame) + .addReg(NVPTX::VRDepot); + else + BuildMI(MBB, MBBI, dl, + tm.getInstrInfo()->get(NVPTX::IMOV32rr), NVPTX::VRFrame) + .addReg(NVPTX::VRDepot); + } + } +} + +void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { +} diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h new file mode 100644 index 0000000..ee87b39 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -0,0 +1,40 @@ +//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTX_FRAMELOWERING_H +#define NVPTX_FRAMELOWERING_H + +#include "llvm/Target/TargetFrameLowering.h" + + +namespace llvm { +class NVPTXTargetMachine; + +class NVPTXFrameLowering : public TargetFrameLowering { + NVPTXTargetMachine &tm; + bool is64bit; + +public: + explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit) + : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), + tm(_tm), is64bit(_is64bit) {} + + virtual bool hasFP(const MachineFunction &MF) const; + virtual void emitPrologue(MachineFunction &MF) const; + virtual void emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const; +}; + +} // End llvm namespace + +#endif diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp new file mode 100644 index 0000000..4e92f0e --- /dev/null +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -0,0 +1,683 @@ +//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the NVPTX target. +// +//===----------------------------------------------------------------------===// + + +#include "llvm/Instructions.h" +#include "llvm/Support/raw_ostream.h" +#include "NVPTXISelDAGToDAG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetIntrinsicInfo.h" +#include "llvm/GlobalValue.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "nvptx-isel" + +using namespace llvm; + + +static cl::opt<bool> +UseFMADInstruction("nvptx-mad-enable", + cl::ZeroOrMore, + cl::desc("NVPTX Specific: Enable generating FMAD instructions"), + cl::init(false)); + +static cl::opt<int> +FMAContractLevel("nvptx-fma-level", + cl::ZeroOrMore, + cl::desc("NVPTX Specific: FMA contraction (0: don't do it" + " 1: do it 2: do it aggressively"), + cl::init(2)); + + +static cl::opt<int> +UsePrecDivF32("nvptx-prec-divf32", + cl::ZeroOrMore, + cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" + " IEEE Compliant F32 div.rnd if avaiable."), + cl::init(2)); + +/// createNVPTXISelDag - This pass converts a legalized DAG into a +/// NVPTX-specific DAG, ready for instruction scheduling. +FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, + llvm::CodeGenOpt::Level OptLevel) { + return new NVPTXDAGToDAGISel(TM, OptLevel); +} + + +NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, + CodeGenOpt::Level OptLevel) +: SelectionDAGISel(tm, OptLevel), + Subtarget(tm.getSubtarget<NVPTXSubtarget>()) +{ + // Always do fma.f32 fpcontract if the target supports the instruction. + // Always do fma.f64 fpcontract if the target supports the instruction. + // Do mad.f32 is nvptx-mad-enable is specified and the target does not + // support fma.f32. + + doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32(); + doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && + (FMAContractLevel>=1); + doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && + (FMAContractLevel>=1); + doFMAF32AGG = (OptLevel > 0) && Subtarget.hasFMAF32() && + (FMAContractLevel==2); + doFMAF64AGG = (OptLevel > 0) && Subtarget.hasFMAF64() && + (FMAContractLevel==2); + + allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction; + + UseF32FTZ = false; + + doMulWide = (OptLevel > 0); + + // Decide how to translate f32 div + do_DIVF32_PREC = UsePrecDivF32; + // sm less than sm_20 does not support div.rnd. Use div.full. + if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20()) + do_DIVF32_PREC = 1; + +} + +/// Select - Select instructions not customized! Used for +/// expanded, promoted and normal instructions. +SDNode* NVPTXDAGToDAGISel::Select(SDNode *N) { + + if (N->isMachineOpcode()) + return NULL; // Already selected. + + SDNode *ResNode = NULL; + switch (N->getOpcode()) { + case ISD::LOAD: + ResNode = SelectLoad(N); + break; + case ISD::STORE: + ResNode = SelectStore(N); + break; + } + if (ResNode) + return ResNode; + return SelectCode(N); +} + + +static unsigned int +getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget) +{ + const Value *Src = N->getSrcValue(); + if (!Src) + return NVPTX::PTXLdStInstCode::LOCAL; + + if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) { + switch (PT->getAddressSpace()) { + case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; + case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; + case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; + case llvm::ADDRESS_SPACE_CONST_NOT_GEN: + return NVPTX::PTXLdStInstCode::CONSTANT; + case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; + case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; + case llvm::ADDRESS_SPACE_CONST: + // If the arch supports generic address space, translate it to GLOBAL + // for correctness. + // If the arch does not support generic address space, then the arch + // does not really support ADDRESS_SPACE_CONST, translate it to + // to CONSTANT for better performance. + if (Subtarget.hasGenericLdSt()) + return NVPTX::PTXLdStInstCode::GLOBAL; + else + return NVPTX::PTXLdStInstCode::CONSTANT; + default: break; + } + } + return NVPTX::PTXLdStInstCode::LOCAL; +} + + +SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { + DebugLoc dl = N->getDebugLoc(); + LoadSDNode *LD = cast<LoadSDNode>(N); + EVT LoadedVT = LD->getMemoryVT(); + SDNode *NVPTXLD= NULL; + + // do not support pre/post inc/dec + if (LD->isIndexed()) + return NULL; + + if (!LoadedVT.isSimple()) + return NULL; + + // Address Space Setting + unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget); + + // Volatile Setting + // - .volatile is only availalble for .global and .shared + bool isVolatile = LD->isVolatile(); + if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && + codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && + codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) + isVolatile = false; + + // Vector Setting + MVT SimpleVT = LoadedVT.getSimpleVT(); + unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + if (SimpleVT.isVector()) { + unsigned num = SimpleVT.getVectorNumElements(); + if (num == 2) + vecType = NVPTX::PTXLdStInstCode::V2; + else if (num == 4) + vecType = NVPTX::PTXLdStInstCode::V4; + else + return NULL; + } + + // Type Setting: fromType + fromTypeWidth + // + // Sign : ISD::SEXTLOAD + // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the + // type is integer + // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float + MVT ScalarVT = SimpleVT.getScalarType(); + unsigned fromTypeWidth = ScalarVT.getSizeInBits(); + unsigned int fromType; + if ((LD->getExtensionType() == ISD::SEXTLOAD)) + fromType = NVPTX::PTXLdStInstCode::Signed; + else if (ScalarVT.isFloatingPoint()) + fromType = NVPTX::PTXLdStInstCode::Float; + else + fromType = NVPTX::PTXLdStInstCode::Unsigned; + + // Create the machine instruction DAG + SDValue Chain = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Addr; + SDValue Offset, Base; + unsigned Opcode; + MVT::SimpleValueType TargetVT = LD->getValueType(0).getSimpleVT().SimpleTy; + + if (SelectDirectAddr(N1, Addr)) { + switch (TargetVT) { + case MVT::i8: Opcode = NVPTX::LD_i8_avar; break; + case MVT::i16: Opcode = NVPTX::LD_i16_avar; break; + case MVT::i32: Opcode = NVPTX::LD_i32_avar; break; + case MVT::i64: Opcode = NVPTX::LD_i64_avar; break; + case MVT::f32: Opcode = NVPTX::LD_f32_avar; break; + case MVT::f64: Opcode = NVPTX::LD_f64_avar; break; + case MVT::v2i8: Opcode = NVPTX::LD_v2i8_avar; break; + case MVT::v2i16: Opcode = NVPTX::LD_v2i16_avar; break; + case MVT::v2i32: Opcode = NVPTX::LD_v2i32_avar; break; + case MVT::v2i64: Opcode = NVPTX::LD_v2i64_avar; break; + case MVT::v2f32: Opcode = NVPTX::LD_v2f32_avar; break; + case MVT::v2f64: Opcode = NVPTX::LD_v2f64_avar; break; + case MVT::v4i8: Opcode = NVPTX::LD_v4i8_avar; break; + case MVT::v4i16: Opcode = NVPTX::LD_v4i16_avar; break; + case MVT::v4i32: Opcode = NVPTX::LD_v4i32_avar; break; + case MVT::v4f32: Opcode = NVPTX::LD_v4f32_avar; break; + default: return NULL; + } + SDValue Ops[] = { getI32Imm(isVolatile), + getI32Imm(codeAddrSpace), + getI32Imm(vecType), + getI32Imm(fromType), + getI32Imm(fromTypeWidth), + Addr, Chain }; + NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, + MVT::Other, Ops, 7); + } else if (Subtarget.is64Bit()? + SelectADDRsi64(N1.getNode(), N1, Base, Offset): + SelectADDRsi(N1.getNode(), N1, Base, Offset)) { + switch (TargetVT) { + case MVT::i8: Opcode = NVPTX::LD_i8_asi; break; + case MVT::i16: Opcode = NVPTX::LD_i16_asi; break; + case MVT::i32: Opcode = NVPTX::LD_i32_asi; break; + case MVT::i64: Opcode = NVPTX::LD_i64_asi; break; + case MVT::f32: Opcode = NVPTX::LD_f32_asi; break; + case MVT::f64: Opcode = NVPTX::LD_f64_asi; break; + case MVT::v2i8: Opcode = NVPTX::LD_v2i8_asi; break; + case MVT::v2i16: Opcode = NVPTX::LD_v2i16_asi; break; + case MVT::v2i32: Opcode = NVPTX::LD_v2i32_asi; break; + case MVT::v2i64: Opcode = NVPTX::LD_v2i64_asi; break; + case MVT::v2f32: Opcode = NVPTX::LD_v2f32_asi; break; + case MVT::v2f64: Opcode = NVPTX::LD_v2f64_asi; break; + case MVT::v4i8: Opcode = NVPTX::LD_v4i8_asi; break; + case MVT::v4i16: Opcode = NVPTX::LD_v4i16_asi; break; + case MVT::v4i32: Opcode = NVPTX::LD_v4i32_asi; break; + case MVT::v4f32: Opcode = NVPTX::LD_v4f32_asi; break; + default: return NULL; + } + SDValue Ops[] = { getI32Imm(isVolatile), + getI32Imm(codeAddrSpace), + getI32Imm(vecType), + getI32Imm(fromType), + getI32Imm(fromTypeWidth), + Base, Offset, Chain }; + NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, + MVT::Other, Ops, 8); + } else if (Subtarget.is64Bit()? + SelectADDRri64(N1.getNode(), N1, Base, Offset): + SelectADDRri(N1.getNode(), N1, Base, Offset)) { + switch (TargetVT) { + case MVT::i8: Opcode = NVPTX::LD_i8_ari; break; + case MVT::i16: Opcode = NVPTX::LD_i16_ari; break; + case MVT::i32: Opcode = NVPTX::LD_i32_ari; break; + case MVT::i64: Opcode = NVPTX::LD_i64_ari; break; + case MVT::f32: Opcode = NVPTX::LD_f32_ari; break; + case MVT::f64: Opcode = NVPTX::LD_f64_ari; break; + case MVT::v2i8: Opcode = NVPTX::LD_v2i8_ari; break; + case MVT::v2i16: Opcode = NVPTX::LD_v2i16_ari; break; + case MVT::v2i32: Opcode = NVPTX::LD_v2i32_ari; break; + case MVT::v2i64: Opcode = NVPTX::LD_v2i64_ari; break; + case MVT::v2f32: Opcode = NVPTX::LD_v2f32_ari; break; + case MVT::v2f64: Opcode = NVPTX::LD_v2f64_ari; break; + case MVT::v4i8: Opcode = NVPTX::LD_v4i8_ari; break; + case MVT::v4i16: Opcode = NVPTX::LD_v4i16_ari; break; + case MVT::v4i32: Opcode = NVPTX::LD_v4i32_ari; break; + case MVT::v4f32: Opcode = NVPTX::LD_v4f32_ari; break; + default: return NULL; + } + SDValue Ops[] = { getI32Imm(isVolatile), + getI32Imm(codeAddrSpace), + getI32Imm(vecType), + getI32Imm(fromType), + getI32Imm(fromTypeWidth), + Base, Offset, Chain }; + NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, + MVT::Other, Ops, 8); + } + else { + switch (TargetVT) { + case MVT::i8: Opcode = NVPTX::LD_i8_areg; break; + case MVT::i16: Opcode = NVPTX::LD_i16_areg; break; + case MVT::i32: Opcode = NVPTX::LD_i32_areg; break; + case MVT::i64: Opcode = NVPTX::LD_i64_areg; break; + case MVT::f32: Opcode = NVPTX::LD_f32_areg; break; + case MVT::f64: Opcode = NVPTX::LD_f64_areg; break; + case MVT::v2i8: Opcode = NVPTX::LD_v2i8_areg; break; + case MVT::v2i16: Opcode = NVPTX::LD_v2i16_areg; break; + case MVT::v2i32: Opcode = NVPTX::LD_v2i32_areg; break; + case MVT::v2i64: Opcode = NVPTX::LD_v2i64_areg; break; + case MVT::v2f32: Opcode = NVPTX::LD_v2f32_areg; break; + case MVT::v2f64: Opcode = NVPTX::LD_v2f64_areg; break; + case MVT::v4i8: Opcode = NVPTX::LD_v4i8_areg; break; + case MVT::v4i16: Opcode = NVPTX::LD_v4i16_areg; break; + case MVT::v4i32: Opcode = NVPTX::LD_v4i32_areg; break; + case MVT::v4f32: Opcode = NVPTX::LD_v4f32_areg; break; + default: return NULL; + } + SDValue Ops[] = { getI32Imm(isVolatile), + getI32Imm(codeAddrSpace), + getI32Imm(vecType), + getI32Imm(fromType), + getI32Imm(fromTypeWidth), + N1, Chain }; + NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, + MVT::Other, Ops, 7); + } + + if (NVPTXLD != NULL) { + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1); + } + + return NVPTXLD; +} + +SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) { + DebugLoc dl = N->getDebugLoc(); + StoreSDNode *ST = cast<StoreSDNode>(N); + EVT StoreVT = ST->getMemoryVT(); + SDNode *NVPTXST = NULL; + + // do not support pre/post inc/dec + if (ST->isIndexed()) + return NULL; + + if (!StoreVT.isSimple()) + return NULL; + + // Address Space Setting + unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget); + + // Volatile Setting + // - .volatile is only availalble for .global and .shared + bool isVolatile = ST->isVolatile(); + if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && + codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && + codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) + isVolatile = false; + + // Vector Setting + MVT SimpleVT = StoreVT.getSimpleVT(); + unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + if (SimpleVT.isVector()) { + unsigned num = SimpleVT.getVectorNumElements(); + if (num == 2) + vecType = NVPTX::PTXLdStInstCode::V2; + else if (num == 4) + vecType = NVPTX::PTXLdStInstCode::V4; + else + return NULL; + } + + // Type Setting: toType + toTypeWidth + // - for integer type, always use 'u' + // + MVT ScalarVT = SimpleVT.getScalarType(); + unsigned toTypeWidth = ScalarVT.getSizeInBits(); + unsigned int toType; + if (ScalarVT.isFloatingPoint()) + toType = NVPTX::PTXLdStInstCode::Float; + else + toType = NVPTX::PTXLdStInstCode::Unsigned; + + // Create the machine instruction DAG + SDValue Chain = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + SDValue Addr; + SDValue Offset, Base; + unsigned Opcode; + MVT::SimpleValueType SourceVT = + N1.getNode()->getValueType(0).getSimpleVT().SimpleTy; + + if (SelectDirectAddr(N2, Addr)) { + switch (SourceVT) { + case MVT::i8: Opcode = NVPTX::ST_i8_avar; break; + case MVT::i16: Opcode = NVPTX::ST_i16_avar; break; + case MVT::i32: Opcode = NVPTX::ST_i32_avar; break; + case MVT::i64: Opcode = NVPTX::ST_i64_avar; break; + case MVT::f32: Opcode = NVPTX::ST_f32_avar; break; + case MVT::f64: Opcode = NVPTX::ST_f64_avar; break; + case MVT::v2i8: Opcode = NVPTX::ST_v2i8_avar; break; + case MVT::v2i16: Opcode = NVPTX::ST_v2i16_avar; break; + case MVT::v2i32: Opcode = NVPTX::ST_v2i32_avar; break; + case MVT::v2i64: Opcode = NVPTX::ST_v2i64_avar; break; + case MVT::v2f32: Opcode = NVPTX::ST_v2f32_avar; break; + case MVT::v2f64: Opcode = NVPTX::ST_v2f64_avar; break; + case MVT::v4i8: Opcode = NVPTX::ST_v4i8_avar; break; + case MVT::v4i16: Opcode = NVPTX::ST_v4i16_avar; break; + case MVT::v4i32: Opcode = NVPTX::ST_v4i32_avar; break; + case MVT::v4f32: Opcode = NVPTX::ST_v4f32_avar; break; + default: return NULL; + } + SDValue Ops[] = { N1, + getI32Imm(isVolatile), + getI32Imm(codeAddrSpace), + getI32Imm(vecType), + getI32Imm(toType), + getI32Imm(toTypeWidth), + Addr, Chain }; + NVPTXST = CurDAG->getMachineNode(Opcode, dl, + MVT::Other, Ops, 8); + } else if (Subtarget.is64Bit()? + SelectADDRsi64(N2.getNode(), N2, Base, Offset): + SelectADDRsi(N2.getNode(), N2, Base, Offset)) { + switch (SourceVT) { + case MVT::i8: Opcode = NVPTX::ST_i8_asi; break; + case MVT::i16: Opcode = NVPTX::ST_i16_asi; break; + case MVT::i32: Opcode = NVPTX::ST_i32_asi; break; + case MVT::i64: Opcode = NVPTX::ST_i64_asi; break; + case MVT::f32: Opcode = NVPTX::ST_f32_asi; break; + case MVT::f64: Opcode = NVPTX::ST_f64_asi; break; + case MVT::v2i8: Opcode = NVPTX::ST_v2i8_asi; break; + case MVT::v2i16: Opcode = NVPTX::ST_v2i16_asi; break; + case MVT::v2i32: Opcode = NVPTX::ST_v2i32_asi; break; + case MVT::v2i64: Opcode = NVPTX::ST_v2i64_asi; break; + case MVT::v2f32: Opcode = NVPTX::ST_v2f32_asi; break; + case MVT::v2f64: Opcode = NVPTX::ST_v2f64_asi; break; + case MVT::v4i8: Opcode = NVPTX::ST_v4i8_asi; break; + case MVT::v4i16: Opcode = NVPTX::ST_v4i16_asi; break; + case MVT::v4i32: Opcode = NVPTX::ST_v4i32_asi; break; + case MVT::v4f32: Opcode = NVPTX::ST_v4f32_asi; break; + default: return NULL; + } + SDValue Ops[] = { N1, + getI32Imm(isVolatile), + getI32Imm(codeAddrSpace), + getI32Imm(vecType), + getI32Imm(toType), + getI32Imm(toTypeWidth), + Base, Offset, Chain }; + NVPTXST = CurDAG->getMachineNode(Opcode, dl, + MVT::Other, Ops, 9); + } else if (Subtarget.is64Bit()? + SelectADDRri64(N2.getNode(), N2, Base, Offset): + SelectADDRri(N2.getNode(), N2, Base, Offset)) { + switch (SourceVT) { + case MVT::i8: Opcode = NVPTX::ST_i8_ari; break; + case MVT::i16: Opcode = NVPTX::ST_i16_ari; break; + case MVT::i32: Opcode = NVPTX::ST_i32_ari; break; + case MVT::i64: Opcode = NVPTX::ST_i64_ari; break; + case MVT::f32: Opcode = NVPTX::ST_f32_ari; break; + case MVT::f64: Opcode = NVPTX::ST_f64_ari; break; + case MVT::v2i8: Opcode = NVPTX::ST_v2i8_ari; break; + case MVT::v2i16: Opcode = NVPTX::ST_v2i16_ari; break; + case MVT::v2i32: Opcode = NVPTX::ST_v2i32_ari; break; + case MVT::v2i64: Opcode = NVPTX::ST_v2i64_ari; break; + case MVT::v2f32: Opcode = NVPTX::ST_v2f32_ari; break; + case MVT::v2f64: Opcode = NVPTX::ST_v2f64_ari; break; + case MVT::v4i8: Opcode = NVPTX::ST_v4i8_ari; break; + case MVT::v4i16: Opcode = NVPTX::ST_v4i16_ari; break; + case MVT::v4i32: Opcode = NVPTX::ST_v4i32_ari; break; + case MVT::v4f32: Opcode = NVPTX::ST_v4f32_ari; break; + default: return NULL; + } + SDValue Ops[] = { N1, + getI32Imm(isVolatile), + getI32Imm(codeAddrSpace), + getI32Imm(vecType), + getI32Imm(toType), + getI32Imm(toTypeWidth), + Base, Offset, Chain }; + NVPTXST = CurDAG->getMachineNode(Opcode, dl, + MVT::Other, Ops, 9); + } else { + switch (SourceVT) { + case MVT::i8: Opcode = NVPTX::ST_i8_areg; break; + case MVT::i16: Opcode = NVPTX::ST_i16_areg; break; + case MVT::i32: Opcode = NVPTX::ST_i32_areg; break; + case MVT::i64: Opcode = NVPTX::ST_i64_areg; break; + case MVT::f32: Opcode = NVPTX::ST_f32_areg; break; + case MVT::f64: Opcode = NVPTX::ST_f64_areg; break; + case MVT::v2i8: Opcode = NVPTX::ST_v2i8_areg; break; + case MVT::v2i16: Opcode = NVPTX::ST_v2i16_areg; break; + case MVT::v2i32: Opcode = NVPTX::ST_v2i32_areg; break; + case MVT::v2i64: Opcode = NVPTX::ST_v2i64_areg; break; + case MVT::v2f32: Opcode = NVPTX::ST_v2f32_areg; break; + case MVT::v2f64: Opcode = NVPTX::ST_v2f64_areg; break; + case MVT::v4i8: Opcode = NVPTX::ST_v4i8_areg; break; + case MVT::v4i16: Opcode = NVPTX::ST_v4i16_areg; break; + case MVT::v4i32: Opcode = NVPTX::ST_v4i32_areg; break; + case MVT::v4f32: Opcode = NVPTX::ST_v4f32_areg; break; + default: return NULL; + } + SDValue Ops[] = { N1, + getI32Imm(isVolatile), + getI32Imm(codeAddrSpace), + getI32Imm(vecType), + getI32Imm(toType), + getI32Imm(toTypeWidth), + N2, Chain }; + NVPTXST = CurDAG->getMachineNode(Opcode, dl, + MVT::Other, Ops, 8); + } + + if (NVPTXST != NULL) { + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1); + } + + return NVPTXST; +} + +// SelectDirectAddr - Match a direct address for DAG. +// A direct address could be a globaladdress or externalsymbol. +bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { + // Return true if TGA or ES. + if (N.getOpcode() == ISD::TargetGlobalAddress + || N.getOpcode() == ISD::TargetExternalSymbol) { + Address = N; + return true; + } + if (N.getOpcode() == NVPTXISD::Wrapper) { + Address = N.getOperand(0); + return true; + } + if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) { + unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue(); + if (IID == Intrinsic::nvvm_ptr_gen_to_param) + if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam) + return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address)); + } + return false; +} + +// symbol+offset +bool NVPTXDAGToDAGISel::SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, + SDValue &Base, SDValue &Offset, + MVT mvt) { + if (Addr.getOpcode() == ISD::ADD) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { + SDValue base=Addr.getOperand(0); + if (SelectDirectAddr(base, Base)) { + Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt); + return true; + } + } + } + return false; +} + +// symbol+offset +bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr, + SDValue &Base, SDValue &Offset) { + return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32); +} + +// symbol+offset +bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr, + SDValue &Base, SDValue &Offset) { + return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64); +} + +// register+offset +bool NVPTXDAGToDAGISel::SelectADDRri_imp(SDNode *OpNode, SDValue Addr, + SDValue &Base, SDValue &Offset, + MVT mvt) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); + Offset = CurDAG->getTargetConstant(0, mvt); + return true; + } + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) + return false; // direct calls. + + if (Addr.getOpcode() == ISD::ADD) { + if (SelectDirectAddr(Addr.getOperand(0), Addr)) { + return false; + } + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { + if (FrameIndexSDNode *FIN = + dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) + // Constant offset from frame ref. + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); + else + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt); + return true; + } + } + return false; +} + +// register+offset +bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr, + SDValue &Base, SDValue &Offset) { + return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32); +} + +// register+offset +bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, + SDValue &Base, SDValue &Offset) { + return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64); +} + +bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, + unsigned int spN) const { + const Value *Src = NULL; + // Even though MemIntrinsicSDNode is a subclas of MemSDNode, + // the classof() for MemSDNode does not include MemIntrinsicSDNode + // (See SelectionDAGNodes.h). So we need to check for both. + if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) { + Src = mN->getSrcValue(); + } + else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) { + Src = mN->getSrcValue(); + } + if (!Src) + return false; + if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) + return (PT->getAddressSpace() == spN); + return false; +} + +/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for +/// inline asm expressions. +bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector<SDValue> &OutOps) { + SDValue Op0, Op1; + switch (ConstraintCode) { + default: return true; + case 'm': // memory + if (SelectDirectAddr(Op, Op0)) { + OutOps.push_back(Op0); + OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32)); + return false; + } + if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) { + OutOps.push_back(Op0); + OutOps.push_back(Op1); + return false; + } + break; + } + return true; +} + +// Return true if N is a undef or a constant. +// If N was undef, return a (i8imm 0) in Retval +// If N was imm, convert it to i8imm and return in Retval +// Note: The convert to i8imm is required, otherwise the +// pattern matcher inserts a bunch of IMOVi8rr to convert +// the imm to i8imm, and this causes instruction selection +// to fail. +bool NVPTXDAGToDAGISel::UndefOrImm(SDValue Op, SDValue N, + SDValue &Retval) { + if (!(N.getOpcode() == ISD::UNDEF) && + !(N.getOpcode() == ISD::Constant)) + return false; + + if (N.getOpcode() == ISD::UNDEF) + Retval = CurDAG->getTargetConstant(0, MVT::i8); + else { + ConstantSDNode *cn = cast<ConstantSDNode>(N.getNode()); + unsigned retval = cn->getZExtValue(); + Retval = CurDAG->getTargetConstant(retval, MVT::i8); + } + return true; +} diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h new file mode 100644 index 0000000..ccd69b2 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -0,0 +1,105 @@ +//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the NVPTX target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "nvptx-isel" + +#include "NVPTX.h" +#include "NVPTXISelLowering.h" +#include "NVPTXRegisterInfo.h" +#include "NVPTXTargetMachine.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Intrinsics.h" +using namespace llvm; + +namespace { + +class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { + + // If true, generate corresponding FPCONTRACT. This is + // language dependent (i.e. CUDA and OpenCL works differently). + bool doFMADF32; + bool doFMAF64; + bool doFMAF32; + bool doFMAF64AGG; + bool doFMAF32AGG; + bool allowFMA; + + // 0: use div.approx + // 1: use div.full + // 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated; + // Otherwise, use div.full + int do_DIVF32_PREC; + + // If true, add .ftz to f32 instructions. + // This is only meaningful for sm_20 and later, as the default + // is not ftz. + // For sm earlier than sm_20, f32 denorms are always ftz by the + // hardware. + // We always add the .ftz modifier regardless of the sm value + // when Use32FTZ is true. + bool UseF32FTZ; + + // If true, generate mul.wide from sext and mul + bool doMulWide; + +public: + explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, + CodeGenOpt::Level OptLevel); + + // Pass Name + virtual const char *getPassName() const { + return "NVPTX DAG->DAG Pattern Instruction Selection"; + } + + const NVPTXSubtarget &Subtarget; + + virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector<SDValue> &OutOps); +private: + // Include the pieces autogenerated from the target description. +#include "NVPTXGenDAGISel.inc" + + SDNode *Select(SDNode *N); + SDNode* SelectLoad(SDNode *N); + SDNode* SelectStore(SDNode *N); + + inline SDValue getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + // Match direct address complex pattern. + bool SelectDirectAddr(SDValue N, SDValue &Address); + + bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base, + SDValue &Offset, MVT mvt); + bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base, + SDValue &Offset); + bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base, + SDValue &Offset); + + bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base, + SDValue &Offset, MVT mvt); + bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base, + SDValue &Offset); + bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base, + SDValue &Offset); + + + bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; + + bool UndefOrImm(SDValue Op, SDValue N, SDValue &Retval); + +}; +} diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp new file mode 100644 index 0000000..6ea10ea --- /dev/null +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -0,0 +1,1291 @@ +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that NVPTX uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + + +#include "NVPTX.h" +#include "NVPTXISelLowering.h" +#include "NVPTXTargetMachine.h" +#include "NVPTXTargetObjectFile.h" +#include "NVPTXUtilities.h" +#include "llvm/Intrinsics.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/DerivedTypes.h" +#include "llvm/GlobalValue.h" +#include "llvm/Module.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/MC/MCSectionELF.h" +#include <sstream> + +#undef DEBUG_TYPE +#define DEBUG_TYPE "nvptx-lower" + +using namespace llvm; + +static unsigned int uniqueCallSite = 0; + +static cl::opt<bool> +RetainVectorOperands("nvptx-codegen-vectors", + cl::desc("NVPTX Specific: Retain LLVM's vectors and generate PTX vectors"), + cl::init(true)); + +static cl::opt<bool> +sched4reg("nvptx-sched4reg", + cl::desc("NVPTX Specific: schedule for register pressue"), + cl::init(false)); + +// NVPTXTargetLowering Constructor. +NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) +: TargetLowering(TM, new NVPTXTargetObjectFile()), + nvTM(&TM), + nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) { + + // always lower memset, memcpy, and memmove intrinsics to load/store + // instructions, rather + // then generating calls to memset, mempcy or memmove. + maxStoresPerMemset = (unsigned)0xFFFFFFFF; + maxStoresPerMemcpy = (unsigned)0xFFFFFFFF; + maxStoresPerMemmove = (unsigned)0xFFFFFFFF; + + setBooleanContents(ZeroOrNegativeOneBooleanContent); + + // Jump is Expensive. Don't create extra control flow for 'and', 'or' + // condition branches. + setJumpIsExpensive(true); + + // By default, use the Source scheduling + if (sched4reg) + setSchedulingPreference(Sched::RegPressure); + else + setSchedulingPreference(Sched::Source); + + addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); + addRegisterClass(MVT::i8, &NVPTX::Int8RegsRegClass); + addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); + addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); + addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); + addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); + addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); + + if (RetainVectorOperands) { + addRegisterClass(MVT::v2f32, &NVPTX::V2F32RegsRegClass); + addRegisterClass(MVT::v4f32, &NVPTX::V4F32RegsRegClass); + addRegisterClass(MVT::v2i32, &NVPTX::V2I32RegsRegClass); + addRegisterClass(MVT::v4i32, &NVPTX::V4I32RegsRegClass); + addRegisterClass(MVT::v2f64, &NVPTX::V2F64RegsRegClass); + addRegisterClass(MVT::v2i64, &NVPTX::V2I64RegsRegClass); + addRegisterClass(MVT::v2i16, &NVPTX::V2I16RegsRegClass); + addRegisterClass(MVT::v4i16, &NVPTX::V4I16RegsRegClass); + addRegisterClass(MVT::v2i8, &NVPTX::V2I8RegsRegClass); + addRegisterClass(MVT::v4i8, &NVPTX::V4I8RegsRegClass); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16 , Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i8 , Custom); + + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i8 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16 , Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i8 , Custom); + } + + // Operations not directly supported by NVPTX. + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::Other, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); + + if (nvptxSubtarget.hasROT64()) { + setOperationAction(ISD::ROTL , MVT::i64, Legal); + setOperationAction(ISD::ROTR , MVT::i64, Legal); + } + else { + setOperationAction(ISD::ROTL , MVT::i64, Expand); + setOperationAction(ISD::ROTR , MVT::i64, Expand); + } + if (nvptxSubtarget.hasROT32()) { + setOperationAction(ISD::ROTL , MVT::i32, Legal); + setOperationAction(ISD::ROTR , MVT::i32, Legal); + } + else { + setOperationAction(ISD::ROTL , MVT::i32, Expand); + setOperationAction(ISD::ROTR , MVT::i32, Expand); + } + + setOperationAction(ISD::ROTL , MVT::i16, Expand); + setOperationAction(ISD::ROTR , MVT::i16, Expand); + setOperationAction(ISD::ROTL , MVT::i8, Expand); + setOperationAction(ISD::ROTR , MVT::i8, Expand); + setOperationAction(ISD::BSWAP , MVT::i16, Expand); + setOperationAction(ISD::BSWAP , MVT::i32, Expand); + setOperationAction(ISD::BSWAP , MVT::i64, Expand); + + // Indirect branch is not supported. + // This also disables Jump Table creation. + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + + setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); + + // We want to legalize constant related memmove and memcopy + // intrinsics. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + + // Turn FP extload into load/fextend + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + // Turn FP truncstore into trunc + store. + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // PTX does not support load / store predicate registers + setOperationAction(ISD::LOAD, MVT::i1, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); + setOperationAction(ISD::STORE, MVT::i1, Expand); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::i32, MVT::i1, Expand); + setTruncStoreAction(MVT::i16, MVT::i1, Expand); + setTruncStoreAction(MVT::i8, MVT::i1, Expand); + + // This is legal in NVPTX + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + + // TRAP can be lowered to PTX trap + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // By default, CONCAT_VECTORS is implemented via store/load + // through stack. It is slow and uses local memory. We need + // to custom-lowering them. + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i8 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f32 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i16 , Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i8 , Custom); + + // Expand vector int to float and float to int conversions + // - For SINT_TO_FP and UINT_TO_FP, the src type + // (Node->getOperand(0).getValueType()) + // is used to determine the action, while for FP_TO_UINT and FP_TO_SINT, + // the dest type (Node->getValueType(0)) is used. + // + // See VectorLegalizer::LegalizeOp() (LegalizeVectorOps.cpp) for the vector + // case, and + // SelectionDAGLegalize::LegalizeOp() (LegalizeDAG.cpp) for the scalar case. + // + // That is why v4i32 or v2i32 are used here. + // + // The expansion for vectors happens in VectorLegalizer::LegalizeOp() + // (LegalizeVectorOps.cpp). + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); + + // Now deduce the information based on the above mentioned + // actions + computeRegisterProperties(); +} + + +const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + case NVPTXISD::CALL: return "NVPTXISD::CALL"; + case NVPTXISD::RET_FLAG: return "NVPTXISD::RET_FLAG"; + case NVPTXISD::Wrapper: return "NVPTXISD::Wrapper"; + case NVPTXISD::NVBuiltin: return "NVPTXISD::NVBuiltin"; + case NVPTXISD::DeclareParam: return "NVPTXISD::DeclareParam"; + case NVPTXISD::DeclareScalarParam: + return "NVPTXISD::DeclareScalarParam"; + case NVPTXISD::DeclareRet: return "NVPTXISD::DeclareRet"; + case NVPTXISD::DeclareRetParam: return "NVPTXISD::DeclareRetParam"; + case NVPTXISD::PrintCall: return "NVPTXISD::PrintCall"; + case NVPTXISD::LoadParam: return "NVPTXISD::LoadParam"; + case NVPTXISD::StoreParam: return "NVPTXISD::StoreParam"; + case NVPTXISD::StoreParamS32: return "NVPTXISD::StoreParamS32"; + case NVPTXISD::StoreParamU32: return "NVPTXISD::StoreParamU32"; + case NVPTXISD::MoveToParam: return "NVPTXISD::MoveToParam"; + case NVPTXISD::CallArgBegin: return "NVPTXISD::CallArgBegin"; + case NVPTXISD::CallArg: return "NVPTXISD::CallArg"; + case NVPTXISD::LastCallArg: return "NVPTXISD::LastCallArg"; + case NVPTXISD::CallArgEnd: return "NVPTXISD::CallArgEnd"; + case NVPTXISD::CallVoid: return "NVPTXISD::CallVoid"; + case NVPTXISD::CallVal: return "NVPTXISD::CallVal"; + case NVPTXISD::CallSymbol: return "NVPTXISD::CallSymbol"; + case NVPTXISD::Prototype: return "NVPTXISD::Prototype"; + case NVPTXISD::MoveParam: return "NVPTXISD::MoveParam"; + case NVPTXISD::MoveRetval: return "NVPTXISD::MoveRetval"; + case NVPTXISD::MoveToRetval: return "NVPTXISD::MoveToRetval"; + case NVPTXISD::StoreRetval: return "NVPTXISD::StoreRetval"; + case NVPTXISD::PseudoUseParam: return "NVPTXISD::PseudoUseParam"; + case NVPTXISD::RETURN: return "NVPTXISD::RETURN"; + case NVPTXISD::CallSeqBegin: return "NVPTXISD::CallSeqBegin"; + case NVPTXISD::CallSeqEnd: return "NVPTXISD::CallSeqEnd"; + } +} + + +SDValue +NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); + return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op); +} + +std::string NVPTXTargetLowering::getPrototype(Type *retTy, + const ArgListTy &Args, + const SmallVectorImpl<ISD::OutputArg> &Outs, + unsigned retAlignment) const { + + bool isABI = (nvptxSubtarget.getSmVersion() >= 20); + + std::stringstream O; + O << "prototype_" << uniqueCallSite << " : .callprototype "; + + if (retTy->getTypeID() == Type::VoidTyID) + O << "()"; + else { + O << "("; + if (isABI) { + if (retTy->isPrimitiveType() || retTy->isIntegerTy()) { + unsigned size = 0; + if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) { + size = ITy->getBitWidth(); + if (size < 32) size = 32; + } + else { + assert(retTy->isFloatingPointTy() && + "Floating point type expected here"); + size = retTy->getPrimitiveSizeInBits(); + } + + O << ".param .b" << size << " _"; + } + else if (isa<PointerType>(retTy)) + O << ".param .b" << getPointerTy().getSizeInBits() + << " _"; + else { + if ((retTy->getTypeID() == Type::StructTyID) || + isa<VectorType>(retTy)) { + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*this, retTy, vtparts); + unsigned totalsz = 0; + for (unsigned i=0,e=vtparts.size(); i!=e; ++i) { + unsigned elems = 1; + EVT elemtype = vtparts[i]; + if (vtparts[i].isVector()) { + elems = vtparts[i].getVectorNumElements(); + elemtype = vtparts[i].getVectorElementType(); + } + for (unsigned j=0, je=elems; j!=je; ++j) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 8)) sz = 8; + totalsz += sz/8; + } + } + O << ".param .align " + << retAlignment + << " .b8 _[" + << totalsz << "]"; + } + else { + assert(false && + "Unknown return type"); + } + } + } + else { + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*this, retTy, vtparts); + unsigned idx = 0; + for (unsigned i=0,e=vtparts.size(); i!=e; ++i) { + unsigned elems = 1; + EVT elemtype = vtparts[i]; + if (vtparts[i].isVector()) { + elems = vtparts[i].getVectorNumElements(); + elemtype = vtparts[i].getVectorElementType(); + } + + for (unsigned j=0, je=elems; j!=je; ++j) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 32)) sz = 32; + O << ".reg .b" << sz << " _"; + if (j<je-1) O << ", "; + ++idx; + } + if (i < e-1) + O << ", "; + } + } + O << ") "; + } + O << "_ ("; + + bool first = true; + MVT thePointerTy = getPointerTy(); + + for (unsigned i=0,e=Args.size(); i!=e; ++i) { + const Type *Ty = Args[i].Ty; + if (!first) { + O << ", "; + } + first = false; + + if (Outs[i].Flags.isByVal() == false) { + unsigned sz = 0; + if (isa<IntegerType>(Ty)) { + sz = cast<IntegerType>(Ty)->getBitWidth(); + if (sz < 32) sz = 32; + } + else if (isa<PointerType>(Ty)) + sz = thePointerTy.getSizeInBits(); + else + sz = Ty->getPrimitiveSizeInBits(); + if (isABI) + O << ".param .b" << sz << " "; + else + O << ".reg .b" << sz << " "; + O << "_"; + continue; + } + const PointerType *PTy = dyn_cast<PointerType>(Ty); + assert(PTy && + "Param with byval attribute should be a pointer type"); + Type *ETy = PTy->getElementType(); + + if (isABI) { + unsigned align = Outs[i].Flags.getByValAlign(); + unsigned sz = getTargetData()->getTypeAllocSize(ETy); + O << ".param .align " << align + << " .b8 "; + O << "_"; + O << "[" << sz << "]"; + continue; + } + else { + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*this, ETy, vtparts); + for (unsigned i=0,e=vtparts.size(); i!=e; ++i) { + unsigned elems = 1; + EVT elemtype = vtparts[i]; + if (vtparts[i].isVector()) { + elems = vtparts[i].getVectorNumElements(); + elemtype = vtparts[i].getVectorElementType(); + } + + for (unsigned j=0,je=elems; j!=je; ++j) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 32)) sz = 32; + O << ".reg .b" << sz << " "; + O << "_"; + if (j<je-1) O << ", "; + } + if (i<e-1) + O << ", "; + } + continue; + } + } + O << ");"; + return O.str(); +} + + +SDValue +NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + ArgListTy &Args = CLI.Args; + Type *retTy = CLI.RetTy; + ImmutableCallSite *CS = CLI.CS; + + bool isABI = (nvptxSubtarget.getSmVersion() >= 20); + + SDValue tempChain = Chain; + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getIntPtrConstant(uniqueCallSite, true)); + SDValue InFlag = Chain.getValue(1); + + assert((Outs.size() == Args.size()) && + "Unexpected number of arguments to function call"); + unsigned paramCount = 0; + // Declare the .params or .reg need to pass values + // to the function + for (unsigned i=0, e=Outs.size(); i!=e; ++i) { + EVT VT = Outs[i].VT; + + if (Outs[i].Flags.isByVal() == false) { + // Plain scalar + // for ABI, declare .param .b<size> .param<n>; + // for nonABI, declare .reg .b<size> .param<n>; + unsigned isReg = 1; + if (isABI) + isReg = 0; + unsigned sz = VT.getSizeInBits(); + if (VT.isInteger() && (sz < 32)) sz = 32; + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareParamOps[] = { Chain, + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(sz, MVT::i32), + DAG.getConstant(isReg, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, + DeclareParamOps, 5); + InFlag = Chain.getValue(1); + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(0, MVT::i32), OutVals[i], InFlag }; + + unsigned opcode = NVPTXISD::StoreParam; + if (isReg) + opcode = NVPTXISD::MoveToParam; + else { + if (Outs[i].Flags.isZExt()) + opcode = NVPTXISD::StoreParamU32; + else if (Outs[i].Flags.isSExt()) + opcode = NVPTXISD::StoreParamS32; + } + Chain = DAG.getNode(opcode, dl, CopyParamVTs, CopyParamOps, 5); + + InFlag = Chain.getValue(1); + ++paramCount; + continue; + } + // struct or vector + SmallVector<EVT, 16> vtparts; + const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty); + assert(PTy && + "Type of a byval parameter should be pointer"); + ComputeValueVTs(*this, PTy->getElementType(), vtparts); + + if (isABI) { + // declare .param .align 16 .b8 .param<n>[<size>]; + unsigned sz = Outs[i].Flags.getByValSize(); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + // The ByValAlign in the Outs[i].Flags is alway set at this point, so we + // don't need to + // worry about natural alignment or not. See TargetLowering::LowerCallTo() + SDValue DeclareParamOps[] = { Chain, + DAG.getConstant(Outs[i].Flags.getByValAlign(), MVT::i32), + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(sz, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps, 5); + InFlag = Chain.getValue(1); + unsigned curOffset = 0; + for (unsigned j=0,je=vtparts.size(); j!=je; ++j) { + unsigned elems = 1; + EVT elemtype = vtparts[j]; + if (vtparts[j].isVector()) { + elems = vtparts[j].getVectorNumElements(); + elemtype = vtparts[j].getVectorElementType(); + } + for (unsigned k=0,ke=elems; k!=ke; ++k) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 8)) sz = 8; + SDValue srcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), + OutVals[i], + DAG.getConstant(curOffset, + getPointerTy())); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), false, false, false, 0); + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, + MVT::i32), + DAG.getConstant(curOffset, MVT::i32), + theVal, InFlag }; + Chain = DAG.getNode(NVPTXISD::StoreParam, dl, CopyParamVTs, + CopyParamOps, 5); + InFlag = Chain.getValue(1); + curOffset += sz/8; + } + } + ++paramCount; + continue; + } + // Non-abi, struct or vector + // Declare a bunch or .reg .b<size> .param<n> + unsigned curOffset = 0; + for (unsigned j=0,je=vtparts.size(); j!=je; ++j) { + unsigned elems = 1; + EVT elemtype = vtparts[j]; + if (vtparts[j].isVector()) { + elems = vtparts[j].getVectorNumElements(); + elemtype = vtparts[j].getVectorElementType(); + } + for (unsigned k=0,ke=elems; k!=ke; ++k) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 32)) sz = 32; + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareParamOps[] = { Chain, DAG.getConstant(paramCount, + MVT::i32), + DAG.getConstant(sz, MVT::i32), + DAG.getConstant(1, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, + DeclareParamOps, 5); + InFlag = Chain.getValue(1); + SDValue srcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i], + DAG.getConstant(curOffset, + getPointerTy())); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), false, false, false, 0); + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(0, MVT::i32), theVal, + InFlag }; + Chain = DAG.getNode(NVPTXISD::MoveToParam, dl, CopyParamVTs, + CopyParamOps, 5); + InFlag = Chain.getValue(1); + ++paramCount; + } + } + } + + GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); + unsigned retAlignment = 0; + + // Handle Result + unsigned retCount = 0; + if (Ins.size() > 0) { + SmallVector<EVT, 16> resvtparts; + ComputeValueVTs(*this, retTy, resvtparts); + + // Declare one .param .align 16 .b8 func_retval0[<size>] for ABI or + // individual .reg .b<size> func_retval<0..> for non ABI + unsigned resultsz = 0; + for (unsigned i=0,e=resvtparts.size(); i!=e; ++i) { + unsigned elems = 1; + EVT elemtype = resvtparts[i]; + if (resvtparts[i].isVector()) { + elems = resvtparts[i].getVectorNumElements(); + elemtype = resvtparts[i].getVectorElementType(); + } + for (unsigned j=0,je=elems; j!=je; ++j) { + unsigned sz = elemtype.getSizeInBits(); + if (isABI == false) { + if (elemtype.isInteger() && (sz < 32)) sz = 32; + } + else { + if (elemtype.isInteger() && (sz < 8)) sz = 8; + } + if (isABI == false) { + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, DAG.getConstant(2, MVT::i32), + DAG.getConstant(sz, MVT::i32), + DAG.getConstant(retCount, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, + DeclareRetOps, 5); + InFlag = Chain.getValue(1); + ++retCount; + } + resultsz += sz; + } + } + if (isABI) { + if (retTy->isPrimitiveType() || retTy->isIntegerTy() || + retTy->isPointerTy() ) { + // Scalar needs to be at least 32bit wide + if (resultsz < 32) + resultsz = 32; + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32), + DAG.getConstant(resultsz, MVT::i32), + DAG.getConstant(0, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, + DeclareRetOps, 5); + InFlag = Chain.getValue(1); + } + else { + if (Func) { // direct call + if (!llvm::getAlign(*(CS->getCalledFunction()), 0, retAlignment)) + retAlignment = getTargetData()->getABITypeAlignment(retTy); + } else { // indirect call + const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction()); + if (!llvm::getAlign(*CallI, 0, retAlignment)) + retAlignment = getTargetData()->getABITypeAlignment(retTy); + } + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, DAG.getConstant(retAlignment, + MVT::i32), + DAG.getConstant(resultsz/8, MVT::i32), + DAG.getConstant(0, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, + DeclareRetOps, 5); + InFlag = Chain.getValue(1); + } + } + } + + if (!Func) { + // This is indirect function call case : PTX requires a prototype of the + // form + // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); + // to be emitted, and the label has to used as the last arg of call + // instruction. + // The prototype is embedded in a string and put as the operand for an + // INLINEASM SDNode. + SDVTList InlineAsmVTs = DAG.getVTList(MVT::Other, MVT::Glue); + std::string proto_string = getPrototype(retTy, Args, Outs, retAlignment); + const char *asmstr = nvTM->getManagedStrPool()-> + getManagedString(proto_string.c_str())->c_str(); + SDValue InlineAsmOps[] = { Chain, + DAG.getTargetExternalSymbol(asmstr, + getPointerTy()), + DAG.getMDNode(0), + DAG.getTargetConstant(0, MVT::i32), InFlag }; + Chain = DAG.getNode(ISD::INLINEASM, dl, InlineAsmVTs, InlineAsmOps, 5); + InFlag = Chain.getValue(1); + } + // Op to just print "call" + SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue PrintCallOps[] = { Chain, + DAG.getConstant(isABI ? ((Ins.size()==0) ? 0 : 1) + : retCount, MVT::i32), + InFlag }; + Chain = DAG.getNode(Func?(NVPTXISD::PrintCallUni):(NVPTXISD::PrintCall), dl, + PrintCallVTs, PrintCallOps, 3); + InFlag = Chain.getValue(1); + + // Ops to print out the function name + SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallVoidOps[] = { Chain, Callee, InFlag }; + Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps, 3); + InFlag = Chain.getValue(1); + + // Ops to print out the param list + SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgBeginOps[] = { Chain, InFlag }; + Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, + CallArgBeginOps, 2); + InFlag = Chain.getValue(1); + + for (unsigned i=0, e=paramCount; i!=e; ++i) { + unsigned opcode; + if (i==(e-1)) + opcode = NVPTXISD::LastCallArg; + else + opcode = NVPTXISD::CallArg; + SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32), + DAG.getConstant(i, MVT::i32), + InFlag }; + Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps, 4); + InFlag = Chain.getValue(1); + } + SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgEndOps[] = { Chain, + DAG.getConstant(Func ? 1 : 0, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps, + 3); + InFlag = Chain.getValue(1); + + if (!Func) { + SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue PrototypeOps[] = { Chain, + DAG.getConstant(uniqueCallSite, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps, 3); + InFlag = Chain.getValue(1); + } + + // Generate loads from param memory/moves from registers for result + if (Ins.size() > 0) { + if (isABI) { + unsigned resoffset = 0; + for (unsigned i=0,e=Ins.size(); i!=e; ++i) { + unsigned sz = Ins[i].VT.getSizeInBits(); + if (Ins[i].VT.isInteger() && (sz < 8)) sz = 8; + std::vector<EVT> LoadRetVTs; + LoadRetVTs.push_back(Ins[i].VT); + LoadRetVTs.push_back(MVT::Other); LoadRetVTs.push_back(MVT::Glue); + std::vector<SDValue> LoadRetOps; + LoadRetOps.push_back(Chain); + LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32)); + LoadRetOps.push_back(InFlag); + SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, LoadRetVTs, + &LoadRetOps[0], LoadRetOps.size()); + Chain = retval.getValue(1); + InFlag = retval.getValue(2); + InVals.push_back(retval); + resoffset += sz/8; + } + } + else { + SmallVector<EVT, 16> resvtparts; + ComputeValueVTs(*this, retTy, resvtparts); + + assert(Ins.size() == resvtparts.size() && + "Unexpected number of return values in non-ABI case"); + unsigned paramNum = 0; + for (unsigned i=0,e=Ins.size(); i!=e; ++i) { + assert(EVT(Ins[i].VT) == resvtparts[i] && + "Unexpected EVT type in non-ABI case"); + unsigned numelems = 1; + EVT elemtype = Ins[i].VT; + if (Ins[i].VT.isVector()) { + numelems = Ins[i].VT.getVectorNumElements(); + elemtype = Ins[i].VT.getVectorElementType(); + } + std::vector<SDValue> tempRetVals; + for (unsigned j=0; j<numelems; ++j) { + std::vector<EVT> MoveRetVTs; + MoveRetVTs.push_back(elemtype); + MoveRetVTs.push_back(MVT::Other); MoveRetVTs.push_back(MVT::Glue); + std::vector<SDValue> MoveRetOps; + MoveRetOps.push_back(Chain); + MoveRetOps.push_back(DAG.getConstant(0, MVT::i32)); + MoveRetOps.push_back(DAG.getConstant(paramNum, MVT::i32)); + MoveRetOps.push_back(InFlag); + SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, MoveRetVTs, + &MoveRetOps[0], MoveRetOps.size()); + Chain = retval.getValue(1); + InFlag = retval.getValue(2); + tempRetVals.push_back(retval); + ++paramNum; + } + if (Ins[i].VT.isVector()) + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, Ins[i].VT, + &tempRetVals[0], tempRetVals.size())); + else + InVals.push_back(tempRetVals[0]); + } + } + } + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(uniqueCallSite, true), + DAG.getIntPtrConstant(uniqueCallSite+1, true), + InFlag); + uniqueCallSite++; + + // set isTailCall to false for now, until we figure out how to express + // tail call optimization in PTX + isTailCall = false; + return Chain; +} + +// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() +// (see LegalizeDAG.cpp). This is slow and uses local memory. +// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 +SDValue NVPTXTargetLowering:: +LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + DebugLoc dl = Node->getDebugLoc(); + SmallVector<SDValue, 8> Ops; + unsigned NumOperands = Node->getNumOperands(); + for (unsigned i=0; i < NumOperands; ++i) { + SDValue SubOp = Node->getOperand(i); + EVT VVT = SubOp.getNode()->getValueType(0); + EVT EltVT = VVT.getVectorElementType(); + unsigned NumSubElem = VVT.getVectorNumElements(); + for (unsigned j=0; j < NumSubElem; ++j) { + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, + DAG.getIntPtrConstant(j))); + } + } + return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), + &Ops[0], Ops.size()); +} + +SDValue NVPTXTargetLowering:: +LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + case ISD::RETURNADDR: return SDValue(); + case ISD::FRAMEADDR: return SDValue(); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: return Op; + case ISD::BUILD_VECTOR: + case ISD::EXTRACT_SUBVECTOR: + return Op; + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + default: + llvm_unreachable("Custom lowering not defined for operation"); + } +} + +SDValue +NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, int idx, + EVT v) const { + std::string *name = nvTM->getManagedStrPool()->getManagedString(inname); + std::stringstream suffix; + suffix << idx; + *name += suffix.str(); + return DAG.getTargetExternalSymbol(name->c_str(), v); +} + +SDValue +NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { + return getExtSymb(DAG, ".PARAM", idx, v); +} + +SDValue +NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) { + return getExtSymb(DAG, ".HLPPARAM", idx); +} + +// Check to see if the kernel argument is image*_t or sampler_t + +bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { + static const char *const specialTypes[] = { + "struct._image2d_t", + "struct._image3d_t", + "struct._sampler_t" + }; + + const Type *Ty = arg->getType(); + const PointerType *PTy = dyn_cast<PointerType>(Ty); + + if (!PTy) + return false; + + if (!context) + return false; + + const StructType *STy = dyn_cast<StructType>(PTy->getElementType()); + const std::string TypeName = STy ? STy->getName() : ""; + + for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i) + if (TypeName == specialTypes[i]) + return true; + + return false; +} + +SDValue +NVPTXTargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + const TargetData *TD = getTargetData(); + + const Function *F = MF.getFunction(); + const AttrListPtr &PAL = F->getAttributes(); + + SDValue Root = DAG.getRoot(); + std::vector<SDValue> OutChains; + + bool isKernel = llvm::isKernelFunction(*F); + bool isABI = (nvptxSubtarget.getSmVersion() >= 20); + + std::vector<Type *> argTypes; + std::vector<const Argument *> theArgs; + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I) { + theArgs.push_back(I); + argTypes.push_back(I->getType()); + } + assert(argTypes.size() == Ins.size() && + "Ins types and function types did not match"); + + int idx = 0; + for (unsigned i=0, e=Ins.size(); i!=e; ++i, ++idx) { + Type *Ty = argTypes[i]; + EVT ObjectVT = getValueType(Ty); + assert(ObjectVT == Ins[i].VT && + "Ins type did not match function type"); + + // If the kernel argument is image*_t or sampler_t, convert it to + // a i32 constant holding the parameter position. This can later + // matched in the AsmPrinter to output the correct mangled name. + if (isImageOrSamplerVal(theArgs[i], + (theArgs[i]->getParent() ? + theArgs[i]->getParent()->getParent() : 0))) { + assert(isKernel && "Only kernels can have image/sampler params"); + InVals.push_back(DAG.getConstant(i+1, MVT::i32)); + continue; + } + + if (theArgs[i]->use_empty()) { + // argument is dead + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, ObjectVT)); + continue; + } + + // In the following cases, assign a node order of "idx+1" + // to newly created nodes. The SDNOdes for params have to + // appear in the same order as their order of appearance + // in the original function. "idx+1" holds that order. + if (PAL.paramHasAttr(i+1, Attribute::ByVal) == false) { + // A plain scalar. + if (isABI || isKernel) { + // If ABI, load from the param symbol + SDValue Arg = getParamSymbol(DAG, idx); + Value *srcValue = new Argument(PointerType::get(ObjectVT.getTypeForEVT( + F->getContext()), + llvm::ADDRESS_SPACE_PARAM)); + SDValue p = DAG.getLoad(ObjectVT, dl, Root, Arg, + MachinePointerInfo(srcValue), false, false, + false, + TD->getABITypeAlignment(ObjectVT.getTypeForEVT( + F->getContext()))); + if (p.getNode()) + DAG.AssignOrdering(p.getNode(), idx+1); + InVals.push_back(p); + } + else { + // If no ABI, just move the param symbol + SDValue Arg = getParamSymbol(DAG, idx, ObjectVT); + SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); + if (p.getNode()) + DAG.AssignOrdering(p.getNode(), idx+1); + InVals.push_back(p); + } + continue; + } + + // Param has ByVal attribute + if (isABI || isKernel) { + // Return MoveParam(param symbol). + // Ideally, the param symbol can be returned directly, + // but when SDNode builder decides to use it in a CopyToReg(), + // machine instruction fails because TargetExternalSymbol + // (not lowered) is target dependent, and CopyToReg assumes + // the source is lowered. + SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); + SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); + if (p.getNode()) + DAG.AssignOrdering(p.getNode(), idx+1); + if (isKernel) + InVals.push_back(p); + else { + SDValue p2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT, + DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), + p); + InVals.push_back(p2); + } + } else { + // Have to move a set of param symbols to registers and + // store them locally and return the local pointer in InVals + const PointerType *elemPtrType = dyn_cast<PointerType>(argTypes[i]); + assert(elemPtrType && + "Byval parameter should be a pointer type"); + Type *elemType = elemPtrType->getElementType(); + // Compute the constituent parts + SmallVector<EVT, 16> vtparts; + SmallVector<uint64_t, 16> offsets; + ComputeValueVTs(*this, elemType, vtparts, &offsets, 0); + unsigned totalsize = 0; + for (unsigned j=0, je=vtparts.size(); j!=je; ++j) + totalsize += vtparts[j].getStoreSizeInBits(); + SDValue localcopy = DAG.getFrameIndex(MF.getFrameInfo()-> + CreateStackObject(totalsize/8, 16, false), + getPointerTy()); + unsigned sizesofar = 0; + std::vector<SDValue> theChains; + for (unsigned j=0, je=vtparts.size(); j!=je; ++j) { + unsigned numElems = 1; + if (vtparts[j].isVector()) numElems = vtparts[j].getVectorNumElements(); + for (unsigned k=0, ke=numElems; k!=ke; ++k) { + EVT tmpvt = vtparts[j]; + if (tmpvt.isVector()) tmpvt = tmpvt.getVectorElementType(); + SDValue arg = DAG.getNode(NVPTXISD::MoveParam, dl, tmpvt, + getParamSymbol(DAG, idx, tmpvt)); + SDValue addr = DAG.getNode(ISD::ADD, dl, getPointerTy(), localcopy, + DAG.getConstant(sizesofar, getPointerTy())); + theChains.push_back(DAG.getStore(Chain, dl, arg, addr, + MachinePointerInfo(), false, false, 0)); + sizesofar += tmpvt.getStoreSizeInBits()/8; + ++idx; + } + } + --idx; + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &theChains[0], + theChains.size()); + InVals.push_back(localcopy); + } + } + + // Clang will check explicit VarArg and issue error if any. However, Clang + // will let code with + // implicit var arg like f() pass. + // We treat this case as if the arg list is empty. + //if (F.isVarArg()) { + // assert(0 && "VarArg not supported yet!"); + //} + + if (!OutChains.empty()) + DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &OutChains[0], OutChains.size())); + + return Chain; +} + +SDValue +NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) const { + + bool isABI = (nvptxSubtarget.getSmVersion() >= 20); + + unsigned sizesofar = 0; + unsigned idx = 0; + for (unsigned i=0, e=Outs.size(); i!=e; ++i) { + SDValue theVal = OutVals[i]; + EVT theValType = theVal.getValueType(); + unsigned numElems = 1; + if (theValType.isVector()) numElems = theValType.getVectorNumElements(); + for (unsigned j=0,je=numElems; j!=je; ++j) { + SDValue tmpval = theVal; + if (theValType.isVector()) + tmpval = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + theValType.getVectorElementType(), + tmpval, DAG.getIntPtrConstant(j)); + Chain = DAG.getNode(isABI ? NVPTXISD::StoreRetval :NVPTXISD::MoveToRetval, + dl, MVT::Other, + Chain, + DAG.getConstant(isABI ? sizesofar : idx, MVT::i32), + tmpval); + if (theValType.isVector()) + sizesofar += theValType.getVectorElementType().getStoreSizeInBits()/8; + else + sizesofar += theValType.getStoreSizeInBits()/8; + ++idx; + } + } + + return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); +} + +void +NVPTXTargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const +{ + if (Constraint.length() > 1) + return; + else + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +// NVPTX suuport vector of legal types of any length in Intrinsics because the +// NVPTX specific type legalizer +// will legalize them to the PTX supported length. +bool +NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const { + if (isTypeLegal(VT)) + return true; + if (VT.isVector()) { + MVT eVT = VT.getVectorElementType(); + if (isTypeLegal(eVT)) + return true; + } + return false; +} + + +// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as +// TgtMemIntrinsic +// because we need the information that is only available in the "Value" type +// of destination +// pointer. In particular, the address space information. +bool +NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo& Info, const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + default: + return false; + + case Intrinsic::nvvm_atomic_load_add_f32: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::f32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = true; + Info.align = 0; + return true; + + case Intrinsic::nvvm_atomic_load_inc_32: + case Intrinsic::nvvm_atomic_load_dec_32: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = true; + Info.align = 0; + return true; + + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: + + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldu_global_i) + Info.memVT = MVT::i32; + else if (Intrinsic == Intrinsic::nvvm_ldu_global_p) + Info.memVT = getPointerTy(); + else + Info.memVT = MVT::f32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + Info.align = 0; + return true; + + } + return false; +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +/// Used to guide target specific optimizations, like loop strength reduction +/// (LoopStrengthReduce.cpp) and memory optimization for address mode +/// (CodeGenPrepare.cpp) +bool +NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const { + + // AddrMode - This represents an addressing mode of: + // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + // + // The legal address modes are + // - [avar] + // - [areg] + // - [areg+immoff] + // - [immAddr] + + if (AM.BaseGV) { + if (AM.BaseOffs || AM.HasBaseReg || AM.Scale) + return false; + return true; + } + + switch (AM.Scale) { + case 0: // "r", "r+i" or "i" is allowed + break; + case 1: + if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. + return false; + // Otherwise we have r+i. + break; + default: + // No scale > 1 is allowed + return false; + } + return true; +} + +//===----------------------------------------------------------------------===// +// NVPTX Inline Assembly Support +//===----------------------------------------------------------------------===// + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +NVPTXTargetLowering::ConstraintType +NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: + break; + case 'r': + case 'h': + case 'c': + case 'l': + case 'f': + case 'd': + case '0': + case 'N': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} + + +std::pair<unsigned, const TargetRegisterClass*> +NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'c': + return std::make_pair(0U, &NVPTX::Int8RegsRegClass); + case 'h': + return std::make_pair(0U, &NVPTX::Int16RegsRegClass); + case 'r': + return std::make_pair(0U, &NVPTX::Int32RegsRegClass); + case 'l': + case 'N': + return std::make_pair(0U, &NVPTX::Int64RegsRegClass); + case 'f': + return std::make_pair(0U, &NVPTX::Float32RegsRegClass); + case 'd': + return std::make_pair(0U, &NVPTX::Float64RegsRegClass); + } + } + return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); +} + + + +/// getFunctionAlignment - Return the Log2 alignment of this function. +unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { + return 4; +} diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h new file mode 100644 index 0000000..86246e6 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -0,0 +1,144 @@ +//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that NVPTX uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTXISELLOWERING_H +#define NVPTXISELLOWERING_H + +#include "NVPTX.h" +#include "NVPTXSubtarget.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { +namespace NVPTXISD { +enum NodeType { + // Start the numbering from where ISD NodeType finishes. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + Wrapper, + CALL, + RET_FLAG, + LOAD_PARAM, + NVBuiltin, + DeclareParam, + DeclareScalarParam, + DeclareRetParam, + DeclareRet, + DeclareScalarRet, + LoadParam, + StoreParam, + StoreParamS32, // to sext and store a <32bit value, not used currently + StoreParamU32, // to zext and store a <32bit value, not used currently + MoveToParam, + PrintCall, + PrintCallUni, + CallArgBegin, + CallArg, + LastCallArg, + CallArgEnd, + CallVoid, + CallVal, + CallSymbol, + Prototype, + MoveParam, + MoveRetval, + MoveToRetval, + StoreRetval, + PseudoUseParam, + RETURN, + CallSeqBegin, + CallSeqEnd, + Dummy +}; +} + +//===--------------------------------------------------------------------===// +// TargetLowering Implementation +//===--------------------------------------------------------------------===// +class NVPTXTargetLowering : public TargetLowering { +public: + explicit NVPTXTargetLowering(NVPTXTargetMachine &TM); + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset, + SelectionDAG &DAG) const; + + virtual const char *getTargetNodeName(unsigned Opcode) const; + + bool isTypeSupportedInIntrinsic(MVT VT) const; + + bool getTgtMemIntrinsic(IntrinsicInfo& Info, const CallInst &I, + unsigned Intrinsic) const; + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type + /// Used to guide target specific optimizations, like loop strength + /// reduction (LoopStrengthReduce.cpp) and memory optimization for + /// address mode (CodeGenPrepare.cpp) + virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const; + + /// getFunctionAlignment - Return the Log2 alignment of this function. + virtual unsigned getFunctionAlignment(const Function *F) const; + + virtual EVT getSetCCResultType(EVT VT) const { + return MVT::i1; + } + + ConstraintType getConstraintType(const std::string &Constraint) const; + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; + + virtual SDValue + LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; + + std::string getPrototype(Type *, const ArgListTy &, + const SmallVectorImpl<ISD::OutputArg> &, + unsigned retAlignment) const; + + virtual SDValue + LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, + SelectionDAG &DAG) const; + + virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const; + + NVPTXTargetMachine *nvTM; + + // PTX always uses 32-bit shift amounts + virtual MVT getShiftAmountTy(EVT LHSTy) const { + return MVT::i32; + } + +private: + const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here + + SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx, EVT = + MVT::i32) const; + SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT = MVT::i32) const; + SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx); + + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; +}; +} // namespace llvm + +#endif // NVPTXISELLOWERING_H diff --git a/lib/Target/NVPTX/NVPTXInstrFormats.td b/lib/Target/NVPTX/NVPTXInstrFormats.td new file mode 100644 index 0000000..f11f1b8 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXInstrFormats.td @@ -0,0 +1,43 @@ +//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Describe NVPTX instructions format +// +//===----------------------------------------------------------------------===// + +// Vector instruction type enum +class VecInstTypeEnum<bits<4> val> { + bits<4> Value=val; +} +def VecNOP : VecInstTypeEnum<0>; + +// Generic NVPTX Format + +class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern> + : Instruction { + field bits<14> Inst; + + let Namespace = "NVPTX"; + dag OutOperandList = outs; + dag InOperandList = ins; + let AsmString = asmstr; + let Pattern = pattern; + + // TSFlagFields + bits<4> VecInstType = VecNOP.Value; + bit IsSimpleMove = 0; + bit IsLoad = 0; + bit IsStore = 0; + + let TSFlags{3-0} = VecInstType; + let TSFlags{4-4} = IsSimpleMove; + let TSFlags{5-5} = IsLoad; + let TSFlags{6-6} = IsStore; +} diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp new file mode 100644 index 0000000..cd50deb --- /dev/null +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -0,0 +1,326 @@ +//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the NVPTX implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXInstrInfo.h" +#include "NVPTXTargetMachine.h" +#define GET_INSTRINFO_CTOR +#include "NVPTXGenInstrInfo.inc" +#include "llvm/Function.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include <cstdio> + + +using namespace llvm; + +// FIXME: Add the subtarget support on this constructor. +NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm) +: NVPTXGenInstrInfo(), + TM(tm), + RegInfo(*this, *TM.getSubtargetImpl()) {} + + +void NVPTXInstrInfo::copyPhysReg (MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + if (NVPTX::Int32RegsRegClass.contains(DestReg) && + NVPTX::Int32RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::Int8RegsRegClass.contains(DestReg) && + NVPTX::Int8RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::Int1RegsRegClass.contains(DestReg) && + NVPTX::Int1RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::Float32RegsRegClass.contains(DestReg) && + NVPTX::Float32RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::Int16RegsRegClass.contains(DestReg) && + NVPTX::Int16RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::Int64RegsRegClass.contains(DestReg) && + NVPTX::Int64RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::Float64RegsRegClass.contains(DestReg) && + NVPTX::Float64RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V4F32RegsRegClass.contains(DestReg) && + NVPTX::V4F32RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V4f32Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V4I32RegsRegClass.contains(DestReg) && + NVPTX::V4I32RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V4i32Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V2F32RegsRegClass.contains(DestReg) && + NVPTX::V2F32RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V2f32Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V2I32RegsRegClass.contains(DestReg) && + NVPTX::V2I32RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V2i32Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V4I8RegsRegClass.contains(DestReg) && + NVPTX::V4I8RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V4i8Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V2I8RegsRegClass.contains(DestReg) && + NVPTX::V2I8RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V2i8Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V4I16RegsRegClass.contains(DestReg) && + NVPTX::V4I16RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V4i16Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V2I16RegsRegClass.contains(DestReg) && + NVPTX::V2I16RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V2i16Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V2I64RegsRegClass.contains(DestReg) && + NVPTX::V2I64RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V2i64Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (NVPTX::V2F64RegsRegClass.contains(DestReg) && + NVPTX::V2F64RegsRegClass.contains(SrcReg)) + BuildMI(MBB, I, DL, get(NVPTX::V2f64Mov), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else { + llvm_unreachable("Don't know how to copy a register"); + } +} + +bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, + unsigned &DestReg) const { + // Look for the appropriate part of TSFlags + bool isMove = false; + + unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >> + NVPTX::SimpleMoveShift; + isMove = (TSFlags == 1); + + if (isMove) { + MachineOperand dest = MI.getOperand(0); + MachineOperand src = MI.getOperand(1); + assert(dest.isReg() && "dest of a movrr is not a reg"); + assert(src.isReg() && "src of a movrr is not a reg"); + + SrcReg = src.getReg(); + DestReg = dest.getReg(); + return true; + } + + return false; +} + +bool NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const +{ + switch (MI.getOpcode()) { + default: return false; + case NVPTX::INT_PTX_SREG_NTID_X: + case NVPTX::INT_PTX_SREG_NTID_Y: + case NVPTX::INT_PTX_SREG_NTID_Z: + case NVPTX::INT_PTX_SREG_TID_X: + case NVPTX::INT_PTX_SREG_TID_Y: + case NVPTX::INT_PTX_SREG_TID_Z: + case NVPTX::INT_PTX_SREG_CTAID_X: + case NVPTX::INT_PTX_SREG_CTAID_Y: + case NVPTX::INT_PTX_SREG_CTAID_Z: + case NVPTX::INT_PTX_SREG_NCTAID_X: + case NVPTX::INT_PTX_SREG_NCTAID_Y: + case NVPTX::INT_PTX_SREG_NCTAID_Z: + case NVPTX::INT_PTX_SREG_WARPSIZE: + return true; + } +} + + +bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI, + unsigned &AddrSpace) const { + bool isLoad = false; + unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::isLoadMask) >> + NVPTX::isLoadShift; + isLoad = (TSFlags == 1); + if (isLoad) + AddrSpace = getLdStCodeAddrSpace(MI); + return isLoad; +} + +bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI, + unsigned &AddrSpace) const { + bool isStore = false; + unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::isStoreMask) >> + NVPTX::isStoreShift; + isStore = (TSFlags == 1); + if (isStore) + AddrSpace = getLdStCodeAddrSpace(MI); + return isStore; +} + + +bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const { + unsigned addrspace = 0; + if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS) + return false; + if (isLoadInstr(*MI, addrspace)) + if (addrspace == NVPTX::PTXLdStInstCode::SHARED) + return false; + if (isStoreInstr(*MI, addrspace)) + if (addrspace == NVPTX::PTXLdStInstCode::SHARED) + return false; + return true; +} + + +/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning +/// true if it cannot be understood (e.g. it's a switch dispatch or isn't +/// implemented for a target). Upon success, this returns false and returns +/// with the following information in various cases: +/// +/// 1. If this block ends with no branches (it just falls through to its succ) +/// just return false, leaving TBB/FBB null. +/// 2. If this block ends with only an unconditional branch, it sets TBB to be +/// the destination block. +/// 3. If this block ends with an conditional branch and it falls through to +/// an successor block, it sets TBB to be the branch destination block and a +/// list of operands that evaluate the condition. These +/// operands can be passed to other TargetInstrInfo methods to create new +/// branches. +/// 4. If this block ends with an conditional branch and an unconditional +/// block, it returns the 'true' destination in TBB, the 'false' destination +/// in FBB, and a list of operands that evaluate the condition. These +/// operands can be passed to other TargetInstrInfo methods to create new +/// branches. +/// +/// Note that RemoveBranch and InsertBranch must be implemented to support +/// cases where this method returns success. +/// +bool NVPTXInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (LastInst->getOpcode() == NVPTX::GOTO) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (LastInst->getOpcode() == NVPTX::CBranch) { + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(1).getMBB(); + Cond.push_back(LastInst->getOperand(0)); + return false; + } + // Otherwise, don't know what this is. + return true; + } + + // Get the instruction before it if it's a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && + isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it. + if (SecondLastInst->getOpcode() == NVPTX::CBranch && + LastInst->getOpcode() == NVPTX::GOTO) { + TBB = SecondLastInst->getOperand(1).getMBB(); + Cond.push_back(SecondLastInst->getOperand(0)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } + + // If the block ends with two NVPTX:GOTOs, handle it. The second one is not + // executed, so remove it. + if (SecondLastInst->getOpcode() == NVPTX::GOTO && + LastInst->getOpcode() == NVPTX::GOTO) { + TBB = SecondLastInst->getOperand(0).getMBB(); + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (I->getOpcode() != NVPTX::CBranch) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +NVPTXInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 1 || Cond.size() == 0) && + "NVPTX branch conditions have two components!"); + + // One-way branch. + if (FBB == 0) { + if (Cond.empty()) // Unconditional branch + BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB); + else // Conditional branch + BuildMI(&MBB, DL, get(NVPTX::CBranch)) + .addReg(Cond[0].getReg()).addMBB(TBB); + return 1; + } + + // Two-way Conditional Branch. + BuildMI(&MBB, DL, get(NVPTX::CBranch)) + .addReg(Cond[0].getReg()).addMBB(TBB); + BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB); + return 2; +} diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h new file mode 100644 index 0000000..7b8e218 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -0,0 +1,83 @@ +//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the niversity of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the NVPTX implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTXINSTRUCTIONINFO_H +#define NVPTXINSTRUCTIONINFO_H + +#include "NVPTX.h" +#include "NVPTXRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "NVPTXGenInstrInfo.inc" + +namespace llvm { + +class NVPTXInstrInfo : public NVPTXGenInstrInfo +{ + NVPTXTargetMachine &TM; + const NVPTXRegisterInfo RegInfo; +public: + explicit NVPTXInstrInfo(NVPTXTargetMachine &TM); + + virtual const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; } + + /* The following virtual functions are used in register allocation. + * They are not implemented because the existing interface and the logic + * at the caller side do not work for the elementized vector load and store. + * + * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI, + * int &FrameIndex) const; + * virtual unsigned isStoreToStackSlot(const MachineInstr *MI, + * int &FrameIndex) const; + * virtual void storeRegToStackSlot(MachineBasicBlock &MBB, + * MachineBasicBlock::iterator MBBI, + * unsigned SrcReg, bool isKill, int FrameIndex, + * const TargetRegisterClass *RC) const; + * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, + * MachineBasicBlock::iterator MBBI, + * unsigned DestReg, int FrameIndex, + * const TargetRegisterClass *RC) const; + */ + + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const ; + virtual bool isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, + unsigned &DestReg) const; + bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const; + bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const; + bool isReadSpecialReg(MachineInstr &MI) const; + + virtual bool CanTailMerge(const MachineInstr *MI) const ; + // Branch analysis. + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const; + unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const { + return MI.getOperand(2).getImm(); + } + +}; + +} // namespace llvm + +#endif diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td new file mode 100644 index 0000000..8a410b8 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -0,0 +1,2837 @@ +//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PTX instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +include "NVPTXInstrFormats.td" + +// A NOP instruction +def NOP : NVPTXInst<(outs), (ins), "", []>; + +// List of vector specific properties +def isVecLD : VecInstTypeEnum<1>; +def isVecST : VecInstTypeEnum<2>; +def isVecBuild : VecInstTypeEnum<3>; +def isVecShuffle : VecInstTypeEnum<4>; +def isVecExtract : VecInstTypeEnum<5>; +def isVecInsert : VecInstTypeEnum<6>; +def isVecDest : VecInstTypeEnum<7>; +def isVecOther : VecInstTypeEnum<15>; + +//===----------------------------------------------------------------------===// +// NVPTX Operand Definitions. +//===----------------------------------------------------------------------===// + +def brtarget : Operand<OtherVT>; + +//===----------------------------------------------------------------------===// +// NVPTX Instruction Predicate Definitions +//===----------------------------------------------------------------------===// + + +def hasAtomRedG32 : Predicate<"Subtarget.hasAtomRedG32()">; +def hasAtomRedS32 : Predicate<"Subtarget.hasAtomRedS32()">; +def hasAtomRedGen32 : Predicate<"Subtarget.hasAtomRedGen32()">; +def useAtomRedG32forGen32 : + Predicate<"!Subtarget.hasAtomRedGen32() && Subtarget.hasAtomRedG32()">; +def hasBrkPt : Predicate<"Subtarget.hasBrkPt()">; +def hasAtomRedG64 : Predicate<"Subtarget.hasAtomRedG64()">; +def hasAtomRedS64 : Predicate<"Subtarget.hasAtomRedS64()">; +def hasAtomRedGen64 : Predicate<"Subtarget.hasAtomRedGen64()">; +def useAtomRedG64forGen64 : + Predicate<"!Subtarget.hasAtomRedGen64() && Subtarget.hasAtomRedG64()">; +def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">; +def hasVote : Predicate<"Subtarget.hasVote()">; +def hasDouble : Predicate<"Subtarget.hasDouble()">; +def reqPTX20 : Predicate<"Subtarget.reqPTX20()">; +def hasLDU : Predicate<"Subtarget.hasLDU()">; +def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">; + +def doF32FTZ : Predicate<"UseF32FTZ">; + +def doFMAF32 : Predicate<"doFMAF32">; +def doFMAF32_ftz : Predicate<"(doFMAF32 && UseF32FTZ)">; +def doFMAF32AGG : Predicate<"doFMAF32AGG">; +def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && UseF32FTZ)">; +def doFMAF64 : Predicate<"doFMAF64">; +def doFMAF64AGG : Predicate<"doFMAF64AGG">; +def doFMADF32 : Predicate<"doFMADF32">; +def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">; + +def doMulWide : Predicate<"doMulWide">; + +def allowFMA : Predicate<"allowFMA">; +def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">; + +def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">; +def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">; + +def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; + +def true : Predicate<"1">; + +//===----------------------------------------------------------------------===// +// Special Handling for 8-bit Operands and Operations +// +// PTX supports 8-bit signed and unsigned types, but does not support 8-bit +// operations (like add, shift, etc) except for ld/st/cvt. SASS does not have +// 8-bit registers. +// +// PTX ld, st and cvt instructions permit source and destination data operands +// to be wider than the instruction-type size, so that narrow values may be +// loaded, stored, and converted using regular-width registers. +// +// So in PTX generation, we +// - always use 16-bit registers in place in 8-bit registers. +// (8-bit variables should stay as 8-bit as they represent memory layout.) +// - for the following 8-bit operations, we sign-ext/zero-ext the 8-bit values +// before operation +// . div +// . rem +// . neg (sign) +// . set, setp +// . shr +// +// We are patching the operations by inserting the cvt instructions in the +// asm strings of the affected instructions. +// +// Since vector operations, except for ld/st, are eventually elementized. We +// do not need to special-hand the vector 8-bit operations. +// +// +//===----------------------------------------------------------------------===// + +// Generate string block like +// { +// .reg .s16 %temp1; +// .reg .s16 %temp2; +// cvt.s16.s8 %temp1, %a; +// cvt.s16.s8 %temp2, %b; +// opc.s16 %dst, %temp1, %temp2; +// } +// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8 +class Handle_i8rr<string OpcStr, string TypeStr, string CVTStr> { + string s = !strconcat("{{\n\t", + !strconcat(".reg .", !strconcat(TypeStr, + !strconcat(" \t%temp1;\n\t", + !strconcat(".reg .", !strconcat(TypeStr, + !strconcat(" \t%temp2;\n\t", + !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t", + !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t", + !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))); +} + +// Generate string block like +// { +// .reg .s16 %temp1; +// .reg .s16 %temp2; +// cvt.s16.s8 %temp1, %a; +// mov.b16 %temp2, %b; +// cvt.s16.s8 %temp2, %temp2; +// opc.s16 %dst, %temp1, %temp2; +// } +// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8 +class Handle_i8ri<string OpcStr, string TypeStr, string CVTStr> { + string s = !strconcat("{{\n\t", + !strconcat(".reg .", !strconcat(TypeStr, + !strconcat(" \t%temp1;\n\t", + !strconcat(".reg .", + !strconcat(TypeStr, !strconcat(" \t%temp2;\n\t", + !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t", + !strconcat("mov.b16 \t%temp2, $b;\n\t", + !strconcat(CVTStr, !strconcat(" \t%temp2, %temp2;\n\t", + !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}"))))))))))))); +} + +// Generate string block like +// { +// .reg .s16 %temp1; +// .reg .s16 %temp2; +// mov.b16 %temp1, %b; +// cvt.s16.s8 %temp1, %temp1; +// cvt.s16.s8 %temp2, %a; +// opc.s16 %dst, %temp1, %temp2; +// } +// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8 +class Handle_i8ir<string OpcStr, string TypeStr, string CVTStr> { + string s = !strconcat("{{\n\t", + !strconcat(".reg .", !strconcat(TypeStr, + !strconcat(" \t%temp1;\n\t", + !strconcat(".reg .", !strconcat(TypeStr, + !strconcat(" \t%temp2;\n\t", + !strconcat("mov.b16 \t%temp1, $a;\n\t", + !strconcat(CVTStr, !strconcat(" \t%temp1, %temp1;\n\t", + !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t", + !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}"))))))))))))); +} + + +//===----------------------------------------------------------------------===// +// Some Common Instruction Class Templates +//===----------------------------------------------------------------------===// + +multiclass I3<string OpcStr, SDNode OpNode> { + def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, + Int64Regs:$b))]>; + def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + Int32Regs:$b))]>; + def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, + Int16Regs:$b))]>; + def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; + def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; + def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>; +} + +multiclass I3_i8<string OpcStr, SDNode OpNode, string TypeStr, string CVTStr> { + def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, + Int64Regs:$b))]>; + def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + Int32Regs:$b))]>; + def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, + Int16Regs:$b))]>; + def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; + def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), + Handle_i8rr<OpcStr, TypeStr, CVTStr>.s, + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; + def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), + Handle_i8ri<OpcStr, TypeStr, CVTStr>.s, + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>; +} + +multiclass I3_noi8<string OpcStr, SDNode OpNode> { + def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, + Int64Regs:$b))]>; + def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + Int32Regs:$b))]>; + def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, + Int16Regs:$b))]>; + def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; +} + +multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> { + def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, + Int32Regs:$b), + !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + Int32Regs:$b))]>; + def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; +} + +multiclass F3<string OpcStr, SDNode OpNode> { + def f64rr : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, + (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[allowFMA]>; + def f64ri : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, + (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; + def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, + (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[allowFMA_ftz]>; + def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, + (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA_ftz]>; + def f32rr : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, + (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[allowFMA]>; + def f32ri : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, + (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; +} + +multiclass F3_rn<string OpcStr, SDNode OpNode> { + def f64rr : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, + (OpNode Float64Regs:$a, Float64Regs:$b))]>; + def f64ri : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, + (OpNode Float64Regs:$a, fpimm:$b))]>; + def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, + (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[doF32FTZ]>; + def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, + (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[doF32FTZ]>; + def f32rr : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, + (OpNode Float32Regs:$a, Float32Regs:$b))]>; + def f32ri : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, + (OpNode Float32Regs:$a, fpimm:$b))]>; +} + +multiclass F2<string OpcStr, SDNode OpNode> { + def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), + !strconcat(OpcStr, ".f64 \t$dst, $a;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; + def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, + Requires<[doF32FTZ]>; + def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), + !strconcat(OpcStr, ".f32 \t$dst, $a;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; +} + +//===----------------------------------------------------------------------===// +// NVPTX Instructions. +//===----------------------------------------------------------------------===// + +//----------------------------------- +// Integer Arithmetic +//----------------------------------- + +multiclass ADD_SUB_i1<SDNode OpNode> { + def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; +} + +defm ADD_i1 : ADD_SUB_i1<add>; +defm SUB_i1 : ADD_SUB_i1<sub>; + + +defm ADD : I3<"add.s", add>; +defm SUB : I3<"sub.s", sub>; + +defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>; +defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>; + +defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>; +defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>; + +//mul.wide PTX instruction +def SInt32Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + if (v.isSignedIntN(32)) + return true; + return false; +}]>; + +def UInt32Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + if (v.isIntN(32)) + return true; + return false; +}]>; + +def SInt16Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + if (v.isSignedIntN(16)) + return true; + return false; +}]>; + +def UInt16Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + if (v.isIntN(16)) + return true; + return false; +}]>; + +def Int5Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + // Check if 0 <= v < 32 + // Only then the result from (x << v) will be i32 + if (v.sge(0) && v.slt(32)) + return true; + return false; +}]>; + +def Int4Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + // Check if 0 <= v < 16 + // Only then the result from (x << v) will be i16 + if (v.sge(0) && v.slt(16)) + return true; + return false; +}]>; + +def SHL2MUL32 : SDNodeXForm<imm, [{ + const APInt &v = N->getAPIntValue(); + APInt temp(32, 1); + return CurDAG->getTargetConstant(temp.shl(v), MVT::i32); +}]>; + +def SHL2MUL16 : SDNodeXForm<imm, [{ + const APInt &v = N->getAPIntValue(); + APInt temp(16, 1); + return CurDAG->getTargetConstant(temp.shl(v), MVT::i16); +}]>; + +def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst), + (ins Int32Regs:$a, i64imm:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; + +def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst), + (ins Int32Regs:$a, i64imm:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; + +def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$a, i32imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; + +def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$a, i32imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; + +def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), + (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, + Requires<[doMulWide]>; +def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)), + (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, + Requires<[doMulWide]>; + +def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)), + (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, + Requires<[doMulWide]>; +def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)), + (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), + (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; +def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), + (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; +def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), + (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; +def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), + (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>, + Requires<[doMulWide]>; + +defm MULT : I3<"mul.lo.s", mul>; + +defm MULTHS : I3_noi8<"mul.hi.s", mulhs>; +defm MULTHU : I3_noi8<"mul.hi.u", mulhu>; +def MULTHSi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), + !strconcat("{{ \n\t", + !strconcat(".reg \t.s16 temp1; \n\t", + !strconcat(".reg \t.s16 temp2; \n\t", + !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t", + !strconcat("cvt.s16.s8 \ttemp2, $b; \n\t", + !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t", + !strconcat("shr.s16 \t$dst, $dst, 8; \n\t", + !strconcat("}}", "")))))))), + [(set Int8Regs:$dst, (mulhs Int8Regs:$a, Int8Regs:$b))]>; +def MULTHSi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), + !strconcat("{{ \n\t", + !strconcat(".reg \t.s16 temp1; \n\t", + !strconcat(".reg \t.s16 temp2; \n\t", + !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t", + !strconcat("mov.b16 \ttemp2, $b; \n\t", + !strconcat("cvt.s16.s8 \ttemp2, temp2; \n\t", + !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t", + !strconcat("shr.s16 \t$dst, $dst, 8; \n\t", + !strconcat("}}", ""))))))))), + [(set Int8Regs:$dst, (mulhs Int8Regs:$a, imm:$b))]>; +def MULTHUi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), + !strconcat("{{ \n\t", + !strconcat(".reg \t.u16 temp1; \n\t", + !strconcat(".reg \t.u16 temp2; \n\t", + !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t", + !strconcat("cvt.u16.u8 \ttemp2, $b; \n\t", + !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t", + !strconcat("shr.u16 \t$dst, $dst, 8; \n\t", + !strconcat("}}", "")))))))), + [(set Int8Regs:$dst, (mulhu Int8Regs:$a, Int8Regs:$b))]>; +def MULTHUi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), + !strconcat("{{ \n\t", + !strconcat(".reg \t.u16 temp1; \n\t", + !strconcat(".reg \t.u16 temp2; \n\t", + !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t", + !strconcat("mov.b16 \ttemp2, $b; \n\t", + !strconcat("cvt.u16.u8 \ttemp2, temp2; \n\t", + !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t", + !strconcat("shr.u16 \t$dst, $dst, 8; \n\t", + !strconcat("}}", ""))))))))), + [(set Int8Regs:$dst, (mulhu Int8Regs:$a, imm:$b))]>; + + +defm SDIV : I3_i8<"div.s", sdiv, "s16", "cvt.s16.s8">; +defm UDIV : I3_i8<"div.u", udiv, "u16", "cvt.u16.u8">; + +defm SREM : I3_i8<"rem.s", srem, "s16", "cvt.s16.s8">; +// The ri version will not be selected as DAGCombiner::visitSREM will lower it. +defm UREM : I3_i8<"rem.u", urem, "u16", "cvt.u16.u8">; +// The ri version will not be selected as DAGCombiner::visitUREM will lower it. + +def MAD8rrr : NVPTXInst<(outs Int8Regs:$dst), + (ins Int8Regs:$a, Int8Regs:$b, Int8Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b), + Int8Regs:$c))]>; +def MAD8rri : NVPTXInst<(outs Int8Regs:$dst), + (ins Int8Regs:$a, Int8Regs:$b, i8imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b), + imm:$c))]>; +def MAD8rir : NVPTXInst<(outs Int8Regs:$dst), + (ins Int8Regs:$a, i8imm:$b, Int8Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b), + Int8Regs:$c))]>; +def MAD8rii : NVPTXInst<(outs Int8Regs:$dst), + (ins Int8Regs:$a, i8imm:$b, i8imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b), + imm:$c))]>; + +def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (add + (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>; +def MAD16rri : NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (add + (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>; +def MAD16rir : NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (add + (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>; +def MAD16rii : NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, i16imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b), + imm:$c))]>; + +def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (add + (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>; +def MAD32rri : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (add + (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>; +def MAD32rir : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (add + (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>; +def MAD32rii : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, i32imm:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (add + (mul Int32Regs:$a, imm:$b), imm:$c))]>; + +def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (add + (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>; +def MAD64rri : NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (add + (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>; +def MAD64rir : NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (add + (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>; +def MAD64rii : NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, i64imm:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (add + (mul Int64Regs:$a, imm:$b), imm:$c))]>; + + +def INEG8 : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src), + !strconcat("cvt.s16.s8 \t$dst, $src;\n\t", + "neg.s16 \t$dst, $dst;"), + [(set Int8Regs:$dst, (ineg Int8Regs:$src))]>; +def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "neg.s16 \t$dst, $src;", + [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; +def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "neg.s32 \t$dst, $src;", + [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; +def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "neg.s64 \t$dst, $src;", + [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; + +//----------------------------------- +// Floating Point Arithmetic +//----------------------------------- + +// Constant 1.0f +def FloatConst1 : PatLeaf<(fpimm), [{ + if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle) + return false; + float f = (float)N->getValueAPF().convertToFloat(); + return (f==1.0f); +}]>; +// Constand (double)1.0 +def DoubleConst1 : PatLeaf<(fpimm), [{ + if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble) + return false; + double d = (double)N->getValueAPF().convertToDouble(); + return (d==1.0); +}]>; + +defm FADD : F3<"add", fadd>; +defm FSUB : F3<"sub", fsub>; +defm FMUL : F3<"mul", fmul>; + +defm FADD_rn : F3_rn<"add", fadd>; +defm FSUB_rn : F3_rn<"sub", fsub>; +defm FMUL_rn : F3_rn<"mul", fmul>; + +defm FABS : F2<"abs", fabs>; +defm FNEG : F2<"neg", fneg>; +defm FSQRT : F2<"sqrt.rn", fsqrt>; + +// +// F64 division +// +def FDIV641r : NVPTXInst<(outs Float64Regs:$dst), + (ins f64imm:$a, Float64Regs:$b), + "rcp.rn.f64 \t$dst, $b;", + [(set Float64Regs:$dst, + (fdiv DoubleConst1:$a, Float64Regs:$b))]>; +def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + "div.rn.f64 \t$dst, $a, $b;", + [(set Float64Regs:$dst, + (fdiv Float64Regs:$a, Float64Regs:$b))]>; +def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + "div.rn.f64 \t$dst, $a, $b;", + [(set Float64Regs:$dst, + (fdiv Float64Regs:$a, fpimm:$b))]>; + +// +// F32 Approximate reciprocal +// +def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, + (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV321r : NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.f32 \t$dst, $b;", + [(set Float32Regs:$dst, + (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX]>; +// +// F32 Approximate division +// +def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.approx.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV32approxrr : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.approx.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX]>; +// +// F32 Semi-accurate reciprocal +// +// rcp.approx gives the same result as div.full(1.0f, a) and is faster. +// +def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, + (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.f32 \t$dst, $b;", + [(set Float32Regs:$dst, + (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL]>; +// +// F32 Semi-accurate division +// +def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.full.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.full.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.full.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL]>; +def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.full.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_FULL]>; +// +// F32 Accurate reciprocal +// +def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.rn.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, + (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[reqPTX20, doF32FTZ]>; +def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.rn.f32 \t$dst, $b;", + [(set Float32Regs:$dst, + (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[reqPTX20]>; +// +// F32 Accurate division +// +def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.rn.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[doF32FTZ, reqPTX20]>; +def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.rn.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[doF32FTZ, reqPTX20]>; +def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.rn.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[reqPTX20]>; +def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.rn.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, + (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[reqPTX20]>; + + +multiclass FPCONTRACT32<string OpcStr, Predicate Pred> { + def rrr : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float32Regs:$dst, (fadd + (fmul Float32Regs:$a, Float32Regs:$b), + Float32Regs:$c))]>, Requires<[Pred]>; + // This is to WAR a weird bug in Tablegen that does not automatically + // generate the following permutated rule rrr2 from the above rrr. + // So we explicitly add it here. This happens to FMA32 only. + // See the comments at FMAD32 and FMA32 for more information. + def rrr2 : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float32Regs:$dst, (fadd Float32Regs:$c, + (fmul Float32Regs:$a, Float32Regs:$b)))]>, + Requires<[Pred]>; + def rri : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float32Regs:$dst, (fadd + (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>, + Requires<[Pred]>; + def rir : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float32Regs:$dst, (fadd + (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>, + Requires<[Pred]>; + def rii : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b, f32imm:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float32Regs:$dst, (fadd + (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>, + Requires<[Pred]>; +} + +multiclass FPCONTRACT64<string OpcStr, Predicate Pred> { + def rrr : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float64Regs:$dst, (fadd + (fmul Float64Regs:$a, Float64Regs:$b), + Float64Regs:$c))]>, Requires<[Pred]>; + def rri : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a, + Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>; + def rir : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float64Regs:$dst, (fadd + (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>, + Requires<[Pred]>; + def rii : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b, f64imm:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set Float64Regs:$dst, (fadd + (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>, + Requires<[Pred]>; +} + +// Due to a unknown reason (most likely a bug in tablegen), tablegen does not +// automatically generate the rrr2 rule from +// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32. +// If we reverse the order of the following two lines, then rrr2 rule will be +// generated for FMA32, but not for rrr. +// Therefore, we manually write the rrr2 rule in FPCONTRACT32. +defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>; +defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>; +defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>; +defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>; +defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>; + +// b*c-a => fmad(b, c, -a) +multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> { + def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), + (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, + Requires<[Pred]>; +} + +// a-b*c => fmad(-b,c, a) +// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c +// b*c-a => fmad(b, c, -a) +// - legal because b*c-a <=> b*c+(-a) +multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> { + def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)), + (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>, + Requires<[Pred]>; + def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), + (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, + Requires<[Pred]>; +} + +// a-b*c => fmad(-b,c, a) +// b*c-a => fmad(b, c, -a) +multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> { + def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)), + (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>, + Requires<[Pred]>; + + def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a), + (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>, + Requires<[Pred]>; +} + +defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>; +defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>; +defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD<FMAD32_ftzrrr, doFMADF32_ftz>; +defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD<FMAD32rrr, doFMADF32>; +defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>; + +def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "sin.approx.f32 \t$dst, $src;", + [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>; +def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "cos.approx.f32 \t$dst, $src;", + [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>; + +//----------------------------------- +// Logical Arithmetic +//----------------------------------- + +multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> { + def b1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), + !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + def b1ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), + !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; + def b8rr: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; + def b8ri: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>; + def b16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, + Int16Regs:$b))]>; + def b16ri: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; + def b32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + Int32Regs:$b))]>; + def b32ri: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def b64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, + Int64Regs:$b))]>; + def b64ri: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; +} + +defm OR : LOG_FORMAT<"or", or>; +defm AND : LOG_FORMAT<"and", and>; +defm XOR : LOG_FORMAT<"xor", xor>; + +def NOT1: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), + "not.pred \t$dst, $src;", + [(set Int1Regs:$dst, (not Int1Regs:$src))]>; +def NOT8: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src), + "not.b16 \t$dst, $src;", + [(set Int8Regs:$dst, (not Int8Regs:$src))]>; +def NOT16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "not.b16 \t$dst, $src;", + [(set Int16Regs:$dst, (not Int16Regs:$src))]>; +def NOT32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "not.b32 \t$dst, $src;", + [(set Int32Regs:$dst, (not Int32Regs:$src))]>; +def NOT64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "not.b64 \t$dst, $src;", + [(set Int64Regs:$dst, (not Int64Regs:$src))]>; + +// For shifts, the second src operand must be 32-bit value +multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> { + def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, + Int32Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, + Int32Regs:$b))]>; + def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, + (i32 imm:$b)))]>; + def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, + Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + Int32Regs:$b))]>; + def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + (i32 imm:$b)))]>; + def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode (i32 imm:$a), + (i32 imm:$b)))]>; + def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, + Int32Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, + Int32Regs:$b))]>; + def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, + (i32 imm:$b)))]>; + def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, + Int32Regs:$b))]>; + def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, + (i32 imm:$b)))]>; +} + +defm SHL : LSHIFT_FORMAT<"shl.b", shl>; + +// For shifts, the second src operand must be 32-bit value +// Need to add cvt for the 8-bits. +multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode, string CVTStr> { + def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, + Int32Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, + Int32Regs:$b))]>; + def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, + (i32 imm:$b)))]>; + def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, + Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + Int32Regs:$b))]>; + def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, + (i32 imm:$b)))]>; + def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode (i32 imm:$a), + (i32 imm:$b)))]>; + def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, + Int32Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, + Int32Regs:$b))]>; + def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, + (i32 imm:$b)))]>; + def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b), + !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t", + !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))), + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, + Int32Regs:$b))]>; + def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b), + !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t", + !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))), + [(set Int8Regs:$dst, (OpNode Int8Regs:$a, + (i32 imm:$b)))]>; +} + +defm SRA : RSHIFT_FORMAT<"shr.s", sra, "cvt.s16.s8">; +defm SRL : RSHIFT_FORMAT<"shr.u", srl, "cvt.u16.u8">; + +// 32bit +def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %lhs;\n\t", + !strconcat(".reg .b32 %rhs;\n\t", + !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t", + !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t", + !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", + !strconcat("}}", ""))))))), + []>; + +def SUB_FRM_32 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(32-N->getZExtValue(), MVT::i32); +}]>; + +def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>; +def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>; + +def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, + Int32Regs:$amt), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %lhs;\n\t", + !strconcat(".reg .b32 %rhs;\n\t", + !strconcat(".reg .b32 %amt2;\n\t", + !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t", + !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t", + !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t", + !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", + !strconcat("}}", ""))))))))), + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>; + +def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, + Int32Regs:$amt), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %lhs;\n\t", + !strconcat(".reg .b32 %rhs;\n\t", + !strconcat(".reg .b32 %amt2;\n\t", + !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t", + !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t", + !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t", + !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", + !strconcat("}}", ""))))))))), + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>; + +// 64bit +def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, + i32imm:$amt1, i32imm:$amt2), + !strconcat("{{\n\t", + !strconcat(".reg .b64 %lhs;\n\t", + !strconcat(".reg .b64 %rhs;\n\t", + !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t", + !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t", + !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t", + !strconcat("}}", ""))))))), + []>; + +def SUB_FRM_64 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(64-N->getZExtValue(), MVT::i32); +}]>; + +def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; +def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; + +def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, + Int32Regs:$amt), + !strconcat("{{\n\t", + !strconcat(".reg .b64 %lhs;\n\t", + !strconcat(".reg .b64 %rhs;\n\t", + !strconcat(".reg .u32 %amt2;\n\t", + !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t", + !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t", + !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t", + !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t", + !strconcat("}}", ""))))))))), + [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; + +def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, + Int32Regs:$amt), + !strconcat("{{\n\t", + !strconcat(".reg .b64 %lhs;\n\t", + !strconcat(".reg .b64 %rhs;\n\t", + !strconcat(".reg .u32 %amt2;\n\t", + !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t", + !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t", + !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t", + !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t", + !strconcat("}}", ""))))))))), + [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; + + +//----------------------------------- +// Data Movement (Load / Store, Move) +//----------------------------------- + +def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], + [SDNPWantRoot]>; +def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], + [SDNPWantRoot]>; + +def MEMri : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops Int32Regs, i32imm); +} +def MEMri64 : Operand<i64> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops Int64Regs, i64imm); +} + +def imem : Operand<iPTR> { + let PrintMethod = "printOperand"; +} + +def imemAny : Operand<iPTRAny> { + let PrintMethod = "printOperand"; +} + +def LdStCode : Operand<i32> { + let PrintMethod = "printLdStCode"; +} + +def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; +def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; + +def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), + "mov.u32 \t$dst, $a;", + [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; + +def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), + "mov.u64 \t$dst, $a;", + [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; + +// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp +let IsSimpleMove=1 in { +def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), + "mov.pred \t$dst, $sss;", []>; +def IMOV8rr: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$sss), + "mov.u16 \t$dst, $sss;", []>; +def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), + "mov.u16 \t$dst, $sss;", []>; +def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), + "mov.u32 \t$dst, $sss;", []>; +def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), + "mov.u64 \t$dst, $sss;", []>; + +def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "mov.f32 \t$dst, $src;", []>; +def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), + "mov.f64 \t$dst, $src;", []>; +} +def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), + "mov.pred \t$dst, $src;", + [(set Int1Regs:$dst, imm:$src)]>; +def IMOV8ri: NVPTXInst<(outs Int8Regs:$dst), (ins i8imm:$src), + "mov.u16 \t$dst, $src;", + [(set Int8Regs:$dst, imm:$src)]>; +def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), + "mov.u16 \t$dst, $src;", + [(set Int16Regs:$dst, imm:$src)]>; +def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), + "mov.u32 \t$dst, $src;", + [(set Int32Regs:$dst, imm:$src)]>; +def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), + "mov.u64 \t$dst, $src;", + [(set Int64Regs:$dst, imm:$src)]>; + +def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), + "mov.f32 \t$dst, $src;", + [(set Float32Regs:$dst, fpimm:$src)]>; +def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), + "mov.f64 \t$dst, $src;", + [(set Float64Regs:$dst, fpimm:$src)]>; + +def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; + +//---- Copy Frame Index ---- +def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), + "add.u32 \t$dst, ${addr:add};", + [(set Int32Regs:$dst, ADDRri:$addr)]>; +def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), + "add.u64 \t$dst, ${addr:add};", + [(set Int64Regs:$dst, ADDRri64:$addr)]>; + +//----------------------------------- +// Comparison and Selection +//----------------------------------- + +// Generate string block like +// { +// .reg .pred p; +// setp.gt.s16 p, %a, %b; +// selp.s16 %dst, -1, 0, p; +// } +// when OpcStr=setp.gt.s sz1=16 sz2=16 d=%dst a=%a b=%b +class Set_Str<string OpcStr, string sz1, string sz2, string d, string a, + string b> { + string t1 = "{{\n\t.reg .pred p;\n\t"; + string t2 = !strconcat(t1 , OpcStr); + string t3 = !strconcat(t2 , sz1); + string t4 = !strconcat(t3 , " \tp, "); + string t5 = !strconcat(t4 , a); + string t6 = !strconcat(t5 , ", "); + string t7 = !strconcat(t6 , b); + string t8 = !strconcat(t7 , ";\n\tselp.s"); + string t9 = !strconcat(t8 , sz2); + string t10 = !strconcat(t9, " \t"); + string t11 = !strconcat(t10, d); + string s = !strconcat(t11, ", -1, 0, p;\n\t}}"); +} + +// Generate string block like +// { +// .reg .pred p; +// .reg .s16 %temp1; +// .reg .s16 %temp2; +// cvt.s16.s8 %temp1, %a; +// cvt s16.s8 %temp1, %b; +// setp.gt.s16 p, %temp1, %temp2; +// selp.s16 %dst, -1, 0, p; +// } +// when OpcStr=setp.gt.s d=%dst a=%a b=%b type=s16 cvt=cvt.s16.s8 +class Set_Stri8<string OpcStr, string d, string a, string b, string type, + string cvt> { + string t1 = "{{\n\t.reg .pred p;\n\t"; + string t2 = !strconcat(t1, ".reg ."); + string t3 = !strconcat(t2, type); + string t4 = !strconcat(t3, " %temp1;\n\t"); + string t5 = !strconcat(t4, ".reg ."); + string t6 = !strconcat(t5, type); + string t7 = !strconcat(t6, " %temp2;\n\t"); + string t8 = !strconcat(t7, cvt); + string t9 = !strconcat(t8, " \t%temp1, "); + string t10 = !strconcat(t9, a); + string t11 = !strconcat(t10, ";\n\t"); + string t12 = !strconcat(t11, cvt); + string t13 = !strconcat(t12, " \t%temp2, "); + string t14 = !strconcat(t13, b); + string t15 = !strconcat(t14, ";\n\t"); + string t16 = !strconcat(t15, OpcStr); + string t17 = !strconcat(t16, "16"); + string t18 = !strconcat(t17, " \tp, %temp1, %temp2;\n\t"); + string t19 = !strconcat(t18, "selp.s16 \t"); + string t20 = !strconcat(t19, d); + string s = !strconcat(t20, ", -1, 0, p;\n\t}}"); +} + +multiclass ISET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode, + string TypeStr, string CVTStr> { + def i8rr_toi8: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), + Set_Stri8<OpcStr, "$dst", "$a", "$b", TypeStr, CVTStr>.s, + []>; + def i16rr_toi16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, + Int16Regs:$b), + Set_Str<OpcStr, "16", "16", "$dst", "$a", "$b">.s, + []>; + def i32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, + Int32Regs:$b), + Set_Str<OpcStr, "32", "32", "$dst", "$a", "$b">.s, + []>; + def i64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, + Int64Regs:$b), + Set_Str<OpcStr, "64", "64", "$dst", "$a", "$b">.s, + []>; + + def i8rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), + Handle_i8rr<OpcStr, TypeStr, CVTStr>.s, + [(set Int1Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; + def i8ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, i8imm:$b), + Handle_i8ri<OpcStr, TypeStr, CVTStr>.s, + [(set Int1Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>; + def i8ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i8imm:$a, Int8Regs:$b), + Handle_i8ir<OpcStr, TypeStr, CVTStr>.s, + [(set Int1Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>; + def i16rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + def i16ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; + def i16ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i16imm:$a, Int16Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>; + def i32rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i32imm:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>; + def i64rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + def i64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + def i64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i64imm:$a, Int64Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>; + + def i8rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), + Handle_i8rr<OpcStr_u32, TypeStr, CVTStr>.s, + [(set Int32Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; + def i8ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, i8imm:$b), + Handle_i8ri<OpcStr_u32, TypeStr, CVTStr>.s, + [(set Int32Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>; + def i8ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i8imm:$a, Int8Regs:$b), + Handle_i8ir<OpcStr_u32, TypeStr, CVTStr>.s, + [(set Int32Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>; + def i16rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, + Int16Regs:$b), + !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + def i16ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; + def i16ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i16imm:$a, Int16Regs:$b), + !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>; + def i32rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, + Int32Regs:$b), + !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i32ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, Int32Regs:$b), + !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>; + def i64rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a, + Int64Regs:$b), + !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + def i64ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + def i64ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i64imm:$a, Int64Regs:$b), + !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>; +} + +multiclass FSET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode> { + def f32rr_toi32_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a, + Float32Regs:$b), + Set_Str<OpcStr, "ftz.f32", "32", "$dst", "$a", "$b">.s, + []>, Requires<[doF32FTZ]>; + def f32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a, + Float32Regs:$b), + Set_Str<OpcStr, "f32", "32", "$dst", "$a", "$b">.s, + []>; + def f64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Float64Regs:$a, + Float64Regs:$b), + Set_Str<OpcStr, "f64", "64", "$dst", "$a", "$b">.s, + []>; + def f64rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float64Regs:$a, + Float64Regs:$b), + Set_Str<OpcStr, "f64", "32", "$dst", "$a", "$b">.s, + []>; + + def f32rr_p_ftz: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a + , Float32Regs:$b), + !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]> + , Requires<[doF32FTZ]>; + def f32rr_p: NVPTXInst<(outs Int1Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, "f32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; + def f32ri_p_ftz: NVPTXInst<(outs Int1Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[doF32FTZ]>; + def f32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, "f32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; + def f32ir_p_ftz: NVPTXInst<(outs Int1Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>, + Requires<[doF32FTZ]>; + def f32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f32imm:$a, Float32Regs:$b), + !strconcat(OpcStr, "f32 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>; + def f64rr_p: NVPTXInst<(outs Int1Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, "f64 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; + def f64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, "f64 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; + def f64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f64imm:$a, Float64Regs:$b), + !strconcat(OpcStr, "f64 \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>; + + def f32rr_u32_ftz: NVPTXInst<(outs Int32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; + def f32rr_u32: NVPTXInst<(outs Int32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; + def f32ri_u32_ftz: NVPTXInst<(outs Int32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; + def f32ri_u32: NVPTXInst<(outs Int32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; + def f32ir_u32_ftz: NVPTXInst<(outs Int32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>; + def f32ir_u32: NVPTXInst<(outs Int32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>; + def f64rr_u32: NVPTXInst<(outs Int32Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; + def f64ri_u32: NVPTXInst<(outs Int32Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; + def f64ir_u32: NVPTXInst<(outs Int32Regs:$dst), + (ins f64imm:$a, Float64Regs:$b), + !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>; +} + +defm ISetSGT +: ISET_FORMAT<"setp.gt.s", "set.gt.u32.s", setgt, "s16", "cvt.s16.s8">; +defm ISetUGT +: ISET_FORMAT<"setp.gt.u", "set.gt.u32.u", setugt, "u16", "cvt.u16.u8">; +defm ISetSLT +: ISET_FORMAT<"setp.lt.s", "set.lt.u32.s", setlt, "s16", "cvt.s16.s8">; +defm ISetULT +: ISET_FORMAT<"setp.lt.u", "set.lt.u32.u", setult, "u16", "cvt.u16.u8">; +defm ISetSGE +: ISET_FORMAT<"setp.ge.s", "set.ge.u32.s", setge, "s16", "cvt.s16.s8">; +defm ISetUGE +: ISET_FORMAT<"setp.ge.u", "set.ge.u32.u", setuge, "u16", "cvt.u16.u8">; +defm ISetSLE +: ISET_FORMAT<"setp.le.s", "set.le.u32.s", setle, "s16", "cvt.s16.s8">; +defm ISetULE +: ISET_FORMAT<"setp.le.u", "set.le.u32.u", setule, "u16", "cvt.u16.u8">; +defm ISetSEQ +: ISET_FORMAT<"setp.eq.s", "set.eq.u32.s", seteq, "s16", "cvt.s16.s8">; +defm ISetUEQ +: ISET_FORMAT<"setp.eq.u", "set.eq.u32.u", setueq, "u16", "cvt.u16.u8">; +defm ISetSNE +: ISET_FORMAT<"setp.ne.s", "set.ne.u32.s", setne, "s16", "cvt.s16.s8">; +defm ISetUNE +: ISET_FORMAT<"setp.ne.u", "set.ne.u32.u", setune, "u16", "cvt.u16.u8">; + +def ISetSNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst), + (ins Int1Regs:$a, Int1Regs:$b), + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>; +def ISetUNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst), + (ins Int1Regs:$a, Int1Regs:$b), + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (setune Int1Regs:$a, Int1Regs:$b))]>; +def ISetSEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst), + (ins Int1Regs:$a, Int1Regs:$b), + !strconcat("{{\n\t", + !strconcat(".reg .pred temp;\n\t", + !strconcat("xor.pred \ttemp, $a, $b;\n\t", + !strconcat("not.pred \t$dst, temp;\n\t}}","")))), + [(set Int1Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>; +def ISetUEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst), + (ins Int1Regs:$a, Int1Regs:$b), + !strconcat("{{\n\t", + !strconcat(".reg .pred temp;\n\t", + !strconcat("xor.pred \ttemp, $a, $b;\n\t", + !strconcat("not.pred \t$dst, temp;\n\t}}","")))), + [(set Int1Regs:$dst, (setueq Int1Regs:$a, Int1Regs:$b))]>; + +// Compare 2 i1's and produce a u32 +def ISETSNEi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst), + (ins Int1Regs:$a, Int1Regs:$b), + !strconcat("{{\n\t", + !strconcat(".reg .pred temp;\n\t", + !strconcat("xor.pred \ttemp, $a, $b;\n\t", + !strconcat("selp.u32 \t$dst, -1, 0, temp;", "\n\t}}")))), + [(set Int32Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>; +def ISETSEQi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst), + (ins Int1Regs:$a, Int1Regs:$b), + !strconcat("{{\n\t", + !strconcat(".reg .pred temp;\n\t", + !strconcat("xor.pred \ttemp, $a, $b;\n\t", + !strconcat("selp.u32 \t$dst, 0, -1, temp;", "\n\t}}")))), + [(set Int32Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>; + +defm FSetGT : FSET_FORMAT<"setp.gt.", "set.gt.u32.", setogt>; +defm FSetLT : FSET_FORMAT<"setp.lt.", "set.lt.u32.", setolt>; +defm FSetGE : FSET_FORMAT<"setp.ge.", "set.ge.u32.", setoge>; +defm FSetLE : FSET_FORMAT<"setp.le.", "set.le.u32.", setole>; +defm FSetEQ : FSET_FORMAT<"setp.eq.", "set.eq.u32.", setoeq>; +defm FSetNE : FSET_FORMAT<"setp.ne.", "set.ne.u32.", setone>; + +defm FSetUGT : FSET_FORMAT<"setp.gtu.", "set.gtu.u32.", setugt>; +defm FSetULT : FSET_FORMAT<"setp.ltu.", "set.ltu.u32.",setult>; +defm FSetUGE : FSET_FORMAT<"setp.geu.", "set.geu.u32.",setuge>; +defm FSetULE : FSET_FORMAT<"setp.leu.", "set.leu.u32.",setule>; +defm FSetUEQ : FSET_FORMAT<"setp.equ.", "set.equ.u32.",setueq>; +defm FSetUNE : FSET_FORMAT<"setp.neu.", "set.neu.u32.",setune>; + +defm FSetNUM : FSET_FORMAT<"setp.num.", "set.num.u32.",seto>; +defm FSetNAN : FSET_FORMAT<"setp.nan.", "set.nan.u32.",setuo>; + +def SELECTi1rr : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)), + (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a), + (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>; +def SELECTi8rr : NVPTXInst<(outs Int8Regs:$dst), + (ins Int8Regs:$a, Int8Regs:$b, Int1Regs:$p), + "selp.b16 \t$dst, $a, $b, $p;", + [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, Int8Regs:$b))]>; +def SELECTi8ri : NVPTXInst<(outs Int8Regs:$dst), + (ins Int8Regs:$a, i8imm:$b, Int1Regs:$p), + "selp.b16 \t$dst, $a, $b, $p;", + [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, imm:$b))]>; +def SELECTi8ir : NVPTXInst<(outs Int8Regs:$dst), + (ins i8imm:$a, Int8Regs:$b, Int1Regs:$p), + "selp.b16 \t$dst, $a, $b, $p;", + [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, Int8Regs:$b))]>; +def SELECTi8ii : NVPTXInst<(outs Int8Regs:$dst), + (ins i8imm:$a, i8imm:$b, Int1Regs:$p), + "selp.b16 \t$dst, $a, $b, $p;", + [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>; + +def SELECTi16rr : NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, Int1Regs:$p), + "selp.b16 \t$dst, $a, $b, $p;", + [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, Int16Regs:$b))]>; +def SELECTi16ri : NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, Int1Regs:$p), + "selp.b16 \t$dst, $a, $b, $p;", + [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, imm:$b))]>; +def SELECTi16ir : NVPTXInst<(outs Int16Regs:$dst), + (ins i16imm:$a, Int16Regs:$b, Int1Regs:$p), + "selp.b16 \t$dst, $a, $b, $p;", + [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, Int16Regs:$b))]>; +def SELECTi16ii : NVPTXInst<(outs Int16Regs:$dst), + (ins i16imm:$a, i16imm:$b, Int1Regs:$p), + "selp.b16 \t$dst, $a, $b, $p;", + [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>; + +def SELECTi32rr : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p), + "selp.b32 \t$dst, $a, $b, $p;", + [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, Int32Regs:$b))]>; +def SELECTi32ri : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, Int1Regs:$p), + "selp.b32 \t$dst, $a, $b, $p;", + [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, imm:$b))]>; +def SELECTi32ir : NVPTXInst<(outs Int32Regs:$dst), + (ins i32imm:$a, Int32Regs:$b, Int1Regs:$p), + "selp.b32 \t$dst, $a, $b, $p;", + [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, Int32Regs:$b))]>; +def SELECTi32ii : NVPTXInst<(outs Int32Regs:$dst), + (ins i32imm:$a, i32imm:$b, Int1Regs:$p), + "selp.b32 \t$dst, $a, $b, $p;", + [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>; + +def SELECTi64rr : NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, Int1Regs:$p), + "selp.b64 \t$dst, $a, $b, $p;", + [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, Int64Regs:$b))]>; +def SELECTi64ri : NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, Int1Regs:$p), + "selp.b64 \t$dst, $a, $b, $p;", + [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, imm:$b))]>; +def SELECTi64ir : NVPTXInst<(outs Int64Regs:$dst), + (ins i64imm:$a, Int64Regs:$b, Int1Regs:$p), + "selp.b64 \t$dst, $a, $b, $p;", + [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, Int64Regs:$b))]>; +def SELECTi64ii : NVPTXInst<(outs Int64Regs:$dst), + (ins i64imm:$a, i64imm:$b, Int1Regs:$p), + "selp.b64 \t$dst, $a, $b, $p;", + [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>; + +def SELECTf32rr : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b, Int1Regs:$p), + "selp.f32 \t$dst, $a, $b, $p;", + [(set Float32Regs:$dst, + (select Int1Regs:$p, Float32Regs:$a, Float32Regs:$b))]>; +def SELECTf32ri : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b, Int1Regs:$p), + "selp.f32 \t$dst, $a, $b, $p;", + [(set Float32Regs:$dst, (select Int1Regs:$p, Float32Regs:$a, fpimm:$b))]>; +def SELECTf32ir : NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b, Int1Regs:$p), + "selp.f32 \t$dst, $a, $b, $p;", + [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float32Regs:$b))]>; +def SELECTf32ii : NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, f32imm:$b, Int1Regs:$p), + "selp.f32 \t$dst, $a, $b, $p;", + [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>; + +def SELECTf64rr : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b, Int1Regs:$p), + "selp.f64 \t$dst, $a, $b, $p;", + [(set Float64Regs:$dst, + (select Int1Regs:$p, Float64Regs:$a, Float64Regs:$b))]>; +def SELECTf64ri : NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b, Int1Regs:$p), + "selp.f64 \t$dst, $a, $b, $p;", + [(set Float64Regs:$dst, (select Int1Regs:$p, Float64Regs:$a, fpimm:$b))]>; +def SELECTf64ir : NVPTXInst<(outs Float64Regs:$dst), + (ins f64imm:$a, Float64Regs:$b, Int1Regs:$p), + "selp.f64 \t$dst, $a, $b, $p;", + [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float64Regs:$b))]>; +def SELECTf64ii : NVPTXInst<(outs Float64Regs:$dst), + (ins f64imm:$a, f64imm:$b, Int1Regs:$p), + "selp.f64 \t $dst, $a, $b, $p;", + [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>; + +//def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, +// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisInt<2>]>; +def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, + SDTCisInt<1>, SDTCisInt<2>]>; +def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; +def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; +def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; +def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; +def SDTCallValProfile : SDTypeProfile<1, 0, []>; +def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; +def SDTMoveRetvalProfile : SDTypeProfile<0, 1, []>; +def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; +def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; + +def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam", + SDTDeclareScalarParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam", + SDTDeclareParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareRet : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def LoadParam : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def PrintCall : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParam : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def MoveToParam : SDNode<"NVPTXISD::MoveToParam", SDTStoreParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArg : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def LastCallArg : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArgEnd : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallVoid : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def Prototype : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallVal : SDNode<"NVPTXISD::CallVal", SDTCallValProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, + []>; +def MoveRetval : SDNode<"NVPTXISD::MoveRetval", SDTMoveRetvalProfile, + [SDNPHasChain, SDNPSideEffect]>; +def StoreRetval : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, + [SDNPHasChain, SDNPSideEffect]>; +def MoveToRetval : SDNode<"NVPTXISD::MoveToRetval", SDTStoreRetvalProfile, + [SDNPHasChain, SDNPSideEffect]>; +def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam", + SDTPseudoUseParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def RETURNNode : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, + [SDNPHasChain, SDNPSideEffect]>; + +class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), + !strconcat(!strconcat("ld.param", opstr), + "\t$dst, [retval0+$b];"), + [(set regclass:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>; + +class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), + !strconcat(!strconcat("mov", opstr), + "\t$dst, retval$b;"), + [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; + +class StoreParamInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), + !strconcat(!strconcat("st.param", opstr), + "\t[param$a+$b], $val;"), + [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>; + +class MoveToParamInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), + !strconcat(!strconcat("mov", opstr), + "\tparam$a, $val;"), + [(MoveToParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>; + +class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), + !strconcat(!strconcat("st.param", opstr), + "\t[func_retval0+$a], $val;"), + [(StoreRetval (i32 imm:$a), regclass:$val)]>; + +class MoveToRetvalInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins i32imm:$num, regclass:$val), + !strconcat(!strconcat("mov", opstr), + "\tfunc_retval$num, $val;"), + [(MoveToRetval (i32 imm:$num), regclass:$val)]>; + +class MoveRetvalInst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), (ins regclass:$val), + !strconcat(!strconcat("mov", opstr), + "\tfunc_retval0, $val;"), + [(MoveRetval regclass:$val)]>; + +def PrintCallRetInst1 : NVPTXInst<(outs), (ins), +"call (retval0), ", + [(PrintCall (i32 1))]>; +def PrintCallRetInst2 : NVPTXInst<(outs), (ins), +"call (retval0, retval1), ", + [(PrintCall (i32 2))]>; +def PrintCallRetInst3 : NVPTXInst<(outs), (ins), +"call (retval0, retval1, retval2), ", + [(PrintCall (i32 3))]>; +def PrintCallRetInst4 : NVPTXInst<(outs), (ins), +"call (retval0, retval1, retval2, retval3), ", + [(PrintCall (i32 4))]>; +def PrintCallRetInst5 : NVPTXInst<(outs), (ins), +"call (retval0, retval1, retval2, retval3, retval4), ", + [(PrintCall (i32 5))]>; +def PrintCallRetInst6 : NVPTXInst<(outs), (ins), +"call (retval0, retval1, retval2, retval3, retval4, retval5), ", + [(PrintCall (i32 6))]>; +def PrintCallRetInst7 : NVPTXInst<(outs), (ins), +"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ", + [(PrintCall (i32 7))]>; +def PrintCallRetInst8 : NVPTXInst<(outs), (ins), +!strconcat("call (retval0, retval1, retval2, retval3, retval4", + ", retval5, retval6, retval7), "), + [(PrintCall (i32 8))]>; + +def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ", + [(PrintCall (i32 0))]>; + +def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins), +"call.uni (retval0), ", + [(PrintCallUni (i32 1))]>; +def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins), +"call.uni (retval0, retval1), ", + [(PrintCallUni (i32 2))]>; +def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins), +"call.uni (retval0, retval1, retval2), ", + [(PrintCallUni (i32 3))]>; +def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins), +"call.uni (retval0, retval1, retval2, retval3), ", + [(PrintCallUni (i32 4))]>; +def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins), +"call.uni (retval0, retval1, retval2, retval3, retval4), ", + [(PrintCallUni (i32 5))]>; +def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins), +"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ", + [(PrintCallUni (i32 6))]>; +def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins), +"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ", + [(PrintCallUni (i32 7))]>; +def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins), +!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4", + ", retval5, retval6, retval7), "), + [(PrintCallUni (i32 8))]>; + +def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ", + [(PrintCallUni (i32 0))]>; + +def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; +def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; +def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; +def LoadParamMemI8 : LoadParamMemInst<Int8Regs, ".b8">; + +//def LoadParamMemI16 : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b), +// !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t", +// "cvt.u16.u32\t$dst, temp_param_reg;"), +// [(set Int16Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>; +//def LoadParamMemI8 : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b), +// !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t", +// "cvt.u16.u32\t$dst, temp_param_reg;"), +// [(set Int8Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>; + +def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; +def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; + +def LoadParamRegI64 : LoadParamRegInst<Int64Regs, ".b64">; +def LoadParamRegI32 : LoadParamRegInst<Int32Regs, ".b32">; +def LoadParamRegI16 : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b), + "cvt.u16.u32\t$dst, retval$b;", + [(set Int16Regs:$dst, + (LoadParam (i32 0), (i32 imm:$b)))]>; +def LoadParamRegI8 : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b), + "cvt.u16.u32\t$dst, retval$b;", + [(set Int8Regs:$dst, + (LoadParam (i32 0), (i32 imm:$b)))]>; + +def LoadParamRegF32 : LoadParamRegInst<Float32Regs, ".f32">; +def LoadParamRegF64 : LoadParamRegInst<Float64Regs, ".f64">; + +def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">; +def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; + +def StoreParamI16 : NVPTXInst<(outs), + (ins Int16Regs:$val, i32imm:$a, i32imm:$b), + "st.param.b16\t[param$a+$b], $val;", + [(StoreParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>; + +def StoreParamI8 : NVPTXInst<(outs), + (ins Int8Regs:$val, i32imm:$a, i32imm:$b), + "st.param.b8\t[param$a+$b], $val;", + [(StoreParam + (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>; + +def StoreParamS32I16 : NVPTXInst<(outs), + (ins Int16Regs:$val, i32imm:$a, i32imm:$b), + !strconcat("cvt.s32.s16\ttemp_param_reg, $val;\n\t", + "st.param.b32\t[param$a+$b], temp_param_reg;"), + [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>; +def StoreParamU32I16 : NVPTXInst<(outs), + (ins Int16Regs:$val, i32imm:$a, i32imm:$b), + !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t", + "st.param.b32\t[param$a+$b], temp_param_reg;"), + [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>; + +def StoreParamU32I8 : NVPTXInst<(outs), + (ins Int8Regs:$val, i32imm:$a, i32imm:$b), + !strconcat("cvt.u32.u8\ttemp_param_reg, $val;\n\t", + "st.param.b32\t[param$a+$b], temp_param_reg;"), + [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>; +def StoreParamS32I8 : NVPTXInst<(outs), + (ins Int8Regs:$val, i32imm:$a, i32imm:$b), + !strconcat("cvt.s32.s8\ttemp_param_reg, $val;\n\t", + "st.param.b32\t[param$a+$b], temp_param_reg;"), + [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>; + +def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; +def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; + +def MoveToParamI64 : MoveToParamInst<Int64Regs, ".b64">; +def MoveToParamI32 : MoveToParamInst<Int32Regs, ".b32">; +def MoveToParamF64 : MoveToParamInst<Float64Regs, ".f64">; +def MoveToParamF32 : MoveToParamInst<Float32Regs, ".f32">; +def MoveToParamI16 : NVPTXInst<(outs), + (ins Int16Regs:$val, i32imm:$a, i32imm:$b), + !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t", + "mov.b32\tparam$a, temp_param_reg;"), + [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>; +def MoveToParamI8 : NVPTXInst<(outs), + (ins Int8Regs:$val, i32imm:$a, i32imm:$b), + !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t", + "mov.b32\tparam$a, temp_param_reg;"), + [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>; + +def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; +def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; +def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; +def StoreRetvalI8 : StoreRetvalInst<Int8Regs, ".b8">; + +//def StoreRetvalI16 : NVPTXInst<(outs), (ins Int16Regs:$val, i32imm:$a), +// !strconcat("\{\n\t", +// !strconcat(".reg .b32 temp_retval_reg;\n\t", +// !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t", +// "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))), +// [(StoreRetval (i32 imm:$a), Int16Regs:$val)]>; +//def StoreRetvalI8 : NVPTXInst<(outs), (ins Int8Regs:$val, i32imm:$a), +// !strconcat("\{\n\t", +// !strconcat(".reg .b32 temp_retval_reg;\n\t", +// !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t", +// "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))), +// [(StoreRetval (i32 imm:$a), Int8Regs:$val)]>; + +def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; +def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; + +def MoveRetvalI64 : MoveRetvalInst<Int64Regs, ".b64">; +def MoveRetvalI32 : MoveRetvalInst<Int32Regs, ".b32">; +def MoveRetvalI16 : MoveRetvalInst<Int16Regs, ".b16">; +def MoveRetvalI8 : MoveRetvalInst<Int8Regs, ".b8">; +def MoveRetvalF64 : MoveRetvalInst<Float64Regs, ".f64">; +def MoveRetvalF32 : MoveRetvalInst<Float32Regs, ".f32">; + +def MoveToRetvalI64 : MoveToRetvalInst<Int64Regs, ".b64">; +def MoveToRetvalI32 : MoveToRetvalInst<Int32Regs, ".b32">; +def MoveToRetvalF64 : MoveToRetvalInst<Float64Regs, ".f64">; +def MoveToRetvalF32 : MoveToRetvalInst<Float32Regs, ".f32">; +def MoveToRetvalI16 : NVPTXInst<(outs), (ins i32imm:$num, Int16Regs:$val), + "cvt.u32.u16\tfunc_retval$num, $val;", + [(MoveToRetval (i32 imm:$num), Int16Regs:$val)]>; +def MoveToRetvalI8 : NVPTXInst<(outs), (ins i32imm:$num, Int8Regs:$val), + "cvt.u32.u16\tfunc_retval$num, $val;", + [(MoveToRetval (i32 imm:$num), Int8Regs:$val)]>; + +def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; +def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; +def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; +def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; + +class CallArgInst<NVPTXRegClass regclass> : + NVPTXInst<(outs), (ins regclass:$a), "$a, ", + [(CallArg (i32 0), regclass:$a)]>; + +class LastCallArgInst<NVPTXRegClass regclass> : + NVPTXInst<(outs), (ins regclass:$a), "$a", + [(LastCallArg (i32 0), regclass:$a)]>; + +def CallArgI64 : CallArgInst<Int64Regs>; +def CallArgI32 : CallArgInst<Int32Regs>; +def CallArgI16 : CallArgInst<Int16Regs>; +def CallArgI8 : CallArgInst<Int8Regs>; + +def CallArgF64 : CallArgInst<Float64Regs>; +def CallArgF32 : CallArgInst<Float32Regs>; + +def LastCallArgI64 : LastCallArgInst<Int64Regs>; +def LastCallArgI32 : LastCallArgInst<Int32Regs>; +def LastCallArgI16 : LastCallArgInst<Int16Regs>; +def LastCallArgI8 : LastCallArgInst<Int8Regs>; + +def LastCallArgF64 : LastCallArgInst<Float64Regs>; +def LastCallArgF32 : LastCallArgInst<Float32Regs>; + +def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", + [(CallArg (i32 0), (i32 imm:$a))]>; +def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", + [(LastCallArg (i32 0), (i32 imm:$a))]>; + +def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", + [(CallArg (i32 1), (i32 imm:$a))]>; +def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", + [(LastCallArg (i32 1), (i32 imm:$a))]>; + +def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), + "$addr, ", + [(CallVoid (Wrapper tglobaladdr:$addr))]>; +def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), + "$addr, ", + [(CallVoid Int32Regs:$addr)]>; +def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), + "$addr, ", + [(CallVoid Int64Regs:$addr)]>; +def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), + ", prototype_$val;", + [(Prototype (i32 imm:$val))]>; + +def DeclareRetMemInst : NVPTXInst<(outs), + (ins i32imm:$align, i32imm:$size, i32imm:$num), + ".param .align $align .b8 retval$num[$size];", + [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; +def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), + ".param .b$size retval$num;", + [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; +def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), + ".reg .b$size retval$num;", + [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; + +def DeclareParamInst : NVPTXInst<(outs), + (ins i32imm:$align, i32imm:$a, i32imm:$size), + ".param .align $align .b8 param$a[$size];", + [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; +def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), + ".param .b$size param$a;", + [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; +def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), + ".reg .b$size param$a;", + [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; + +class MoveParamInst<NVPTXRegClass regclass, string asmstr> : + NVPTXInst<(outs regclass:$dst), (ins regclass:$src), + !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"), + [(set regclass:$dst, (MoveParam regclass:$src))]>; + +def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">; +def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">; +def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "cvt.u16.u32\t$dst, $src;", + [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; +def MoveParamI8 : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src), + "cvt.u16.u32\t$dst, $src;", + [(set Int8Regs:$dst, (MoveParam Int8Regs:$src))]>; +def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">; +def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">; + +class PseudoUseParamInst<NVPTXRegClass regclass> : + NVPTXInst<(outs), (ins regclass:$src), + "// Pseudo use of $src", + [(PseudoUseParam regclass:$src)]>; + +def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>; +def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>; +def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>; +def PseudoUseParamI8 : PseudoUseParamInst<Int8Regs>; +def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>; +def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>; + + +// +// Load / Store Handling +// +multiclass LD<NVPTXRegClass regclass> { + def _avar : NVPTXInst<(outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), +!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t$dst, [$addr];"), []>; + def _areg : NVPTXInst<(outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), +!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t$dst, [$addr];"), []>; + def _ari : NVPTXInst<(outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), +!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t$dst, [$addr+$offset];"), []>; + def _asi : NVPTXInst<(outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), +!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t$dst, [$addr+$offset];"), []>; +} + +let mayLoad=1, neverHasSideEffects=1 in { +defm LD_i8 : LD<Int8Regs>; +defm LD_i16 : LD<Int16Regs>; +defm LD_i32 : LD<Int32Regs>; +defm LD_i64 : LD<Int64Regs>; +defm LD_f32 : LD<Float32Regs>; +defm LD_f64 : LD<Float64Regs>; +} + +let VecInstType=isVecLD.Value, mayLoad=1, neverHasSideEffects=1 in { +defm LD_v2i8 : LD<V2I8Regs>; +defm LD_v4i8 : LD<V4I8Regs>; +defm LD_v2i16 : LD<V2I16Regs>; +defm LD_v4i16 : LD<V4I16Regs>; +defm LD_v2i32 : LD<V2I32Regs>; +defm LD_v4i32 : LD<V4I32Regs>; +defm LD_v2f32 : LD<V2F32Regs>; +defm LD_v4f32 : LD<V4F32Regs>; +defm LD_v2i64 : LD<V2I64Regs>; +defm LD_v2f64 : LD<V2F64Regs>; +} + +multiclass ST<NVPTXRegClass regclass> { + def _avar : NVPTXInst<(outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, imem:$addr), +!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", + " \t[$addr], $src;"), []>; + def _areg : NVPTXInst<(outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), +!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", + " \t[$addr], $src;"), []>; + def _ari : NVPTXInst<(outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), +!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", + " \t[$addr+$offset], $src;"), []>; + def _asi : NVPTXInst<(outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), +!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", + " \t[$addr+$offset], $src;"), []>; +} + +let mayStore=1, neverHasSideEffects=1 in { +defm ST_i8 : ST<Int8Regs>; +defm ST_i16 : ST<Int16Regs>; +defm ST_i32 : ST<Int32Regs>; +defm ST_i64 : ST<Int64Regs>; +defm ST_f32 : ST<Float32Regs>; +defm ST_f64 : ST<Float64Regs>; +} + +let VecInstType=isVecST.Value, mayStore=1, neverHasSideEffects=1 in { +defm ST_v2i8 : ST<V2I8Regs>; +defm ST_v4i8 : ST<V4I8Regs>; +defm ST_v2i16 : ST<V2I16Regs>; +defm ST_v4i16 : ST<V4I16Regs>; +defm ST_v2i32 : ST<V2I32Regs>; +defm ST_v4i32 : ST<V4I32Regs>; +defm ST_v2f32 : ST<V2F32Regs>; +defm ST_v4f32 : ST<V4F32Regs>; +defm ST_v2i64 : ST<V2I64Regs>; +defm ST_v2f64 : ST<V2F64Regs>; +} + +// The following is used only in and after vector elementizations. +// Vector elementization happens at the machine instruction level, so the +// following instruction +// never appears in the DAG. +multiclass LD_VEC<NVPTXRegClass regclass> { + def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>; + def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>; + def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>; + def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>; + def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, + regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>; + def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, + regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>; + def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, + regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"), + []>; + def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, + regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"), + []>; +} +let mayLoad=1, neverHasSideEffects=1 in { +defm LDV_i8 : LD_VEC<Int8Regs>; +defm LDV_i16 : LD_VEC<Int16Regs>; +defm LDV_i32 : LD_VEC<Int32Regs>; +defm LDV_i64 : LD_VEC<Int64Regs>; +defm LDV_f32 : LD_VEC<Float32Regs>; +defm LDV_f64 : LD_VEC<Float64Regs>; +} + +multiclass ST_VEC<NVPTXRegClass regclass> { + def _v2_avar : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr], {{$src1, $src2}};"), []>; + def _v2_areg : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr], {{$src1, $src2}};"), []>; + def _v2_ari : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, + i32imm:$offset), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>; + def _v2_asi : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, + i32imm:$offset), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>; + def _v4_avar : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>; + def _v4_areg : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>; + def _v4_ari : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"), + []>; + def _v4_asi : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"), + []>; +} +let mayStore=1, neverHasSideEffects=1 in { +defm STV_i8 : ST_VEC<Int8Regs>; +defm STV_i16 : ST_VEC<Int16Regs>; +defm STV_i32 : ST_VEC<Int32Regs>; +defm STV_i64 : ST_VEC<Int64Regs>; +defm STV_f32 : ST_VEC<Float32Regs>; +defm STV_f64 : ST_VEC<Float64Regs>; +} + + +//---- Conversion ---- + +multiclass CVT_INT_TO_FP <string OpStr, SDNode OpNode> { +// FIXME: need to add f16 support +// def CVTf16i8 : +// NVPTXInst<(outs Float16Regs:$d), (ins Int8Regs:$a), +// !strconcat(!strconcat("cvt.rn.f16.", OpStr), "8 \t$d, $a;"), +// [(set Float16Regs:$d, (OpNode Int8Regs:$a))]>; +// def CVTf16i16 : +// NVPTXInst<(outs Float16Regs:$d), (ins Int16Regs:$a), +// !strconcat(!strconcat("cvt.rn.f16.", OpStr), "16 \t$d, $a;"), +// [(set Float16Regs:$d, (OpNode Int16Regs:$a))]>; +// def CVTf16i32 : +// NVPTXInst<(outs Float16Regs:$d), (ins Int32Regs:$a), +// !strconcat(!strconcat("cvt.rn.f16.", OpStr), "32 \t$d, $a;"), +// [(set Float16Regs:$d, (OpNode Int32Regs:$a))]>; +// def CVTf16i64: +// NVPTXInst<(outs Float16Regs:$d), (ins Int64Regs:$a), +// !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"), +// [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>; + + def CVTf32i1 : + NVPTXInst<(outs Float32Regs:$d), (ins Int1Regs:$a), + "selp.f32 \t$d, 1.0, 0.0, $a;", + [(set Float32Regs:$d, (OpNode Int1Regs:$a))]>; + def CVTf32i8 : + NVPTXInst<(outs Float32Regs:$d), (ins Int8Regs:$a), + !strconcat(!strconcat("cvt.rn.f32.", OpStr), "8 \t$d, $a;"), + [(set Float32Regs:$d, (OpNode Int8Regs:$a))]>; + def CVTf32i16 : + NVPTXInst<(outs Float32Regs:$d), (ins Int16Regs:$a), + !strconcat(!strconcat("cvt.rn.f32.", OpStr), "16 \t$d, $a;"), + [(set Float32Regs:$d, (OpNode Int16Regs:$a))]>; + def CVTf32i32 : + NVPTXInst<(outs Float32Regs:$d), (ins Int32Regs:$a), + !strconcat(!strconcat("cvt.rn.f32.", OpStr), "32 \t$d, $a;"), + [(set Float32Regs:$d, (OpNode Int32Regs:$a))]>; + def CVTf32i64: + NVPTXInst<(outs Float32Regs:$d), (ins Int64Regs:$a), + !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"), + [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>; + + def CVTf64i1 : + NVPTXInst<(outs Float64Regs:$d), (ins Int1Regs:$a), + "selp.f64 \t$d, 1.0, 0.0, $a;", + [(set Float64Regs:$d, (OpNode Int1Regs:$a))]>; + def CVTf64i8 : + NVPTXInst<(outs Float64Regs:$d), (ins Int8Regs:$a), + !strconcat(!strconcat("cvt.rn.f64.", OpStr), "8 \t$d, $a;"), + [(set Float64Regs:$d, (OpNode Int8Regs:$a))]>; + def CVTf64i16 : + NVPTXInst<(outs Float64Regs:$d), (ins Int16Regs:$a), + !strconcat(!strconcat("cvt.rn.f64.", OpStr), "16 \t$d, $a;"), + [(set Float64Regs:$d, (OpNode Int16Regs:$a))]>; + def CVTf64i32 : + NVPTXInst<(outs Float64Regs:$d), (ins Int32Regs:$a), + !strconcat(!strconcat("cvt.rn.f64.", OpStr), "32 \t$d, $a;"), + [(set Float64Regs:$d, (OpNode Int32Regs:$a))]>; + def CVTf64i64: + NVPTXInst<(outs Float64Regs:$d), (ins Int64Regs:$a), + !strconcat(!strconcat("cvt.rn.f64.", OpStr), "64 \t$d, $a;"), + [(set Float64Regs:$d, (OpNode Int64Regs:$a))]>; +} + +defm Sint_to_fp : CVT_INT_TO_FP <"s", sint_to_fp>; +defm Uint_to_fp : CVT_INT_TO_FP <"u", uint_to_fp>; + +multiclass CVT_FP_TO_INT <string OpStr, SDNode OpNode> { +// FIXME: need to add f16 support +// def CVTi8f16: +// NVPTXInst<(outs Int8Regs:$d), (ins Float16Regs:$a), +// !strconcat(!strconcat("cvt.rzi.", OpStr), "8.f16 $d, $a;"), +// [(set Int8Regs:$d, (OpNode Float16Regs:$a))]>; + def CVTi8f32_ftz: + NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a), + !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"), + [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>; + def CVTi8f32: + NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a), + !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"), + [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>; + def CVTi8f64: + NVPTXInst<(outs Int8Regs:$d), (ins Float64Regs:$a), + !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"), + [(set Int8Regs:$d, (OpNode Float64Regs:$a))]>; + +// FIXME: need to add f16 support +// def CVTi16f16: +// NVPTXInst<(outs Int16Regs:$d), (ins Float16Regs:$a), +// !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f16 \t$d, $a;"), +// [(set Int16Regs:$d, (OpNode Float16Regs:$a))]>; + def CVTi16f32_ftz: + NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a), + !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"), + [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>; + def CVTi16f32: + NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a), + !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"), + [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>; + def CVTi16f64: + NVPTXInst<(outs Int16Regs:$d), (ins Float64Regs:$a), + !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"), + [(set Int16Regs:$d, (OpNode Float64Regs:$a))]>; + +// FIXME: need to add f16 support +// def CVTi32f16: def CVTi32f16: +// NVPTXInst<(outs Int32Regs:$d), (ins Float16Regs:$a), +// !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f16 \t$d, $a;"), +// [(set Int32Regs:$d, (OpNode Float16Regs:$a))]>; + def CVTi32f32_ftz: + NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a), + !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "32.f32 \t$d, $a;"), + [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>; + def CVTi32f32: + NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a), + !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f32 \t$d, $a;"), + [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>; + def CVTi32f64: + NVPTXInst<(outs Int32Regs:$d), (ins Float64Regs:$a), + !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f64 \t$d, $a;"), + [(set Int32Regs:$d, (OpNode Float64Regs:$a))]>; + +// FIXME: need to add f16 support +// def CVTi64f16: +// NVPTXInst<(outs Int64Regs:$d), (ins Float16Regs:$a), +// !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f16 \t$d, $a;"), +// [(set Int64Regs:$d, (OpNode Float16Regs:$a))]>; + def CVTi64f32_ftz: + NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a), + !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "64.f32 \t$d, $a;"), + [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>; + def CVTi64f32: + NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a), + !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f32 \t$d, $a;"), + [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>; + def CVTi64f64: + NVPTXInst<(outs Int64Regs:$d), (ins Float64Regs:$a), + !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f64 \t$d, $a;"), + [(set Int64Regs:$d, (OpNode Float64Regs:$a))]>; +} + +defm Fp_to_sint : CVT_FP_TO_INT <"s", fp_to_sint>; +defm Fp_to_uint : CVT_FP_TO_INT <"u", fp_to_uint>; + +multiclass INT_EXTEND_UNSIGNED_1 <SDNode OpNode> { + def ext1to8: + NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a), + "selp.u16 \t$d, 1, 0, $a;", + [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>; + def ext1to16: + NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a), + "selp.u16 \t$d, 1, 0, $a;", + [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>; + def ext1to32: + NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a), + "selp.u32 \t$d, 1, 0, $a;", + [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>; + def ext1to64: + NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a), + "selp.u64 \t$d, 1, 0, $a;", + [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>; +} + +multiclass INT_EXTEND_SIGNED_1 <SDNode OpNode> { + def ext1to8: + NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a), + "selp.s16 \t$d, -1, 0, $a;", + [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>; + def ext1to16: + NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a), + "selp.s16 \t$d, -1, 0, $a;", + [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>; + def ext1to32: + NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a), + "selp.s32 \t$d, -1, 0, $a;", + [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>; + def ext1to64: + NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a), + "selp.s64 \t$d, -1, 0, $a;", + [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>; +} + +multiclass INT_EXTEND <string OpStr, SDNode OpNode> { + // All Int8Regs are emiited as 16bit registers in ptx. + // And there is no selp.u8 in ptx. + def ext8to16: + NVPTXInst<(outs Int16Regs:$d), (ins Int8Regs:$a), + !strconcat("cvt.", !strconcat(OpStr, !strconcat("16.", + !strconcat(OpStr, "8 \t$d, $a;")))), + [(set Int16Regs:$d, (OpNode Int8Regs:$a))]>; + def ext8to32: + NVPTXInst<(outs Int32Regs:$d), (ins Int8Regs:$a), + !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.", + !strconcat(OpStr, "8 \t$d, $a;")))), + [(set Int32Regs:$d, (OpNode Int8Regs:$a))]>; + def ext8to64: + NVPTXInst<(outs Int64Regs:$d), (ins Int8Regs:$a), + !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.", + !strconcat(OpStr, "8 \t$d, $a;")))), + [(set Int64Regs:$d, (OpNode Int8Regs:$a))]>; + def ext16to32: + NVPTXInst<(outs Int32Regs:$d), (ins Int16Regs:$a), + !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.", + !strconcat(OpStr, "16 \t$d, $a;")))), + [(set Int32Regs:$d, (OpNode Int16Regs:$a))]>; + def ext16to64: + NVPTXInst<(outs Int64Regs:$d), (ins Int16Regs:$a), + !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.", + !strconcat(OpStr, "16 \t$d, $a;")))), + [(set Int64Regs:$d, (OpNode Int16Regs:$a))]>; + def ext32to64: + NVPTXInst<(outs Int64Regs:$d), (ins Int32Regs:$a), + !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.", + !strconcat(OpStr, "32 \t$d, $a;")))), + [(set Int64Regs:$d, (OpNode Int32Regs:$a))]>; +} + +defm Sint_extend_1 : INT_EXTEND_SIGNED_1<sext>; +defm Zint_extend_1 : INT_EXTEND_UNSIGNED_1<zext>; +defm Aint_extend_1 : INT_EXTEND_UNSIGNED_1<anyext>; + +defm Sint_extend : INT_EXTEND <"s", sext>; +defm Zint_extend : INT_EXTEND <"u", zext>; +defm Aint_extend : INT_EXTEND <"u", anyext>; + +class TRUNC_to1_asm<string sz> { + string s = !strconcat("{{\n\t", + !strconcat(".reg ", + !strconcat(sz, + !strconcat(" temp;\n\t", + !strconcat("and", + !strconcat(sz, + !strconcat("\t temp, $a, 1;\n\t", + !strconcat("setp", + !strconcat(sz, ".eq \t $d, temp, 1;\n\t}}"))))))))); +} + +def TRUNC_64to32 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "cvt.u32.u64 \t$d, $a;", + [(set Int32Regs:$d, (trunc Int64Regs:$a))]>; +def TRUNC_64to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int64Regs:$a), + "cvt.u16.u64 \t$d, $a;", + [(set Int16Regs:$d, (trunc Int64Regs:$a))]>; +def TRUNC_64to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int64Regs:$a), + "cvt.u8.u64 \t$d, $a;", + [(set Int8Regs:$d, (trunc Int64Regs:$a))]>; +def TRUNC_32to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int32Regs:$a), + "cvt.u16.u32 \t$d, $a;", + [(set Int16Regs:$d, (trunc Int32Regs:$a))]>; +def TRUNC_32to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int32Regs:$a), + "cvt.u8.u32 \t$d, $a;", + [(set Int8Regs:$d, (trunc Int32Regs:$a))]>; +def TRUNC_16to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int16Regs:$a), + "cvt.u8.u16 \t$d, $a;", + [(set Int8Regs:$d, (trunc Int16Regs:$a))]>; +def TRUNC_64to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + TRUNC_to1_asm<".b64">.s, + [(set Int1Regs:$d, (trunc Int64Regs:$a))]>; +def TRUNC_32to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + TRUNC_to1_asm<".b32">.s, + [(set Int1Regs:$d, (trunc Int32Regs:$a))]>; +def TRUNC_16to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int16Regs:$a), + TRUNC_to1_asm<".b16">.s, + [(set Int1Regs:$d, (trunc Int16Regs:$a))]>; +def TRUNC_8to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int8Regs:$a), + TRUNC_to1_asm<".b16">.s, + [(set Int1Regs:$d, (trunc Int8Regs:$a))]>; + +// Select instructions +def : Pat<(select Int32Regs:$pred, Int8Regs:$a, Int8Regs:$b), + (SELECTi8rr Int8Regs:$a, Int8Regs:$b, (TRUNC_32to1 Int32Regs:$pred))>; +def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b), + (SELECTi16rr Int16Regs:$a, Int16Regs:$b, + (TRUNC_32to1 Int32Regs:$pred))>; +def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b), + (SELECTi32rr Int32Regs:$a, Int32Regs:$b, + (TRUNC_32to1 Int32Regs:$pred))>; +def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b), + (SELECTi64rr Int64Regs:$a, Int64Regs:$b, + (TRUNC_32to1 Int32Regs:$pred))>; +def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b), + (SELECTf32rr Float32Regs:$a, Float32Regs:$b, + (TRUNC_32to1 Int32Regs:$pred))>; +def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b), + (SELECTf64rr Float64Regs:$a, Float64Regs:$b, + (TRUNC_32to1 Int32Regs:$pred))>; + +class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn, + NVPTXRegClass regclassOut> : + NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), + !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")), + [(set regclassOut:$d, (bitconvert regclassIn:$a))]>; + +def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>; +def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>; +def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>; +def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>; + +// pack a set of smaller int registers to a larger int register +def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d), + (ins Int8Regs:$s1, Int8Regs:$s2, + Int8Regs:$s3, Int8Regs:$s4), + !strconcat("{{\n\t.reg .b8\t%t<4>;", + !strconcat("\n\tcvt.u8.u8\t%t0, $s1;", + !strconcat("\n\tcvt.u8.u8\t%t1, $s2;", + !strconcat("\n\tcvt.u8.u8\t%t2, $s3;", + !strconcat("\n\tcvt.u8.u8\t%t3, $s4;", + "\n\tmov.b32\t$d, {%t0, %t1, %t2, %t3};\n\t}}"))))), + []>; +def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), + (ins Int16Regs:$s1, Int16Regs:$s2, + Int16Regs:$s3, Int16Regs:$s4), + "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", + []>; +def V2I8toI16 : NVPTXInst<(outs Int16Regs:$d), + (ins Int8Regs:$s1, Int8Regs:$s2), + !strconcat("{{\n\t.reg .b8\t%t<2>;", + !strconcat("\n\tcvt.u8.u8\t%t0, $s1;", + !strconcat("\n\tcvt.u8.u8\t%t1, $s2;", + "\n\tmov.b16\t$d, {%t0, %t1};\n\t}}"))), + []>; +def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), + (ins Int16Regs:$s1, Int16Regs:$s2), + "mov.b32\t$d, {{$s1, $s2}};", + []>; +def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), + (ins Int32Regs:$s1, Int32Regs:$s2), + "mov.b64\t$d, {{$s1, $s2}};", + []>; +def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), + (ins Float32Regs:$s1, Float32Regs:$s2), + "mov.b64\t$d, {{$s1, $s2}};", + []>; + +// unpack a larger int register to a set of smaller int registers +def I32toV4I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2, + Int8Regs:$d3, Int8Regs:$d4), + (ins Int32Regs:$s), + !strconcat("{{\n\t.reg .b8\t%t<4>;", + !strconcat("\n\tmov.b32\t{%t0, %t1, %t2, %t3}, $s;", + !strconcat("\n\tcvt.u8.u8\t$d1, %t0;", + !strconcat("\n\tcvt.u8.u8\t$d2, %t1;", + !strconcat("\n\tcvt.u8.u8\t$d3, %t2;", + "\n\tcvt.u8.u8\t$d4, %t3;\n\t}}"))))), + []>; +def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, + Int16Regs:$d3, Int16Regs:$d4), + (ins Int64Regs:$s), + "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", + []>; +def I16toV2I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2), + (ins Int16Regs:$s), + !strconcat("{{\n\t.reg .b8\t%t<2>;", + !strconcat("\n\tmov.b16\t{%t0, %t1}, $s;", + !strconcat("\n\tcvt.u8.u8\t$d1, %t0;", + "\n\tcvt.u8.u8\t$d2, %t1;\n\t}}"))), + []>; +def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), + (ins Int32Regs:$s), + "mov.b32\t{{$d1, $d2}}, $s;", + []>; +def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), + (ins Int64Regs:$s), + "mov.b64\t{{$d1, $d2}}, $s;", + []>; +def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), + (ins Float64Regs:$s), + "mov.b64\t{{$d1, $d2}}, $s;", + []>; + +def FPRound_ftz : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a), + "cvt.rn.ftz.f32.f64 \t$d, $a;", + [(set Float32Regs:$d, (fround Float64Regs:$a))]>, Requires<[doF32FTZ]>; + +def FPRound : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a), + "cvt.rn.f32.f64 \t$d, $a;", + [(set Float32Regs:$d, (fround Float64Regs:$a))]>; + +def FPExtend_ftz : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a), + "cvt.ftz.f64.f32 \t$d, $a;", + [(set Float64Regs:$d, (fextend Float32Regs:$a))]>, Requires<[doF32FTZ]>; + +def FPExtend : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a), + "cvt.f64.f32 \t$d, $a;", + [(set Float64Regs:$d, (fextend Float32Regs:$a))]>; + +def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +//----------------------------------- +// Control-flow +//----------------------------------- + +let isTerminator=1 in { + let isReturn=1, isBarrier=1 in + def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>; + + let isBranch=1 in + def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), + "@$a bra \t$target;", + [(brcond Int1Regs:$a, bb:$target)]>; + let isBranch=1 in + def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), + "@!$a bra \t$target;", + []>; + + let isBranch=1, isBarrier=1 in + def GOTO : NVPTXInst<(outs), (ins brtarget:$target), + "bra.uni \t$target;", + [(br bb:$target)]>; +} + +def : Pat<(brcond Int32Regs:$a, bb:$target), (CBranch + (ISetUNEi32ri_p Int32Regs:$a, 0), bb:$target)>; + +// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a +// conditional branch if +// the target block is the next block so that the code can fall through to the +// target block. +// The invertion is done by 'xor condition, 1', which will be translated to +// (setne condition, -1). +// Since ptx supports '@!pred bra target', we should use it. +def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), + (CBranchOther Int1Regs:$a, bb:$target)>; + +// Call +def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; + +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPSideEffect]>; + +def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; +def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def calltarget : Operand<i32>; +let isCall=1 in { + def CALL : NVPTXInst<(outs), (ins calltarget:$dst), + "call \t$dst, (1);", []>; +} + +def : Pat<(call tglobaladdr:$dst), + (CALL tglobaladdr:$dst)>; +def : Pat<(call texternalsym:$dst), + (CALL texternalsym:$dst)>; + +// Pseudo instructions. +class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> + : NVPTXInst<outs, ins, asmstr, pattern>; + +// @TODO: We use some tricks here to emit curly braces. Can we clean this up +// a bit without TableGen modifications? +def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt), + "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}", + [(callseq_start timm:$amt)]>; +def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\n\t//{{\n\t}}// Callseq End $amt1", + [(callseq_end timm:$amt1, timm:$amt2)]>; + +// trap instruction + +def trapinst : NVPTXInst<(outs), (ins), + "trap;", + [(trap)]>; + +include "NVPTXVector.td" + +include "NVPTXIntrinsics.td" + + +//----------------------------------- +// Notes +//----------------------------------- +// BSWAP is currently expanded. The following is a more efficient +// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register +// - for sm_20, use pmpt (use vector scalar mov to get the pack and +// unpack). sm_20 supports native 32-bit register, but not native 16-bit +// register. diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td new file mode 100644 index 0000000..028a94b --- /dev/null +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -0,0 +1,1675 @@ +//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def immFloat0 : PatLeaf<(fpimm), [{ + float f = (float)N->getValueAPF().convertToFloat(); + return (f==0.0f); +}]>; + +def immFloat1 : PatLeaf<(fpimm), [{ + float f = (float)N->getValueAPF().convertToFloat(); + return (f==1.0f); +}]>; + +def immDouble0 : PatLeaf<(fpimm), [{ + double d = (double)N->getValueAPF().convertToDouble(); + return (d==0.0); +}]>; + +def immDouble1 : PatLeaf<(fpimm), [{ + double d = (double)N->getValueAPF().convertToDouble(); + return (d==1.0); +}]>; + + + +//----------------------------------- +// Synchronization Functions +//----------------------------------- +def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins), + "bar.sync \t0;", + [(int_cuda_syncthreads)]>; +def INT_BARRIER0 : NVPTXInst<(outs), (ins), + "bar.sync \t0;", + [(int_nvvm_barrier0)]>; +def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), + !strconcat("{{ \n\t", + !strconcat(".reg .pred \t%p1; \n\t", + !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t", + !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t", + !strconcat("}}", ""))))), + [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>; +def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), + !strconcat("{{ \n\t", + !strconcat(".reg .pred \t%p1; \n\t", + !strconcat(".reg .pred \t%p2; \n\t", + !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t", + !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t", + !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t", + !strconcat("}}", ""))))))), + [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>; +def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred), + !strconcat("{{ \n\t", + !strconcat(".reg .pred \t%p1; \n\t", + !strconcat(".reg .pred \t%p2; \n\t", + !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t", + !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t", + !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t", + !strconcat("}}", ""))))))), + [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>; + + +//----------------------------------- +// Explicit Memory Fence Functions +//----------------------------------- +class MEMBAR<string StrOp, Intrinsic IntOP> : + NVPTXInst<(outs), (ins), + StrOp, [(IntOP)]>; + +def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>; +def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>; +def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>; + + +//----------------------------------- +// Math Functions +//----------------------------------- + +// Map min(1.0, max(0.0, x)) to sat(x) +multiclass SAT<NVPTXRegClass regclass, Operand fimm, Intrinsic IntMinOp, + Intrinsic IntMaxOp, PatLeaf f0, PatLeaf f1, string OpStr> { + + // fmin(1.0, fmax(0.0, x)) => sat(x) + def SAT11 : NVPTXInst<(outs regclass:$dst), + (ins fimm:$srcf0, fimm:$srcf1, regclass:$src), + OpStr, + [(set regclass:$dst, (IntMinOp f1:$srcf0 , + (IntMaxOp f0:$srcf1, regclass:$src)))]>; + + // fmin(1.0, fmax(x, 0.0)) => sat(x) + def SAT12 : NVPTXInst<(outs regclass:$dst), + (ins fimm:$srcf0, fimm:$srcf1, regclass:$src), + OpStr, + [(set regclass:$dst, (IntMinOp f1:$srcf0 , + (IntMaxOp regclass:$src, f0:$srcf1)))]>; + + // fmin(fmax(0.0, x), 1.0) => sat(x) + def SAT13 : NVPTXInst<(outs regclass:$dst), + (ins fimm:$srcf0, fimm:$srcf1, regclass:$src), + OpStr, + [(set regclass:$dst, (IntMinOp + (IntMaxOp f0:$srcf0, regclass:$src), f1:$srcf1))]>; + + // fmin(fmax(x, 0.0), 1.0) => sat(x) + def SAT14 : NVPTXInst<(outs regclass:$dst), + (ins fimm:$srcf0, fimm:$srcf1, regclass:$src), + OpStr, + [(set regclass:$dst, (IntMinOp + (IntMaxOp regclass:$src, f0:$srcf0), f1:$srcf1))]>; + +} +// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x +// is NaN +// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0. +// Same story for fmax, fmin. + +defm SAT_fmin_fmax_f : SAT<Float32Regs, f32imm, int_nvvm_fmin_f, + int_nvvm_fmax_f, immFloat0, immFloat1, + "cvt.sat.f32.f32 \t$dst, $src; \n">; +defm SAT_fmin_fmax_d : SAT<Float64Regs, f64imm, int_nvvm_fmin_d, + int_nvvm_fmax_d, immDouble0, immDouble1, + "cvt.sat.f64.f64 \t$dst, $src; \n">; + + +// We need a full string for OpcStr here because we need to deal with case like +// INT_PTX_RECIP. +class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass, + NVPTXRegClass src_regclass, Intrinsic IntOP> + : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0), + OpcStr, + [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>; + +// We need a full string for OpcStr here because we need to deal with the case +// like INT_PTX_NATIVE_POWR_F. +class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass, + NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP> + : NVPTXInst<(outs t_regclass:$dst), + (ins s0_regclass:$src0, s1_regclass:$src1), + OpcStr, + [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>; + +class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass, + NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, + NVPTXRegClass s2_regclass, Intrinsic IntOP> + : NVPTXInst<(outs t_regclass:$dst), + (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2), + OpcStr, + [(set t_regclass:$dst, + (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>; + +// +// MISC +// + +def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs, + int_nvvm_clz_i>; +def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs, + int_nvvm_clz_ll>; + +def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs, + int_nvvm_popc_i>; +def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs, + int_nvvm_popc_ll>; + +def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs, + Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>; + +// +// Min Max +// + +def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs, + Int32Regs, Int32Regs, int_nvvm_min_i>; +def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs, + Int32Regs, Int32Regs, int_nvvm_min_ui>; + +def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs, + Int64Regs, Int64Regs, int_nvvm_min_ll>; +def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs, + Int64Regs, Int64Regs, int_nvvm_min_ull>; + +def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs, + Int32Regs, Int32Regs, int_nvvm_max_i>; +def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs, + Int32Regs, Int32Regs, int_nvvm_max_ui>; + +def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs, + Int64Regs, Int64Regs, int_nvvm_max_ll>; +def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs, + Int64Regs, Int64Regs, int_nvvm_max_ull>; + +def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs, + Float32Regs, Float32Regs, int_nvvm_fmin_f>; +def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>; + +def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs, + Float32Regs, Float32Regs, int_nvvm_fmax_f>; +def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>; + +def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs, + Float64Regs, Float64Regs, int_nvvm_fmin_d>; +def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs, + Float64Regs, Float64Regs, int_nvvm_fmax_d>; + +// +// Multiplication +// + +def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs, + Int32Regs, Int32Regs, int_nvvm_mulhi_i>; +def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs, + Int32Regs, Int32Regs, int_nvvm_mulhi_ui>; + +def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs, + Int64Regs, Int64Regs, int_nvvm_mulhi_ll>; +def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs, + Int64Regs, Int64Regs, int_nvvm_mulhi_ull>; + +def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>; +def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>; +def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>; +def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>; +def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>; +def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>; +def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>; +def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>; + +def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>; +def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>; +def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>; +def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>; + +def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;", + Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>; +def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;", + Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>; + +// +// Div +// + +def INT_NVVM_DIV_APPROX_FTZ_F + : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs, + Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>; +def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>; + +def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>; +def INT_NVVM_DIV_RN_F : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>; +def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>; +def INT_NVVM_DIV_RZ_F : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>; +def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>; +def INT_NVVM_DIV_RM_F : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>; +def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>; +def INT_NVVM_DIV_RP_F : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>; + +def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>; +def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>; +def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>; +def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>; + +// +// Brev +// + +def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs, + int_nvvm_brev32>; +def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs, + int_nvvm_brev64>; + +// +// Sad +// + +def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;", + Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>; +def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;", + Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>; + +// +// Floor Ceil +// + +def INT_NVVM_FLOOR_FTZ_F : F_MATH_1<"cvt.rmi.ftz.f32.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_floor_ftz_f>; +def INT_NVVM_FLOOR_F : F_MATH_1<"cvt.rmi.f32.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_floor_f>; +def INT_NVVM_FLOOR_D : F_MATH_1<"cvt.rmi.f64.f64 \t$dst, $src0;", + Float64Regs, Float64Regs, int_nvvm_floor_d>; + +def INT_NVVM_CEIL_FTZ_F : F_MATH_1<"cvt.rpi.ftz.f32.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_ceil_ftz_f>; +def INT_NVVM_CEIL_F : F_MATH_1<"cvt.rpi.f32.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_ceil_f>; +def INT_NVVM_CEIL_D : F_MATH_1<"cvt.rpi.f64.f64 \t$dst, $src0;", + Float64Regs, Float64Regs, int_nvvm_ceil_d>; + +// +// Abs +// + +def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs, + int_nvvm_abs_i>; +def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs, + int_nvvm_abs_ll>; + +def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs, + Float32Regs, int_nvvm_fabs_ftz_f>; +def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs, + Float32Regs, int_nvvm_fabs_f>; + +def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_fabs_d>; + +// +// Round +// + +def INT_NVVM_ROUND_FTZ_F : F_MATH_1<"cvt.rni.ftz.f32.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_round_ftz_f>; +def INT_NVVM_ROUND_F : F_MATH_1<"cvt.rni.f32.f32 \t$dst, $src0;", Float32Regs, + Float32Regs, int_nvvm_round_f>; + +def INT_NVVM_ROUND_D : F_MATH_1<"cvt.rni.f64.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_round_d>; + +// +// Trunc +// + +def INT_NVVM_TRUNC_FTZ_F : F_MATH_1<"cvt.rzi.ftz.f32.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_trunc_ftz_f>; +def INT_NVVM_TRUNC_F : F_MATH_1<"cvt.rzi.f32.f32 \t$dst, $src0;", Float32Regs, + Float32Regs, int_nvvm_trunc_f>; + +def INT_NVVM_TRUNC_D : F_MATH_1<"cvt.rzi.f64.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_trunc_d>; + +// +// Saturate +// + +def INT_NVVM_SATURATE_FTZ_F : F_MATH_1<"cvt.sat.ftz.f32.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_saturate_ftz_f>; +def INT_NVVM_SATURATE_F : F_MATH_1<"cvt.sat.f32.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_saturate_f>; + +def INT_NVVM_SATURATE_D : F_MATH_1<"cvt.sat.f64.f64 \t$dst, $src0;", + Float64Regs, Float64Regs, int_nvvm_saturate_d>; + +// +// Exp2 Log2 +// + +def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>; +def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>; +def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;", + Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>; + +def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>; +def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>; +def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;", + Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>; + +// +// Sin Cos +// + +def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>; +def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_sin_approx_f>; + +def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>; +def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_cos_approx_f>; + +// +// Fma +// + +def INT_NVVM_FMA_RN_FTZ_F + : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs, + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>; +def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;", + Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>; +def INT_NVVM_FMA_RZ_FTZ_F + : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs, + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>; +def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;", + Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>; +def INT_NVVM_FMA_RM_FTZ_F + : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs, + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>; +def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;", + Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>; +def INT_NVVM_FMA_RP_FTZ_F + : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs, + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>; +def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;", + Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>; + +def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;", + Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>; +def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;", + Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>; +def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;", + Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>; +def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;", + Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>; + +// +// Rcp +// + +def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>; +def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>; +def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>; +def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>; +def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>; +def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>; +def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>; +def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>; + +def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_rcp_rn_d>; +def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_rcp_rz_d>; +def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_rcp_rm_d>; +def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_rcp_rp_d>; + +def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;", + Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>; + +// +// Sqrt +// + +def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>; +def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs, + Float32Regs, int_nvvm_sqrt_rn_f>; +def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>; +def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs, + Float32Regs, int_nvvm_sqrt_rz_f>; +def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>; +def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs, + Float32Regs, int_nvvm_sqrt_rm_f>; +def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>; +def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs, + Float32Regs, int_nvvm_sqrt_rp_f>; +def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>; +def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>; + +def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_sqrt_rn_d>; +def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_sqrt_rz_d>; +def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_sqrt_rm_d>; +def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs, + Float64Regs, int_nvvm_sqrt_rp_d>; + +// +// Rsqrt +// + +def INT_NVVM_RSQRT_APPROX_FTZ_F + : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs, + int_nvvm_rsqrt_approx_ftz_f>; +def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>; +def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;", + Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>; + +// +// Add +// + +def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>; +def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>; +def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>; +def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>; +def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>; +def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>; +def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>; +def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>; + +def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>; +def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>; +def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>; +def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;", + Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>; + +// +// Convert +// + +def INT_NVVM_D2F_RN_FTZ : F_MATH_1<"cvt.rn.ftz.f32.f64 \t$dst, $src0;", + Float32Regs, Float64Regs, int_nvvm_d2f_rn_ftz>; +def INT_NVVM_D2F_RN : F_MATH_1<"cvt.rn.f32.f64 \t$dst, $src0;", + Float32Regs, Float64Regs, int_nvvm_d2f_rn>; +def INT_NVVM_D2F_RZ_FTZ : F_MATH_1<"cvt.rz.ftz.f32.f64 \t$dst, $src0;", + Float32Regs, Float64Regs, int_nvvm_d2f_rz_ftz>; +def INT_NVVM_D2F_RZ : F_MATH_1<"cvt.rz.f32.f64 \t$dst, $src0;", + Float32Regs, Float64Regs, int_nvvm_d2f_rz>; +def INT_NVVM_D2F_RM_FTZ : F_MATH_1<"cvt.rm.ftz.f32.f64 \t$dst, $src0;", + Float32Regs, Float64Regs, int_nvvm_d2f_rm_ftz>; +def INT_NVVM_D2F_RM : F_MATH_1<"cvt.rm.f32.f64 \t$dst, $src0;", + Float32Regs, Float64Regs, int_nvvm_d2f_rm>; +def INT_NVVM_D2F_RP_FTZ : F_MATH_1<"cvt.rp.ftz.f32.f64 \t$dst, $src0;", + Float32Regs, Float64Regs, int_nvvm_d2f_rp_ftz>; +def INT_NVVM_D2F_RP : F_MATH_1<"cvt.rp.f32.f64 \t$dst, $src0;", + Float32Regs, Float64Regs, int_nvvm_d2f_rp>; + +def INT_NVVM_D2I_RN : F_MATH_1<"cvt.rni.s32.f64 \t$dst, $src0;", + Int32Regs, Float64Regs, int_nvvm_d2i_rn>; +def INT_NVVM_D2I_RZ : F_MATH_1<"cvt.rzi.s32.f64 \t$dst, $src0;", + Int32Regs, Float64Regs, int_nvvm_d2i_rz>; +def INT_NVVM_D2I_RM : F_MATH_1<"cvt.rmi.s32.f64 \t$dst, $src0;", + Int32Regs, Float64Regs, int_nvvm_d2i_rm>; +def INT_NVVM_D2I_RP : F_MATH_1<"cvt.rpi.s32.f64 \t$dst, $src0;", + Int32Regs, Float64Regs, int_nvvm_d2i_rp>; + +def INT_NVVM_D2UI_RN : F_MATH_1<"cvt.rni.u32.f64 \t$dst, $src0;", + Int32Regs, Float64Regs, int_nvvm_d2ui_rn>; +def INT_NVVM_D2UI_RZ : F_MATH_1<"cvt.rzi.u32.f64 \t$dst, $src0;", + Int32Regs, Float64Regs, int_nvvm_d2ui_rz>; +def INT_NVVM_D2UI_RM : F_MATH_1<"cvt.rmi.u32.f64 \t$dst, $src0;", + Int32Regs, Float64Regs, int_nvvm_d2ui_rm>; +def INT_NVVM_D2UI_RP : F_MATH_1<"cvt.rpi.u32.f64 \t$dst, $src0;", + Int32Regs, Float64Regs, int_nvvm_d2ui_rp>; + +def INT_NVVM_I2D_RN : F_MATH_1<"cvt.rn.f64.s32 \t$dst, $src0;", + Float64Regs, Int32Regs, int_nvvm_i2d_rn>; +def INT_NVVM_I2D_RZ : F_MATH_1<"cvt.rz.f64.s32 \t$dst, $src0;", + Float64Regs, Int32Regs, int_nvvm_i2d_rz>; +def INT_NVVM_I2D_RM : F_MATH_1<"cvt.rm.f64.s32 \t$dst, $src0;", + Float64Regs, Int32Regs, int_nvvm_i2d_rm>; +def INT_NVVM_I2D_RP : F_MATH_1<"cvt.rp.f64.s32 \t$dst, $src0;", + Float64Regs, Int32Regs, int_nvvm_i2d_rp>; + +def INT_NVVM_UI2D_RN : F_MATH_1<"cvt.rn.f64.u32 \t$dst, $src0;", + Float64Regs, Int32Regs, int_nvvm_ui2d_rn>; +def INT_NVVM_UI2D_RZ : F_MATH_1<"cvt.rz.f64.u32 \t$dst, $src0;", + Float64Regs, Int32Regs, int_nvvm_ui2d_rz>; +def INT_NVVM_UI2D_RM : F_MATH_1<"cvt.rm.f64.u32 \t$dst, $src0;", + Float64Regs, Int32Regs, int_nvvm_ui2d_rm>; +def INT_NVVM_UI2D_RP : F_MATH_1<"cvt.rp.f64.u32 \t$dst, $src0;", + Float64Regs, Int32Regs, int_nvvm_ui2d_rp>; + +def INT_NVVM_F2I_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s32.f32 \t$dst, $src0;", + Int32Regs, Float32Regs, int_nvvm_f2i_rn_ftz>; +def INT_NVVM_F2I_RN : F_MATH_1<"cvt.rni.s32.f32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_f2i_rn>; +def INT_NVVM_F2I_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s32.f32 \t$dst, $src0;", + Int32Regs, Float32Regs, int_nvvm_f2i_rz_ftz>; +def INT_NVVM_F2I_RZ : F_MATH_1<"cvt.rzi.s32.f32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_f2i_rz>; +def INT_NVVM_F2I_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s32.f32 \t$dst, $src0;", + Int32Regs, Float32Regs, int_nvvm_f2i_rm_ftz>; +def INT_NVVM_F2I_RM : F_MATH_1<"cvt.rmi.s32.f32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_f2i_rm>; +def INT_NVVM_F2I_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s32.f32 \t$dst, $src0;", + Int32Regs, Float32Regs, int_nvvm_f2i_rp_ftz>; +def INT_NVVM_F2I_RP : F_MATH_1<"cvt.rpi.s32.f32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_f2i_rp>; + +def INT_NVVM_F2UI_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u32.f32 \t$dst, $src0;", + Int32Regs, Float32Regs, int_nvvm_f2ui_rn_ftz>; +def INT_NVVM_F2UI_RN : F_MATH_1<"cvt.rni.u32.f32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_f2ui_rn>; +def INT_NVVM_F2UI_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u32.f32 \t$dst, $src0;", + Int32Regs, Float32Regs, int_nvvm_f2ui_rz_ftz>; +def INT_NVVM_F2UI_RZ : F_MATH_1<"cvt.rzi.u32.f32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_f2ui_rz>; +def INT_NVVM_F2UI_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u32.f32 \t$dst, $src0;", + Int32Regs, Float32Regs, int_nvvm_f2ui_rm_ftz>; +def INT_NVVM_F2UI_RM : F_MATH_1<"cvt.rmi.u32.f32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_f2ui_rm>; +def INT_NVVM_F2UI_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u32.f32 \t$dst, $src0;", + Int32Regs, Float32Regs, int_nvvm_f2ui_rp_ftz>; +def INT_NVVM_F2UI_RP : F_MATH_1<"cvt.rpi.u32.f32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_f2ui_rp>; + +def INT_NVVM_I2F_RN : F_MATH_1<"cvt.rn.f32.s32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_i2f_rn>; +def INT_NVVM_I2F_RZ : F_MATH_1<"cvt.rz.f32.s32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_i2f_rz>; +def INT_NVVM_I2F_RM : F_MATH_1<"cvt.rm.f32.s32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_i2f_rm>; +def INT_NVVM_I2F_RP : F_MATH_1<"cvt.rp.f32.s32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_i2f_rp>; + +def INT_NVVM_UI2F_RN : F_MATH_1<"cvt.rn.f32.u32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_ui2f_rn>; +def INT_NVVM_UI2F_RZ : F_MATH_1<"cvt.rz.f32.u32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_ui2f_rz>; +def INT_NVVM_UI2F_RM : F_MATH_1<"cvt.rm.f32.u32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_ui2f_rm>; +def INT_NVVM_UI2F_RP : F_MATH_1<"cvt.rp.f32.u32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_ui2f_rp>; + +def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};", + Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>; + +def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t", + !strconcat(".reg .b32 %temp; \n\t", + !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t", + "}}"))), + Int32Regs, Float64Regs, int_nvvm_d2i_lo>; +def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t", + !strconcat(".reg .b32 %temp; \n\t", + !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t", + "}}"))), + Int32Regs, Float64Regs, int_nvvm_d2i_hi>; + +def INT_NVVM_F2LL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s64.f32 \t$dst, $src0;", + Int64Regs, Float32Regs, int_nvvm_f2ll_rn_ftz>; +def INT_NVVM_F2LL_RN : F_MATH_1<"cvt.rni.s64.f32 \t$dst, $src0;", Int64Regs, + Float32Regs, int_nvvm_f2ll_rn>; +def INT_NVVM_F2LL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s64.f32 \t$dst, $src0;", + Int64Regs, Float32Regs, int_nvvm_f2ll_rz_ftz>; +def INT_NVVM_F2LL_RZ : F_MATH_1<"cvt.rzi.s64.f32 \t$dst, $src0;", Int64Regs, + Float32Regs, int_nvvm_f2ll_rz>; +def INT_NVVM_F2LL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s64.f32 \t$dst, $src0;", + Int64Regs, Float32Regs, int_nvvm_f2ll_rm_ftz>; +def INT_NVVM_F2LL_RM : F_MATH_1<"cvt.rmi.s64.f32 \t$dst, $src0;", Int64Regs, + Float32Regs, int_nvvm_f2ll_rm>; +def INT_NVVM_F2LL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s64.f32 \t$dst, $src0;", + Int64Regs, Float32Regs, int_nvvm_f2ll_rp_ftz>; +def INT_NVVM_F2LL_RP : F_MATH_1<"cvt.rpi.s64.f32 \t$dst, $src0;", Int64Regs, + Float32Regs, int_nvvm_f2ll_rp>; + +def INT_NVVM_F2ULL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u64.f32 \t$dst, $src0;", + Int64Regs, Float32Regs, int_nvvm_f2ull_rn_ftz>; +def INT_NVVM_F2ULL_RN : F_MATH_1<"cvt.rni.u64.f32 \t$dst, $src0;", Int64Regs, + Float32Regs, int_nvvm_f2ull_rn>; +def INT_NVVM_F2ULL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u64.f32 \t$dst, $src0;", + Int64Regs, Float32Regs, int_nvvm_f2ull_rz_ftz>; +def INT_NVVM_F2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f32 \t$dst, $src0;", Int64Regs, + Float32Regs, int_nvvm_f2ull_rz>; +def INT_NVVM_F2ULL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u64.f32 \t$dst, $src0;", + Int64Regs, Float32Regs, int_nvvm_f2ull_rm_ftz>; +def INT_NVVM_F2ULL_RM : F_MATH_1<"cvt.rmi.u64.f32 \t$dst, $src0;", Int64Regs, + Float32Regs, int_nvvm_f2ull_rm>; +def INT_NVVM_F2ULL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u64.f32 \t$dst, $src0;", + Int64Regs, Float32Regs, int_nvvm_f2ull_rp_ftz>; +def INT_NVVM_F2ULL_RP : F_MATH_1<"cvt.rpi.u64.f32 \t$dst, $src0;", Int64Regs, + Float32Regs, int_nvvm_f2ull_rp>; + +def INT_NVVM_D2LL_RN : F_MATH_1<"cvt.rni.s64.f64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_d2ll_rn>; +def INT_NVVM_D2LL_RZ : F_MATH_1<"cvt.rzi.s64.f64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_d2ll_rz>; +def INT_NVVM_D2LL_RM : F_MATH_1<"cvt.rmi.s64.f64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_d2ll_rm>; +def INT_NVVM_D2LL_RP : F_MATH_1<"cvt.rpi.s64.f64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_d2ll_rp>; + +def INT_NVVM_D2ULL_RN : F_MATH_1<"cvt.rni.u64.f64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_d2ull_rn>; +def INT_NVVM_D2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_d2ull_rz>; +def INT_NVVM_D2ULL_RM : F_MATH_1<"cvt.rmi.u64.f64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_d2ull_rm>; +def INT_NVVM_D2ULL_RP : F_MATH_1<"cvt.rpi.u64.f64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_d2ull_rp>; + +def INT_NVVM_LL2F_RN : F_MATH_1<"cvt.rn.f32.s64 \t$dst, $src0;", Float32Regs, + Int64Regs, int_nvvm_ll2f_rn>; +def INT_NVVM_LL2F_RZ : F_MATH_1<"cvt.rz.f32.s64 \t$dst, $src0;", Float32Regs, + Int64Regs, int_nvvm_ll2f_rz>; +def INT_NVVM_LL2F_RM : F_MATH_1<"cvt.rm.f32.s64 \t$dst, $src0;", Float32Regs, + Int64Regs, int_nvvm_ll2f_rm>; +def INT_NVVM_LL2F_RP : F_MATH_1<"cvt.rp.f32.s64 \t$dst, $src0;", Float32Regs, + Int64Regs, int_nvvm_ll2f_rp>; +def INT_NVVM_ULL2F_RN : F_MATH_1<"cvt.rn.f32.u64 \t$dst, $src0;", Float32Regs, + Int64Regs, int_nvvm_ull2f_rn>; +def INT_NVVM_ULL2F_RZ : F_MATH_1<"cvt.rz.f32.u64 \t$dst, $src0;", Float32Regs, + Int64Regs, int_nvvm_ull2f_rz>; +def INT_NVVM_ULL2F_RM : F_MATH_1<"cvt.rm.f32.u64 \t$dst, $src0;", Float32Regs, + Int64Regs, int_nvvm_ull2f_rm>; +def INT_NVVM_ULL2F_RP : F_MATH_1<"cvt.rp.f32.u64 \t$dst, $src0;", Float32Regs, + Int64Regs, int_nvvm_ull2f_rp>; + +def INT_NVVM_LL2D_RN : F_MATH_1<"cvt.rn.f64.s64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_ll2d_rn>; +def INT_NVVM_LL2D_RZ : F_MATH_1<"cvt.rz.f64.s64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_ll2d_rz>; +def INT_NVVM_LL2D_RM : F_MATH_1<"cvt.rm.f64.s64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_ll2d_rm>; +def INT_NVVM_LL2D_RP : F_MATH_1<"cvt.rp.f64.s64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_ll2d_rp>; +def INT_NVVM_ULL2D_RN : F_MATH_1<"cvt.rn.f64.u64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_ull2d_rn>; +def INT_NVVM_ULL2D_RZ : F_MATH_1<"cvt.rz.f64.u64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_ull2d_rz>; +def INT_NVVM_ULL2D_RM : F_MATH_1<"cvt.rm.f64.u64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_ull2d_rm>; +def INT_NVVM_ULL2D_RP : F_MATH_1<"cvt.rp.f64.u64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_ull2d_rp>; + +def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t", + !strconcat(".reg .b16 %temp;\n\t", + !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t", + !strconcat("mov.b16 \t$dst, %temp;\n", + "}}")))), + Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>; +def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t", + !strconcat(".reg .b16 %temp;\n\t", + !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t", + !strconcat("mov.b16 \t$dst, %temp;\n", + "}}")))), + Int16Regs, Float32Regs, int_nvvm_f2h_rn>; + +def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t", + !strconcat(".reg .b16 %temp;\n\t", + !strconcat("mov.b16 \t%temp, $src0;\n\t", + !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t", + "}}")))), + Float32Regs, Int16Regs, int_nvvm_h2f>; + +// +// Bitcast +// + +def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs, + Float32Regs, int_nvvm_bitcast_f2i>; +def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs, + Int32Regs, int_nvvm_bitcast_i2f>; + +def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs, + Int64Regs, int_nvvm_bitcast_ll2d>; +def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs, + Float64Regs, int_nvvm_bitcast_d2ll>; + +//----------------------------------- +// Atomic Functions +//----------------------------------- + +class ATOMIC_GLOBAL_CHK <dag ops, dag frag> + : PatFrag<ops, frag, [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); +}]>; +class ATOMIC_SHARED_CHK <dag ops, dag frag> + : PatFrag<ops, frag, [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED); +}]>; +class ATOMIC_GENERIC_CHK <dag ops, dag frag> + : PatFrag<ops, frag, [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC); +}]>; + +multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass, + string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, + Operand IMMType, SDNode IMM, Predicate Pred> { + def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b), + !strconcat("atom", + !strconcat(SpaceStr, + !strconcat(OpcStr, + !strconcat(TypeStr, + !strconcat(" \t$dst, [$addr], $b;", ""))))), + [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>, + Requires<[Pred]>; + def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b), + !strconcat("atom", + !strconcat(SpaceStr, + !strconcat(OpcStr, + !strconcat(TypeStr, + !strconcat(" \t$dst, [$addr], $b;", ""))))), + [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>, + Requires<[Pred]>; +} +multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr, + string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> { + defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr, + IntOp, IMMType, IMM, Pred>; + defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr, + IntOp, IMMType, IMM, Pred>; +} + +// has 2 operands, neg the second one +multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass, + string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, + Operand IMMType, Predicate Pred> { + def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b), + !strconcat("{{ \n\t", + !strconcat(".reg \t.s", + !strconcat(TypeStr, + !strconcat(" temp; \n\t", + !strconcat("neg.s", + !strconcat(TypeStr, + !strconcat(" \ttemp, $b; \n\t", + !strconcat("atom", + !strconcat(SpaceStr, + !strconcat(OpcStr, + !strconcat(".u", + !strconcat(TypeStr, + !strconcat(" \t$dst, [$addr], temp; \n\t", + !strconcat("}}", "")))))))))))))), + [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>, + Requires<[Pred]>; +} +multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr, + string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType, + Predicate Pred> { + defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr, + IntOp, IMMType, Pred> ; + defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr, + IntOp, IMMType, Pred> ; +} + +// has 3 operands +multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass, + string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, + Operand IMMType, Predicate Pred> { + def reg : NVPTXInst<(outs regclass:$dst), + (ins ptrclass:$addr, regclass:$b, regclass:$c), + !strconcat("atom", + !strconcat(SpaceStr, + !strconcat(OpcStr, + !strconcat(TypeStr, + !strconcat(" \t$dst, [$addr], $b, $c;", ""))))), + [(set regclass:$dst, + (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>, + Requires<[Pred]>; + def imm1 : NVPTXInst<(outs regclass:$dst), + (ins ptrclass:$addr, IMMType:$b, regclass:$c), + !strconcat("atom", + !strconcat(SpaceStr, + !strconcat(OpcStr, + !strconcat(TypeStr, + !strconcat(" \t$dst, [$addr], $b, $c;", ""))))), + [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>, + Requires<[Pred]>; + def imm2 : NVPTXInst<(outs regclass:$dst), + (ins ptrclass:$addr, regclass:$b, IMMType:$c), + !strconcat("atom", + !strconcat(SpaceStr, + !strconcat(OpcStr, + !strconcat(TypeStr, + !strconcat(" \t$dst, [$addr], $b, $c;", ""))))), + [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>, + Requires<[Pred]>; + def imm3 : NVPTXInst<(outs regclass:$dst), + (ins ptrclass:$addr, IMMType:$b, IMMType:$c), + !strconcat("atom", + !strconcat(SpaceStr, + !strconcat(OpcStr, + !strconcat(TypeStr, + !strconcat(" \t$dst, [$addr], $b, $c;", ""))))), + [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>, + Requires<[Pred]>; +} +multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr, + string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> { + defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr, + IntOp, IMMType, Pred>; + defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr, + IntOp, IMMType, Pred>; +} + +// atom_add + +def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_add_32 node:$a, node:$b)>; +def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_add_32 node:$a, node:$b)>; +def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_add_32 node:$a, node:$b)>; +def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_add_64 node:$a, node:$b)>; +def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_add_64 node:$a, node:$b)>; +def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_add_64 node:$a, node:$b)>; +def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; +def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; +def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; + +defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add", + atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add", + atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add", + atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", + ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>; + +defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add", + atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add", + atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add", + atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64", + ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>; + +defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add", + atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>; +defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add", + atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>; +defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add", + atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>; + +// atom_sub + +def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_sub_32 node:$a, node:$b)>; +def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_sub_32 node:$a, node:$b)>; +def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_sub_32 node:$a, node:$b)>; +def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_sub_64 node:$a, node:$b)>; +def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_sub_64 node:$a, node:$b)>; +def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_sub_64 node:$a, node:$b)>; + +defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add", + atomic_load_sub_32_g, i32imm, hasAtomRedG32>; +defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add", + atomic_load_sub_64_g, i64imm, hasAtomRedG64>; +defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add", + atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", + ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add", + atomic_load_sub_32_s, i32imm, hasAtomRedS32>; +defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add", + atomic_load_sub_64_s, i64imm, hasAtomRedS64>; +defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add", + atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", + ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>; + +// atom_swap + +def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_swap_32 node:$a, node:$b)>; +def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_swap_32 node:$a, node:$b)>; +def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_swap_32 node:$a, node:$b)>; +def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_swap_64 node:$a, node:$b)>; +def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_swap_64 node:$a, node:$b)>; +def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_swap_64 node:$a, node:$b)>; + +defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch", + atomic_swap_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch", + atomic_swap_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch", + atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", + ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch", + atomic_swap_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch", + atomic_swap_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch", + atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>; + +// atom_max + +def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) + , (atomic_load_max_32 node:$a, node:$b)>; +def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_max_32 node:$a, node:$b)>; +def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_max_32 node:$a, node:$b)>; +def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umax_32 node:$a, node:$b)>; +def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umax_32 node:$a, node:$b)>; +def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umax_32 node:$a, node:$b)>; + +defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32", + ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32", + ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max", + atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", + ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", + ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", + ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max", + atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", + ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>; + +// atom_min + +def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_min_32 node:$a, node:$b)>; +def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_min_32 node:$a, node:$b)>; +def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_min_32 node:$a, node:$b)>; +def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umin_32 node:$a, node:$b)>; +def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umin_32 node:$a, node:$b)>; +def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umin_32 node:$a, node:$b)>; + +defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32", + ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32", + ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min", + atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", + ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", + ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", + ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min", + atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", + ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>; + +// atom_inc atom_dec + +def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; +def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; +def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; +def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; +def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; +def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; + +defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc", + atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc", + atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc", + atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", + ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec", + atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec", + atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec", + atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", + ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>; + +// atom_and + +def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_and_32 node:$a, node:$b)>; +def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_and_32 node:$a, node:$b)>; +def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_and_32 node:$a, node:$b)>; + +defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and", + atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and", + atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and", + atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", + ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>; + +// atom_or + +def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_or_32 node:$a, node:$b)>; +def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_or_32 node:$a, node:$b)>; +def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_or_32 node:$a, node:$b)>; + +defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or", + atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or", + atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", + ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or", + atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>; + +// atom_xor + +def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_xor_32 node:$a, node:$b)>; +def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_xor_32 node:$a, node:$b)>; +def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_xor_32 node:$a, node:$b)>; + +defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor", + atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>; +defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor", + atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor", + atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", + ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>; + +// atom_cas + +def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>; +def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c), + (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>; + +defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas", + atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>; +defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas", + atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>; +defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas", + atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>; +defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32", + ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas", + atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>; +defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas", + atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>; +defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas", + atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64", + ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>; + + +//----------------------------------- +// Read Special Registers +//----------------------------------- +class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> : + NVPTXInst<(outs regclassOut:$dst), (ins), + OpStr, + [(set regclassOut:$dst, (IntOp))]>; + +def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs, + int_nvvm_read_ptx_sreg_tid_x>; +def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs, + int_nvvm_read_ptx_sreg_tid_y>; +def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs, + int_nvvm_read_ptx_sreg_tid_z>; + +def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs, + int_nvvm_read_ptx_sreg_ntid_x>; +def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs, + int_nvvm_read_ptx_sreg_ntid_y>; +def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs, + int_nvvm_read_ptx_sreg_ntid_z>; + +def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs, + int_nvvm_read_ptx_sreg_ctaid_x>; +def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs, + int_nvvm_read_ptx_sreg_ctaid_y>; +def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs, + int_nvvm_read_ptx_sreg_ctaid_z>; + +def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs, + int_nvvm_read_ptx_sreg_nctaid_x>; +def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs, + int_nvvm_read_ptx_sreg_nctaid_y>; +def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs, + int_nvvm_read_ptx_sreg_nctaid_z>; + +def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs, + int_nvvm_read_ptx_sreg_warpsize>; + + +//----------------------------------- +// Support for ldu on sm_20 or later +//----------------------------------- + +// Scalar +// @TODO: Revisit this, Changed imemAny to imem +multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { + def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; + def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, + Requires<[hasLDU]>; + def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; + def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; +} + +defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int8Regs, +int_nvvm_ldu_global_i>; +defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs, +int_nvvm_ldu_global_i>; +defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, +int_nvvm_ldu_global_i>; +defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, +int_nvvm_ldu_global_i>; +defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs, +int_nvvm_ldu_global_f>; +defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs, +int_nvvm_ldu_global_f>; +defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, +int_nvvm_ldu_global_p>; +defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, +int_nvvm_ldu_global_p>; + +// vector + +// Elementized vector ldu +multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> { + def _32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins Int32Regs:$src), + !strconcat("ldu.global.", TyStr), []>; + def _64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins Int64Regs:$src), + !strconcat("ldu.global.", TyStr), []>; +} + +multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { + def _32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, + regclass:$dst4), (ins Int32Regs:$src), + !strconcat("ldu.global.", TyStr), []>; + def _64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, + regclass:$dst4), (ins Int64Regs:$src), + !strconcat("ldu.global.", TyStr), []>; +} + +defm INT_PTX_LDU_G_v2i8_ELE + : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int8Regs>; +defm INT_PTX_LDU_G_v2i16_ELE + : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>; +defm INT_PTX_LDU_G_v2i32_ELE + : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>; +defm INT_PTX_LDU_G_v2f32_ELE + : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>; +defm INT_PTX_LDU_G_v2i64_ELE + : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>; +defm INT_PTX_LDU_G_v2f64_ELE + : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>; +defm INT_PTX_LDU_G_v4i8_ELE + : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int8Regs>; +defm INT_PTX_LDU_G_v4i16_ELE + : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", + Int16Regs>; +defm INT_PTX_LDU_G_v4i32_ELE + : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", + Int32Regs>; +defm INT_PTX_LDU_G_v4f32_ELE + : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", + Float32Regs>; + +// Vector ldu +multiclass VLDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp, + NVPTXInst eleInst, NVPTXInst eleInst64> { + def _32: NVPTXVecInst<(outs regclass:$result), (ins Int32Regs:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp Int32Regs:$src))], eleInst>, + Requires<[hasLDU]>; + def _64: NVPTXVecInst<(outs regclass:$result), (ins Int64Regs:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp Int64Regs:$src))], eleInst64>, + Requires<[hasLDU]>; +} + +let VecInstType=isVecLD.Value in { +defm INT_PTX_LDU_G_v2i8 : VLDU_G<"v2.u8 \t${result:vecfull}, [$src];", + V2I8Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i8_ELE_32, + INT_PTX_LDU_G_v2i8_ELE_64>; +defm INT_PTX_LDU_G_v4i8 : VLDU_G<"v4.u8 \t${result:vecfull}, [$src];", + V4I8Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i8_ELE_32, + INT_PTX_LDU_G_v4i8_ELE_64>; +defm INT_PTX_LDU_G_v2i16 : VLDU_G<"v2.u16 \t${result:vecfull}, [$src];", + V2I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i16_ELE_32, + INT_PTX_LDU_G_v2i16_ELE_64>; +defm INT_PTX_LDU_G_v4i16 : VLDU_G<"v4.u16 \t${result:vecfull}, [$src];", + V4I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i16_ELE_32, + INT_PTX_LDU_G_v4i16_ELE_64>; +defm INT_PTX_LDU_G_v2i32 : VLDU_G<"v2.u32 \t${result:vecfull}, [$src];", + V2I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i32_ELE_32, + INT_PTX_LDU_G_v2i32_ELE_64>; +defm INT_PTX_LDU_G_v4i32 : VLDU_G<"v4.u32 \t${result:vecfull}, [$src];", + V4I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i32_ELE_32, + INT_PTX_LDU_G_v4i32_ELE_64>; +defm INT_PTX_LDU_G_v2f32 : VLDU_G<"v2.f32 \t${result:vecfull}, [$src];", + V2F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f32_ELE_32, + INT_PTX_LDU_G_v2f32_ELE_64>; +defm INT_PTX_LDU_G_v4f32 : VLDU_G<"v4.f32 \t${result:vecfull}, [$src];", + V4F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v4f32_ELE_32, + INT_PTX_LDU_G_v4f32_ELE_64>; +defm INT_PTX_LDU_G_v2i64 : VLDU_G<"v2.u64 \t${result:vecfull}, [$src];", + V2I64Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i64_ELE_32, + INT_PTX_LDU_G_v2i64_ELE_64>; +defm INT_PTX_LDU_G_v2f64 : VLDU_G<"v2.f64 \t${result:vecfull}, [$src];", + V2F64Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f64_ELE_32, + INT_PTX_LDU_G_v2f64_ELE_64>; +} + + + +multiclass NG_TO_G<string Str, Intrinsic Intrin> { + def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), + !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")), + [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>, + Requires<[hasGenericLdSt]>; + def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), + !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")), + [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>, + Requires<[hasGenericLdSt]>; + +// @TODO: Are these actually needed? I believe global addresses will be copied +// to register values anyway. + /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src), + !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")), + [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>, + Requires<[hasGenericLdSt]>; + def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src), + !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")), + [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>, + Requires<[hasGenericLdSt]>;*/ + + def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), + "mov.u32 \t$result, $src;", + [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; + def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), + "mov.u64 \t$result, $src;", + [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; + +// @TODO: Are these actually needed? I believe global addresses will be copied +// to register values anyway. + /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src), + "mov.u32 \t$result, $src;", + [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>; + def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src), + "mov.u64 \t$result, $src;", + [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/ +} + +multiclass G_TO_NG<string Str, Intrinsic Intrin> { + def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), + !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")), + [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>, + Requires<[hasGenericLdSt]>; + def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), + !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")), + [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>, + Requires<[hasGenericLdSt]>; + def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), + "mov.u32 \t$result, $src;", + [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; + def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), + "mov.u64 \t$result, $src;", + [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; +} + +defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>; +defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>; +defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>; + +defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>; +defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>; +defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>; + +def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), + "mov.u32 \t$result, $src;", + [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>; +def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), + "mov.u64 \t$result, $src;", + [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>; + + + +// @TODO: Revisit this. There is a type +// contradiction between iPTRAny and iPTR for the def. +/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src), + "mov.u32 \t$result, $src;", + [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen + (Wrapper tglobaladdr:$src)))]>; +def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src), + "mov.u64 \t$result, $src;", + [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen + (Wrapper tglobaladdr:$src)))]>;*/ + + +def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), + "mov.u32 \t$result, $src;", + [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>; +def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), + "mov.u64 \t$result, $src;", + [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>; + + +// nvvm.ptr.gen.to.param +def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result), + (ins Int32Regs:$src), + "mov.u32 \t$result, $src;", + [(set Int32Regs:$result, + (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>; +def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result), + (ins Int64Regs:$src), + "mov.u64 \t$result, $src;", + [(set Int64Regs:$result, + (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>; + + +// nvvm.move intrinsicc +def nvvm_move_i8 : NVPTXInst<(outs Int8Regs:$r), (ins Int8Regs:$s), + "mov.b16 \t$r, $s;", + [(set Int8Regs:$r, + (int_nvvm_move_i8 Int8Regs:$s))]>; +def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s), + "mov.b16 \t$r, $s;", + [(set Int16Regs:$r, + (int_nvvm_move_i16 Int16Regs:$s))]>; +def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s), + "mov.b32 \t$r, $s;", + [(set Int32Regs:$r, + (int_nvvm_move_i32 Int32Regs:$s))]>; +def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s), + "mov.b64 \t$r, $s;", + [(set Int64Regs:$r, + (int_nvvm_move_i64 Int64Regs:$s))]>; +def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s), + "mov.f32 \t$r, $s;", + [(set Float32Regs:$r, + (int_nvvm_move_float Float32Regs:$s))]>; +def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s), + "mov.f64 \t$r, $s;", + [(set Float64Regs:$r, + (int_nvvm_move_double Float64Regs:$s))]>; +def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s), + "mov.u32 \t$r, $s;", + [(set Int32Regs:$r, + (int_nvvm_move_ptr Int32Regs:$s))]>; +def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s), + "mov.u64 \t$r, $s;", + [(set Int64Regs:$r, + (int_nvvm_move_ptr Int64Regs:$s))]>; + +// @TODO: Are these actually needed, or will we always just see symbols +// copied to registers first? +/*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s), + "mov.u32 \t$r, $s;", + [(set Int32Regs:$r, + (int_nvvm_move_ptr texternalsym:$s))]>; +def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s), + "mov.u64 \t$r, $s;", + [(set Int64Regs:$r, + (int_nvvm_move_ptr texternalsym:$s))]>;*/ + + +// MoveParam %r1, param +// ptr_local_to_gen %r2, %r1 +// ptr_gen_to_local %r3, %r2 +// -> +// mov %r1, param + +// @TODO: Revisit this. There is a type +// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym +// instructions are not currently defined. However, we can use the ptr +// variants and the asm printer will do the right thing. +def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen + (MoveParam texternalsym:$src)))), + (nvvm_move_ptr64 texternalsym:$src)>; +def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen + (MoveParam texternalsym:$src)))), + (nvvm_move_ptr32 texternalsym:$src)>; + + +//----------------------------------- +// Compiler Error Warn +// - Just ignore them in codegen +//----------------------------------- + +def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a), + "// llvm.nvvm.compiler.warn()", + [(int_nvvm_compiler_warn Int32Regs:$a)]>; +def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a), + "// llvm.nvvm.compiler.warn()", + [(int_nvvm_compiler_warn Int64Regs:$a)]>; +def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a), + "// llvm.nvvm.compiler.error()", + [(int_nvvm_compiler_error Int32Regs:$a)]>; +def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a), + "// llvm.nvvm.compiler.error()", + [(int_nvvm_compiler_error Int64Regs:$a)]>; + + + +//===-- Old PTX Back-end Intrinsics ---------------------------------------===// + +// These intrinsics are handled to retain compatibility with the old backend. + +// PTX Special Purpose Register Accessor Intrinsics + +class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop> + : NVPTXInst<(outs Int64Regs:$d), (ins), + !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"), + [(set Int64Regs:$d, (intop))]>; + +class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop> + : NVPTXInst<(outs Int32Regs:$d), (ins), + !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"), + [(set Int32Regs:$d, (intop))]>; + +// TODO Add read vector-version of special registers + +def PTX_READ_TID_X : PTX_READ_SPECIAL_REGISTER_R32<"tid.x", + int_ptx_read_tid_x>; +def PTX_READ_TID_Y : PTX_READ_SPECIAL_REGISTER_R32<"tid.y", + int_ptx_read_tid_y>; +def PTX_READ_TID_Z : PTX_READ_SPECIAL_REGISTER_R32<"tid.z", + int_ptx_read_tid_z>; +def PTX_READ_TID_W : PTX_READ_SPECIAL_REGISTER_R32<"tid.w", + int_ptx_read_tid_w>; + +def PTX_READ_NTID_X : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x", + int_ptx_read_ntid_x>; +def PTX_READ_NTID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y", + int_ptx_read_ntid_y>; +def PTX_READ_NTID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z", + int_ptx_read_ntid_z>; +def PTX_READ_NTID_W : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w", + int_ptx_read_ntid_w>; + +def PTX_READ_LANEID : PTX_READ_SPECIAL_REGISTER_R32<"laneid", + int_ptx_read_laneid>; +def PTX_READ_WARPID : PTX_READ_SPECIAL_REGISTER_R32<"warpid", + int_ptx_read_warpid>; +def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid", + int_ptx_read_nwarpid>; + +def PTX_READ_CTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x", + int_ptx_read_ctaid_x>; +def PTX_READ_CTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y", + int_ptx_read_ctaid_y>; +def PTX_READ_CTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z", + int_ptx_read_ctaid_z>; +def PTX_READ_CTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w", + int_ptx_read_ctaid_w>; + +def PTX_READ_NCTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x", + int_ptx_read_nctaid_x>; +def PTX_READ_NCTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y", + int_ptx_read_nctaid_y>; +def PTX_READ_NCTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z", + int_ptx_read_nctaid_z>; +def PTX_READ_NCTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w", + int_ptx_read_nctaid_w>; + +def PTX_READ_SMID : PTX_READ_SPECIAL_REGISTER_R32<"smid", + int_ptx_read_smid>; +def PTX_READ_NSMID : PTX_READ_SPECIAL_REGISTER_R32<"nsmid", + int_ptx_read_nsmid>; +def PTX_READ_GRIDID : PTX_READ_SPECIAL_REGISTER_R32<"gridid", + int_ptx_read_gridid>; + +def PTX_READ_LANEMASK_EQ + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>; +def PTX_READ_LANEMASK_LE + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>; +def PTX_READ_LANEMASK_LT + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>; +def PTX_READ_LANEMASK_GE + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>; +def PTX_READ_LANEMASK_GT + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>; + +def PTX_READ_CLOCK + : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>; +def PTX_READ_CLOCK64 + : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>; + +def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>; +def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>; +def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>; +def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>; + +// PTX Parallel Synchronization and Communication Intrinsics + +def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;", + [(int_ptx_bar_sync imm:$i)]>; diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp new file mode 100644 index 0000000..56b2372 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -0,0 +1,208 @@ +//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when +// the size is large or is not a compile-time constant. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXLowerAggrCopies.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Target/TargetData.h" + +using namespace llvm; + +namespace llvm { +FunctionPass *createLowerAggrCopies(); +} + +char NVPTXLowerAggrCopies::ID = 0; + +// Lower MemTransferInst or load-store pair to loop +static void convertTransferToLoop(Instruction *splitAt, Value *srcAddr, + Value *dstAddr, Value *len, + //unsigned numLoads, + bool srcVolatile, bool dstVolatile, + LLVMContext &Context, Function &F) { + Type *indType = len->getType(); + + BasicBlock *origBB = splitAt->getParent(); + BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split"); + BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB); + + origBB->getTerminator()->setSuccessor(0, loopBB); + IRBuilder<> builder(origBB, origBB->getTerminator()); + + // srcAddr and dstAddr are expected to be pointer types, + // so no check is made here. + unsigned srcAS = + dyn_cast<PointerType>(srcAddr->getType())->getAddressSpace(); + unsigned dstAS = + dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace(); + + // Cast pointers to (char *) + srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS)); + dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS)); + + IRBuilder<> loop(loopBB); + // The loop index (ind) is a phi node. + PHINode *ind = loop.CreatePHI(indType, 0); + // Incoming value for ind is 0 + ind->addIncoming(ConstantInt::get(indType, 0), origBB); + + // load from srcAddr+ind + Value *val = loop.CreateLoad(loop.CreateGEP(srcAddr, ind), srcVolatile); + // store at dstAddr+ind + loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), dstVolatile); + + // The value for ind coming from backedge is (ind + 1) + Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1)); + ind->addIncoming(newind, loopBB); + + loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB); +} + +// Lower MemSetInst to loop +static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr, + Value *len, Value *val, LLVMContext &Context, + Function &F) { + BasicBlock *origBB = splitAt->getParent(); + BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split"); + BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB); + + origBB->getTerminator()->setSuccessor(0, loopBB); + IRBuilder<> builder(origBB, origBB->getTerminator()); + + unsigned dstAS = + dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace(); + + // Cast pointer to the type of value getting stored + dstAddr = builder.CreateBitCast(dstAddr, + PointerType::get(val->getType(), dstAS)); + + IRBuilder<> loop(loopBB); + PHINode *ind = loop.CreatePHI(len->getType(), 0); + ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB); + + loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), false); + + Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1)); + ind->addIncoming(newind, loopBB); + + loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB); +} + +bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { + SmallVector<LoadInst *, 4> aggrLoads; + SmallVector<MemTransferInst *, 4> aggrMemcpys; + SmallVector<MemSetInst *, 4> aggrMemsets; + + TargetData *TD = &getAnalysis<TargetData>(); + LLVMContext &Context = F.getParent()->getContext(); + + // + // Collect all the aggrLoads, aggrMemcpys and addrMemsets. + // + //const BasicBlock *firstBB = &F.front(); // first BB in F + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + //BasicBlock *bb = BI; + for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; + ++II) { + if (LoadInst * load = dyn_cast<LoadInst>(II)) { + + if (load->hasOneUse() == false) continue; + + if (TD->getTypeStoreSize(load->getType()) < MaxAggrCopySize) continue; + + User *use = *(load->use_begin()); + if (StoreInst * store = dyn_cast<StoreInst>(use)) { + if (store->getOperand(0) != load) //getValueOperand + continue; + aggrLoads.push_back(load); + } + } else if (MemTransferInst * intr = dyn_cast<MemTransferInst>(II)) { + Value *len = intr->getLength(); + // If the number of elements being copied is greater + // than MaxAggrCopySize, lower it to a loop + if (ConstantInt * len_int = dyn_cast < ConstantInt > (len)) { + if (len_int->getZExtValue() >= MaxAggrCopySize) { + aggrMemcpys.push_back(intr); + } + } else { + // turn variable length memcpy/memmov into loop + aggrMemcpys.push_back(intr); + } + } else if (MemSetInst * memsetintr = dyn_cast<MemSetInst>(II)) { + Value *len = memsetintr->getLength(); + if (ConstantInt * len_int = dyn_cast<ConstantInt>(len)) { + if (len_int->getZExtValue() >= MaxAggrCopySize) { + aggrMemsets.push_back(memsetintr); + } + } else { + // turn variable length memset into loop + aggrMemsets.push_back(memsetintr); + } + } + } + } + if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) + && (aggrMemsets.size() == 0)) return false; + + // + // Do the transformation of an aggr load/copy/set to a loop + // + for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) { + LoadInst *load = aggrLoads[i]; + StoreInst *store = dyn_cast<StoreInst>(*load->use_begin()); + Value *srcAddr = load->getOperand(0); + Value *dstAddr = store->getOperand(1); + unsigned numLoads = TD->getTypeStoreSize(load->getType()); + Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads); + + convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(), + store->isVolatile(), Context, F); + + store->eraseFromParent(); + load->eraseFromParent(); + } + + for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) { + MemTransferInst *cpy = aggrMemcpys[i]; + Value *len = cpy->getLength(); + // llvm 2.7 version of memcpy does not have volatile + // operand yet. So always making it non-volatile + // optimistically, so that we don't see unnecessary + // st.volatile in ptx + convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false, + false, Context, F); + cpy->eraseFromParent(); + } + + for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) { + MemSetInst *memsetinst = aggrMemsets[i]; + Value *len = memsetinst->getLength(); + Value *val = memsetinst->getValue(); + convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context, + F); + memsetinst->eraseFromParent(); + } + + return true; +} + +FunctionPass *llvm::createLowerAggrCopies() { + return new NVPTXLowerAggrCopies(); +} diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h new file mode 100644 index 0000000..ac7f150 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h @@ -0,0 +1,47 @@ +//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the NVIDIA specific lowering of +// aggregate copies +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTX_LOWER_AGGR_COPIES_H +#define NVPTX_LOWER_AGGR_COPIES_H + +#include "llvm/Pass.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/Target/TargetData.h" + +namespace llvm { + +// actual analysis class, which is a functionpass +struct NVPTXLowerAggrCopies : public FunctionPass { + static char ID; + + NVPTXLowerAggrCopies() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetData>(); + AU.addPreserved<MachineFunctionAnalysis>(); + } + + virtual bool runOnFunction(Function &F); + + static const unsigned MaxAggrCopySize = 128; + + virtual const char *getPassName() const { + return "Lower aggregate copies/intrinsics into loops"; + } +}; + +extern FunctionPass *createLowerAggrCopies(); +} + +#endif diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.cpp b/lib/Target/NVPTX/NVPTXNumRegisters.h index 60acfc7..b4a4dbc 100644 --- a/lib/Target/PTX/PTXMachineFunctionInfo.cpp +++ b/lib/Target/NVPTX/NVPTXNumRegisters.h @@ -1,4 +1,5 @@ -//===-- PTXMachineFuctionInfo.cpp - PTX machine function info -------------===// + +//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,8 +8,13 @@ // //===----------------------------------------------------------------------===// -#include "PTXMachineFunctionInfo.h" +#ifndef NVPTX_NUM_REGISTERS_H +#define NVPTX_NUM_REGISTERS_H + +namespace llvm { + +const unsigned NVPTXNumRegisters = 396; -using namespace llvm; +} -void PTXMachineFunctionInfo::anchor() { } +#endif diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp new file mode 100644 index 0000000..e3cd46f --- /dev/null +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -0,0 +1,325 @@ +//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the NVPTX implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "nvptx-reg-info" + +#include "NVPTX.h" +#include "NVPTXRegisterInfo.h" +#include "NVPTXSubtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Target/TargetInstrInfo.h" + + +using namespace llvm; + +namespace llvm +{ +std::string getNVPTXRegClassName (TargetRegisterClass const *RC) { + if (RC == &NVPTX::Float32RegsRegClass) { + return ".f32"; + } + if (RC == &NVPTX::Float64RegsRegClass) { + return ".f64"; + } + else if (RC == &NVPTX::Int64RegsRegClass) { + return ".s64"; + } + else if (RC == &NVPTX::Int32RegsRegClass) { + return ".s32"; + } + else if (RC == &NVPTX::Int16RegsRegClass) { + return ".s16"; + } + // Int8Regs become 16-bit registers in PTX + else if (RC == &NVPTX::Int8RegsRegClass) { + return ".s16"; + } + else if (RC == &NVPTX::Int1RegsRegClass) { + return ".pred"; + } + else if (RC == &NVPTX::SpecialRegsRegClass) { + return "!Special!"; + } + else if (RC == &NVPTX::V2F32RegsRegClass) { + return ".v2.f32"; + } + else if (RC == &NVPTX::V4F32RegsRegClass) { + return ".v4.f32"; + } + else if (RC == &NVPTX::V2I32RegsRegClass) { + return ".v2.s32"; + } + else if (RC == &NVPTX::V4I32RegsRegClass) { + return ".v4.s32"; + } + else if (RC == &NVPTX::V2F64RegsRegClass) { + return ".v2.f64"; + } + else if (RC == &NVPTX::V2I64RegsRegClass) { + return ".v2.s64"; + } + else if (RC == &NVPTX::V2I16RegsRegClass) { + return ".v2.s16"; + } + else if (RC == &NVPTX::V4I16RegsRegClass) { + return ".v4.s16"; + } + else if (RC == &NVPTX::V2I8RegsRegClass) { + return ".v2.s16"; + } + else if (RC == &NVPTX::V4I8RegsRegClass) { + return ".v4.s16"; + } + else { + return "INTERNAL"; + } + return ""; +} + +std::string getNVPTXRegClassStr (TargetRegisterClass const *RC) { + if (RC == &NVPTX::Float32RegsRegClass) { + return "%f"; + } + if (RC == &NVPTX::Float64RegsRegClass) { + return "%fd"; + } + else if (RC == &NVPTX::Int64RegsRegClass) { + return "%rd"; + } + else if (RC == &NVPTX::Int32RegsRegClass) { + return "%r"; + } + else if (RC == &NVPTX::Int16RegsRegClass) { + return "%rs"; + } + else if (RC == &NVPTX::Int8RegsRegClass) { + return "%rc"; + } + else if (RC == &NVPTX::Int1RegsRegClass) { + return "%p"; + } + else if (RC == &NVPTX::SpecialRegsRegClass) { + return "!Special!"; + } + else if (RC == &NVPTX::V2F32RegsRegClass) { + return "%v2f"; + } + else if (RC == &NVPTX::V4F32RegsRegClass) { + return "%v4f"; + } + else if (RC == &NVPTX::V2I32RegsRegClass) { + return "%v2r"; + } + else if (RC == &NVPTX::V4I32RegsRegClass) { + return "%v4r"; + } + else if (RC == &NVPTX::V2F64RegsRegClass) { + return "%v2fd"; + } + else if (RC == &NVPTX::V2I64RegsRegClass) { + return "%v2rd"; + } + else if (RC == &NVPTX::V2I16RegsRegClass) { + return "%v2s"; + } + else if (RC == &NVPTX::V4I16RegsRegClass) { + return "%v4rs"; + } + else if (RC == &NVPTX::V2I8RegsRegClass) { + return "%v2rc"; + } + else if (RC == &NVPTX::V4I8RegsRegClass) { + return "%v4rc"; + } + else { + return "INTERNAL"; + } + return ""; +} + +bool isNVPTXVectorRegClass(TargetRegisterClass const *RC) { + if (RC->getID() == NVPTX::V2F32RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V2F64RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V2I16RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V2I32RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V2I64RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V2I8RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V4F32RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V4I16RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V4I32RegsRegClassID) + return true; + if (RC->getID() == NVPTX::V4I8RegsRegClassID) + return true; + return false; +} + +std::string getNVPTXElemClassName(TargetRegisterClass const *RC) { + if (RC->getID() == NVPTX::V2F32RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass); + if (RC->getID() == NVPTX::V2F64RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Float64RegsRegClass); + if (RC->getID() == NVPTX::V2I16RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass); + if (RC->getID() == NVPTX::V2I32RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass); + if (RC->getID() == NVPTX::V2I64RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Int64RegsRegClass); + if (RC->getID() == NVPTX::V2I8RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass); + if (RC->getID() == NVPTX::V4F32RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass); + if (RC->getID() == NVPTX::V4I16RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass); + if (RC->getID() == NVPTX::V4I32RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass); + if (RC->getID() == NVPTX::V4I8RegsRegClassID) + return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass); + llvm_unreachable("Not a vector register class"); +} + +const TargetRegisterClass *getNVPTXElemClass(TargetRegisterClass const *RC) { + if (RC->getID() == NVPTX::V2F32RegsRegClassID) + return (&NVPTX::Float32RegsRegClass); + if (RC->getID() == NVPTX::V2F64RegsRegClassID) + return (&NVPTX::Float64RegsRegClass); + if (RC->getID() == NVPTX::V2I16RegsRegClassID) + return (&NVPTX::Int16RegsRegClass); + if (RC->getID() == NVPTX::V2I32RegsRegClassID) + return (&NVPTX::Int32RegsRegClass); + if (RC->getID() == NVPTX::V2I64RegsRegClassID) + return (&NVPTX::Int64RegsRegClass); + if (RC->getID() == NVPTX::V2I8RegsRegClassID) + return (&NVPTX::Int8RegsRegClass); + if (RC->getID() == NVPTX::V4F32RegsRegClassID) + return (&NVPTX::Float32RegsRegClass); + if (RC->getID() == NVPTX::V4I16RegsRegClassID) + return (&NVPTX::Int16RegsRegClass); + if (RC->getID() == NVPTX::V4I32RegsRegClassID) + return (&NVPTX::Int32RegsRegClass); + if (RC->getID() == NVPTX::V4I8RegsRegClassID) + return (&NVPTX::Int8RegsRegClass); + llvm_unreachable("Not a vector register class"); +} + +int getNVPTXVectorSize(TargetRegisterClass const *RC) { + if (RC->getID() == NVPTX::V2F32RegsRegClassID) + return 2; + if (RC->getID() == NVPTX::V2F64RegsRegClassID) + return 2; + if (RC->getID() == NVPTX::V2I16RegsRegClassID) + return 2; + if (RC->getID() == NVPTX::V2I32RegsRegClassID) + return 2; + if (RC->getID() == NVPTX::V2I64RegsRegClassID) + return 2; + if (RC->getID() == NVPTX::V2I8RegsRegClassID) + return 2; + if (RC->getID() == NVPTX::V4F32RegsRegClassID) + return 4; + if (RC->getID() == NVPTX::V4I16RegsRegClassID) + return 4; + if (RC->getID() == NVPTX::V4I32RegsRegClassID) + return 4; + if (RC->getID() == NVPTX::V4I8RegsRegClassID) + return 4; + llvm_unreachable("Not a vector register class"); +} +} + +NVPTXRegisterInfo::NVPTXRegisterInfo(const TargetInstrInfo &tii, + const NVPTXSubtarget &st) + : NVPTXGenRegisterInfo(0), + Is64Bit(st.is64Bit()) {} + +#define GET_REGINFO_TARGET_DESC +#include "NVPTXGenRegisterInfo.inc" + +/// NVPTX Callee Saved Registers +const uint16_t* NVPTXRegisterInfo:: +getCalleeSavedRegs(const MachineFunction *MF) const { + static const uint16_t CalleeSavedRegs[] = { 0 }; + return CalleeSavedRegs; +} + +// NVPTX Callee Saved Reg Classes +const TargetRegisterClass* const* +NVPTXRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 }; + return CalleeSavedRegClasses; +} + +BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + return Reserved; +} + +void NVPTXRegisterInfo:: +eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, + RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + unsigned i = 0; + MachineInstr &MI = *II; + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getIndex(); + + MachineFunction &MF = *MI.getParent()->getParent(); + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + MI.getOperand(i+1).getImm(); + + // Using I0 as the frame pointer + MI.getOperand(i).ChangeToRegister(NVPTX::VRFrame, false); + MI.getOperand(i+1).ChangeToImmediate(Offset); +} + + +int NVPTXRegisterInfo:: +getDwarfRegNum(unsigned RegNum, bool isEH) const { + return 0; +} + +unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return NVPTX::VRFrame; +} + +unsigned NVPTXRegisterInfo::getRARegister() const { + return 0; +} + +// This function eliminates ADJCALLSTACKDOWN, +// ADJCALLSTACKUP pseudo instructions +void NVPTXRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + // Simply discard ADJCALLSTACKDOWN, + // ADJCALLSTACKUP instructions. + MBB.erase(I); +} diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h new file mode 100644 index 0000000..5951783 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h @@ -0,0 +1,92 @@ +//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the NVPTX implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTXREGISTERINFO_H +#define NVPTXREGISTERINFO_H + +#include "ManagedStringPool.h" +#include "llvm/Target/TargetRegisterInfo.h" + + +#define GET_REGINFO_HEADER +#include "NVPTXGenRegisterInfo.inc" +#include "llvm/Target/TargetRegisterInfo.h" +#include <sstream> + +namespace llvm { + +// Forward Declarations. +class TargetInstrInfo; +class NVPTXSubtarget; + +class NVPTXRegisterInfo : public NVPTXGenRegisterInfo { +private: + bool Is64Bit; + // Hold Strings that can be free'd all together with NVPTXRegisterInfo + ManagedStringPool ManagedStrPool; + +public: + NVPTXRegisterInfo(const TargetInstrInfo &tii, + const NVPTXSubtarget &st); + + + //------------------------------------------------------ + // Pure virtual functions from TargetRegisterInfo + //------------------------------------------------------ + + // NVPTX callee saved registers + virtual const uint16_t* + getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + // NVPTX callee saved register classes + virtual const TargetRegisterClass* const * + getCalleeSavedRegClasses(const MachineFunction *MF) const; + + virtual BitVector getReservedRegs(const MachineFunction &MF) const; + + virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, + RegScavenger *RS=NULL) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const; + virtual unsigned getFrameRegister(const MachineFunction &MF) const; + virtual unsigned getRARegister() const; + + ManagedStringPool *getStrPool() const { + return const_cast<ManagedStringPool *>(&ManagedStrPool); + } + + const char *getName(unsigned RegNo) const { + std::stringstream O; + O << "reg" << RegNo; + return getStrPool()->getManagedString(O.str().c_str())->c_str(); + } + +}; + + +std::string getNVPTXRegClassName (const TargetRegisterClass *RC); +std::string getNVPTXRegClassStr (const TargetRegisterClass *RC); +bool isNVPTXVectorRegClass (const TargetRegisterClass *RC); +std::string getNVPTXElemClassName (const TargetRegisterClass *RC); +int getNVPTXVectorSize (const TargetRegisterClass *RC); +const TargetRegisterClass *getNVPTXElemClass(const TargetRegisterClass *RC); + +} // end namespace llvm + + +#endif diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td new file mode 100644 index 0000000..ba15825 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -0,0 +1,108 @@ +//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the PTX register file +//===----------------------------------------------------------------------===// + +class NVPTXReg<string n> : Register<n> { + let Namespace = "NVPTX"; +} + +class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList> + : RegisterClass <"NVPTX", regTypes, alignment, regList>; + +//===----------------------------------------------------------------------===// +// Registers +//===----------------------------------------------------------------------===// + +// Special Registers used as stack pointer +def VRFrame : NVPTXReg<"%SP">; +def VRFrameLocal : NVPTXReg<"%SPL">; + +// Special Registers used as the stack +def VRDepot : NVPTXReg<"%Depot">; + +foreach i = 0-395 in { + def P#i : NVPTXReg<"%p"#i>; // Predicate + def RC#i : NVPTXReg<"%rc"#i>; // 8-bit + def RS#i : NVPTXReg<"%rs"#i>; // 16-bit + def R#i : NVPTXReg<"%r"#i>; // 32-bit + def RL#i : NVPTXReg<"%rl"#i>; // 64-bit + def F#i : NVPTXReg<"%f"#i>; // 32-bit float + def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float + // Vectors + foreach s = [ "2b8", "2b16", "2b32", "2b64", "4b8", "4b16", "4b32" ] in + def v#s#_#i : NVPTXReg<"%v"#s#"_"#i>; + + // Arguments + def ia#i : NVPTXReg<"%ia"#i>; + def la#i : NVPTXReg<"%la"#i>; + def fa#i : NVPTXReg<"%fa"#i>; + def da#i : NVPTXReg<"%da"#i>; +} + +//===----------------------------------------------------------------------===// +// Register classes +//===----------------------------------------------------------------------===// +def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 395))>; +def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%u", 0, 395))>; +def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 395))>; +def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 395))>; +def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 395))>; +def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 395))>; +def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 395))>; +def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 395))>; +def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 395))>; +def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 395))>; +def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 395))>; + +// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>; + +class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList, + NVPTXRegClass sClass, + int e, + string n> + : NVPTXRegClass<regTypes, alignment, regList> +{ + NVPTXRegClass scalarClass=sClass; + int elems=e; + string name=n; +} +def V2F32Regs + : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%u", 0, 395)), + Float32Regs, 2, ".v2.f32">; +def V4F32Regs + : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%u", 0, 395)), + Float32Regs, 4, ".v4.f32">; +def V2I32Regs + : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%u", 0, 395)), + Int32Regs, 2, ".v2.u32">; +def V4I32Regs + : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%u", 0, 395)), + Int32Regs, 4, ".v4.u32">; +def V2F64Regs + : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%u", 0, 395)), + Float64Regs, 2, ".v2.f64">; +def V2I64Regs + : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%u", 0, 395)), + Int64Regs, 2, ".v2.u64">; +def V2I16Regs + : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%u", 0, 395)), + Int16Regs, 2, ".v2.u16">; +def V4I16Regs + : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%u", 0, 395)), + Int16Regs, 4, ".v4.u16">; +def V2I8Regs + : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%u", 0, 395)), + Int8Regs, 2, ".v2.u8">; +def V4I8Regs + : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%u", 0, 395)), + Int8Regs, 4, ".v4.u8">; diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h new file mode 100644 index 0000000..f1ca466 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXSection.h @@ -0,0 +1,45 @@ +//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the NVPTXSection class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_NVPTXSECTION_H +#define LLVM_NVPTXSECTION_H + +#include "llvm/MC/MCSection.h" +#include "llvm/GlobalVariable.h" +#include <vector> + +namespace llvm { +/// NVPTXSection - Represents a section in PTX +/// PTX does not have sections. We create this class in order to use +/// the ASMPrint interface. +/// +class NVPTXSection : public MCSection { + +public: + NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {} + ~NVPTXSection() {} + + /// Override this as NVPTX has its own way of printing switching + /// to a section. + virtual void PrintSwitchToSection(const MCAsmInfo &MAI, + raw_ostream &OS) const {} + + /// Base address of PTX sections is zero. + virtual bool isBaseAddressKnownZero() const { return true; } + virtual bool UseCodeAlign() const { return false; } + virtual bool isVirtualSection() const { return false; } +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp new file mode 100644 index 0000000..2836cad --- /dev/null +++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp @@ -0,0 +1,77 @@ +//===- NVPTXSplitBBatBar.cpp - Split BB at Barrier --*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Split basic blocks so that a basic block that contains a barrier instruction +// only contains the barrier instruction. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Support/InstIterator.h" +#include "NVPTXUtilities.h" +#include "NVPTXSplitBBatBar.h" + +using namespace llvm; + +namespace llvm { +FunctionPass *createSplitBBatBarPass(); +} + +char NVPTXSplitBBatBar::ID = 0; + +bool NVPTXSplitBBatBar::runOnFunction(Function &F) { + + SmallVector<Instruction *, 4> SplitPoints; + bool changed = false; + + // Collect all the split points in SplitPoints + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + BasicBlock::iterator IB = BI->begin(); + BasicBlock::iterator II = IB; + BasicBlock::iterator IE = BI->end(); + + // Skit the first intruction. No splitting is needed at this + // point even if this is a bar. + while (II != IE) { + if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) { + Intrinsic::ID id = inst->getIntrinsicID(); + // If this is a barrier, split at this instruction + // and the next instruction. + if (llvm::isBarrierIntrinsic(id)) { + if (II != IB) + SplitPoints.push_back(II); + II++; + if ((II != IE) && (!II->isTerminator())) { + SplitPoints.push_back(II); + II++; + } + continue; + } + } + II++; + } + } + + for (unsigned i = 0; i != SplitPoints.size(); i++) { + changed = true; + Instruction *inst = SplitPoints[i]; + inst->getParent()->splitBasicBlock(inst, "bar_split"); + } + + return changed; +} + +// This interface will most likely not be necessary, because this pass will +// not be invoked by the driver, but will be used as a prerequisite to +// another pass. +FunctionPass *llvm::createSplitBBatBarPass() { + return new NVPTXSplitBBatBar(); +} diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.h b/lib/Target/NVPTX/NVPTXSplitBBatBar.h new file mode 100644 index 0000000..9e4d5a0 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.h @@ -0,0 +1,41 @@ +//===-- llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the NVIDIA specific declarations +// for splitting basic blocks at barrier instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTX_SPLIT_BB_AT_BAR_H +#define NVPTX_SPLIT_BB_AT_BAR_H + +#include "llvm/Pass.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" + +namespace llvm { + +// actual analysis class, which is a functionpass +struct NVPTXSplitBBatBar : public FunctionPass { + static char ID; + + NVPTXSplitBBatBar() : FunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<MachineFunctionAnalysis>(); + } + virtual bool runOnFunction(Function &F); + + virtual const char *getPassName() const { + return "Split basic blocks at barrier"; + } +}; + +extern FunctionPass *createSplitBBatBarPass(); +} + +#endif //NVPTX_SPLIT_BB_AT_BAR_H diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp new file mode 100644 index 0000000..6aadd43 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -0,0 +1,57 @@ +//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the NVPTX specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXSubtarget.h" +#define GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "NVPTXGenSubtargetInfo.inc" + +using namespace llvm; + +// Select Driver Interface +#include "llvm/Support/CommandLine.h" +namespace { +cl::opt<NVPTX::DrvInterface> +DriverInterface(cl::desc("Choose driver interface:"), + cl::values( + clEnumValN(NVPTX::NVCL, "drvnvcl", "Nvidia OpenCL driver"), + clEnumValN(NVPTX::CUDA, "drvcuda", "Nvidia CUDA driver"), + clEnumValN(NVPTX::TEST, "drvtest", "Plain Test"), + clEnumValEnd), + cl::init(NVPTX::NVCL)); +} + +NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, bool is64Bit) +:NVPTXGenSubtargetInfo(TT, "", FS), // Don't pass CPU to subtarget, + // because we don't register all + // nvptx targets. + Is64Bit(is64Bit) { + + drvInterface = DriverInterface; + + // Provide the default CPU if none + std::string defCPU = "sm_10"; + + // Get the TargetName from the FS if available + if (FS.empty() && CPU.empty()) + TargetName = defCPU; + else if (!CPU.empty()) + TargetName = CPU; + else + llvm_unreachable("we are not using FeatureStr"); + + // Set up the SmVersion + SmVersion = atoi(TargetName.c_str()+3); +} diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h new file mode 100644 index 0000000..8f2a629 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXSubtarget.h @@ -0,0 +1,92 @@ +//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the NVPTX specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTXSUBTARGET_H +#define NVPTXSUBTARGET_H + +#include "llvm/Target/TargetSubtargetInfo.h" +#include "NVPTX.h" + +#define GET_SUBTARGETINFO_HEADER +#include "NVPTXGenSubtargetInfo.inc" + +#include <string> + +namespace llvm { + +class NVPTXSubtarget : public NVPTXGenSubtargetInfo { + + unsigned int SmVersion; + std::string TargetName; + NVPTX::DrvInterface drvInterface; + bool dummy; // For the 'dummy' feature, see NVPTX.td + bool Is64Bit; + +public: + /// This constructor initializes the data members to match that + /// of the specified module. + /// + NVPTXSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, bool is64Bit); + + bool hasBrkPt() const { return SmVersion >= 11; } + bool hasAtomRedG32() const { return SmVersion >= 11; } + bool hasAtomRedS32() const { return SmVersion >= 12; } + bool hasAtomRedG64() const { return SmVersion >= 12; } + bool hasAtomRedS64() const { return SmVersion >= 20; } + bool hasAtomRedGen32() const { return SmVersion >= 20; } + bool hasAtomRedGen64() const { return SmVersion >= 20; } + bool hasAtomAddF32() const { return SmVersion >= 20; } + bool hasVote() const { return SmVersion >= 12; } + bool hasDouble() const { return SmVersion >= 13; } + bool reqPTX20() const { return SmVersion >= 20; } + bool hasF32FTZ() const { return SmVersion >= 20; } + bool hasFMAF32() const { return SmVersion >= 20; } + bool hasFMAF64() const { return SmVersion >= 13; } + bool hasLDU() const { return SmVersion >= 20; } + bool hasGenericLdSt() const { return SmVersion >= 20; } + inline bool hasHWROT32() const { return false; } + inline bool hasSWROT32() const { + return true; + } + inline bool hasROT32() const { return hasHWROT32() || hasSWROT32() ; } + inline bool hasROT64() const { return SmVersion >= 20; } + + + bool is64Bit() const { return Is64Bit; } + + unsigned int getSmVersion() const { return SmVersion; } + NVPTX::DrvInterface getDrvInterface() const { return drvInterface; } + std::string getTargetName() const { return TargetName; } + + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + std::string getDataLayout() const { + const char *p; + if (is64Bit()) + p = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-" + "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-" + "n16:32:64"; + else + p = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-" + "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-" + "n16:32:64"; + + return std::string(p); + } + +}; + +} // End llvm namespace + +#endif // NVPTXSUBTARGET_H diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp new file mode 100644 index 0000000..433f415 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -0,0 +1,133 @@ +//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Top-level implementation for the NVPTX target. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXTargetMachine.h" +#include "NVPTX.h" +#include "NVPTXSplitBBatBar.h" +#include "NVPTXLowerAggrCopies.h" +#include "MCTargetDesc/NVPTXMCAsmInfo.h" +#include "NVPTXAllocaHoisting.h" +#include "llvm/PassManager.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Assembly/PrintModulePass.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" + + +using namespace llvm; + + +extern "C" void LLVMInitializeNVPTXTarget() { + // Register the target. + RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32); + RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64); + + RegisterMCAsmInfo<NVPTXMCAsmInfo> A(TheNVPTXTarget32); + RegisterMCAsmInfo<NVPTXMCAsmInfo> B(TheNVPTXTarget64); + +} + +NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, + StringRef TT, + StringRef CPU, + StringRef FS, + const TargetOptions& Options, + Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL, + bool is64bit) +: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, is64bit), + DataLayout(Subtarget.getDataLayout()), + InstrInfo(*this), TLInfo(*this), TSInfo(*this), FrameLowering(*this,is64bit) +/*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ { +} + + + +void NVPTXTargetMachine32::anchor() {} + +NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) +: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) { +} + +void NVPTXTargetMachine64::anchor() {} + +NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) +: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) { +} + + +namespace llvm { +class NVPTXPassConfig : public TargetPassConfig { +public: + NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + NVPTXTargetMachine &getNVPTXTargetMachine() const { + return getTM<NVPTXTargetMachine>(); + } + + virtual bool addInstSelector(); + virtual bool addPreRegAlloc(); +}; +} + +TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { + NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM); + return PassConfig; +} + +bool NVPTXPassConfig::addInstSelector() { + addPass(createLowerAggrCopies()); + addPass(createSplitBBatBarPass()); + addPass(createAllocaHoisting()); + addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); + addPass(createVectorElementizePass(getNVPTXTargetMachine())); + return false; +} + +bool NVPTXPassConfig::addPreRegAlloc() { + return false; +} diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h new file mode 100644 index 0000000..b3f9cac --- /dev/null +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -0,0 +1,125 @@ +//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the NVPTX specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + + +#ifndef NVPTX_TARGETMACHINE_H +#define NVPTX_TARGETMACHINE_H + +#include "NVPTXInstrInfo.h" +#include "NVPTXISelLowering.h" +#include "NVPTXRegisterInfo.h" +#include "NVPTXSubtarget.h" +#include "NVPTXFrameLowering.h" +#include "ManagedStringPool.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +/// NVPTXTargetMachine +/// +class NVPTXTargetMachine : public LLVMTargetMachine { + NVPTXSubtarget Subtarget; + const TargetData DataLayout; // Calculates type size & alignment + NVPTXInstrInfo InstrInfo; + NVPTXTargetLowering TLInfo; + TargetSelectionDAGInfo TSInfo; + + // NVPTX does not have any call stack frame, but need a NVPTX specific + // FrameLowering class because TargetFrameLowering is abstract. + NVPTXFrameLowering FrameLowering; + + // Hold Strings that can be free'd all together with NVPTXTargetMachine + ManagedStringPool ManagedStrPool; + + //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level, + // bool DisableVerify, MCContext *&OutCtx); + +public: + NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OP, + bool is64bit); + + virtual const TargetFrameLowering *getFrameLowering() const { + return &FrameLowering; + } + virtual const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetData *getTargetData() const { return &DataLayout;} + virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget;} + + virtual const NVPTXRegisterInfo *getRegisterInfo() const { + return &(InstrInfo.getRegisterInfo()); + } + + virtual NVPTXTargetLowering *getTargetLowering() const { + return const_cast<NVPTXTargetLowering*>(&TLInfo); + } + + virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const { + return &TSInfo; + } + + //virtual bool addInstSelector(PassManagerBase &PM, + // CodeGenOpt::Level OptLevel); + + //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level); + + ManagedStringPool *getManagedStrPool() const { + return const_cast<ManagedStringPool*>(&ManagedStrPool); + } + + virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); + + // Emission of machine code through JITCodeEmitter is not supported. + virtual bool addPassesToEmitMachineCode(PassManagerBase &, + JITCodeEmitter &, + bool = true) { + return true; + } + + // Emission of machine code through MCJIT is not supported. + virtual bool addPassesToEmitMC(PassManagerBase &, + MCContext *&, + raw_ostream &, + bool = true) { + return true; + } + +}; // NVPTXTargetMachine. + +class NVPTXTargetMachine32 : public NVPTXTargetMachine { + virtual void anchor(); +public: + NVPTXTargetMachine32(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +class NVPTXTargetMachine64 : public NVPTXTargetMachine { + virtual void anchor(); +public: + NVPTXTargetMachine64(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + + +} // end namespace llvm + +#endif diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h new file mode 100644 index 0000000..b5698a2 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -0,0 +1,105 @@ +//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H +#define LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H + +#include "NVPTXSection.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include <string> + +namespace llvm { +class GlobalVariable; +class Module; + +class NVPTXTargetObjectFile : public TargetLoweringObjectFile { + +public: + NVPTXTargetObjectFile() {} + ~NVPTXTargetObjectFile() { + delete TextSection; + delete DataSection; + delete BSSSection; + delete ReadOnlySection; + + delete StaticCtorSection; + delete StaticDtorSection; + delete LSDASection; + delete EHFrameSection; + delete DwarfAbbrevSection; + delete DwarfInfoSection; + delete DwarfLineSection; + delete DwarfFrameSection; + delete DwarfPubTypesSection; + delete DwarfDebugInlineSection; + delete DwarfStrSection; + delete DwarfLocSection; + delete DwarfARangesSection; + delete DwarfRangesSection; + delete DwarfMacroInfoSection; + } + + virtual void Initialize(MCContext &ctx, const TargetMachine &TM) { + TextSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getText()); + DataSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getDataRel()); + BSSSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getBSS()); + ReadOnlySection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getReadOnly()); + + StaticCtorSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + StaticDtorSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + LSDASection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + EHFrameSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfAbbrevSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfInfoSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfLineSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfFrameSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfPubTypesSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfDebugInlineSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfStrSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfLocSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfARangesSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfRangesSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + DwarfMacroInfoSection = new NVPTXSection(MCSection::SV_ELF, + SectionKind::getMetadata()); + } + + virtual const MCSection *getSectionForConstant(SectionKind Kind) const { + return ReadOnlySection; + } + + virtual const MCSection * + getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler *Mang, + const TargetMachine &TM) const { + return DataSection; + } + +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp new file mode 100644 index 0000000..3f52251 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -0,0 +1,514 @@ +//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains miscellaneous utility functions +//===----------------------------------------------------------------------===// + +#include "NVPTXUtilities.h" +#include "NVPTX.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Function.h" +#include "llvm/Module.h" +#include "llvm/Constants.h" +#include "llvm/Operator.h" +#include <algorithm> +#include <cstring> +#include <map> +#include <string> +#include <vector> +//#include <iostream> +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/InstIterator.h" + +using namespace llvm; + +typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t; +typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t; +typedef std::map<const Module *, global_val_annot_t> per_module_annot_t; + +ManagedStatic<per_module_annot_t> annotationCache; + + +static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { + assert(md && "Invalid mdnode for annotation"); + assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands"); + // start index = 1, to skip the global variable key + // increment = 2, to skip the value for each property-value pairs + for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) { + // property + const MDString *prop = dyn_cast<MDString>(md->getOperand(i)); + assert(prop && "Annotation property not a string"); + + // value + ConstantInt *Val = dyn_cast<ConstantInt>(md->getOperand(i+1)); + assert(Val && "Value operand not a constant int"); + + std::string keyname = prop->getString().str(); + if (retval.find(keyname) != retval.end()) + retval[keyname].push_back(Val->getZExtValue()); + else { + std::vector<unsigned> tmp; + tmp.push_back(Val->getZExtValue()); + retval[keyname] = tmp; + } + } +} + +static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { + NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations); + if (!NMD) + return; + key_val_pair_t tmp; + for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) { + const MDNode *elem = NMD->getOperand(i); + + Value *entity = elem->getOperand(0); + // entity may be null due to DCE + if (!entity) + continue; + if (entity != gv) + continue; + + // accumulate annotations for entity in tmp + cacheAnnotationFromMD(elem, tmp); + } + + if (tmp.empty()) // no annotations for this gv + return; + + if ((*annotationCache).find(m) != (*annotationCache).end()) + (*annotationCache)[m][gv] = tmp; + else { + global_val_annot_t tmp1; + tmp1[gv] = tmp; + (*annotationCache)[m] = tmp1; + } +} + +bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop, + unsigned &retval) { + const Module *m = gv->getParent(); + if ((*annotationCache).find(m) == (*annotationCache).end()) + cacheAnnotationFromMD(m, gv); + else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end()) + cacheAnnotationFromMD(m, gv); + if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end()) + return false; + retval = (*annotationCache)[m][gv][prop][0]; + return true; +} + +bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop, + std::vector<unsigned> &retval) { + const Module *m = gv->getParent(); + if ((*annotationCache).find(m) == (*annotationCache).end()) + cacheAnnotationFromMD(m, gv); + else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end()) + cacheAnnotationFromMD(m, gv); + if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end()) + return false; + retval = (*annotationCache)[m][gv][prop]; + return true; +} + +bool llvm::isTexture(const llvm::Value &val) { + if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { + unsigned annot; + if (llvm::findOneNVVMAnnotation(gv, + llvm::PropertyAnnotationNames[llvm::PROPERTY_ISTEXTURE], + annot)) { + assert((annot == 1) && "Unexpected annotation on a texture symbol"); + return true; + } + } + return false; +} + +bool llvm::isSurface(const llvm::Value &val) { + if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { + unsigned annot; + if (llvm::findOneNVVMAnnotation(gv, + llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSURFACE], + annot)) { + assert((annot == 1) && "Unexpected annotation on a surface symbol"); + return true; + } + } + return false; +} + +bool llvm::isSampler(const llvm::Value &val) { + if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) { + unsigned annot; + if (llvm::findOneNVVMAnnotation(gv, + llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER], + annot)) { + assert((annot == 1) && "Unexpected annotation on a sampler symbol"); + return true; + } + } + if (const Argument *arg = dyn_cast<Argument>(&val)) { + const Function *func = arg->getParent(); + std::vector<unsigned> annot; + if (llvm::findAllNVVMAnnotation(func, + llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER], + annot)) { + if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end()) + return true; + } + } + return false; +} + +bool llvm::isImageReadOnly(const llvm::Value &val) { + if (const Argument *arg = dyn_cast<Argument>(&val)) { + const Function *func = arg->getParent(); + std::vector<unsigned> annot; + if (llvm::findAllNVVMAnnotation(func, + llvm::PropertyAnnotationNames[llvm::PROPERTY_ISREADONLY_IMAGE_PARAM], + annot)) { + if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end()) + return true; + } + } + return false; +} + +bool llvm::isImageWriteOnly(const llvm::Value &val) { + if (const Argument *arg = dyn_cast<Argument>(&val)) { + const Function *func = arg->getParent(); + std::vector<unsigned> annot; + if (llvm::findAllNVVMAnnotation(func, + llvm::PropertyAnnotationNames[llvm::PROPERTY_ISWRITEONLY_IMAGE_PARAM], + annot)) { + if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end()) + return true; + } + } + return false; +} + +bool llvm::isImage(const llvm::Value &val) { + return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val); +} + +std::string llvm::getTextureName(const llvm::Value &val) { + assert(val.hasName() && "Found texture variable with no name"); + return val.getName(); +} + +std::string llvm::getSurfaceName(const llvm::Value &val) { + assert(val.hasName() && "Found surface variable with no name"); + return val.getName(); +} + +std::string llvm::getSamplerName(const llvm::Value &val) { + assert(val.hasName() && "Found sampler variable with no name"); + return val.getName(); +} + +bool llvm::getMaxNTIDx(const Function &F, unsigned &x) { + return (llvm::findOneNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_X], + x)); +} + +bool llvm::getMaxNTIDy(const Function &F, unsigned &y) { + return (llvm::findOneNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Y], + y)); +} + +bool llvm::getMaxNTIDz(const Function &F, unsigned &z) { + return (llvm::findOneNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Z], + z)); +} + +bool llvm::getReqNTIDx(const Function &F, unsigned &x) { + return (llvm::findOneNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_X], + x)); +} + +bool llvm::getReqNTIDy(const Function &F, unsigned &y) { + return (llvm::findOneNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Y], + y)); +} + +bool llvm::getReqNTIDz(const Function &F, unsigned &z) { + return (llvm::findOneNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Z], + z)); +} + +bool llvm::getMinCTASm(const Function &F, unsigned &x) { + return (llvm::findOneNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_MINNCTAPERSM], + x)); +} + +bool llvm::isKernelFunction(const Function &F) { + unsigned x = 0; + bool retval = llvm::findOneNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_ISKERNEL_FUNCTION], + x); + if (retval == false) { + // There is no NVVM metadata, check the calling convention + if (F.getCallingConv() == llvm::CallingConv::PTX_Kernel) + return true; + else + return false; + } + return (x==1); +} + +bool llvm::getAlign(const Function &F, unsigned index, unsigned &align) { + std::vector<unsigned> Vs; + bool retval = llvm::findAllNVVMAnnotation(&F, + llvm::PropertyAnnotationNames[llvm::PROPERTY_ALIGN], + Vs); + if (retval == false) + return false; + for (int i=0, e=Vs.size(); i<e; i++) { + unsigned v = Vs[i]; + if ( (v >> 16) == index ) { + align = v & 0xFFFF; + return true; + } + } + return false; +} + +bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) { + if (MDNode *alignNode = I.getMetadata("callalign")) { + for (int i=0, n = alignNode->getNumOperands(); + i<n; i++) { + if (const ConstantInt *CI = + dyn_cast<ConstantInt>(alignNode->getOperand(i))) { + unsigned v = CI->getZExtValue(); + if ( (v>>16) == index ) { + align = v & 0xFFFF; + return true; + } + if ( (v>>16) > index ) { + return false; + } + } + } + } + return false; +} + +bool llvm::isBarrierIntrinsic(Intrinsic::ID id) { + if ((id == Intrinsic::nvvm_barrier0) || + (id == Intrinsic::nvvm_barrier0_popc) || + (id == Intrinsic::nvvm_barrier0_and) || + (id == Intrinsic::nvvm_barrier0_or) || + (id == Intrinsic::cuda_syncthreads)) + return true; + return false; +} + +// Interface for checking all memory space transfer related intrinsics +bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) { + if (id == Intrinsic::nvvm_ptr_local_to_gen || + id == Intrinsic::nvvm_ptr_shared_to_gen || + id == Intrinsic::nvvm_ptr_global_to_gen || + id == Intrinsic::nvvm_ptr_constant_to_gen || + id == Intrinsic::nvvm_ptr_gen_to_global || + id == Intrinsic::nvvm_ptr_gen_to_shared || + id == Intrinsic::nvvm_ptr_gen_to_local || + id == Intrinsic::nvvm_ptr_gen_to_constant || + id == Intrinsic::nvvm_ptr_gen_to_param) { + return true; + } + + return false; +} + +// consider several special intrinsics in striping pointer casts, and +// provide an option to ignore GEP indicies for find out the base address only +// which could be used in simple alias disambigurate. +const Value *llvm::skipPointerTransfer(const Value *V, + bool ignore_GEP_indices) { + V = V->stripPointerCasts(); + while (true) { + if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) { + if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) { + V = IS->getArgOperand(0)->stripPointerCasts(); + continue; + } + } else if (ignore_GEP_indices) + if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { + V = GEP->getPointerOperand()->stripPointerCasts(); + continue; + } + break; + } + return V; +} + +// consider several special intrinsics in striping pointer casts, and +// - ignore GEP indicies for find out the base address only, and +// - tracking PHINode +// which could be used in simple alias disambigurate. +const Value *llvm::skipPointerTransfer(const Value *V, + std::set<const Value *> &processed) { + if (processed.find(V) != processed.end()) + return NULL; + processed.insert(V); + + const Value *V2 = V->stripPointerCasts(); + if (V2 != V && processed.find(V2) != processed.end()) + return NULL; + processed.insert(V2); + + V = V2; + + while (true) { + if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) { + if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) { + V = IS->getArgOperand(0)->stripPointerCasts(); + continue; + } + } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { + V = GEP->getPointerOperand()->stripPointerCasts(); + continue; + } else if (const PHINode *PN = dyn_cast<PHINode>(V)) { + if (V != V2 && processed.find(V) != processed.end()) + return NULL; + processed.insert(PN); + const Value *common = 0; + for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { + const Value *pv = PN->getIncomingValue(i); + const Value *base = skipPointerTransfer(pv, processed); + if (base) { + if (common == 0) + common = base; + else if (common != base) + return PN; + } + } + if (common == 0) + return PN; + V = common; + } + break; + } + return V; +} + + +// The following are some useful utilities for debuggung + +BasicBlock *llvm::getParentBlock(Value *v) { + if (BasicBlock *B = dyn_cast<BasicBlock>(v)) + return B; + + if (Instruction *I = dyn_cast<Instruction>(v)) + return I->getParent(); + + return 0; +} + +Function *llvm::getParentFunction(Value *v) { + if (Function *F = dyn_cast<Function>(v)) + return F; + + if (Instruction *I = dyn_cast<Instruction>(v)) + return I->getParent()->getParent(); + + if (BasicBlock *B = dyn_cast<BasicBlock>(v)) + return B->getParent(); + + return 0; +} + +// Dump a block by name +void llvm::dumpBlock(Value *v, char *blockName) { + Function *F = getParentFunction(v); + if (F == 0) + return; + + for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) { + BasicBlock *B = it; + if (strcmp(B->getName().data(), blockName) == 0) { + B->dump(); + return; + } + } +} + +// Find an instruction by name +Instruction *llvm::getInst(Value *base, char *instName) { + Function *F = getParentFunction(base); + if (F == 0) + return 0; + + for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) { + Instruction *I = &*it; + if (strcmp(I->getName().data(), instName) == 0) { + return I; + } + } + + return 0; +} + +// Dump an instruction by nane +void llvm::dumpInst(Value *base, char *instName) { + Instruction *I = getInst(base, instName); + if (I) + I->dump(); +} + +// Dump an instruction and all dependent instructions +void llvm::dumpInstRec(Value *v, std::set<Instruction *> *visited) { + if (Instruction *I = dyn_cast<Instruction>(v)) { + + if (visited->find(I) != visited->end()) + return; + + visited->insert(I); + + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + dumpInstRec(I->getOperand(i), visited); + + I->dump(); + } +} + +// Dump an instruction and all dependent instructions +void llvm::dumpInstRec(Value *v) { + std::set<Instruction *> visited; + + //BasicBlock *B = getParentBlock(v); + + dumpInstRec(v, &visited); +} + +// Dump the parent for Instruction, block or function +void llvm::dumpParent(Value *v) { + if (Instruction *I = dyn_cast<Instruction>(v)) { + I->getParent()->dump(); + return; + } + + if (BasicBlock *B = dyn_cast<BasicBlock>(v)) { + B->getParent()->dump(); + return; + } + + if (Function *F = dyn_cast<Function>(v)) { + F->getParent()->dump(); + return; + } +} diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h new file mode 100644 index 0000000..fe6ad55 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXUtilities.h @@ -0,0 +1,94 @@ +//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the NVVM specific utility functions. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTXUTILITIES_H +#define NVPTXUTILITIES_H + +#include "llvm/Value.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Function.h" +#include "llvm/IntrinsicInst.h" +#include <cstdarg> +#include <set> +#include <string> +#include <vector> + +namespace llvm +{ + +#define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly" +#define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly" + +bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &); +bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string, + std::vector<unsigned> &); + +bool isTexture(const llvm::Value &); +bool isSurface(const llvm::Value &); +bool isSampler(const llvm::Value &); +bool isImage(const llvm::Value &); +bool isImageReadOnly(const llvm::Value &); +bool isImageWriteOnly(const llvm::Value &); + +std::string getTextureName(const llvm::Value &); +std::string getSurfaceName(const llvm::Value &); +std::string getSamplerName(const llvm::Value &); + +bool getMaxNTIDx(const llvm::Function &, unsigned &); +bool getMaxNTIDy(const llvm::Function &, unsigned &); +bool getMaxNTIDz(const llvm::Function &, unsigned &); + +bool getReqNTIDx(const llvm::Function &, unsigned &); +bool getReqNTIDy(const llvm::Function &, unsigned &); +bool getReqNTIDz(const llvm::Function &, unsigned &); + +bool getMinCTASm(const llvm::Function &, unsigned &); +bool isKernelFunction(const llvm::Function &); + +bool getAlign(const llvm::Function &, unsigned index, unsigned &); +bool getAlign(const llvm::CallInst &, unsigned index, unsigned &); + +bool isBarrierIntrinsic(llvm::Intrinsic::ID); + +/// make_vector - Helper function which is useful for building temporary vectors +/// to pass into type construction of CallInst ctors. This turns a null +/// terminated list of pointers (or other value types) into a real live vector. +/// +template<typename T> +inline std::vector<T> make_vector(T A, ...) { + va_list Args; + va_start(Args, A); + std::vector<T> Result; + Result.push_back(A); + while (T Val = va_arg(Args, T)) + Result.push_back(Val); + va_end(Args); + return Result; +} + +bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id); +const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices); +const Value *skipPointerTransfer(const Value *V, + std::set<const Value *> &processed); +BasicBlock *getParentBlock(Value *v); +Function *getParentFunction(Value *v); +void dumpBlock(Value *v, char *blockName); +Instruction *getInst(Value *base, char *instName); +void dumpInst(Value *base, char *instName); +void dumpInstRec(Value *v, std::set<Instruction *> *visited); +void dumpInstRec(Value *v); +void dumpParent(Value *v); + +} + +#endif diff --git a/lib/Target/NVPTX/NVPTXVector.td b/lib/Target/NVPTX/NVPTXVector.td new file mode 100644 index 0000000..775df19 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXVector.td @@ -0,0 +1,1481 @@ +//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//----------------------------------- +// Vector Specific +//----------------------------------- + +// +// All vector instructions derive from NVPTXVecInst +// + +class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern, + NVPTXInst sInst=NOP> + : NVPTXInst<outs, ins, asmstr, pattern> { + NVPTXInst scalarInst=sInst; +} + +let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in { +// Extract v2i16 +def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), + (ins V2I16Regs:$src, i8imm:$c), + "mov.u16 \t$dst, $src${c:vecelem};", + [(set Int16Regs:$dst, (vector_extract + (v2i16 V2I16Regs:$src), imm:$c))], + IMOV16rr>; + +// Extract v4i16 +def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), + (ins V4I16Regs:$src, i8imm:$c), + "mov.u16 \t$dst, $src${c:vecelem};", + [(set Int16Regs:$dst, (vector_extract + (v4i16 V4I16Regs:$src), imm:$c))], + IMOV16rr>; + +// Extract v2i8 +def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), + (ins V2I8Regs:$src, i8imm:$c), + "mov.u16 \t$dst, $src${c:vecelem};", + [(set Int8Regs:$dst, (vector_extract + (v2i8 V2I8Regs:$src), imm:$c))], + IMOV8rr>; + +// Extract v4i8 +def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), + (ins V4I8Regs:$src, i8imm:$c), + "mov.u16 \t$dst, $src${c:vecelem};", + [(set Int8Regs:$dst, (vector_extract + (v4i8 V4I8Regs:$src), imm:$c))], + IMOV8rr>; + +// Extract v2i32 +def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), + (ins V2I32Regs:$src, i8imm:$c), + "mov.u32 \t$dst, $src${c:vecelem};", + [(set Int32Regs:$dst, (vector_extract + (v2i32 V2I32Regs:$src), imm:$c))], + IMOV32rr>; + +// Extract v2f32 +def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), + (ins V2F32Regs:$src, i8imm:$c), + "mov.f32 \t$dst, $src${c:vecelem};", + [(set Float32Regs:$dst, (vector_extract + (v2f32 V2F32Regs:$src), imm:$c))], + FMOV32rr>; + +// Extract v2i64 +def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst), + (ins V2I64Regs:$src, i8imm:$c), + "mov.u64 \t$dst, $src${c:vecelem};", + [(set Int64Regs:$dst, (vector_extract + (v2i64 V2I64Regs:$src), imm:$c))], + IMOV64rr>; + +// Extract v2f64 +def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst), + (ins V2F64Regs:$src, i8imm:$c), + "mov.f64 \t$dst, $src${c:vecelem};", + [(set Float64Regs:$dst, (vector_extract + (v2f64 V2F64Regs:$src), imm:$c))], + FMOV64rr>; + +// Extract v4i32 +def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), + (ins V4I32Regs:$src, i8imm:$c), + "mov.u32 \t$dst, $src${c:vecelem};", + [(set Int32Regs:$dst, (vector_extract + (v4i32 V4I32Regs:$src), imm:$c))], + IMOV32rr>; + +// Extract v4f32 +def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), + (ins V4F32Regs:$src, i8imm:$c), + "mov.f32 \t$dst, $src${c:vecelem};", + [(set Float32Regs:$dst, (vector_extract + (v4f32 V4F32Regs:$src), imm:$c))], + FMOV32rr>; +} + +let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in { +// Insert v2i8 +def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst), + (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c), + "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.u16 \t$dst${c:vecelem}, $val;", + [(set V2I8Regs:$dst, + (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))], + IMOV8rr>; + +// Insert v4i8 +def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst), + (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c), + "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.u16 \t$dst${c:vecelem}, $val;", + [(set V4I8Regs:$dst, + (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))], + IMOV8rr>; + +// Insert v2i16 +def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst), + (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c), + "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.u16 \t$dst${c:vecelem}, $val;", + [(set V2I16Regs:$dst, + (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))], + IMOV16rr>; + +// Insert v4i16 +def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst), + (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c), + "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.u16 \t$dst${c:vecelem}, $val;", + [(set V4I16Regs:$dst, + (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))], + IMOV16rr>; + +// Insert v2i32 +def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst), + (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c), + "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.u32 \t$dst${c:vecelem}, $val;", + [(set V2I32Regs:$dst, + (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))], + IMOV32rr>; + +// Insert v2f32 +def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst), + (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c), + "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.f32 \t$dst${c:vecelem}, $val;", + [(set V2F32Regs:$dst, + (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))], + FMOV32rr>; + +// Insert v2i64 +def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst), + (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c), + "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.u64 \t$dst${c:vecelem}, $val;", + [(set V2I64Regs:$dst, + (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))], + IMOV64rr>; + +// Insert v2f64 +def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst), + (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c), + "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.f64 \t$dst${c:vecelem}, $val;", + [(set V2F64Regs:$dst, + (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))], + FMOV64rr>; + +// Insert v4i32 +def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst), + (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c), + "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.u32 \t$dst${c:vecelem}, $val;", + [(set V4I32Regs:$dst, + (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))], + IMOV32rr>; + +// Insert v4f32 +def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst), + (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c), + "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};" + "\n\tmov.f32 \t$dst${c:vecelem}, $val;", + [(set V4F32Regs:$dst, + (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))], + FMOV32rr>; +} + +class BinOpAsmString<string c> { + string s = c; +} + +class V4AsmStr<string opcode> : BinOpAsmString< + !strconcat(!strconcat(!strconcat(!strconcat( + !strconcat(!strconcat(!strconcat( + opcode, " \t${dst}_0, ${a}_0, ${b}_0;\n\t"), + opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"), + opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"), + opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>; + +class V2AsmStr<string opcode> : BinOpAsmString< + !strconcat(!strconcat(!strconcat( + opcode, " \t${dst}_0, ${a}_0, ${b}_0;\n\t"), + opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>; + +class V4MADStr<string opcode> : BinOpAsmString< + !strconcat(!strconcat(!strconcat(!strconcat( + !strconcat(!strconcat(!strconcat( + opcode, " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"), + opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"), + opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"), + opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>; + +class V2MADStr<string opcode> : BinOpAsmString< + !strconcat(!strconcat(!strconcat( + opcode, " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"), + opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>; + +class V4UnaryStr<string opcode> : BinOpAsmString< + !strconcat(!strconcat(!strconcat(!strconcat( + !strconcat(!strconcat(!strconcat( + opcode, " \t${dst}_0, ${a}_0;\n\t"), + opcode), " \t${dst}_1, ${a}_1;\n\t"), + opcode), " \t${dst}_2, ${a}_2;\n\t"), + opcode), " \t${dst}_3, ${a}_3;")>; + +class V2UnaryStr<string opcode> : BinOpAsmString< + !strconcat(!strconcat(!strconcat( + opcode, " \t${dst}_0, ${a}_0;\n\t"), + opcode), " \t${dst}_1, ${a}_1;")>; + +class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass, + NVPTXInst sInst=NOP> : + NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b), + asmstr.s, + [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))], + sInst>; + +class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1, + NVPTXRegClass regclass2, NVPTXInst sInst=NOP> : + NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b), + asmstr.s, + [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))], + sInst>; + +class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass, + NVPTXInst sInst=NOP> : + NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a), + asmstr.s, + [(set regclass:$dst, (OpNode regclass:$a))], sInst>; + +multiclass IntBinVOp<string asmstr, SDNode OpNode, + NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst + i16op=NOP, NVPTXInst i8op=NOP> { + def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs, + i64op>; + def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs, + i32op>; + def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs, + i32op>; + def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs, + i16op>; + def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs, + i16op>; + def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs, + i8op>; + def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs, + i8op>; +} + +multiclass FloatBinVOp<string asmstr, SDNode OpNode, + NVPTXInst f64=NOP, NVPTXInst f32=NOP, + NVPTXInst f32_ftz=NOP> { + def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode, + V2F64Regs, f64>; + def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode, + V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>; + def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode, + V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>; + def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode, + V4F32Regs, f32>; + def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode, + V2F32Regs, f32>; +} + +multiclass IntUnaryVOp<string asmstr, PatFrag OpNode, + NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, + NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> { + def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode, + V2I64Regs, i64op>; + def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode, + V4I32Regs, i32op>; + def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode, + V2I32Regs, i32op>; + def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode, + V4I16Regs, i16op>; + def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode, + V2I16Regs, i16op>; + def V4I8 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode, + V4I8Regs, i8op>; + def V2I8 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode, + V2I8Regs, i8op>; +} + + +// Integer Arithmetic +let VecInstType=isVecOther.Value in { +defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>; +defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>; + +def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs, + ADDCCi32rr>; +def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs, + ADDCCi32rr>; +def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs, + SUBCCi32rr>; +def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs, + SUBCCi32rr>; +def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs, + ADDCCCi32rr>; +def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs, + ADDCCCi32rr>; +def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs, + SUBCCCi32rr>; +def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs, + SUBCCCi32rr>; + +def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs, + SHLi64rr>; +def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs, + SHLi32rr>; +def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs, + SHLi32rr>; +def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs, + SHLi16rr>; +def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs, + SHLi16rr>; +def ShiftLV2I8 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs, V2I32Regs, + SHLi8rr>; +def ShiftLV4I8 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs, V4I32Regs, + SHLi8rr>; +} + +// cvt to v*i32, helpers for shift +class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr, + NVPTXInst sInst=NOP> : + NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>; + +class VecCVTStrHelper<string op, string dest, string src> { + string s=!strconcat(op, !strconcat("\t", + !strconcat(dest, !strconcat(", ", !strconcat(src, ";"))))); +} + +class Vec2CVTStr<string op> { + string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s, + !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s)); +} + +class Vec4CVTStr<string op> { + string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s, + !strconcat("\n\t", + !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s, + !strconcat("\n\t", + !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s, + !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s)))))); +} + +let VecInstType=isVecOther.Value in { +def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs, + Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>; +def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs, + Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>; +def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs, + Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>; +def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs, + Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>; +def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs, + Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>; +} + +def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2), + (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>; +def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2), + (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>; +def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2), + (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>; + +def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2), + (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>; +def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2), + (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>; + +let VecInstType=isVecOther.Value in { +def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs, + SRAi64rr>; +def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs, + SRAi32rr>; +def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs, + SRAi32rr>; +def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs, + SRAi16rr>; +def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs, + SRAi16rr>; +def ShiftRAV2I8 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs, V2I32Regs, + SRAi8rr>; +def ShiftRAV4I8 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs, V4I32Regs, + SRAi8rr>; + +def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs, + SRLi64rr>; +def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs, + SRLi32rr>; +def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs, + SRLi32rr>; +def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs, + SRLi16rr>; +def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs, + SRLi16rr>; +def ShiftRLV2I8 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs, V2I32Regs, + SRLi8rr>; +def ShiftRLV4I8 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs, V4I32Regs, + SRLi8rr>; + +defm VMult : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr, + MULTi8rr>; +defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr, + MULTHSi16rr, + MULTHSi8rr>; +defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr, + MULTHUi16rr, + MULTHUi8rr>; +defm VSDiv : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr, + SDIVi8rr>; +defm VUDiv : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr, + UDIVi8rr>; +defm VSRem : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr, + SREMi8rr>; +defm VURem : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr, + UREMi8rr>; +} + +def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2), + (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>; +def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2), + (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>; +def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2), + (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>; + +def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2), + (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>; +def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2), + (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>; + +def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2), + (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>; +def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2), + (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>; +def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2), + (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>; + +def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2), + (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>; +def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2), + (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>; + +multiclass VMAD<string asmstr, NVPTXRegClass regclassv4, + NVPTXRegClass regclassv2, + SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP, + Predicate Pred> { + def V4 : NVPTXVecInst<(outs regclassv4:$dst), + (ins regclassv4:$a, regclassv4:$b, regclassv4:$c), + V4MADStr<asmstr>.s, + [(set regclassv4:$dst, + (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))], + sop>, + Requires<[Pred]>; + def V2 : NVPTXVecInst<(outs regclassv2:$dst), + (ins regclassv2:$a, regclassv2:$b, regclassv2:$c), + V2MADStr<asmstr>.s, + [(set regclassv2:$dst, + (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))], + sop>, + Requires<[Pred]>; +} + +multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP, + Predicate Pred> { + def V2 : NVPTXVecInst<(outs regclass:$dst), + (ins regclass:$a, regclass:$b, regclass:$c), + V2MADStr<asmstr>.s, + [(set regclass:$dst, (add + (mul regclass:$a, regclass:$b), regclass:$c))], sop>, + Requires<[Pred]>; +} +multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP, + Predicate Pred> { + def V2 : NVPTXVecInst<(outs regclass:$dst), + (ins regclass:$a, regclass:$b, regclass:$c), + V2MADStr<asmstr>.s, + [(set regclass:$dst, (fadd + (fmul regclass:$a, regclass:$b), regclass:$c))], sop>, + Requires<[Pred]>; +} + +let VecInstType=isVecOther.Value in { +defm I8MAD : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>; +defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr, + true>; +defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr, + true>; +defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>; + +defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>; + +defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>; +defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>; +defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>; + +defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul, + FMAD32_ftzrrr, doFMADF32_ftz>; +defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul, + FMA32_ftzrrr, doFMAF32_ftz>; +defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr, + doFMADF32>; +defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr, + doFMAF32>; +defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>; +} + +let VecInstType=isVecOther.Value in { +def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs, + FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>; +def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs, + FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>; +def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs, + FDIV32rr_prec>, Requires<[reqPTX20]>; +def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs, + FDIV32rr_prec>, Requires<[reqPTX20]>; +def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs, + FDIV32rr_ftz>, Requires<[doF32FTZ]>; +def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs, + FDIV32rr_ftz>, Requires<[doF32FTZ]>; +def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>; +def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>; +def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>; +} + +def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>; + +let VecInstType=isVecOther.Value in { +def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs, + FNEGf32_ftz>, Requires<[doF32FTZ]>; +def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs, + FNEGf32_ftz>, Requires<[doF32FTZ]>; +def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>; +def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>; +def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>; + +// Logical Arithmetic +defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>; +defm VOr : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>; +defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>; + +defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>; +} + + +multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> { + def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)), + (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c, V2F32Regs:$a)>, + Requires<[Pred]>; + + def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c), + (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>, + Requires<[Pred]>; +} + +defm V2FMAF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>; +defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>; +defm V2FMAF32ext : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>; +defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>; + +multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> { + def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)), + (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c, V4F32Regs:$a)>, + Requires<[Pred]>; + + def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c), + (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>, + Requires<[Pred]>; +} + +defm V4FMAF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>; +defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>; +defm V4FMAF32ext : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>; +defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>; + +multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> { + def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)), + (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>, + Requires<[Pred]>; + + def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c), + (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>, + Requires<[Pred]>; +} + +defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>; + +class VecModStr<string vecsize, string elem, string extra, string l=""> +{ + string t1 = !strconcat("${c", elem); + string t2 = !strconcat(t1, ":vecv"); + string t3 = !strconcat(t2, vecsize); + string t4 = !strconcat(t3, extra); + string t5 = !strconcat(t4, l); + string s = !strconcat(t5, "}"); +} +class ShuffleOneLine<string vecsize, string elem, string type> +{ + string t1 = VecModStr<vecsize, elem, "comm", "1">.s; + string t2 = !strconcat(t1, "mov."); + string t3 = !strconcat(t2, type); + string t4 = !strconcat(t3, " \t${dst}_"); + string t5 = !strconcat(t4, elem); + string t6 = !strconcat(t5, ", $src1"); + string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s); + string t8 = !strconcat(t7, ";\n\t"); + string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s); + string t10 = !strconcat(t9, "mov."); + string t11 = !strconcat(t10, type); + string t12 = !strconcat(t11, " \t${dst}_"); + string t13 = !strconcat(t12, elem); + string t14 = !strconcat(t13, ", $src2"); + string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s); + string s = !strconcat(t15, ";"); +} +class ShuffleAsmStr2<string type> +{ + string t1 = ShuffleOneLine<"2", "0", type>.s; + string t2 = !strconcat(t1, "\n\t"); + string s = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s); +} +class ShuffleAsmStr4<string type> +{ + string t1 = ShuffleOneLine<"4", "0", type>.s; + string t2 = !strconcat(t1, "\n\t"); + string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s); + string t4 = !strconcat(t3, "\n\t"); + string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s); + string t6 = !strconcat(t5, "\n\t"); + string s = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s); +} + +let neverHasSideEffects=1, VecInstType=isVecShuffle.Value in { +def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst), + (ins V4F32Regs:$src1, V4F32Regs:$src2, + i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t", + ShuffleAsmStr4<"f32">.s), + [], FMOV32rr>; + +def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst), + (ins V4I32Regs:$src1, V4I32Regs:$src2, + i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t", + ShuffleAsmStr4<"u32">.s), + [], IMOV32rr>; + +def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst), + (ins V4I16Regs:$src1, V4I16Regs:$src2, + i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t", + ShuffleAsmStr4<"u16">.s), + [], IMOV16rr>; + +def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst), + (ins V4I8Regs:$src1, V4I8Regs:$src2, + i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t", + ShuffleAsmStr4<"u16">.s), + [], IMOV8rr>; + +def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst), + (ins V2F32Regs:$src1, V2F32Regs:$src2, + i8imm:$c0, i8imm:$c1), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", + ShuffleAsmStr2<"f32">.s), + [], FMOV32rr>; + +def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst), + (ins V2I32Regs:$src1, V2I32Regs:$src2, + i8imm:$c0, i8imm:$c1), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", + ShuffleAsmStr2<"u32">.s), + [], IMOV32rr>; + +def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst), + (ins V2I8Regs:$src1, V2I8Regs:$src2, + i8imm:$c0, i8imm:$c1), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", + ShuffleAsmStr2<"u16">.s), + [], IMOV8rr>; + +def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst), + (ins V2I16Regs:$src1, V2I16Regs:$src2, + i8imm:$c0, i8imm:$c1), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", + ShuffleAsmStr2<"u16">.s), + [], IMOV16rr>; + +def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst), + (ins V2F64Regs:$src1, V2F64Regs:$src2, + i8imm:$c0, i8imm:$c1), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", + ShuffleAsmStr2<"f64">.s), + [], FMOV64rr>; + +def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst), + (ins V2I64Regs:$src1, V2I64Regs:$src2, + i8imm:$c0, i8imm:$c1), + !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t", + ShuffleAsmStr2<"u64">.s), + [], IMOV64rr>; +} + +def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + return CurDAG->getTargetConstant(SVOp->getMaskElt(0), MVT::i32); +}]>; +def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + return CurDAG->getTargetConstant(SVOp->getMaskElt(1), MVT::i32); +}]>; +def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + return CurDAG->getTargetConstant(SVOp->getMaskElt(2), MVT::i32); +}]>; +def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + return CurDAG->getTargetConstant(SVOp->getMaskElt(3), MVT::i32); +}]>; + +// The spurious call is here to silence a compiler warning about N being +// unused. +def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), + [{ N->getGluedNode(); return true; }]>; + +def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)), + (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; + +def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)), + (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op), + (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>; + +def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)), + (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; + +def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)), + (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; + +def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)), + (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op), + (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>; + +def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)), + (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; + +def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)), + (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op), + (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>; + +def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)), + (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; + +def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)), + (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op), + (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>; + +def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)), + (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2, + (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>; + +class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass, + NVPTXInst si> + : NVPTXVecInst<(outs vclass:$dst), + (ins sclass:$a1, sclass:$a2), + !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"), + [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))], + si>; +class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass, + NVPTXInst si> + : NVPTXVecInst<(outs vclass:$dst), + (ins sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4), + !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"), + [(set vclass:$dst, + (build_vector sclass:$a1, sclass:$a2, + sclass:$a3, sclass:$a4))], si>; + +let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in { +def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs, + FMOV32rr>; +def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs, + FMOV64rr>; + +def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs, + IMOV32rr>; +def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs, + IMOV64rr>; +def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs, + IMOV16rr>; +def Build_Vector2_i8 : Build_Vector2<"mov.v2.u16", V2I8Regs, Int8Regs, + IMOV8rr>; + +def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs, + FMOV32rr>; + +def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs, + IMOV32rr>; +def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs, + IMOV16rr>; +def Build_Vector4_i8 : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs, + IMOV8rr>; +} + +class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP> + : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src), + !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"), + [], sop>; + +let isAsCheapAsAMove=1, neverHasSideEffects=1, IsSimpleMove=1, + VecInstType=isVecOther.Value in { +def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>; +def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>; + +def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>; +def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>; + +def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>; +def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>; + +def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>; +def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>; + +def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>; +def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>; +} + +// extract subvector patterns +def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR", + SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>; + +def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)), + (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0), + (V4f32Extract V4F32Regs:$src, 1))>; +def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)), + (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2), + (V4f32Extract V4F32Regs:$src, 3))>; +def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)), + (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0), + (V4i32Extract V4I32Regs:$src, 1))>; +def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)), + (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2), + (V4i32Extract V4I32Regs:$src, 3))>; +def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)), + (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0), + (V4i16Extract V4I16Regs:$src, 1))>; +def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)), + (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2), + (V4i16Extract V4I16Regs:$src, 3))>; +def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)), + (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0), + (V4i8Extract V4I8Regs:$src, 1))>; +def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)), + (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2), + (V4i8Extract V4I8Regs:$src, 3))>; + +// Select instructions +class Select_OneLine<string type, string pos> { + string t1 = !strconcat("selp.", type); + string t2 = !strconcat(t1, " \t${dst}_"); + string t3 = !strconcat(t2, pos); + string t4 = !strconcat(t3, ", ${src1}_"); + string t5 = !strconcat(t4, pos); + string t6 = !strconcat(t5, ", ${src2}_"); + string t7 = !strconcat(t6, pos); + string s = !strconcat(t7, ", $p;"); +} + +class Select_Str2<string type> { + string t1 = Select_OneLine<type, "0">.s; + string t2 = !strconcat(t1, "\n\t"); + string s = !strconcat(t2, Select_OneLine<type, "1">.s); +} + +class Select_Str4<string type> { + string t1 = Select_OneLine<type, "0">.s; + string t2 = !strconcat(t1, "\n\t"); + string t3 = !strconcat(t2, Select_OneLine<type, "1">.s); + string t4 = !strconcat(t3, "\n\t"); + string t5 = !strconcat(t4, Select_OneLine<type, "2">.s); + string t6 = !strconcat(t5, "\n\t"); + string s = !strconcat(t6, Select_OneLine<type, "3">.s); + +} + +class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop> + : NVPTXVecInst<(outs vclass:$dst), + (ins vclass:$src1, vclass:$src2, Int1Regs:$p), + asmstr, + [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1, + vclass:$src2))], + sop>; + +let VecInstType=isVecOther.Value in { +def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>; +def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>; +def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>; +def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>; +def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>; +def V4I8_Select : Vec_Select<V4I8Regs, Select_Str4<"b16">.s, SELECTi8rr>; +def V2I8_Select : Vec_Select<V2I8Regs, Select_Str2<"b16">.s, SELECTi8rr>; + +def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>; +def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>; +def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>; +} + +// Comparison instructions + +// setcc convenience fragments. +def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOEQ)>; +def vsetogt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOGT)>; +def vsetoge : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOGE)>; +def vsetolt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOLT)>; +def vsetole : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOLE)>; +def vsetone : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETONE)>; +def vseto : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETO)>; +def vsetuo : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUO)>; +def vsetueq : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUEQ)>; +def vsetugt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUGT)>; +def vsetuge : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUGE)>; +def vsetult : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETULT)>; +def vsetule : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETULE)>; +def vsetune : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUNE)>; +def vseteq : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETEQ)>; +def vsetgt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETGT)>; +def vsetge : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETGE)>; +def vsetlt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETLT)>; +def vsetle : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETLE)>; +def vsetne : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETNE)>; + +class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass, + NVPTXInst sop> + : NVPTXVecInst<(outs outrclass:$dst), + (ins inrclass:$a, inrclass:$b), + "Unsupported", + [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))], + sop>; + +multiclass Vec_Compare_All<PatFrag op, + NVPTXInst inst8, + NVPTXInst inst16, + NVPTXInst inst32, + NVPTXInst inst64> +{ + def V2I8 : Vec_Compare<op, V2I8Regs, V2I8Regs, inst8>; + def V4I8 : Vec_Compare<op, V4I8Regs, V4I8Regs, inst8>; + def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>; + def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>; + def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>; + def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>; + def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>; +} + +let VecInstType=isVecOther.Value in { + defm VecSGT : Vec_Compare_All<vsetgt, ISetSGTi8rr_toi8, ISetSGTi16rr_toi16, + ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>; + defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16, + ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>; + defm VecSLT : Vec_Compare_All<vsetlt, ISetSLTi8rr_toi8, ISetSLTi16rr_toi16, + ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>; + defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16, + ISetULTi32rr_toi32, ISetULTi64rr_toi64>; + defm VecSGE : Vec_Compare_All<vsetge, ISetSGEi8rr_toi8, ISetSGEi16rr_toi16, + ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>; + defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16, + ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>; + defm VecSLE : Vec_Compare_All<vsetle, ISetSLEi8rr_toi8, ISetSLEi16rr_toi16, + ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>; + defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16, + ISetULEi32rr_toi32, ISetULEi64rr_toi64>; + defm VecSEQ : Vec_Compare_All<vseteq, ISetSEQi8rr_toi8, ISetSEQi16rr_toi16, + ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>; + defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16, + ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>; + defm VecSNE : Vec_Compare_All<vsetne, ISetSNEi8rr_toi8, ISetSNEi16rr_toi16, + ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>; + defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16, + ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>; +} + +multiclass FVec_Compare_All<PatFrag op, + NVPTXInst instf32, + NVPTXInst instf64> +{ + def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>; + def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>; + def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>; +} + +let VecInstType=isVecOther.Value in { + defm FVecGT : FVec_Compare_All<vsetogt, FSetGTf32rr_toi32, + FSetGTf64rr_toi64>; + defm FVecLT : FVec_Compare_All<vsetolt, FSetLTf32rr_toi32, + FSetLTf64rr_toi64>; + defm FVecGE : FVec_Compare_All<vsetoge, FSetGEf32rr_toi32, + FSetGEf64rr_toi64>; + defm FVecLE : FVec_Compare_All<vsetole, FSetLEf32rr_toi32, + FSetLEf64rr_toi64>; + defm FVecEQ : FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32, + FSetEQf64rr_toi64>; + defm FVecNE : FVec_Compare_All<vsetone, FSetNEf32rr_toi32, + FSetNEf64rr_toi64>; + + defm FVecUGT : FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32, + FSetUGTf64rr_toi64>; + defm FVecULT : FVec_Compare_All<vsetult, FSetULTf32rr_toi32, + FSetULTf64rr_toi64>; + defm FVecUGE : FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32, + FSetUGEf64rr_toi64>; + defm FVecULE : FVec_Compare_All<vsetule, FSetULEf32rr_toi32, + FSetULEf64rr_toi64>; + defm FVecUEQ : FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32, + FSetUEQf64rr_toi64>; + defm FVecUNE : FVec_Compare_All<vsetune, FSetUNEf32rr_toi32, + FSetUNEf64rr_toi64>; + + defm FVecNUM : FVec_Compare_All<vseto, FSetNUMf32rr_toi32, + FSetNUMf64rr_toi64>; + defm FVecNAN : FVec_Compare_All<vsetuo, FSetNANf32rr_toi32, + FSetNANf64rr_toi64>; +} + +class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4), + (ins i32imm:$a, i32imm:$b), + !strconcat(!strconcat("ld.param", opstr), + "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>; + +class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs regclass:$d1, regclass:$d2), + (ins i32imm:$a, i32imm:$b), + !strconcat(!strconcat("ld.param", opstr), + "\t{{$d1, $d2}}, [retval0+$b];"), []>; + + +class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), + (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4, + i32imm:$a, i32imm:$b), + !strconcat(!strconcat("st.param", opstr), + "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>; + +class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), + (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b), + !strconcat(!strconcat("st.param", opstr), + "\t[param$a+$b], {{$s1, $s2}};"), []>; + +class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), + (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4, + i32imm:$a), + !strconcat(!strconcat("st.param", opstr), + "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>; + +class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> : + NVPTXInst<(outs), + (ins regclass:$s1, regclass:$s2, i32imm:$a), + !strconcat(!strconcat("st.param", opstr), + "\t[func_retval+$a], {{$s1, $s2}};"), []>; + +def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">; +def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">; +def LoadParamScalar4I8 : LoadParamScalar4Inst<Int8Regs, ".v4.b8">; + +def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">; +def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">; +def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">; +def LoadParamScalar2I8 : LoadParamScalar2Inst<Int32Regs, ".v2.b8">; + +def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">; +def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">; +def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">; + +def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">; +def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">; +def StoreParamScalar4I8 : StoreParamScalar4Inst<Int8Regs, ".v4.b8">; + +def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">; +def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">; +def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">; +def StoreParamScalar2I8 : StoreParamScalar2Inst<Int8Regs, ".v2.b8">; + +def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">; +def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">; +def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">; + +def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">; +def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">; +def StoreRetvalScalar4I8 : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">; + +def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">; +def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">; +def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">; +def StoreRetvalScalar2I8 : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">; + +def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">; +def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">; +def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">; + +class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>: + NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b), + "loadparam : $dst <- [$a, $b]", + [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))], + sop>; + +class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP> + : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), + "storeparam : [$a, $b] <- $val", + [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>; + +class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr, + NVPTXInst sop=NOP> + : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a), + "storeretval : retval[$a] <- $val", + [(StoreRetval (i32 imm:$a), regclass:$val)], sop>; + +let VecInstType=isVecLD.Value in { +def LoadParamV4I32 : LoadParamVecInst<V4I32Regs, ".v4.b32", + LoadParamScalar4I32>; +def LoadParamV4I16 : LoadParamVecInst<V4I16Regs, ".v4.b16", + LoadParamScalar4I16>; +def LoadParamV4I8 : LoadParamVecInst<V4I8Regs, ".v4.b8", + LoadParamScalar4I8>; + +def LoadParamV2I64 : LoadParamVecInst<V2I64Regs, ".v2.b64", + LoadParamScalar2I64>; +def LoadParamV2I32 : LoadParamVecInst<V2I32Regs, ".v2.b32", + LoadParamScalar2I32>; +def LoadParamV2I16 : LoadParamVecInst<V2I16Regs, ".v2.b16", + LoadParamScalar2I16>; +def LoadParamV2I8 : LoadParamVecInst<V2I8Regs, ".v2.b8", + LoadParamScalar2I8>; + +def LoadParamV4F32 : LoadParamVecInst<V4F32Regs, ".v4.f32", + LoadParamScalar4F32>; +def LoadParamV2F32 : LoadParamVecInst<V2F32Regs, ".v2.f32", + LoadParamScalar2F32>; +def LoadParamV2F64 : LoadParamVecInst<V2F64Regs, ".v2.f64", + LoadParamScalar2F64>; +} + +let VecInstType=isVecST.Value in { +def StoreParamV4I32 : StoreParamVecInst<V4I32Regs, ".v4.b32", + StoreParamScalar4I32>; +def StoreParamV4I16 : StoreParamVecInst<V4I16Regs, ".v4.b16", + StoreParamScalar4I16>; +def StoreParamV4I8 : StoreParamVecInst<V4I8Regs, ".v4.b8", + StoreParamScalar4I8>; + +def StoreParamV2I64 : StoreParamVecInst<V2I64Regs, ".v2.b64", + StoreParamScalar2I64>; +def StoreParamV2I32 : StoreParamVecInst<V2I32Regs, ".v2.b32", + StoreParamScalar2I32>; +def StoreParamV2I16 : StoreParamVecInst<V2I16Regs, ".v2.b16", + StoreParamScalar2I16>; +def StoreParamV2I8 : StoreParamVecInst<V2I8Regs, ".v2.b8", + StoreParamScalar2I8>; + +def StoreParamV4F32 : StoreParamVecInst<V4F32Regs, ".v4.f32", + StoreParamScalar4F32>; +def StoreParamV2F32 : StoreParamVecInst<V2F32Regs, ".v2.f32", + StoreParamScalar2F32>; +def StoreParamV2F64 : StoreParamVecInst<V2F64Regs, ".v2.f64", + StoreParamScalar2F64>; + +def StoreRetvalV4I32 : StoreRetvalVecInst<V4I32Regs, ".v4.b32", + StoreRetvalScalar4I32>; +def StoreRetvalV4I16 : StoreRetvalVecInst<V4I16Regs, ".v4.b16", + StoreRetvalScalar4I16>; +def StoreRetvalV4I8 : StoreRetvalVecInst<V4I8Regs, ".v4.b8", + StoreRetvalScalar4I8>; + +def StoreRetvalV2I64 : StoreRetvalVecInst<V2I64Regs, ".v2.b64", + StoreRetvalScalar2I64>; +def StoreRetvalV2I32 : StoreRetvalVecInst<V2I32Regs, ".v2.b32", + StoreRetvalScalar2I32>; +def StoreRetvalV2I16 : StoreRetvalVecInst<V2I16Regs, ".v2.b16", + StoreRetvalScalar2I16>; +def StoreRetvalV2I8 : StoreRetvalVecInst<V2I8Regs, ".v2.b8", + StoreRetvalScalar2I8>; + +def StoreRetvalV4F32 : StoreRetvalVecInst<V4F32Regs, ".v4.f32", + StoreRetvalScalar4F32>; +def StoreRetvalV2F32 : StoreRetvalVecInst<V2F32Regs, ".v2.f32", + StoreRetvalScalar2F32>; +def StoreRetvalV2F64 : StoreRetvalVecInst<V2F64Regs, ".v2.f64", + StoreRetvalScalar2F64>; + +} + + +// Int vector to int scalar bit convert +// v4i8 -> i32 +def : Pat<(i32 (bitconvert V4I8Regs:$s)), + (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1), + (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>; +// v4i16 -> i64 +def : Pat<(i64 (bitconvert V4I16Regs:$s)), + (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), + (V4i16Extract V4I16Regs:$s,1), + (V4i16Extract V4I16Regs:$s,2), + (V4i16Extract V4I16Regs:$s,3))>; +// v2i8 -> i16 +def : Pat<(i16 (bitconvert V2I8Regs:$s)), + (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>; +// v2i16 -> i32 +def : Pat<(i32 (bitconvert V2I16Regs:$s)), + (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), + (V2i16Extract V2I16Regs:$s,1))>; +// v2i32 -> i64 +def : Pat<(i64 (bitconvert V2I32Regs:$s)), + (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), + (V2i32Extract V2I32Regs:$s,1))>; + +// Int scalar to int vector bit convert +let VecInstType=isVecDest.Value in { +// i32 -> v4i8 +def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s), + "Error!", + [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))], + I32toV4I8>; +// i64 -> v4i16 +def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s), + "Error!", + [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))], + I64toV4I16>; +// i16 -> v2i8 +def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s), + "Error!", + [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))], + I16toV2I8>; +// i32 -> v2i16 +def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s), + "Error!", + [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))], + I32toV2I16>; +// i64 -> v2i32 +def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s), + "Error!", + [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))], + I64toV2I32>; +} + +// Int vector to int vector bit convert +// v4i8 -> v2i16 +def : Pat<(v2i16 (bitconvert V4I8Regs:$s)), + (VecI32toV2I16 + (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1), + (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>; +// v4i16 -> v2i32 +def : Pat<(v2i32 (bitconvert V4I16Regs:$s)), + (VecI64toV2I32 + (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1), + (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>; +// v2i16 -> v4i8 +def : Pat<(v4i8 (bitconvert V2I16Regs:$s)), + (VecI32toV4I8 + (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>; +// v2i32 -> v4i16 +def : Pat<(v4i16 (bitconvert V2I32Regs:$s)), + (VecI64toV4I16 + (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>; +// v2i64 -> v4i32 +def : Pat<(v4i32 (bitconvert V2I64Regs:$s)), + (Build_Vector4_i32 + (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0), + (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1), + (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0), + (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>; +// v4i32 -> v2i64 +def : Pat<(v2i64 (bitconvert V4I32Regs:$s)), + (Build_Vector2_i64 + (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)), + (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>; + +// Fp scalar to fp vector convert +// f64 -> v2f32 +let VecInstType=isVecDest.Value in { +def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s), + "Error!", + [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))], + F64toV2F32>; +} + +// Fp vector to fp scalar convert +// v2f32 -> f64 +def : Pat<(f64 (bitconvert V2F32Regs:$s)), + (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>; + +// Fp scalar to int vector convert +// f32 -> v4i8 +def : Pat<(v4i8 (bitconvert Float32Regs:$s)), + (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>; +// f32 -> v2i16 +def : Pat<(v2i16 (bitconvert Float32Regs:$s)), + (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>; +// f64 -> v4i16 +def : Pat<(v4i16 (bitconvert Float64Regs:$s)), + (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>; +// f64 -> v2i32 +def : Pat<(v2i32 (bitconvert Float64Regs:$s)), + (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>; + +// Int vector to fp scalar convert +// v4i8 -> f32 +def : Pat<(f32 (bitconvert V4I8Regs:$s)), + (BITCONVERT_32_I2F + (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1), + (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>; +// v4i16 -> f64 +def : Pat<(f64 (bitconvert V4I16Regs:$s)), + (BITCONVERT_64_I2F + (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1), + (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>; +// v2i16 -> f32 +def : Pat<(f32 (bitconvert V2I16Regs:$s)), + (BITCONVERT_32_I2F + (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>; +// v2i32 -> f64 +def : Pat<(f64 (bitconvert V2I32Regs:$s)), + (BITCONVERT_64_I2F + (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>; + +// Int scalar to fp vector convert +// i64 -> v2f32 +def : Pat<(v2f32 (bitconvert Int64Regs:$s)), + (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>; + +// Fp vector to int scalar convert +// v2f32 -> i64 +def : Pat<(i64 (bitconvert V2F32Regs:$s)), + (BITCONVERT_64_F2I + (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>; + +// Int vector to fp vector convert +// v2i64 -> v4f32 +def : Pat<(v4f32 (bitconvert V2I64Regs:$s)), + (Build_Vector4_f32 + (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32 + (V2i64Extract V2I64Regs:$s, 0)), 0)), + (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32 + (V2i64Extract V2I64Regs:$s, 0)), 1)), + (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32 + (V2i64Extract V2I64Regs:$s, 1)), 0)), + (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32 + (V2i64Extract V2I64Regs:$s, 1)), 1)))>; +// v2i64 -> v2f64 +def : Pat<(v2f64 (bitconvert V2I64Regs:$s)), + (Build_Vector2_f64 + (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)), + (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>; +// v2i32 -> v2f32 +def : Pat<(v2f32 (bitconvert V2I32Regs:$s)), + (Build_Vector2_f32 + (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)), + (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>; +// v4i32 -> v2f64 +def : Pat<(v2f64 (bitconvert V4I32Regs:$s)), + (Build_Vector2_f64 + (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), + (V4i32Extract V4I32Regs:$s,1))), + (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), + (V4i32Extract V4I32Regs:$s,3))))>; +// v4i32 -> v4f32 +def : Pat<(v4f32 (bitconvert V4I32Regs:$s)), + (Build_Vector4_f32 + (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)), + (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)), + (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)), + (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>; +// v4i16 -> v2f32 +def : Pat<(v2f32 (bitconvert V4I16Regs:$s)), + (VecF64toV2F32 (BITCONVERT_64_I2F + (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), + (V4i16Extract V4I16Regs:$s,1), + (V4i16Extract V4I16Regs:$s,2), + (V4i16Extract V4I16Regs:$s,3))))>; + +// Fp vector to int vector convert +// v2i64 <- v4f32 +def : Pat<(v2i64 (bitconvert V4F32Regs:$s)), + (Build_Vector2_i64 + (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0), + (V4f32Extract V4F32Regs:$s,1))), + (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2), + (V4f32Extract V4F32Regs:$s,3))))>; +// v2i64 <- v2f64 +def : Pat<(v2i64 (bitconvert V2F64Regs:$s)), + (Build_Vector2_i64 + (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)), + (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>; +// v2i32 <- v2f32 +def : Pat<(v2i32 (bitconvert V2F32Regs:$s)), + (Build_Vector2_i32 + (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)), + (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>; +// v4i32 <- v2f64 +def : Pat<(v4i32 (bitconvert V2F64Regs:$s)), + (Build_Vector4_i32 + (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32 + (V2f64Extract V2F64Regs:$s, 0)), 0)), + (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32 + (V2f64Extract V2F64Regs:$s, 0)), 1)), + (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32 + (V2f64Extract V2F64Regs:$s, 1)), 0)), + (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32 + (V2f64Extract V2F64Regs:$s, 1)), 1)))>; +// v4i32 <- v4f32 +def : Pat<(v4i32 (bitconvert V4F32Regs:$s)), + (Build_Vector4_i32 + (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)), + (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)), + (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)), + (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>; +// v4i16 <- v2f32 +def : Pat<(v4i16 (bitconvert V2F32Regs:$s)), + (VecI64toV4I16 (BITCONVERT_64_F2I + (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), + (V2f32Extract V2F32Regs:$s,1))))>; diff --git a/lib/Target/NVPTX/NVPTXutil.cpp b/lib/Target/NVPTX/NVPTXutil.cpp new file mode 100644 index 0000000..6a0e532 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXutil.cpp @@ -0,0 +1,92 @@ +//===-- NVPTXutil.cpp - Functions exported to CodeGen --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the functions that can be used in CodeGen. +// +//===----------------------------------------------------------------------===// + +#include "NVPTXutil.h" +#include "NVPTX.h" + +using namespace llvm; + +namespace llvm { + +bool isParamLoad(const MachineInstr *MI) +{ + if ((MI->getOpcode() != NVPTX::LD_i32_avar) && + (MI->getOpcode() != NVPTX::LD_i64_avar)) + return false; + if (MI->getOperand(2).isImm() == false) + return false; + if (MI->getOperand(2).getImm() != NVPTX::PTXLdStInstCode::PARAM) + return false; + return true; +} + +#define DATA_MASK 0x7f +#define DIGIT_WIDTH 7 +#define MORE_BYTES 0x80 + +static int encode_leb128(uint64_t val, int *nbytes, + char *space, int splen) +{ + char *a; + char *end = space + splen; + + a = space; + do { + unsigned char uc; + + if (a >= end) + return 1; + uc = val & DATA_MASK; + val >>= DIGIT_WIDTH; + if (val != 0) + uc |= MORE_BYTES; + *a = uc; + a++; + } while (val); + *nbytes = a - space; + return 0; +} + +#undef DATA_MASK +#undef DIGIT_WIDTH +#undef MORE_BYTES + +uint64_t encode_leb128(const char *str) +{ + union { uint64_t x; char a[8]; } temp64; + + temp64.x = 0; + + for (unsigned i=0,e=strlen(str); i!=e; ++i) + temp64.a[i] = str[e-1-i]; + + char encoded[16]; + int nbytes; + + int retval = encode_leb128(temp64.x, &nbytes, encoded, 16); + + (void)retval; + assert(retval == 0 && + "Encoding to leb128 failed"); + + assert(nbytes <= 8 && + "Cannot support register names with leb128 encoding > 8 bytes"); + + temp64.x = 0; + for (int i=0; i<nbytes; ++i) + temp64.a[i] = encoded[i]; + + return temp64.x; +} + +} // end namespace llvm diff --git a/lib/Target/NVPTX/NVPTXutil.h b/lib/Target/NVPTX/NVPTXutil.h new file mode 100644 index 0000000..d1d1171 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXutil.h @@ -0,0 +1,25 @@ +//===-- NVPTXutil.h - Functions exported to CodeGen --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the functions that can be used in CodeGen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_NVPTX_UTIL_H +#define LLVM_TARGET_NVPTX_UTIL_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" + +namespace llvm { +bool isParamLoad(const MachineInstr *); +uint64_t encode_leb128(const char *str); +} + +#endif diff --git a/lib/Target/NVPTX/TargetInfo/CMakeLists.txt b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt new file mode 100644 index 0000000..0bf1334 --- /dev/null +++ b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt @@ -0,0 +1,7 @@ +#include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMNVPTXInfo + NVPTXTargetInfo.cpp + ) + +add_dependencies(LLVMNVPTXInfo NVPTXCommonTableGen) diff --git a/lib/Target/PTX/TargetInfo/LLVMBuild.txt b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt index 2cc30c4..ef12b0e 100644 --- a/lib/Target/PTX/TargetInfo/LLVMBuild.txt +++ b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Target/PTX/TargetInfo/LLVMBuild.txt ----------------*- Conf -*--===; +;===- ./lib/Target/NVPTX/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -17,7 +17,7 @@ [component_0] type = Library -name = PTXInfo -parent = PTX +name = NVPTXInfo +parent = NVPTX required_libraries = MC Support Target -add_to_library_groups = PTX +add_to_library_groups = NVPTX diff --git a/lib/Target/PTX/TargetInfo/Makefile b/lib/Target/NVPTX/TargetInfo/Makefile index 8619785..8622315 100644 --- a/lib/Target/PTX/TargetInfo/Makefile +++ b/lib/Target/NVPTX/TargetInfo/Makefile @@ -1,4 +1,4 @@ -##===- lib/Target/PTX/TargetInfo/Makefile ------------------*- Makefile -*-===## +##===- lib/Target/NVPTX/TargetInfo/Makefile ----------------*- Makefile -*-===## # # The LLVM Compiler Infrastructure # @@ -7,7 +7,7 @@ # ##===----------------------------------------------------------------------===## LEVEL = ../../../.. -LIBRARYNAME = LLVMPTXInfo +LIBRARYNAME = LLVMNVPTXInfo # Hack: we need to include 'main' target directory to grab private headers CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. diff --git a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp new file mode 100644 index 0000000..f3624b9 --- /dev/null +++ b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp @@ -0,0 +1,23 @@ +//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/Module.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +Target llvm::TheNVPTXTarget32; +Target llvm::TheNVPTXTarget64; + +extern "C" void LLVMInitializeNVPTXTargetInfo() { + RegisterTarget<Triple::nvptx> X(TheNVPTXTarget32, "nvptx", + "NVIDIA PTX 32-bit"); + RegisterTarget<Triple::nvptx64> Y(TheNVPTXTarget64, "nvptx64", + "NVIDIA PTX 64-bit"); +} diff --git a/lib/Target/NVPTX/VectorElementize.cpp b/lib/Target/NVPTX/VectorElementize.cpp new file mode 100644 index 0000000..8043e2d --- /dev/null +++ b/lib/Target/NVPTX/VectorElementize.cpp @@ -0,0 +1,1248 @@ +//===-- VectorElementize.cpp - Remove unreachable blocks for codegen --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass converts operations on vector types to operations on their +// element types. +// +// For generic binary and unary vector instructions, the conversion is simple. +// Suppose we have +// av = bv Vop cv +// where av, bv, and cv are vector virtual registers, and Vop is a vector op. +// This gets converted to the following : +// a1 = b1 Sop c1 +// a2 = b2 Sop c2 +// +// VectorToScalarMap maintains the vector vreg to scalar vreg mapping. +// For the above example, the map will look as follows: +// av => [a1, a2] +// bv => [b1, b2] +// +// In addition, initVectorInfo creates the following opcode->opcode map. +// Vop => Sop +// OtherVop => OtherSop +// ... +// +// For vector specific instructions like vecbuild, vecshuffle etc, the +// conversion is different. Look at comments near the functions with +// prefix createVec<...>. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/Passes.h" +#include "llvm/Constant.h" +#include "llvm/Instructions.h" +#include "llvm/Function.h" +#include "llvm/Pass.h" +#include "llvm/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "NVPTX.h" +#include "NVPTXTargetMachine.h" + +using namespace llvm; + +namespace { + +class LLVM_LIBRARY_VISIBILITY VectorElementize : public MachineFunctionPass { + virtual bool runOnMachineFunction(MachineFunction &F); + + NVPTXTargetMachine &TM; + MachineRegisterInfo *MRI; + const NVPTXRegisterInfo *RegInfo; + const NVPTXInstrInfo *InstrInfo; + + llvm::DenseMap<const TargetRegisterClass *, const TargetRegisterClass *> + RegClassMap; + llvm::DenseMap<unsigned, bool> SimpleMoveMap; + + llvm::DenseMap<unsigned, SmallVector<unsigned, 4> > VectorToScalarMap; + + bool isVectorInstr(MachineInstr *); + + SmallVector<unsigned, 4> getScalarRegisters(unsigned); + unsigned getScalarVersion(unsigned); + unsigned getScalarVersion(MachineInstr *); + + bool isVectorRegister(unsigned); + const TargetRegisterClass *getScalarRegClass(const TargetRegisterClass *RC); + unsigned numCopiesNeeded(MachineInstr *); + + void createLoadCopy(MachineFunction&, MachineInstr *, + std::vector<MachineInstr *>&); + void createStoreCopy(MachineFunction&, MachineInstr *, + std::vector<MachineInstr *>&); + + void createVecDest(MachineFunction&, MachineInstr *, + std::vector<MachineInstr *>&); + + void createCopies(MachineFunction&, MachineInstr *, + std::vector<MachineInstr *>&); + + unsigned copyProp(MachineFunction&); + unsigned removeDeadMoves(MachineFunction&); + + void elementize(MachineFunction&); + + bool isSimpleMove(MachineInstr *); + + void createVecShuffle(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies); + + void createVecExtract(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies); + + void createVecInsert(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies); + + void createVecBuild(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies); + +public: + + static char ID; // Pass identification, replacement for typeid + VectorElementize(NVPTXTargetMachine &tm) + : MachineFunctionPass(ID), TM(tm) {} + + virtual const char *getPassName() const { + return "Convert LLVM vector types to their element types"; + } +}; + +char VectorElementize::ID = 1; +} + +static cl::opt<bool> +RemoveRedundantMoves("nvptx-remove-redundant-moves", + cl::desc("NVPTX: Remove redundant moves introduced by vector lowering"), + cl::init(true)); + +#define VECINST(x) ((((x)->getDesc().TSFlags) & NVPTX::VecInstTypeMask) \ + >> NVPTX::VecInstTypeShift) +#define ISVECINST(x) (VECINST(x) != NVPTX::VecNOP) +#define ISVECLOAD(x) (VECINST(x) == NVPTX::VecLoad) +#define ISVECSTORE(x) (VECINST(x) == NVPTX::VecStore) +#define ISVECBUILD(x) (VECINST(x) == NVPTX::VecBuild) +#define ISVECSHUFFLE(x) (VECINST(x) == NVPTX::VecShuffle) +#define ISVECEXTRACT(x) (VECINST(x) == NVPTX::VecExtract) +#define ISVECINSERT(x) (VECINST(x) == NVPTX::VecInsert) +#define ISVECDEST(x) (VECINST(x) == NVPTX::VecDest) + +bool VectorElementize::isSimpleMove(MachineInstr *mi) { + if (mi->isCopy()) + return true; + unsigned TSFlags = (mi->getDesc().TSFlags & NVPTX::SimpleMoveMask) + >> NVPTX::SimpleMoveShift; + return (TSFlags == 1); +} + +bool VectorElementize::isVectorInstr(MachineInstr *mi) { + if ((mi->getOpcode() == NVPTX::PHI) || + (mi->getOpcode() == NVPTX::IMPLICIT_DEF) || mi->isCopy()) { + MachineOperand dest = mi->getOperand(0); + return isVectorRegister(dest.getReg()); + } + return ISVECINST(mi); +} + +unsigned VectorElementize::getScalarVersion(MachineInstr *mi) { + return getScalarVersion(mi->getOpcode()); +} + +///============================================================================= +///Instr is assumed to be a vector instruction. For most vector instructions, +///the size of the destination vector register gives the number of scalar copies +///needed. For VecStore, size of getOperand(1) gives the number of scalar copies +///needed. For VecExtract, the dest is a scalar. So getOperand(1) gives the +///number of scalar copies needed. +///============================================================================= +unsigned VectorElementize::numCopiesNeeded(MachineInstr *Instr) { + unsigned numDefs=0; + unsigned def; + for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) { + MachineOperand oper = Instr->getOperand(i); + + if (!oper.isReg()) continue; + if (!oper.isDef()) continue; + def = i; + numDefs++; + } + assert((numDefs <= 1) && "Only 0 or 1 defs supported"); + + if (numDefs == 1) { + unsigned regnum = Instr->getOperand(def).getReg(); + if (ISVECEXTRACT(Instr)) + regnum = Instr->getOperand(1).getReg(); + return getNVPTXVectorSize(MRI->getRegClass(regnum)); + } + else if (numDefs == 0) { + assert(ISVECSTORE(Instr) + && "Only 0 def instruction supported is vector store"); + + unsigned regnum = Instr->getOperand(0).getReg(); + return getNVPTXVectorSize(MRI->getRegClass(regnum)); + } + return 1; +} + +const TargetRegisterClass *VectorElementize:: +getScalarRegClass(const TargetRegisterClass *RC) { + assert(isNVPTXVectorRegClass(RC) && + "Not a vector register class"); + return getNVPTXElemClass(RC); +} + +bool VectorElementize::isVectorRegister(unsigned reg) { + const TargetRegisterClass *RC=MRI->getRegClass(reg); + return isNVPTXVectorRegClass(RC); +} + +///============================================================================= +///For every vector register 'v' that is not already in the VectorToScalarMap, +///create n scalar registers of the corresponding element type, where n +///is 2 or 4 (getNVPTXVectorSize) and add it VectorToScalarMap. +///============================================================================= +SmallVector<unsigned, 4> VectorElementize::getScalarRegisters(unsigned regnum) { + assert(isVectorRegister(regnum) && "Expecting a vector register here"); + // Create the scalar registers and put them in the map, if not already there. + if (VectorToScalarMap.find(regnum) == VectorToScalarMap.end()) { + const TargetRegisterClass *vecClass = MRI->getRegClass(regnum); + const TargetRegisterClass *scalarClass = getScalarRegClass(vecClass); + + SmallVector<unsigned, 4> temp; + + for (unsigned i=0, e=getNVPTXVectorSize(vecClass); i!=e; ++i) + temp.push_back(MRI->createVirtualRegister(scalarClass)); + + VectorToScalarMap[regnum] = temp; + } + return VectorToScalarMap[regnum]; +} + +///============================================================================= +///For a vector load of the form +///va <= ldv2 [addr] +///the following multi output instruction is created : +///[v1, v2] <= LD [addr] +///Look at NVPTXVector.td for the definitions of multi output loads. +///============================================================================= +void VectorElementize::createLoadCopy(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies) { + copies.push_back(F.CloneMachineInstr(Instr)); + + MachineInstr *copy=copies[0]; + copy->setDesc(InstrInfo->get(getScalarVersion(copy))); + + // Remove the dest, that should be a vector operand. + MachineOperand dest = copy->getOperand(0); + unsigned regnum = dest.getReg(); + + SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum); + copy->RemoveOperand(0); + + std::vector<MachineOperand> otherOperands; + for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) + otherOperands.push_back(copy->getOperand(i)); + + for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) + copy->RemoveOperand(0); + + for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i) { + copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], true)); + } + + for (unsigned i=0, e=otherOperands.size(); i!=e; ++i) + copy->addOperand(otherOperands[i]); + +} + +///============================================================================= +///For a vector store of the form +///stv2 va, [addr] +///the following multi input instruction is created : +///ST v1, v2, [addr] +///Look at NVPTXVector.td for the definitions of multi input stores. +///============================================================================= +void VectorElementize::createStoreCopy(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies) { + copies.push_back(F.CloneMachineInstr(Instr)); + + MachineInstr *copy=copies[0]; + copy->setDesc(InstrInfo->get(getScalarVersion(copy))); + + MachineOperand src = copy->getOperand(0); + unsigned regnum = src.getReg(); + + SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum); + copy->RemoveOperand(0); + + std::vector<MachineOperand> otherOperands; + for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) + otherOperands.push_back(copy->getOperand(i)); + + for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) + copy->RemoveOperand(0); + + for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i) + copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], false)); + + for (unsigned i=0, e=otherOperands.size(); i!=e; ++i) + copy->addOperand(otherOperands[i]); +} + +///============================================================================= +///va <= shufflev2 vb, vc, <i1>, <i2> +///gets converted to 2 moves into a1 and a2. The source of the moves depend on +///i1 and i2. i1, i2 can belong to the set {0, 1, 2, 3} for shufflev2. For +///shufflev4 the set is {0,..7}. For example, if i1=3, i2=0, the move +///instructions will be +///a1 <= c2 +///a2 <= b1 +///============================================================================= +void VectorElementize::createVecShuffle(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies) { + unsigned numcopies=numCopiesNeeded(Instr); + + unsigned destregnum = Instr->getOperand(0).getReg(); + unsigned src1regnum = Instr->getOperand(1).getReg(); + unsigned src2regnum = Instr->getOperand(2).getReg(); + + SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum); + SmallVector<unsigned, 4> src1 = getScalarRegisters(src1regnum); + SmallVector<unsigned, 4> src2 = getScalarRegisters(src2regnum); + + DebugLoc DL = Instr->getDebugLoc(); + + for (unsigned i=0; i<numcopies; i++) { + MachineInstr *copy = BuildMI(F, DL, + InstrInfo->get(getScalarVersion(Instr)), dest[i]); + MachineOperand which=Instr->getOperand(3+i); + assert(which.isImm() && "Shuffle operand not a constant"); + + int src=which.getImm(); + int elem=src%numcopies; + + if (which.getImm() < numcopies) + copy->addOperand(MachineOperand::CreateReg(src1[elem], false)); + else + copy->addOperand(MachineOperand::CreateReg(src2[elem], false)); + copies.push_back(copy); + } +} + +///============================================================================= +///a <= extractv2 va, <i1> +///gets turned into a simple move to the scalar register a. The source depends +///on i1. +///============================================================================= +void VectorElementize::createVecExtract(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies) { + unsigned srcregnum = Instr->getOperand(1).getReg(); + + SmallVector<unsigned, 4> src = getScalarRegisters(srcregnum); + + MachineOperand which = Instr->getOperand(2); + assert(which.isImm() && "Extract operand not a constant"); + + DebugLoc DL = Instr->getDebugLoc(); + + MachineInstr *copy = BuildMI(F, DL, InstrInfo->get(getScalarVersion(Instr)), + Instr->getOperand(0).getReg()); + copy->addOperand(MachineOperand::CreateReg(src[which.getImm()], false)); + + copies.push_back(copy); +} + +///============================================================================= +///va <= vecinsertv2 vb, c, <i1> +///This instruction copies all elements of vb to va, except the 'i1'th element. +///The scalar value c becomes the 'i1'th element of va. +///This gets translated to 2 (4 for vecinsertv4) moves. +///============================================================================= +void VectorElementize::createVecInsert(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies) { + unsigned numcopies=numCopiesNeeded(Instr); + + unsigned destregnum = Instr->getOperand(0).getReg(); + unsigned srcregnum = Instr->getOperand(1).getReg(); + + SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum); + SmallVector<unsigned, 4> src = getScalarRegisters(srcregnum); + + MachineOperand which=Instr->getOperand(3); + assert(which.isImm() && "Insert operand not a constant"); + unsigned int elem=which.getImm(); + + DebugLoc DL = Instr->getDebugLoc(); + + for (unsigned i=0; i<numcopies; i++) { + MachineInstr *copy = BuildMI(F, DL, + InstrInfo->get(getScalarVersion(Instr)), dest[i]); + + if (i != elem) + copy->addOperand(MachineOperand::CreateReg(src[i], false)); + else + copy->addOperand(Instr->getOperand(2)); + + copies.push_back(copy); + } + +} + +///============================================================================= +///va <= buildv2 b1, b2 +///gets translated to +///a1 <= b1 +///a2 <= b2 +///============================================================================= +void VectorElementize::createVecBuild(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies) { + unsigned numcopies=numCopiesNeeded(Instr); + + unsigned destregnum = Instr->getOperand(0).getReg(); + + SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum); + + DebugLoc DL = Instr->getDebugLoc(); + + for (unsigned i=0; i<numcopies; i++) { + MachineInstr *copy = BuildMI(F, DL, + InstrInfo->get(getScalarVersion(Instr)), dest[i]); + + copy->addOperand(Instr->getOperand(1+i)); + + copies.push_back(copy); + } + +} + +///============================================================================= +///For a tex inst of the form +///va <= op [scalar operands] +///the following multi output instruction is created : +///[v1, v2] <= op' [scalar operands] +///============================================================================= +void VectorElementize::createVecDest(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies) { + copies.push_back(F.CloneMachineInstr(Instr)); + + MachineInstr *copy=copies[0]; + copy->setDesc(InstrInfo->get(getScalarVersion(copy))); + + // Remove the dest, that should be a vector operand. + MachineOperand dest = copy->getOperand(0); + unsigned regnum = dest.getReg(); + + SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum); + copy->RemoveOperand(0); + + std::vector<MachineOperand> otherOperands; + for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) + otherOperands.push_back(copy->getOperand(i)); + + for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) + copy->RemoveOperand(0); + + for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i) + copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], true)); + + for (unsigned i=0, e=otherOperands.size(); i!=e; ++i) + copy->addOperand(otherOperands[i]); +} + +///============================================================================= +///Look at the vector instruction type and dispatch to the createVec<...> +///function that creates the scalar copies. +///============================================================================= +void VectorElementize::createCopies(MachineFunction& F, MachineInstr *Instr, + std::vector<MachineInstr *>& copies) { + if (ISVECLOAD(Instr)) { + createLoadCopy(F, Instr, copies); + return; + } + if (ISVECSTORE(Instr)) { + createStoreCopy(F, Instr, copies); + return; + } + if (ISVECSHUFFLE(Instr)) { + createVecShuffle(F, Instr, copies); + return; + } + if (ISVECEXTRACT(Instr)) { + createVecExtract(F, Instr, copies); + return; + } + if (ISVECINSERT(Instr)) { + createVecInsert(F, Instr, copies); + return; + } + if (ISVECDEST(Instr)) { + createVecDest(F, Instr, copies); + return; + } + if (ISVECBUILD(Instr)) { + createVecBuild(F, Instr, copies); + return; + } + + unsigned numcopies=numCopiesNeeded(Instr); + + for (unsigned i=0; i<numcopies; ++i) + copies.push_back(F.CloneMachineInstr(Instr)); + + for (unsigned i=0; i<numcopies; ++i) { + MachineInstr *copy = copies[i]; + + std::vector<MachineOperand> allOperands; + std::vector<bool> isDef; + + for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j) { + MachineOperand oper = copy->getOperand(j); + allOperands.push_back(oper); + if (oper.isReg()) + isDef.push_back(oper.isDef()); + else + isDef.push_back(false); + } + + for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j) + copy->RemoveOperand(0); + + copy->setDesc(InstrInfo->get(getScalarVersion(Instr))); + + for (unsigned j=0, e=allOperands.size(); j!=e; ++j) { + MachineOperand oper=allOperands[j]; + if (oper.isReg()) { + unsigned regnum = oper.getReg(); + if (isVectorRegister(regnum)) { + + SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum); + copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], isDef[j])); + } + else + copy->addOperand(oper); + } + else + copy->addOperand(oper); + } + } +} + +///============================================================================= +///Scan through all basic blocks, looking for vector instructions. +///For each vector instruction I, insert the scalar copies before I, and +///add I into toRemove vector. Finally remove all instructions in toRemove. +///============================================================================= +void VectorElementize::elementize(MachineFunction &F) { + for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); + BI!=BE; ++BI) { + MachineBasicBlock *BB = &*BI; + + std::vector<MachineInstr *> copies; + std::vector<MachineInstr *> toRemove; + + for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); + II!=IE; ++II) { + MachineInstr *Instr = &*II; + + if (!isVectorInstr(Instr)) + continue; + + copies.clear(); + createCopies(F, Instr, copies); + for (unsigned i=0, e=copies.size(); i!=e; ++i) + BB->insert(II, copies[i]); + + assert((copies.size() > 0) && "Problem in createCopies"); + toRemove.push_back(Instr); + } + for (unsigned i=0, e=toRemove.size(); i!=e; ++i) + F.DeleteMachineInstr(toRemove[i]->getParent()->remove(toRemove[i])); + } +} + +///============================================================================= +///a <= b +///... +///... +///x <= op(a, ...) +///gets converted to +/// +///x <= op(b, ...) +///The original move is still present. This works on SSA form machine code. +///Note that a <= b should be a simple vreg-to-vreg move instruction. +///TBD : I didn't find a function that can do replaceOperand, so I remove +///all operands and add all of them again, replacing the one while adding. +///============================================================================= +unsigned VectorElementize::copyProp(MachineFunction &F) { + unsigned numReplacements = 0; + + for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE; + ++BI) { + MachineBasicBlock *BB = &*BI; + + for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE; + ++II) { + MachineInstr *Instr = &*II; + + // Don't do copy propagation on PHI as it will cause unnecessary + // live range overlap. + if ((Instr->getOpcode() == TargetOpcode::PHI) || + (Instr->getOpcode() == TargetOpcode::DBG_VALUE)) + continue; + + bool needsReplacement = false; + + for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) { + MachineOperand oper = Instr->getOperand(i); + if (!oper.isReg()) continue; + if (oper.isDef()) continue; + if (!RegInfo->isVirtualRegister(oper.getReg())) continue; + + MachineInstr *defInstr = MRI->getVRegDef(oper.getReg()); + + if (!defInstr) continue; + + if (!isSimpleMove(defInstr)) continue; + + MachineOperand defSrc = defInstr->getOperand(1); + if (!defSrc.isReg()) continue; + if (!RegInfo->isVirtualRegister(defSrc.getReg())) continue; + + needsReplacement = true; + + } + if (!needsReplacement) continue; + + numReplacements++; + + std::vector<MachineOperand> operands; + + for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) { + MachineOperand oper = Instr->getOperand(i); + bool flag = false; + do { + if (!(oper.isReg())) + break; + if (oper.isDef()) + break; + if (!(RegInfo->isVirtualRegister(oper.getReg()))) + break; + MachineInstr *defInstr = MRI->getVRegDef(oper.getReg()); + if (!(isSimpleMove(defInstr))) + break; + MachineOperand defSrc = defInstr->getOperand(1); + if (!(defSrc.isReg())) + break; + if (!(RegInfo->isVirtualRegister(defSrc.getReg()))) + break; + operands.push_back(defSrc); + flag = true; + } while (0); + if (flag == false) + operands.push_back(oper); + } + + for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) + Instr->RemoveOperand(0); + for (unsigned i=0, e=operands.size(); i!=e; ++i) + Instr->addOperand(operands[i]); + + } + } + return numReplacements; +} + +///============================================================================= +///Look for simple vreg-to-vreg instructions whose use_empty() is true, add +///them to deadMoves vector. Then remove all instructions in deadMoves. +///============================================================================= +unsigned VectorElementize::removeDeadMoves(MachineFunction &F) { + std::vector<MachineInstr *> deadMoves; + for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE; + ++BI) { + MachineBasicBlock *BB = &*BI; + + for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE; + ++II) { + MachineInstr *Instr = &*II; + + if (!isSimpleMove(Instr)) continue; + + MachineOperand dest = Instr->getOperand(0); + assert(dest.isReg() && "dest of move not a register"); + assert(RegInfo->isVirtualRegister(dest.getReg()) && + "dest of move not a virtual register"); + + if (MRI->use_empty(dest.getReg())) { + deadMoves.push_back(Instr); + } + } + } + + for (unsigned i=0, e=deadMoves.size(); i!=e; ++i) + F.DeleteMachineInstr(deadMoves[i]->getParent()->remove(deadMoves[i])); + + return deadMoves.size(); +} + +///============================================================================= +///Main function for this pass. +///============================================================================= +bool VectorElementize::runOnMachineFunction(MachineFunction &F) { + MRI = &F.getRegInfo(); + + RegInfo = TM.getRegisterInfo(); + InstrInfo = TM.getInstrInfo(); + + VectorToScalarMap.clear(); + + elementize(F); + + if (RemoveRedundantMoves) + while (1) { + if (copyProp(F) == 0) break; + removeDeadMoves(F); + } + + return true; +} + +FunctionPass *llvm::createVectorElementizePass(NVPTXTargetMachine &tm) { + return new VectorElementize(tm); +} + +unsigned VectorElementize::getScalarVersion(unsigned opcode) { + if (opcode == NVPTX::PHI) + return opcode; + if (opcode == NVPTX::IMPLICIT_DEF) + return opcode; + switch(opcode) { + default: llvm_unreachable("Scalar version not set, fix NVPTXVector.td"); + case TargetOpcode::COPY: return TargetOpcode::COPY; + case NVPTX::AddCCCV2I32: return NVPTX::ADDCCCi32rr; + case NVPTX::AddCCCV4I32: return NVPTX::ADDCCCi32rr; + case NVPTX::AddCCV2I32: return NVPTX::ADDCCi32rr; + case NVPTX::AddCCV4I32: return NVPTX::ADDCCi32rr; + case NVPTX::Build_Vector2_f32: return NVPTX::FMOV32rr; + case NVPTX::Build_Vector2_f64: return NVPTX::FMOV64rr; + case NVPTX::Build_Vector2_i16: return NVPTX::IMOV16rr; + case NVPTX::Build_Vector2_i32: return NVPTX::IMOV32rr; + case NVPTX::Build_Vector2_i64: return NVPTX::IMOV64rr; + case NVPTX::Build_Vector2_i8: return NVPTX::IMOV8rr; + case NVPTX::Build_Vector4_f32: return NVPTX::FMOV32rr; + case NVPTX::Build_Vector4_i16: return NVPTX::IMOV16rr; + case NVPTX::Build_Vector4_i32: return NVPTX::IMOV32rr; + case NVPTX::Build_Vector4_i8: return NVPTX::IMOV8rr; + case NVPTX::CVTv2i16tov2i32: return NVPTX::Zint_extendext16to32; + case NVPTX::CVTv2i64tov2i32: return NVPTX::TRUNC_64to32; + case NVPTX::CVTv2i8tov2i32: return NVPTX::Zint_extendext8to32; + case NVPTX::CVTv4i16tov4i32: return NVPTX::Zint_extendext16to32; + case NVPTX::CVTv4i8tov4i32: return NVPTX::Zint_extendext8to32; + case NVPTX::F32MAD_ftzV2: return NVPTX::FMAD32_ftzrrr; + case NVPTX::F32MADV2: return NVPTX::FMAD32rrr; + case NVPTX::F32MAD_ftzV4: return NVPTX::FMAD32_ftzrrr; + case NVPTX::F32MADV4: return NVPTX::FMAD32rrr; + case NVPTX::F32FMA_ftzV2: return NVPTX::FMA32_ftzrrr; + case NVPTX::F32FMAV2: return NVPTX::FMA32rrr; + case NVPTX::F32FMA_ftzV4: return NVPTX::FMA32_ftzrrr; + case NVPTX::F32FMAV4: return NVPTX::FMA32rrr; + case NVPTX::F64FMAV2: return NVPTX::FMA64rrr; + case NVPTX::FVecEQV2F32: return NVPTX::FSetEQf32rr_toi32; + case NVPTX::FVecEQV2F64: return NVPTX::FSetEQf64rr_toi64; + case NVPTX::FVecEQV4F32: return NVPTX::FSetEQf32rr_toi32; + case NVPTX::FVecGEV2F32: return NVPTX::FSetGEf32rr_toi32; + case NVPTX::FVecGEV2F64: return NVPTX::FSetGEf64rr_toi64; + case NVPTX::FVecGEV4F32: return NVPTX::FSetGEf32rr_toi32; + case NVPTX::FVecGTV2F32: return NVPTX::FSetGTf32rr_toi32; + case NVPTX::FVecGTV2F64: return NVPTX::FSetGTf64rr_toi64; + case NVPTX::FVecGTV4F32: return NVPTX::FSetGTf32rr_toi32; + case NVPTX::FVecLEV2F32: return NVPTX::FSetLEf32rr_toi32; + case NVPTX::FVecLEV2F64: return NVPTX::FSetLEf64rr_toi64; + case NVPTX::FVecLEV4F32: return NVPTX::FSetLEf32rr_toi32; + case NVPTX::FVecLTV2F32: return NVPTX::FSetLTf32rr_toi32; + case NVPTX::FVecLTV2F64: return NVPTX::FSetLTf64rr_toi64; + case NVPTX::FVecLTV4F32: return NVPTX::FSetLTf32rr_toi32; + case NVPTX::FVecNANV2F32: return NVPTX::FSetNANf32rr_toi32; + case NVPTX::FVecNANV2F64: return NVPTX::FSetNANf64rr_toi64; + case NVPTX::FVecNANV4F32: return NVPTX::FSetNANf32rr_toi32; + case NVPTX::FVecNEV2F32: return NVPTX::FSetNEf32rr_toi32; + case NVPTX::FVecNEV2F64: return NVPTX::FSetNEf64rr_toi64; + case NVPTX::FVecNEV4F32: return NVPTX::FSetNEf32rr_toi32; + case NVPTX::FVecNUMV2F32: return NVPTX::FSetNUMf32rr_toi32; + case NVPTX::FVecNUMV2F64: return NVPTX::FSetNUMf64rr_toi64; + case NVPTX::FVecNUMV4F32: return NVPTX::FSetNUMf32rr_toi32; + case NVPTX::FVecUEQV2F32: return NVPTX::FSetUEQf32rr_toi32; + case NVPTX::FVecUEQV2F64: return NVPTX::FSetUEQf64rr_toi64; + case NVPTX::FVecUEQV4F32: return NVPTX::FSetUEQf32rr_toi32; + case NVPTX::FVecUGEV2F32: return NVPTX::FSetUGEf32rr_toi32; + case NVPTX::FVecUGEV2F64: return NVPTX::FSetUGEf64rr_toi64; + case NVPTX::FVecUGEV4F32: return NVPTX::FSetUGEf32rr_toi32; + case NVPTX::FVecUGTV2F32: return NVPTX::FSetUGTf32rr_toi32; + case NVPTX::FVecUGTV2F64: return NVPTX::FSetUGTf64rr_toi64; + case NVPTX::FVecUGTV4F32: return NVPTX::FSetUGTf32rr_toi32; + case NVPTX::FVecULEV2F32: return NVPTX::FSetULEf32rr_toi32; + case NVPTX::FVecULEV2F64: return NVPTX::FSetULEf64rr_toi64; + case NVPTX::FVecULEV4F32: return NVPTX::FSetULEf32rr_toi32; + case NVPTX::FVecULTV2F32: return NVPTX::FSetULTf32rr_toi32; + case NVPTX::FVecULTV2F64: return NVPTX::FSetULTf64rr_toi64; + case NVPTX::FVecULTV4F32: return NVPTX::FSetULTf32rr_toi32; + case NVPTX::FVecUNEV2F32: return NVPTX::FSetUNEf32rr_toi32; + case NVPTX::FVecUNEV2F64: return NVPTX::FSetUNEf64rr_toi64; + case NVPTX::FVecUNEV4F32: return NVPTX::FSetUNEf32rr_toi32; + case NVPTX::I16MADV2: return NVPTX::MAD16rrr; + case NVPTX::I16MADV4: return NVPTX::MAD16rrr; + case NVPTX::I32MADV2: return NVPTX::MAD32rrr; + case NVPTX::I32MADV4: return NVPTX::MAD32rrr; + case NVPTX::I64MADV2: return NVPTX::MAD64rrr; + case NVPTX::I8MADV2: return NVPTX::MAD8rrr; + case NVPTX::I8MADV4: return NVPTX::MAD8rrr; + case NVPTX::ShiftLV2I16: return NVPTX::SHLi16rr; + case NVPTX::ShiftLV2I32: return NVPTX::SHLi32rr; + case NVPTX::ShiftLV2I64: return NVPTX::SHLi64rr; + case NVPTX::ShiftLV2I8: return NVPTX::SHLi8rr; + case NVPTX::ShiftLV4I16: return NVPTX::SHLi16rr; + case NVPTX::ShiftLV4I32: return NVPTX::SHLi32rr; + case NVPTX::ShiftLV4I8: return NVPTX::SHLi8rr; + case NVPTX::ShiftRAV2I16: return NVPTX::SRAi16rr; + case NVPTX::ShiftRAV2I32: return NVPTX::SRAi32rr; + case NVPTX::ShiftRAV2I64: return NVPTX::SRAi64rr; + case NVPTX::ShiftRAV2I8: return NVPTX::SRAi8rr; + case NVPTX::ShiftRAV4I16: return NVPTX::SRAi16rr; + case NVPTX::ShiftRAV4I32: return NVPTX::SRAi32rr; + case NVPTX::ShiftRAV4I8: return NVPTX::SRAi8rr; + case NVPTX::ShiftRLV2I16: return NVPTX::SRLi16rr; + case NVPTX::ShiftRLV2I32: return NVPTX::SRLi32rr; + case NVPTX::ShiftRLV2I64: return NVPTX::SRLi64rr; + case NVPTX::ShiftRLV2I8: return NVPTX::SRLi8rr; + case NVPTX::ShiftRLV4I16: return NVPTX::SRLi16rr; + case NVPTX::ShiftRLV4I32: return NVPTX::SRLi32rr; + case NVPTX::ShiftRLV4I8: return NVPTX::SRLi8rr; + case NVPTX::SubCCCV2I32: return NVPTX::SUBCCCi32rr; + case NVPTX::SubCCCV4I32: return NVPTX::SUBCCCi32rr; + case NVPTX::SubCCV2I32: return NVPTX::SUBCCi32rr; + case NVPTX::SubCCV4I32: return NVPTX::SUBCCi32rr; + case NVPTX::V2F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz; + case NVPTX::V2F32Div_prec: return NVPTX::FDIV32rr_prec; + case NVPTX::V2F32Div_ftz: return NVPTX::FDIV32rr_ftz; + case NVPTX::V2F32Div: return NVPTX::FDIV32rr; + case NVPTX::V2F32_Select: return NVPTX::SELECTf32rr; + case NVPTX::V2F64Div: return NVPTX::FDIV64rr; + case NVPTX::V2F64_Select: return NVPTX::SELECTf64rr; + case NVPTX::V2I16_Select: return NVPTX::SELECTi16rr; + case NVPTX::V2I32_Select: return NVPTX::SELECTi32rr; + case NVPTX::V2I64_Select: return NVPTX::SELECTi64rr; + case NVPTX::V2I8_Select: return NVPTX::SELECTi8rr; + case NVPTX::V2f32Extract: return NVPTX::FMOV32rr; + case NVPTX::V2f32Insert: return NVPTX::FMOV32rr; + case NVPTX::V2f32Mov: return NVPTX::FMOV32rr; + case NVPTX::V2f64Extract: return NVPTX::FMOV64rr; + case NVPTX::V2f64Insert: return NVPTX::FMOV64rr; + case NVPTX::V2f64Mov: return NVPTX::FMOV64rr; + case NVPTX::V2i16Extract: return NVPTX::IMOV16rr; + case NVPTX::V2i16Insert: return NVPTX::IMOV16rr; + case NVPTX::V2i16Mov: return NVPTX::IMOV16rr; + case NVPTX::V2i32Extract: return NVPTX::IMOV32rr; + case NVPTX::V2i32Insert: return NVPTX::IMOV32rr; + case NVPTX::V2i32Mov: return NVPTX::IMOV32rr; + case NVPTX::V2i64Extract: return NVPTX::IMOV64rr; + case NVPTX::V2i64Insert: return NVPTX::IMOV64rr; + case NVPTX::V2i64Mov: return NVPTX::IMOV64rr; + case NVPTX::V2i8Extract: return NVPTX::IMOV8rr; + case NVPTX::V2i8Insert: return NVPTX::IMOV8rr; + case NVPTX::V2i8Mov: return NVPTX::IMOV8rr; + case NVPTX::V4F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz; + case NVPTX::V4F32Div_prec: return NVPTX::FDIV32rr_prec; + case NVPTX::V4F32Div_ftz: return NVPTX::FDIV32rr_ftz; + case NVPTX::V4F32Div: return NVPTX::FDIV32rr; + case NVPTX::V4F32_Select: return NVPTX::SELECTf32rr; + case NVPTX::V4I16_Select: return NVPTX::SELECTi16rr; + case NVPTX::V4I32_Select: return NVPTX::SELECTi32rr; + case NVPTX::V4I8_Select: return NVPTX::SELECTi8rr; + case NVPTX::V4f32Extract: return NVPTX::FMOV32rr; + case NVPTX::V4f32Insert: return NVPTX::FMOV32rr; + case NVPTX::V4f32Mov: return NVPTX::FMOV32rr; + case NVPTX::V4i16Extract: return NVPTX::IMOV16rr; + case NVPTX::V4i16Insert: return NVPTX::IMOV16rr; + case NVPTX::V4i16Mov: return NVPTX::IMOV16rr; + case NVPTX::V4i32Extract: return NVPTX::IMOV32rr; + case NVPTX::V4i32Insert: return NVPTX::IMOV32rr; + case NVPTX::V4i32Mov: return NVPTX::IMOV32rr; + case NVPTX::V4i8Extract: return NVPTX::IMOV8rr; + case NVPTX::V4i8Insert: return NVPTX::IMOV8rr; + case NVPTX::V4i8Mov: return NVPTX::IMOV8rr; + case NVPTX::VAddV2I16: return NVPTX::ADDi16rr; + case NVPTX::VAddV2I32: return NVPTX::ADDi32rr; + case NVPTX::VAddV2I64: return NVPTX::ADDi64rr; + case NVPTX::VAddV2I8: return NVPTX::ADDi8rr; + case NVPTX::VAddV4I16: return NVPTX::ADDi16rr; + case NVPTX::VAddV4I32: return NVPTX::ADDi32rr; + case NVPTX::VAddV4I8: return NVPTX::ADDi8rr; + case NVPTX::VAddfV2F32: return NVPTX::FADDf32rr; + case NVPTX::VAddfV2F32_ftz: return NVPTX::FADDf32rr_ftz; + case NVPTX::VAddfV2F64: return NVPTX::FADDf64rr; + case NVPTX::VAddfV4F32: return NVPTX::FADDf32rr; + case NVPTX::VAddfV4F32_ftz: return NVPTX::FADDf32rr_ftz; + case NVPTX::VAndV2I16: return NVPTX::ANDb16rr; + case NVPTX::VAndV2I32: return NVPTX::ANDb32rr; + case NVPTX::VAndV2I64: return NVPTX::ANDb64rr; + case NVPTX::VAndV2I8: return NVPTX::ANDb8rr; + case NVPTX::VAndV4I16: return NVPTX::ANDb16rr; + case NVPTX::VAndV4I32: return NVPTX::ANDb32rr; + case NVPTX::VAndV4I8: return NVPTX::ANDb8rr; + case NVPTX::VMulfV2F32_ftz: return NVPTX::FMULf32rr_ftz; + case NVPTX::VMulfV2F32: return NVPTX::FMULf32rr; + case NVPTX::VMulfV2F64: return NVPTX::FMULf64rr; + case NVPTX::VMulfV4F32_ftz: return NVPTX::FMULf32rr_ftz; + case NVPTX::VMulfV4F32: return NVPTX::FMULf32rr; + case NVPTX::VMultHSV2I16: return NVPTX::MULTHSi16rr; + case NVPTX::VMultHSV2I32: return NVPTX::MULTHSi32rr; + case NVPTX::VMultHSV2I64: return NVPTX::MULTHSi64rr; + case NVPTX::VMultHSV2I8: return NVPTX::MULTHSi8rr; + case NVPTX::VMultHSV4I16: return NVPTX::MULTHSi16rr; + case NVPTX::VMultHSV4I32: return NVPTX::MULTHSi32rr; + case NVPTX::VMultHSV4I8: return NVPTX::MULTHSi8rr; + case NVPTX::VMultHUV2I16: return NVPTX::MULTHUi16rr; + case NVPTX::VMultHUV2I32: return NVPTX::MULTHUi32rr; + case NVPTX::VMultHUV2I64: return NVPTX::MULTHUi64rr; + case NVPTX::VMultHUV2I8: return NVPTX::MULTHUi8rr; + case NVPTX::VMultHUV4I16: return NVPTX::MULTHUi16rr; + case NVPTX::VMultHUV4I32: return NVPTX::MULTHUi32rr; + case NVPTX::VMultHUV4I8: return NVPTX::MULTHUi8rr; + case NVPTX::VMultV2I16: return NVPTX::MULTi16rr; + case NVPTX::VMultV2I32: return NVPTX::MULTi32rr; + case NVPTX::VMultV2I64: return NVPTX::MULTi64rr; + case NVPTX::VMultV2I8: return NVPTX::MULTi8rr; + case NVPTX::VMultV4I16: return NVPTX::MULTi16rr; + case NVPTX::VMultV4I32: return NVPTX::MULTi32rr; + case NVPTX::VMultV4I8: return NVPTX::MULTi8rr; + case NVPTX::VNegV2I16: return NVPTX::INEG16; + case NVPTX::VNegV2I32: return NVPTX::INEG32; + case NVPTX::VNegV2I64: return NVPTX::INEG64; + case NVPTX::VNegV2I8: return NVPTX::INEG8; + case NVPTX::VNegV4I16: return NVPTX::INEG16; + case NVPTX::VNegV4I32: return NVPTX::INEG32; + case NVPTX::VNegV4I8: return NVPTX::INEG8; + case NVPTX::VNegv2f32: return NVPTX::FNEGf32; + case NVPTX::VNegv2f32_ftz: return NVPTX::FNEGf32_ftz; + case NVPTX::VNegv2f64: return NVPTX::FNEGf64; + case NVPTX::VNegv4f32: return NVPTX::FNEGf32; + case NVPTX::VNegv4f32_ftz: return NVPTX::FNEGf32_ftz; + case NVPTX::VNotV2I16: return NVPTX::NOT16; + case NVPTX::VNotV2I32: return NVPTX::NOT32; + case NVPTX::VNotV2I64: return NVPTX::NOT64; + case NVPTX::VNotV2I8: return NVPTX::NOT8; + case NVPTX::VNotV4I16: return NVPTX::NOT16; + case NVPTX::VNotV4I32: return NVPTX::NOT32; + case NVPTX::VNotV4I8: return NVPTX::NOT8; + case NVPTX::VOrV2I16: return NVPTX::ORb16rr; + case NVPTX::VOrV2I32: return NVPTX::ORb32rr; + case NVPTX::VOrV2I64: return NVPTX::ORb64rr; + case NVPTX::VOrV2I8: return NVPTX::ORb8rr; + case NVPTX::VOrV4I16: return NVPTX::ORb16rr; + case NVPTX::VOrV4I32: return NVPTX::ORb32rr; + case NVPTX::VOrV4I8: return NVPTX::ORb8rr; + case NVPTX::VSDivV2I16: return NVPTX::SDIVi16rr; + case NVPTX::VSDivV2I32: return NVPTX::SDIVi32rr; + case NVPTX::VSDivV2I64: return NVPTX::SDIVi64rr; + case NVPTX::VSDivV2I8: return NVPTX::SDIVi8rr; + case NVPTX::VSDivV4I16: return NVPTX::SDIVi16rr; + case NVPTX::VSDivV4I32: return NVPTX::SDIVi32rr; + case NVPTX::VSDivV4I8: return NVPTX::SDIVi8rr; + case NVPTX::VSRemV2I16: return NVPTX::SREMi16rr; + case NVPTX::VSRemV2I32: return NVPTX::SREMi32rr; + case NVPTX::VSRemV2I64: return NVPTX::SREMi64rr; + case NVPTX::VSRemV2I8: return NVPTX::SREMi8rr; + case NVPTX::VSRemV4I16: return NVPTX::SREMi16rr; + case NVPTX::VSRemV4I32: return NVPTX::SREMi32rr; + case NVPTX::VSRemV4I8: return NVPTX::SREMi8rr; + case NVPTX::VSubV2I16: return NVPTX::SUBi16rr; + case NVPTX::VSubV2I32: return NVPTX::SUBi32rr; + case NVPTX::VSubV2I64: return NVPTX::SUBi64rr; + case NVPTX::VSubV2I8: return NVPTX::SUBi8rr; + case NVPTX::VSubV4I16: return NVPTX::SUBi16rr; + case NVPTX::VSubV4I32: return NVPTX::SUBi32rr; + case NVPTX::VSubV4I8: return NVPTX::SUBi8rr; + case NVPTX::VSubfV2F32_ftz: return NVPTX::FSUBf32rr_ftz; + case NVPTX::VSubfV2F32: return NVPTX::FSUBf32rr; + case NVPTX::VSubfV2F64: return NVPTX::FSUBf64rr; + case NVPTX::VSubfV4F32_ftz: return NVPTX::FSUBf32rr_ftz; + case NVPTX::VSubfV4F32: return NVPTX::FSUBf32rr; + case NVPTX::VUDivV2I16: return NVPTX::UDIVi16rr; + case NVPTX::VUDivV2I32: return NVPTX::UDIVi32rr; + case NVPTX::VUDivV2I64: return NVPTX::UDIVi64rr; + case NVPTX::VUDivV2I8: return NVPTX::UDIVi8rr; + case NVPTX::VUDivV4I16: return NVPTX::UDIVi16rr; + case NVPTX::VUDivV4I32: return NVPTX::UDIVi32rr; + case NVPTX::VUDivV4I8: return NVPTX::UDIVi8rr; + case NVPTX::VURemV2I16: return NVPTX::UREMi16rr; + case NVPTX::VURemV2I32: return NVPTX::UREMi32rr; + case NVPTX::VURemV2I64: return NVPTX::UREMi64rr; + case NVPTX::VURemV2I8: return NVPTX::UREMi8rr; + case NVPTX::VURemV4I16: return NVPTX::UREMi16rr; + case NVPTX::VURemV4I32: return NVPTX::UREMi32rr; + case NVPTX::VURemV4I8: return NVPTX::UREMi8rr; + case NVPTX::VXorV2I16: return NVPTX::XORb16rr; + case NVPTX::VXorV2I32: return NVPTX::XORb32rr; + case NVPTX::VXorV2I64: return NVPTX::XORb64rr; + case NVPTX::VXorV2I8: return NVPTX::XORb8rr; + case NVPTX::VXorV4I16: return NVPTX::XORb16rr; + case NVPTX::VXorV4I32: return NVPTX::XORb32rr; + case NVPTX::VXorV4I8: return NVPTX::XORb8rr; + case NVPTX::VecSEQV2I16: return NVPTX::ISetSEQi16rr_toi16; + case NVPTX::VecSEQV2I32: return NVPTX::ISetSEQi32rr_toi32; + case NVPTX::VecSEQV2I64: return NVPTX::ISetSEQi64rr_toi64; + case NVPTX::VecSEQV2I8: return NVPTX::ISetSEQi8rr_toi8; + case NVPTX::VecSEQV4I16: return NVPTX::ISetSEQi16rr_toi16; + case NVPTX::VecSEQV4I32: return NVPTX::ISetSEQi32rr_toi32; + case NVPTX::VecSEQV4I8: return NVPTX::ISetSEQi8rr_toi8; + case NVPTX::VecSGEV2I16: return NVPTX::ISetSGEi16rr_toi16; + case NVPTX::VecSGEV2I32: return NVPTX::ISetSGEi32rr_toi32; + case NVPTX::VecSGEV2I64: return NVPTX::ISetSGEi64rr_toi64; + case NVPTX::VecSGEV2I8: return NVPTX::ISetSGEi8rr_toi8; + case NVPTX::VecSGEV4I16: return NVPTX::ISetSGEi16rr_toi16; + case NVPTX::VecSGEV4I32: return NVPTX::ISetSGEi32rr_toi32; + case NVPTX::VecSGEV4I8: return NVPTX::ISetSGEi8rr_toi8; + case NVPTX::VecSGTV2I16: return NVPTX::ISetSGTi16rr_toi16; + case NVPTX::VecSGTV2I32: return NVPTX::ISetSGTi32rr_toi32; + case NVPTX::VecSGTV2I64: return NVPTX::ISetSGTi64rr_toi64; + case NVPTX::VecSGTV2I8: return NVPTX::ISetSGTi8rr_toi8; + case NVPTX::VecSGTV4I16: return NVPTX::ISetSGTi16rr_toi16; + case NVPTX::VecSGTV4I32: return NVPTX::ISetSGTi32rr_toi32; + case NVPTX::VecSGTV4I8: return NVPTX::ISetSGTi8rr_toi8; + case NVPTX::VecSLEV2I16: return NVPTX::ISetSLEi16rr_toi16; + case NVPTX::VecSLEV2I32: return NVPTX::ISetSLEi32rr_toi32; + case NVPTX::VecSLEV2I64: return NVPTX::ISetSLEi64rr_toi64; + case NVPTX::VecSLEV2I8: return NVPTX::ISetSLEi8rr_toi8; + case NVPTX::VecSLEV4I16: return NVPTX::ISetSLEi16rr_toi16; + case NVPTX::VecSLEV4I32: return NVPTX::ISetSLEi32rr_toi32; + case NVPTX::VecSLEV4I8: return NVPTX::ISetSLEi8rr_toi8; + case NVPTX::VecSLTV2I16: return NVPTX::ISetSLTi16rr_toi16; + case NVPTX::VecSLTV2I32: return NVPTX::ISetSLTi32rr_toi32; + case NVPTX::VecSLTV2I64: return NVPTX::ISetSLTi64rr_toi64; + case NVPTX::VecSLTV2I8: return NVPTX::ISetSLTi8rr_toi8; + case NVPTX::VecSLTV4I16: return NVPTX::ISetSLTi16rr_toi16; + case NVPTX::VecSLTV4I32: return NVPTX::ISetSLTi32rr_toi32; + case NVPTX::VecSLTV4I8: return NVPTX::ISetSLTi8rr_toi8; + case NVPTX::VecSNEV2I16: return NVPTX::ISetSNEi16rr_toi16; + case NVPTX::VecSNEV2I32: return NVPTX::ISetSNEi32rr_toi32; + case NVPTX::VecSNEV2I64: return NVPTX::ISetSNEi64rr_toi64; + case NVPTX::VecSNEV2I8: return NVPTX::ISetSNEi8rr_toi8; + case NVPTX::VecSNEV4I16: return NVPTX::ISetSNEi16rr_toi16; + case NVPTX::VecSNEV4I32: return NVPTX::ISetSNEi32rr_toi32; + case NVPTX::VecSNEV4I8: return NVPTX::ISetSNEi8rr_toi8; + case NVPTX::VecShuffle_v2f32: return NVPTX::FMOV32rr; + case NVPTX::VecShuffle_v2f64: return NVPTX::FMOV64rr; + case NVPTX::VecShuffle_v2i16: return NVPTX::IMOV16rr; + case NVPTX::VecShuffle_v2i32: return NVPTX::IMOV32rr; + case NVPTX::VecShuffle_v2i64: return NVPTX::IMOV64rr; + case NVPTX::VecShuffle_v2i8: return NVPTX::IMOV8rr; + case NVPTX::VecShuffle_v4f32: return NVPTX::FMOV32rr; + case NVPTX::VecShuffle_v4i16: return NVPTX::IMOV16rr; + case NVPTX::VecShuffle_v4i32: return NVPTX::IMOV32rr; + case NVPTX::VecShuffle_v4i8: return NVPTX::IMOV8rr; + case NVPTX::VecUEQV2I16: return NVPTX::ISetUEQi16rr_toi16; + case NVPTX::VecUEQV2I32: return NVPTX::ISetUEQi32rr_toi32; + case NVPTX::VecUEQV2I64: return NVPTX::ISetUEQi64rr_toi64; + case NVPTX::VecUEQV2I8: return NVPTX::ISetUEQi8rr_toi8; + case NVPTX::VecUEQV4I16: return NVPTX::ISetUEQi16rr_toi16; + case NVPTX::VecUEQV4I32: return NVPTX::ISetUEQi32rr_toi32; + case NVPTX::VecUEQV4I8: return NVPTX::ISetUEQi8rr_toi8; + case NVPTX::VecUGEV2I16: return NVPTX::ISetUGEi16rr_toi16; + case NVPTX::VecUGEV2I32: return NVPTX::ISetUGEi32rr_toi32; + case NVPTX::VecUGEV2I64: return NVPTX::ISetUGEi64rr_toi64; + case NVPTX::VecUGEV2I8: return NVPTX::ISetUGEi8rr_toi8; + case NVPTX::VecUGEV4I16: return NVPTX::ISetUGEi16rr_toi16; + case NVPTX::VecUGEV4I32: return NVPTX::ISetUGEi32rr_toi32; + case NVPTX::VecUGEV4I8: return NVPTX::ISetUGEi8rr_toi8; + case NVPTX::VecUGTV2I16: return NVPTX::ISetUGTi16rr_toi16; + case NVPTX::VecUGTV2I32: return NVPTX::ISetUGTi32rr_toi32; + case NVPTX::VecUGTV2I64: return NVPTX::ISetUGTi64rr_toi64; + case NVPTX::VecUGTV2I8: return NVPTX::ISetUGTi8rr_toi8; + case NVPTX::VecUGTV4I16: return NVPTX::ISetUGTi16rr_toi16; + case NVPTX::VecUGTV4I32: return NVPTX::ISetUGTi32rr_toi32; + case NVPTX::VecUGTV4I8: return NVPTX::ISetUGTi8rr_toi8; + case NVPTX::VecULEV2I16: return NVPTX::ISetULEi16rr_toi16; + case NVPTX::VecULEV2I32: return NVPTX::ISetULEi32rr_toi32; + case NVPTX::VecULEV2I64: return NVPTX::ISetULEi64rr_toi64; + case NVPTX::VecULEV2I8: return NVPTX::ISetULEi8rr_toi8; + case NVPTX::VecULEV4I16: return NVPTX::ISetULEi16rr_toi16; + case NVPTX::VecULEV4I32: return NVPTX::ISetULEi32rr_toi32; + case NVPTX::VecULEV4I8: return NVPTX::ISetULEi8rr_toi8; + case NVPTX::VecULTV2I16: return NVPTX::ISetULTi16rr_toi16; + case NVPTX::VecULTV2I32: return NVPTX::ISetULTi32rr_toi32; + case NVPTX::VecULTV2I64: return NVPTX::ISetULTi64rr_toi64; + case NVPTX::VecULTV2I8: return NVPTX::ISetULTi8rr_toi8; + case NVPTX::VecULTV4I16: return NVPTX::ISetULTi16rr_toi16; + case NVPTX::VecULTV4I32: return NVPTX::ISetULTi32rr_toi32; + case NVPTX::VecULTV4I8: return NVPTX::ISetULTi8rr_toi8; + case NVPTX::VecUNEV2I16: return NVPTX::ISetUNEi16rr_toi16; + case NVPTX::VecUNEV2I32: return NVPTX::ISetUNEi32rr_toi32; + case NVPTX::VecUNEV2I64: return NVPTX::ISetUNEi64rr_toi64; + case NVPTX::VecUNEV2I8: return NVPTX::ISetUNEi8rr_toi8; + case NVPTX::VecUNEV4I16: return NVPTX::ISetUNEi16rr_toi16; + case NVPTX::VecUNEV4I32: return NVPTX::ISetUNEi32rr_toi32; + case NVPTX::VecUNEV4I8: return NVPTX::ISetUNEi8rr_toi8; + case NVPTX::INT_PTX_LDU_G_v2i8_32: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_32; + case NVPTX::INT_PTX_LDU_G_v4i8_32: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_32; + case NVPTX::INT_PTX_LDU_G_v2i16_32: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_32; + case NVPTX::INT_PTX_LDU_G_v4i16_32: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_32; + case NVPTX::INT_PTX_LDU_G_v2i32_32: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_32; + case NVPTX::INT_PTX_LDU_G_v4i32_32: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_32; + case NVPTX::INT_PTX_LDU_G_v2f32_32: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_32; + case NVPTX::INT_PTX_LDU_G_v4f32_32: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_32; + case NVPTX::INT_PTX_LDU_G_v2i64_32: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_32; + case NVPTX::INT_PTX_LDU_G_v2f64_32: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_32; + case NVPTX::INT_PTX_LDU_G_v2i8_64: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_64; + case NVPTX::INT_PTX_LDU_G_v4i8_64: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_64; + case NVPTX::INT_PTX_LDU_G_v2i16_64: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_64; + case NVPTX::INT_PTX_LDU_G_v4i16_64: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_64; + case NVPTX::INT_PTX_LDU_G_v2i32_64: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_64; + case NVPTX::INT_PTX_LDU_G_v4i32_64: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_64; + case NVPTX::INT_PTX_LDU_G_v2f32_64: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_64; + case NVPTX::INT_PTX_LDU_G_v4f32_64: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_64; + case NVPTX::INT_PTX_LDU_G_v2i64_64: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_64; + case NVPTX::INT_PTX_LDU_G_v2f64_64: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_64; + + case NVPTX::LoadParamV4I32: return NVPTX::LoadParamScalar4I32; + case NVPTX::LoadParamV4I16: return NVPTX::LoadParamScalar4I16; + case NVPTX::LoadParamV4I8: return NVPTX::LoadParamScalar4I8; + case NVPTX::LoadParamV2I64: return NVPTX::LoadParamScalar2I64; + case NVPTX::LoadParamV2I32: return NVPTX::LoadParamScalar2I32; + case NVPTX::LoadParamV2I16: return NVPTX::LoadParamScalar2I16; + case NVPTX::LoadParamV2I8: return NVPTX::LoadParamScalar2I8; + case NVPTX::LoadParamV4F32: return NVPTX::LoadParamScalar4F32; + case NVPTX::LoadParamV2F32: return NVPTX::LoadParamScalar2F32; + case NVPTX::LoadParamV2F64: return NVPTX::LoadParamScalar2F64; + case NVPTX::StoreParamV4I32: return NVPTX::StoreParamScalar4I32; + case NVPTX::StoreParamV4I16: return NVPTX::StoreParamScalar4I16; + case NVPTX::StoreParamV4I8: return NVPTX::StoreParamScalar4I8; + case NVPTX::StoreParamV2I64: return NVPTX::StoreParamScalar2I64; + case NVPTX::StoreParamV2I32: return NVPTX::StoreParamScalar2I32; + case NVPTX::StoreParamV2I16: return NVPTX::StoreParamScalar2I16; + case NVPTX::StoreParamV2I8: return NVPTX::StoreParamScalar2I8; + case NVPTX::StoreParamV4F32: return NVPTX::StoreParamScalar4F32; + case NVPTX::StoreParamV2F32: return NVPTX::StoreParamScalar2F32; + case NVPTX::StoreParamV2F64: return NVPTX::StoreParamScalar2F64; + case NVPTX::StoreRetvalV4I32: return NVPTX::StoreRetvalScalar4I32; + case NVPTX::StoreRetvalV4I16: return NVPTX::StoreRetvalScalar4I16; + case NVPTX::StoreRetvalV4I8: return NVPTX::StoreRetvalScalar4I8; + case NVPTX::StoreRetvalV2I64: return NVPTX::StoreRetvalScalar2I64; + case NVPTX::StoreRetvalV2I32: return NVPTX::StoreRetvalScalar2I32; + case NVPTX::StoreRetvalV2I16: return NVPTX::StoreRetvalScalar2I16; + case NVPTX::StoreRetvalV2I8: return NVPTX::StoreRetvalScalar2I8; + case NVPTX::StoreRetvalV4F32: return NVPTX::StoreRetvalScalar4F32; + case NVPTX::StoreRetvalV2F32: return NVPTX::StoreRetvalScalar2F32; + case NVPTX::StoreRetvalV2F64: return NVPTX::StoreRetvalScalar2F64; + case NVPTX::VecI32toV4I8: return NVPTX::I32toV4I8; + case NVPTX::VecI64toV4I16: return NVPTX::I64toV4I16; + case NVPTX::VecI16toV2I8: return NVPTX::I16toV2I8; + case NVPTX::VecI32toV2I16: return NVPTX::I32toV2I16; + case NVPTX::VecI64toV2I32: return NVPTX::I64toV2I32; + case NVPTX::VecF64toV2F32: return NVPTX::F64toV2F32; + + case NVPTX::LD_v2i8_avar: return NVPTX::LDV_i8_v2_avar; + case NVPTX::LD_v2i8_areg: return NVPTX::LDV_i8_v2_areg; + case NVPTX::LD_v2i8_ari: return NVPTX::LDV_i8_v2_ari; + case NVPTX::LD_v2i8_asi: return NVPTX::LDV_i8_v2_asi; + case NVPTX::LD_v4i8_avar: return NVPTX::LDV_i8_v4_avar; + case NVPTX::LD_v4i8_areg: return NVPTX::LDV_i8_v4_areg; + case NVPTX::LD_v4i8_ari: return NVPTX::LDV_i8_v4_ari; + case NVPTX::LD_v4i8_asi: return NVPTX::LDV_i8_v4_asi; + + case NVPTX::LD_v2i16_avar: return NVPTX::LDV_i16_v2_avar; + case NVPTX::LD_v2i16_areg: return NVPTX::LDV_i16_v2_areg; + case NVPTX::LD_v2i16_ari: return NVPTX::LDV_i16_v2_ari; + case NVPTX::LD_v2i16_asi: return NVPTX::LDV_i16_v2_asi; + case NVPTX::LD_v4i16_avar: return NVPTX::LDV_i16_v4_avar; + case NVPTX::LD_v4i16_areg: return NVPTX::LDV_i16_v4_areg; + case NVPTX::LD_v4i16_ari: return NVPTX::LDV_i16_v4_ari; + case NVPTX::LD_v4i16_asi: return NVPTX::LDV_i16_v4_asi; + + case NVPTX::LD_v2i32_avar: return NVPTX::LDV_i32_v2_avar; + case NVPTX::LD_v2i32_areg: return NVPTX::LDV_i32_v2_areg; + case NVPTX::LD_v2i32_ari: return NVPTX::LDV_i32_v2_ari; + case NVPTX::LD_v2i32_asi: return NVPTX::LDV_i32_v2_asi; + case NVPTX::LD_v4i32_avar: return NVPTX::LDV_i32_v4_avar; + case NVPTX::LD_v4i32_areg: return NVPTX::LDV_i32_v4_areg; + case NVPTX::LD_v4i32_ari: return NVPTX::LDV_i32_v4_ari; + case NVPTX::LD_v4i32_asi: return NVPTX::LDV_i32_v4_asi; + + case NVPTX::LD_v2f32_avar: return NVPTX::LDV_f32_v2_avar; + case NVPTX::LD_v2f32_areg: return NVPTX::LDV_f32_v2_areg; + case NVPTX::LD_v2f32_ari: return NVPTX::LDV_f32_v2_ari; + case NVPTX::LD_v2f32_asi: return NVPTX::LDV_f32_v2_asi; + case NVPTX::LD_v4f32_avar: return NVPTX::LDV_f32_v4_avar; + case NVPTX::LD_v4f32_areg: return NVPTX::LDV_f32_v4_areg; + case NVPTX::LD_v4f32_ari: return NVPTX::LDV_f32_v4_ari; + case NVPTX::LD_v4f32_asi: return NVPTX::LDV_f32_v4_asi; + + case NVPTX::LD_v2i64_avar: return NVPTX::LDV_i64_v2_avar; + case NVPTX::LD_v2i64_areg: return NVPTX::LDV_i64_v2_areg; + case NVPTX::LD_v2i64_ari: return NVPTX::LDV_i64_v2_ari; + case NVPTX::LD_v2i64_asi: return NVPTX::LDV_i64_v2_asi; + case NVPTX::LD_v2f64_avar: return NVPTX::LDV_f64_v2_avar; + case NVPTX::LD_v2f64_areg: return NVPTX::LDV_f64_v2_areg; + case NVPTX::LD_v2f64_ari: return NVPTX::LDV_f64_v2_ari; + case NVPTX::LD_v2f64_asi: return NVPTX::LDV_f64_v2_asi; + + case NVPTX::ST_v2i8_avar: return NVPTX::STV_i8_v2_avar; + case NVPTX::ST_v2i8_areg: return NVPTX::STV_i8_v2_areg; + case NVPTX::ST_v2i8_ari: return NVPTX::STV_i8_v2_ari; + case NVPTX::ST_v2i8_asi: return NVPTX::STV_i8_v2_asi; + case NVPTX::ST_v4i8_avar: return NVPTX::STV_i8_v4_avar; + case NVPTX::ST_v4i8_areg: return NVPTX::STV_i8_v4_areg; + case NVPTX::ST_v4i8_ari: return NVPTX::STV_i8_v4_ari; + case NVPTX::ST_v4i8_asi: return NVPTX::STV_i8_v4_asi; + + case NVPTX::ST_v2i16_avar: return NVPTX::STV_i16_v2_avar; + case NVPTX::ST_v2i16_areg: return NVPTX::STV_i16_v2_areg; + case NVPTX::ST_v2i16_ari: return NVPTX::STV_i16_v2_ari; + case NVPTX::ST_v2i16_asi: return NVPTX::STV_i16_v2_asi; + case NVPTX::ST_v4i16_avar: return NVPTX::STV_i16_v4_avar; + case NVPTX::ST_v4i16_areg: return NVPTX::STV_i16_v4_areg; + case NVPTX::ST_v4i16_ari: return NVPTX::STV_i16_v4_ari; + case NVPTX::ST_v4i16_asi: return NVPTX::STV_i16_v4_asi; + + case NVPTX::ST_v2i32_avar: return NVPTX::STV_i32_v2_avar; + case NVPTX::ST_v2i32_areg: return NVPTX::STV_i32_v2_areg; + case NVPTX::ST_v2i32_ari: return NVPTX::STV_i32_v2_ari; + case NVPTX::ST_v2i32_asi: return NVPTX::STV_i32_v2_asi; + case NVPTX::ST_v4i32_avar: return NVPTX::STV_i32_v4_avar; + case NVPTX::ST_v4i32_areg: return NVPTX::STV_i32_v4_areg; + case NVPTX::ST_v4i32_ari: return NVPTX::STV_i32_v4_ari; + case NVPTX::ST_v4i32_asi: return NVPTX::STV_i32_v4_asi; + + case NVPTX::ST_v2f32_avar: return NVPTX::STV_f32_v2_avar; + case NVPTX::ST_v2f32_areg: return NVPTX::STV_f32_v2_areg; + case NVPTX::ST_v2f32_ari: return NVPTX::STV_f32_v2_ari; + case NVPTX::ST_v2f32_asi: return NVPTX::STV_f32_v2_asi; + case NVPTX::ST_v4f32_avar: return NVPTX::STV_f32_v4_avar; + case NVPTX::ST_v4f32_areg: return NVPTX::STV_f32_v4_areg; + case NVPTX::ST_v4f32_ari: return NVPTX::STV_f32_v4_ari; + case NVPTX::ST_v4f32_asi: return NVPTX::STV_f32_v4_asi; + + case NVPTX::ST_v2i64_avar: return NVPTX::STV_i64_v2_avar; + case NVPTX::ST_v2i64_areg: return NVPTX::STV_i64_v2_areg; + case NVPTX::ST_v2i64_ari: return NVPTX::STV_i64_v2_ari; + case NVPTX::ST_v2i64_asi: return NVPTX::STV_i64_v2_asi; + case NVPTX::ST_v2f64_avar: return NVPTX::STV_f64_v2_avar; + case NVPTX::ST_v2f64_areg: return NVPTX::STV_f64_v2_areg; + case NVPTX::ST_v2f64_ari: return NVPTX::STV_f64_v2_ari; + case NVPTX::ST_v2f64_asi: return NVPTX::STV_f64_v2_asi; + } + return 0; +} diff --git a/lib/Target/NVPTX/cl_common_defines.h b/lib/Target/NVPTX/cl_common_defines.h new file mode 100644 index 0000000..a7347ef --- /dev/null +++ b/lib/Target/NVPTX/cl_common_defines.h @@ -0,0 +1,125 @@ +#ifndef __CL_COMMON_DEFINES_H__ +#define __CL_COMMON_DEFINES_H__ +// This file includes defines that are common to both kernel code and +// the NVPTX back-end. + +// +// Common defines for Image intrinsics +// Channel order +enum { + CLK_R = 0x10B0, + CLK_A = 0x10B1, + CLK_RG = 0x10B2, + CLK_RA = 0x10B3, + CLK_RGB = 0x10B4, + CLK_RGBA = 0x10B5, + CLK_BGRA = 0x10B6, + CLK_ARGB = 0x10B7, + +#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0) + CLK_xRGB = 0x10B7, +#endif + + CLK_INTENSITY = 0x10B8, + CLK_LUMINANCE = 0x10B9 + +#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1) + , + CLK_Rx = 0x10BA, + CLK_RGx = 0x10BB, + CLK_RGBx = 0x10BC +#endif +}; + + +typedef enum clk_channel_type { + // valid formats for float return types + CLK_SNORM_INT8 = 0x10D0, // four channel RGBA unorm8 + CLK_SNORM_INT16 = 0x10D1, // four channel RGBA unorm16 + CLK_UNORM_INT8 = 0x10D2, // four channel RGBA unorm8 + CLK_UNORM_INT16 = 0x10D3, // four channel RGBA unorm16 + CLK_HALF_FLOAT = 0x10DD, // four channel RGBA half + CLK_FLOAT = 0x10DE, // four channel RGBA float + +#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1) + CLK_UNORM_SHORT_565 = 0x10D4, + CLK_UNORM_SHORT_555 = 0x10D5, + CLK_UNORM_INT_101010 = 0x10D6, +#endif + + // valid only for integer return types + CLK_SIGNED_INT8 = 0x10D7, + CLK_SIGNED_INT16 = 0x10D8, + CLK_SIGNED_INT32 = 0x10D9, + CLK_UNSIGNED_INT8 = 0x10DA, + CLK_UNSIGNED_INT16 = 0x10DB, + CLK_UNSIGNED_INT32 = 0x10DC, + + // CI SPI for CPU + __CLK_UNORM_INT8888 , // four channel ARGB unorm8 + __CLK_UNORM_INT8888R, // four channel BGRA unorm8 + + __CLK_VALID_IMAGE_TYPE_COUNT, + __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT, + __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4, // number of bits required to + // represent any image type + __CLK_VALID_IMAGE_TYPE_MASK = ( 1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS ) - 1 +}clk_channel_type; + +typedef enum clk_sampler_type { + __CLK_ADDRESS_BASE = 0, + CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE, + CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE, + CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE, + CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE, + CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE, + +#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1) + CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR, +#endif + __CLK_ADDRESS_MASK = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP | + CLK_ADDRESS_CLAMP_TO_EDGE | + CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR, + __CLK_ADDRESS_BITS = 3, // number of bits required to + // represent address info + + __CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS, + CLK_NORMALIZED_COORDS_FALSE = 0, + CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE, + __CLK_NORMALIZED_MASK = CLK_NORMALIZED_COORDS_FALSE | + CLK_NORMALIZED_COORDS_TRUE, + __CLK_NORMALIZED_BITS = 1, // number of bits required to + // represent normalization + + __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE + + __CLK_NORMALIZED_BITS, + CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE, + CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE, + CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE, + __CLK_FILTER_MASK = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR | + CLK_FILTER_ANISOTROPIC, + __CLK_FILTER_BITS = 2, // number of bits required to + // represent address info + + __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS, + CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE, + CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE, + CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE, + __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR | + CLK_MIP_ANISOTROPIC, + __CLK_MIP_BITS = 2, + + __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS, + __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK | + __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK, + + __CLK_ANISOTROPIC_RATIO_BITS = 5, + __CLK_ANISOTROPIC_RATIO_MASK = (int) 0x80000000 >> + (__CLK_ANISOTROPIC_RATIO_BITS-1) +} clk_sampler_type; + +// Memory synchronization +#define CLK_LOCAL_MEM_FENCE (1 << 0) +#define CLK_GLOBAL_MEM_FENCE (1 << 1) + +#endif // __CL_COMMON_DEFINES_H__ diff --git a/lib/Target/NVPTX/gen-register-defs.py b/lib/Target/NVPTX/gen-register-defs.py new file mode 100644 index 0000000..ed06668 --- /dev/null +++ b/lib/Target/NVPTX/gen-register-defs.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +num_regs = 396 + +outFile = open('NVPTXRegisterInfo.td', 'w') + +outFile.write(''' +//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the PTX register file +//===----------------------------------------------------------------------===// + +class NVPTXReg<string n> : Register<n> { + let Namespace = "NVPTX"; +} + +class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList> + : RegisterClass <"NVPTX", regTypes, alignment, regList>; + +//===----------------------------------------------------------------------===// +// Registers +//===----------------------------------------------------------------------===// + +// Special Registers used as stack pointer +def VRFrame : NVPTXReg<"%SP">; +def VRFrameLocal : NVPTXReg<"%SPL">; + +// Special Registers used as the stack +def VRDepot : NVPTXReg<"%Depot">; +''') + +# Predicates +outFile.write(''' +//===--- Predicate --------------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def P%d : NVPTXReg<"%%p%d">;\n' % (i, i)) + +# Int8 +outFile.write(''' +//===--- 8-bit ------------------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def RC%d : NVPTXReg<"%%rc%d">;\n' % (i, i)) + +# Int16 +outFile.write(''' +//===--- 16-bit -----------------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def RS%d : NVPTXReg<"%%rs%d">;\n' % (i, i)) + +# Int32 +outFile.write(''' +//===--- 32-bit -----------------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def R%d : NVPTXReg<"%%r%d">;\n' % (i, i)) + +# Int64 +outFile.write(''' +//===--- 64-bit -----------------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def RL%d : NVPTXReg<"%%rl%d">;\n' % (i, i)) + +# F32 +outFile.write(''' +//===--- 32-bit float -----------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def F%d : NVPTXReg<"%%f%d">;\n' % (i, i)) + +# F64 +outFile.write(''' +//===--- 64-bit float -----------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def FL%d : NVPTXReg<"%%fl%d">;\n' % (i, i)) + +# Vector registers +outFile.write(''' +//===--- Vector -----------------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def v2b8_%d : NVPTXReg<"%%v2b8_%d">;\n' % (i, i)) +for i in range(0, num_regs): + outFile.write('def v2b16_%d : NVPTXReg<"%%v2b16_%d">;\n' % (i, i)) +for i in range(0, num_regs): + outFile.write('def v2b32_%d : NVPTXReg<"%%v2b32_%d">;\n' % (i, i)) +for i in range(0, num_regs): + outFile.write('def v2b64_%d : NVPTXReg<"%%v2b64_%d">;\n' % (i, i)) + +for i in range(0, num_regs): + outFile.write('def v4b8_%d : NVPTXReg<"%%v4b8_%d">;\n' % (i, i)) +for i in range(0, num_regs): + outFile.write('def v4b16_%d : NVPTXReg<"%%v4b16_%d">;\n' % (i, i)) +for i in range(0, num_regs): + outFile.write('def v4b32_%d : NVPTXReg<"%%v4b32_%d">;\n' % (i, i)) + +# Argument registers +outFile.write(''' +//===--- Arguments --------------------------------------------------------===// +''') +for i in range(0, num_regs): + outFile.write('def ia%d : NVPTXReg<"%%ia%d">;\n' % (i, i)) +for i in range(0, num_regs): + outFile.write('def la%d : NVPTXReg<"%%la%d">;\n' % (i, i)) +for i in range(0, num_regs): + outFile.write('def fa%d : NVPTXReg<"%%fa%d">;\n' % (i, i)) +for i in range(0, num_regs): + outFile.write('def da%d : NVPTXReg<"%%da%d">;\n' % (i, i)) + +outFile.write(''' +//===----------------------------------------------------------------------===// +// Register classes +//===----------------------------------------------------------------------===// +''') + +outFile.write('def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%%u", 0, %d))>;\n' % (num_regs-1)) +outFile.write('def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%%u", 0, %d))>;\n' % (num_regs-1)) +outFile.write('def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%%u", 0, %d))>;\n' % (num_regs-1)) +outFile.write('def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%%u", 0, %d))>;\n' % (num_regs-1)) +outFile.write('def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%%u", 0, %d))>;\n' % (num_regs-1)) + +outFile.write('def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%%u", 0, %d))>;\n' % (num_regs-1)) +outFile.write('def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%%u", 0, %d))>;\n' % (num_regs-1)) + +outFile.write('def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%%u", 0, %d))>;\n' % (num_regs-1)) +outFile.write('def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%%u", 0, %d))>;\n' % (num_regs-1)) +outFile.write('def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%%u", 0, %d))>;\n' % (num_regs-1)) +outFile.write('def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%%u", 0, %d))>;\n' % (num_regs-1)) + +outFile.write(''' +// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>; +''') + +outFile.write(''' +class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList, + NVPTXRegClass sClass, + int e, + string n> + : NVPTXRegClass<regTypes, alignment, regList> +{ + NVPTXRegClass scalarClass=sClass; + int elems=e; + string name=n; +} +''') + + +outFile.write('def V2F32Regs\n : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n Float32Regs, 2, ".v2.f32">;\n' % (num_regs-1)) +outFile.write('def V4F32Regs\n : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n Float32Regs, 4, ".v4.f32">;\n' % (num_regs-1)) + +outFile.write('def V2I32Regs\n : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n Int32Regs, 2, ".v2.u32">;\n' % (num_regs-1)) +outFile.write('def V4I32Regs\n : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n Int32Regs, 4, ".v4.u32">;\n' % (num_regs-1)) + +outFile.write('def V2F64Regs\n : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n Float64Regs, 2, ".v2.f64">;\n' % (num_regs-1)) +outFile.write('def V2I64Regs\n : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n Int64Regs, 2, ".v2.u64">;\n' % (num_regs-1)) + +outFile.write('def V2I16Regs\n : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%%u", 0, %d)),\n Int16Regs, 2, ".v2.u16">;\n' % (num_regs-1)) +outFile.write('def V4I16Regs\n : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%%u", 0, %d)),\n Int16Regs, 4, ".v4.u16">;\n' % (num_regs-1)) + +outFile.write('def V2I8Regs\n : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%%u", 0, %d)),\n Int8Regs, 2, ".v2.u8">;\n' % (num_regs-1)) +outFile.write('def V4I8Regs\n : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%%u", 0, %d)),\n Int8Regs, 4, ".v4.u8">;\n' % (num_regs-1)) + +outFile.close() + + +outFile = open('NVPTXNumRegisters.h', 'w') +outFile.write(''' +//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef NVPTX_NUM_REGISTERS_H +#define NVPTX_NUM_REGISTERS_H + +namespace llvm { + +const unsigned NVPTXNumRegisters = %d; + +} + +#endif +''' % num_regs) + +outFile.close() diff --git a/lib/Target/PTX/CMakeLists.txt b/lib/Target/PTX/CMakeLists.txt deleted file mode 100644 index a3be342..0000000 --- a/lib/Target/PTX/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS PTX.td) - -tablegen(LLVM PTXGenAsmWriter.inc -gen-asm-writer) -tablegen(LLVM PTXGenDAGISel.inc -gen-dag-isel) -tablegen(LLVM PTXGenInstrInfo.inc -gen-instr-info) -tablegen(LLVM PTXGenRegisterInfo.inc -gen-register-info) -tablegen(LLVM PTXGenSubtargetInfo.inc -gen-subtarget) -add_public_tablegen_target(PTXCommonTableGen) - -add_llvm_target(PTXCodeGen - PTXAsmPrinter.cpp - PTXISelDAGToDAG.cpp - PTXISelLowering.cpp - PTXInstrInfo.cpp - PTXFPRoundingModePass.cpp - PTXFrameLowering.cpp - PTXMCAsmStreamer.cpp - PTXMCInstLower.cpp - PTXMFInfoExtract.cpp - PTXMachineFunctionInfo.cpp - PTXParamManager.cpp - PTXRegAlloc.cpp - PTXRegisterInfo.cpp - PTXSelectionDAGInfo.cpp - PTXSubtarget.cpp - PTXTargetMachine.cpp - ) - -add_subdirectory(TargetInfo) -add_subdirectory(InstPrinter) -add_subdirectory(MCTargetDesc) - diff --git a/lib/Target/PTX/InstPrinter/CMakeLists.txt b/lib/Target/PTX/InstPrinter/CMakeLists.txt deleted file mode 100644 index b252893..0000000 --- a/lib/Target/PTX/InstPrinter/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMPTXAsmPrinter - PTXInstPrinter.cpp - ) - -add_dependencies(LLVMPTXAsmPrinter PTXCommonTableGen) - diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp deleted file mode 100644 index 1830213..0000000 --- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp +++ /dev/null @@ -1,249 +0,0 @@ -//===-- PTXInstPrinter.cpp - Convert PTX MCInst to assembly syntax --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints a PTX MCInst to a .ptx file. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "asm-printer" -#include "PTXInstPrinter.h" -#include "MCTargetDesc/PTXBaseInfo.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -#include "PTXGenAsmWriter.inc" - -PTXInstPrinter::PTXInstPrinter(const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) : - MCInstPrinter(MAI, MII, MRI) { - // Initialize the set of available features. - setAvailableFeatures(STI.getFeatureBits()); -} - -void PTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - // Decode the register number into type and offset - unsigned RegSpace = RegNo & 0x7; - unsigned RegType = (RegNo >> 3) & 0x7; - unsigned RegOffset = RegNo >> 6; - - // Print the register - OS << "%"; - - switch (RegSpace) { - default: - llvm_unreachable("Unknown register space!"); - case PTXRegisterSpace::Reg: - switch (RegType) { - default: - llvm_unreachable("Unknown register type!"); - case PTXRegisterType::Pred: - OS << "p"; - break; - case PTXRegisterType::B16: - OS << "rh"; - break; - case PTXRegisterType::B32: - OS << "r"; - break; - case PTXRegisterType::B64: - OS << "rd"; - break; - case PTXRegisterType::F32: - OS << "f"; - break; - case PTXRegisterType::F64: - OS << "fd"; - break; - } - break; - case PTXRegisterSpace::Return: - OS << "ret"; - break; - case PTXRegisterSpace::Argument: - OS << "arg"; - break; - } - - OS << RegOffset; -} - -void PTXInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot) { - printPredicate(MI, O); - switch (MI->getOpcode()) { - default: - printInstruction(MI, O); - break; - case PTX::CALL: - printCall(MI, O); - } - O << ";"; - printAnnotation(O, Annot); -} - -void PTXInstPrinter::printPredicate(const MCInst *MI, raw_ostream &O) { - // The last two operands are the predicate operands - int RegIndex; - int OpIndex; - - if (MI->getOpcode() == PTX::CALL) { - RegIndex = 0; - OpIndex = 1; - } else { - RegIndex = MI->getNumOperands()-2; - OpIndex = MI->getNumOperands()-1; - } - - int PredOp = MI->getOperand(OpIndex).getImm(); - if (PredOp == PTXPredicate::None) - return; - - if (PredOp == PTXPredicate::Negate) - O << '!'; - else - O << '@'; - - printOperand(MI, RegIndex, O); -} - -void PTXInstPrinter::printCall(const MCInst *MI, raw_ostream &O) { - O << "\tcall.uni\t"; - // The first two operands are the predicate slot - unsigned Index = 2; - unsigned NumRets = MI->getOperand(Index++).getImm(); - - if (NumRets > 0) { - O << "("; - printOperand(MI, Index++, O); - for (unsigned i = 1; i < NumRets; ++i) { - O << ", "; - printOperand(MI, Index++, O); - } - O << "), "; - } - - const MCExpr* Expr = MI->getOperand(Index++).getExpr(); - unsigned NumArgs = MI->getOperand(Index++).getImm(); - - // if the function call is to printf or puts, change to vprintf - if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) { - const MCSymbol &Sym = SymRefExpr->getSymbol(); - if (Sym.getName() == "printf" || Sym.getName() == "puts") { - O << "vprintf"; - } else { - O << Sym.getName(); - } - } else { - O << *Expr; - } - - O << ", ("; - - if (NumArgs > 0) { - printOperand(MI, Index++, O); - for (unsigned i = 1; i < NumArgs; ++i) { - O << ", "; - printOperand(MI, Index++, O); - } - } - O << ")"; -} - -void PTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) { - O << Op.getImm(); - } else if (Op.isFPImm()) { - double Imm = Op.getFPImm(); - APFloat FPImm(Imm); - APInt FPIntImm = FPImm.bitcastToAPInt(); - O << "0D"; - // PTX requires us to output the full 64 bits, even if the number is zero - if (FPIntImm.getZExtValue() > 0) { - O << FPIntImm.toString(16, false); - } else { - O << "0000000000000000"; - } - } else if (Op.isReg()) { - printRegName(O, Op.getReg()); - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - const MCExpr *Expr = Op.getExpr(); - if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) { - const MCSymbol &Sym = SymRefExpr->getSymbol(); - O << Sym.getName(); - } else { - O << *Op.getExpr(); - } - } -} - -void PTXInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - // By definition, operand OpNo+1 is an i32imm - const MCOperand &Op2 = MI->getOperand(OpNo+1); - printOperand(MI, OpNo, O); - if (Op2.getImm() == 0) - return; // don't print "+0" - O << "+" << Op2.getImm(); -} - -void PTXInstPrinter::printRoundingMode(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - assert (Op.isImm() && "Rounding modes must be immediate values"); - switch (Op.getImm()) { - default: - llvm_unreachable("Unknown rounding mode!"); - case PTXRoundingMode::RndDefault: - llvm_unreachable("FP rounding-mode pass did not handle instruction!"); - case PTXRoundingMode::RndNone: - // Do not print anything. - break; - case PTXRoundingMode::RndNearestEven: - O << ".rn"; - break; - case PTXRoundingMode::RndTowardsZero: - O << ".rz"; - break; - case PTXRoundingMode::RndNegInf: - O << ".rm"; - break; - case PTXRoundingMode::RndPosInf: - O << ".rp"; - break; - case PTXRoundingMode::RndApprox: - O << ".approx"; - break; - case PTXRoundingMode::RndNearestEvenInt: - O << ".rni"; - break; - case PTXRoundingMode::RndTowardsZeroInt: - O << ".rzi"; - break; - case PTXRoundingMode::RndNegInfInt: - O << ".rmi"; - break; - case PTXRoundingMode::RndPosInfInt: - O << ".rpi"; - break; - } -} - diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h deleted file mode 100644 index ea4d504..0000000 --- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h +++ /dev/null @@ -1,45 +0,0 @@ -//===- PTXInstPrinter.h - Convert PTX MCInst to assembly syntax -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints n PTX MCInst to a .ptx file. -// -//===----------------------------------------------------------------------===// - -#ifndef PTXINSTPRINTER_H -#define PTXINSTPRINTER_H - -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" - -namespace llvm { - -class MCOperand; - -class PTXInstPrinter : public MCInstPrinter { -public: - PTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, const MCSubtargetInfo &STI); - - virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot); - virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; - - // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printPredicate(const MCInst *MI, raw_ostream &O); - void printCall(const MCInst *MI, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printRoundingMode(const MCInst *MI, unsigned OpNo, raw_ostream &O); -}; -} - -#endif - diff --git a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt deleted file mode 100644 index d1fd74c..0000000 --- a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -add_llvm_library(LLVMPTXDesc - PTXMCTargetDesc.cpp - PTXMCAsmInfo.cpp - ) - -add_dependencies(LLVMPTXDesc PTXCommonTableGen) diff --git a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h deleted file mode 100644 index a3e0f32..0000000 --- a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h +++ /dev/null @@ -1,134 +0,0 @@ -//===-- PTXBaseInfo.h - Top level definitions for PTX -------- --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains small standalone helper functions and enum definitions for -// the PTX target useful for the compiler back-end and the MC libraries. -// As such, it deliberately does not include references to LLVM core -// code gen types, passes, etc.. -// -//===----------------------------------------------------------------------===// - -#ifndef PTXBASEINFO_H -#define PTXBASEINFO_H - -#include "PTXMCTargetDesc.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" - -namespace llvm { - namespace PTXStateSpace { - enum { - Global = 0, // default to global state space - Constant = 1, - Local = 2, - Parameter = 3, - Shared = 4 - }; - } // namespace PTXStateSpace - - namespace PTXPredicate { - enum { - Normal = 0, - Negate = 1, - None = 2 - }; - } // namespace PTXPredicate - - /// Namespace to hold all target-specific flags. - namespace PTXRoundingMode { - // Instruction Flags - enum { - // Rounding Mode Flags - RndMask = 15, - RndDefault = 0, // --- - RndNone = 1, // <NONE> - RndNearestEven = 2, // .rn - RndTowardsZero = 3, // .rz - RndNegInf = 4, // .rm - RndPosInf = 5, // .rp - RndApprox = 6, // .approx - RndNearestEvenInt = 7, // .rni - RndTowardsZeroInt = 8, // .rzi - RndNegInfInt = 9, // .rmi - RndPosInfInt = 10 // .rpi - }; - } // namespace PTXII - - namespace PTXRegisterType { - // Register type encoded in MCOperands - enum { - Pred = 0, - B16, - B32, - B64, - F32, - F64 - }; - } // namespace PTXRegisterType - - namespace PTXRegisterSpace { - // Register space encoded in MCOperands - enum { - Reg = 0, - Local, - Param, - Argument, - Return - }; - } - - inline static void decodeRegisterName(raw_ostream &OS, - unsigned EncodedReg) { - OS << "%"; - - unsigned RegSpace = EncodedReg & 0x7; - unsigned RegType = (EncodedReg >> 3) & 0x7; - unsigned RegOffset = EncodedReg >> 6; - - switch (RegSpace) { - default: - llvm_unreachable("Unknown register space!"); - case PTXRegisterSpace::Reg: - switch (RegType) { - default: - llvm_unreachable("Unknown register type!"); - case PTXRegisterType::Pred: - OS << "p"; - break; - case PTXRegisterType::B16: - OS << "rh"; - break; - case PTXRegisterType::B32: - OS << "r"; - break; - case PTXRegisterType::B64: - OS << "rd"; - break; - case PTXRegisterType::F32: - OS << "f"; - break; - case PTXRegisterType::F64: - OS << "fd"; - break; - } - break; - case PTXRegisterSpace::Return: - OS << "ret"; - break; - case PTXRegisterSpace::Argument: - OS << "arg"; - break; - } - - OS << RegOffset; - } -} // namespace llvm - -#endif - diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp deleted file mode 100644 index cdfbc80..0000000 --- a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp +++ /dev/null @@ -1,37 +0,0 @@ -//===-- PTXMCAsmInfo.cpp - PTX asm properties -----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the declarations of the PTXMCAsmInfo properties. -// -//===----------------------------------------------------------------------===// - -#include "PTXMCAsmInfo.h" -#include "llvm/ADT/Triple.h" - -using namespace llvm; - -void PTXMCAsmInfo::anchor() { } - -PTXMCAsmInfo::PTXMCAsmInfo(const Target &T, const StringRef &TT) { - Triple TheTriple(TT); - if (TheTriple.getArch() == Triple::ptx64) - PointerSize = 8; - - CommentString = "//"; - - PrivateGlobalPrefix = "$L__"; - - AllowPeriodsInName = false; - - HasSetDirective = false; - - HasDotTypeDotSizeDirective = false; - - HasSingleParameterDotFile = false; -} diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp deleted file mode 100644 index 08fb970..0000000 --- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp +++ /dev/null @@ -1,98 +0,0 @@ -//===-- PTXMCTargetDesc.cpp - PTX Target Descriptions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file provides PTX specific target descriptions. -// -//===----------------------------------------------------------------------===// - -#include "PTXMCTargetDesc.h" -#include "PTXMCAsmInfo.h" -#include "InstPrinter/PTXInstPrinter.h" -#include "llvm/MC/MCCodeGenInfo.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/TargetRegistry.h" - -#define GET_INSTRINFO_MC_DESC -#include "PTXGenInstrInfo.inc" - -#define GET_SUBTARGETINFO_MC_DESC -#include "PTXGenSubtargetInfo.inc" - -#define GET_REGINFO_MC_DESC -#include "PTXGenRegisterInfo.inc" - -using namespace llvm; - -static MCInstrInfo *createPTXMCInstrInfo() { - MCInstrInfo *X = new MCInstrInfo(); - InitPTXMCInstrInfo(X); - return X; -} - -static MCRegisterInfo *createPTXMCRegisterInfo(StringRef TT) { - MCRegisterInfo *X = new MCRegisterInfo(); - // PTX does not have a return address register. - InitPTXMCRegisterInfo(X, 0); - return X; -} - -static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - InitPTXMCSubtargetInfo(X, TT, CPU, FS); - return X; -} - -static MCCodeGenInfo *createPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { - MCCodeGenInfo *X = new MCCodeGenInfo(); - X->InitMCCodeGenInfo(RM, CM, OL); - return X; -} - -static MCInstPrinter *createPTXMCInstPrinter(const Target &T, - unsigned SyntaxVariant, - const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) { - assert(SyntaxVariant == 0 && "We only have one syntax variant"); - return new PTXInstPrinter(MAI, MII, MRI, STI); -} - -extern "C" void LLVMInitializePTXTargetMC() { - // Register the MC asm info. - RegisterMCAsmInfo<PTXMCAsmInfo> X(ThePTX32Target); - RegisterMCAsmInfo<PTXMCAsmInfo> Y(ThePTX64Target); - - // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(ThePTX32Target, createPTXMCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(ThePTX64Target, createPTXMCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(ThePTX32Target, createPTXMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(ThePTX64Target, createPTXMCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(ThePTX32Target, createPTXMCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(ThePTX64Target, createPTXMCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(ThePTX32Target, - createPTXMCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(ThePTX64Target, - createPTXMCSubtargetInfo); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(ThePTX32Target, createPTXMCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(ThePTX64Target, createPTXMCInstPrinter); -} diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h deleted file mode 100644 index ffb92cb..0000000 --- a/lib/Target/PTX/PTX.h +++ /dev/null @@ -1,43 +0,0 @@ -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the entry points for global functions defined in the LLVM -// PTX back-end. -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_H -#define PTX_H - -#include "MCTargetDesc/PTXBaseInfo.h" -#include "llvm/Target/TargetMachine.h" - -namespace llvm { - class MachineInstr; - class MCInst; - class PTXAsmPrinter; - class PTXTargetMachine; - class FunctionPass; - - FunctionPass *createPTXISelDag(PTXTargetMachine &TM, - CodeGenOpt::Level OptLevel); - - FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM, - CodeGenOpt::Level OptLevel); - - FunctionPass *createPTXFPRoundingModePass(PTXTargetMachine &TM, - CodeGenOpt::Level OptLevel); - - FunctionPass *createPTXRegisterAllocator(); - - void LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, - PTXAsmPrinter &AP); - -} // namespace llvm; - -#endif // PTX_H diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td deleted file mode 100644 index 994a68e..0000000 --- a/lib/Target/PTX/PTX.td +++ /dev/null @@ -1,141 +0,0 @@ -//===-- PTX.td - Describe the PTX Target Machine -----------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// This is the top level entry point for the PTX target. -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Target-independent interfaces -//===----------------------------------------------------------------------===// - -include "llvm/Target/Target.td" - -//===----------------------------------------------------------------------===// -// Subtarget Features -//===----------------------------------------------------------------------===// - -//===- Architectural Features ---------------------------------------------===// - -def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true", - "Do not demote .f64 to .f32">; - -def FeatureNoFMA : SubtargetFeature<"no-fma","SupportsFMA", "false", - "Disable Fused-Multiply Add">; - -//===- PTX Version --------------------------------------------------------===// - -def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0", - "Use PTX Language Version 2.0">; - -def FeaturePTX21 : SubtargetFeature<"ptx21", "PTXVersion", "PTX_VERSION_2_1", - "Use PTX Language Version 2.1">; - -def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2", - "Use PTX Language Version 2.2">; - -def FeaturePTX23 : SubtargetFeature<"ptx23", "PTXVersion", "PTX_VERSION_2_3", - "Use PTX Language Version 2.3">; - -//===- PTX Target ---------------------------------------------------------===// - -def FeatureSM10 : SubtargetFeature<"sm10", "PTXTarget", "PTX_SM_1_0", - "Use Shader Model 1.0">; -def FeatureSM11 : SubtargetFeature<"sm11", "PTXTarget", "PTX_SM_1_1", - "Use Shader Model 1.1">; -def FeatureSM12 : SubtargetFeature<"sm12", "PTXTarget", "PTX_SM_1_2", - "Use Shader Model 1.2">; -def FeatureSM13 : SubtargetFeature<"sm13", "PTXTarget", "PTX_SM_1_3", - "Use Shader Model 1.3">; -def FeatureSM20 : SubtargetFeature<"sm20", "PTXTarget", "PTX_SM_2_0", - "Use Shader Model 2.0", [FeatureDouble]>; -def FeatureSM21 : SubtargetFeature<"sm21", "PTXTarget", "PTX_SM_2_1", - "Use Shader Model 2.1", [FeatureDouble]>; -def FeatureSM22 : SubtargetFeature<"sm22", "PTXTarget", "PTX_SM_2_2", - "Use Shader Model 2.2", [FeatureDouble]>; -def FeatureSM23 : SubtargetFeature<"sm23", "PTXTarget", "PTX_SM_2_3", - "Use Shader Model 2.3", [FeatureDouble]>; - -def FeatureCOMPUTE10 : SubtargetFeature<"compute10", "PTXTarget", - "PTX_COMPUTE_1_0", - "Use Compute Compatibility 1.0">; -def FeatureCOMPUTE11 : SubtargetFeature<"compute11", "PTXTarget", - "PTX_COMPUTE_1_1", - "Use Compute Compatibility 1.1">; -def FeatureCOMPUTE12 : SubtargetFeature<"compute12", "PTXTarget", - "PTX_COMPUTE_1_2", - "Use Compute Compatibility 1.2">; -def FeatureCOMPUTE13 : SubtargetFeature<"compute13", "PTXTarget", - "PTX_COMPUTE_1_3", - "Use Compute Compatibility 1.3">; -def FeatureCOMPUTE20 : SubtargetFeature<"compute20", "PTXTarget", - "PTX_COMPUTE_2_0", - "Use Compute Compatibility 2.0", - [FeatureDouble]>; - -//===----------------------------------------------------------------------===// -// PTX supported processors -//===----------------------------------------------------------------------===// - -class Proc<string Name, list<SubtargetFeature> Features> - : Processor<Name, NoItineraries, Features>; - -def : Proc<"generic", []>; - -// Processor definitions for compute/shader models -def : Proc<"compute_10", [FeatureCOMPUTE10]>; -def : Proc<"compute_11", [FeatureCOMPUTE11]>; -def : Proc<"compute_12", [FeatureCOMPUTE12]>; -def : Proc<"compute_13", [FeatureCOMPUTE13]>; -def : Proc<"compute_20", [FeatureCOMPUTE20]>; -def : Proc<"sm_10", [FeatureSM10]>; -def : Proc<"sm_11", [FeatureSM11]>; -def : Proc<"sm_12", [FeatureSM12]>; -def : Proc<"sm_13", [FeatureSM13]>; -def : Proc<"sm_20", [FeatureSM20]>; -def : Proc<"sm_21", [FeatureSM21]>; -def : Proc<"sm_22", [FeatureSM22]>; -def : Proc<"sm_23", [FeatureSM23]>; - -// Processor definitions for common GPU architectures -def : Proc<"g80", [FeatureSM10]>; -def : Proc<"gt200", [FeatureSM13]>; -def : Proc<"gf100", [FeatureSM20, FeatureDouble]>; -def : Proc<"fermi", [FeatureSM20, FeatureDouble]>; - -//===----------------------------------------------------------------------===// -// Register File Description -//===----------------------------------------------------------------------===// - -include "PTXRegisterInfo.td" - -//===----------------------------------------------------------------------===// -// Instruction Descriptions -//===----------------------------------------------------------------------===// - -include "PTXInstrInfo.td" - -def PTXInstrInfo : InstrInfo; - -//===----------------------------------------------------------------------===// -// Assembly printer -//===----------------------------------------------------------------------===// -// PTX uses the MC printer for asm output, so make sure the TableGen -// AsmWriter bits get associated with the correct class. -def PTXAsmWriter : AsmWriter { - string AsmWriterClassName = "InstPrinter"; - bit isMCAsmWriter = 1; -} - -//===----------------------------------------------------------------------===// -// Target Declaration -//===----------------------------------------------------------------------===// - -def PTX : Target { - let InstructionSet = PTXInstrInfo; - let AssemblyWriters = [PTXAsmWriter]; -} diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp deleted file mode 100644 index 0b6ac7b..0000000 --- a/lib/Target/PTX/PTXAsmPrinter.cpp +++ /dev/null @@ -1,561 +0,0 @@ -//===-- PTXAsmPrinter.cpp - PTX LLVM assembly writer ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a printer that converts from our internal representation -// of machine-dependent LLVM code to PTX assembly language. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "ptx-asm-printer" - -#include "PTXAsmPrinter.h" -#include "PTX.h" -#include "PTXMachineFunctionInfo.h" -#include "PTXParamManager.h" -#include "PTXRegisterInfo.h" -#include "PTXTargetMachine.h" -#include "llvm/Argument.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Module.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Target/Mangler.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -static const char PARAM_PREFIX[] = "__param_"; -static const char RETURN_PREFIX[] = "__ret_"; - -static const char *getRegisterTypeName(unsigned RegType) { - switch (RegType) { - default: - llvm_unreachable("Unknown register type"); - case PTXRegisterType::Pred: - return ".pred"; - case PTXRegisterType::B16: - return ".b16"; - case PTXRegisterType::B32: - return ".b32"; - case PTXRegisterType::B64: - return ".b64"; - case PTXRegisterType::F32: - return ".f32"; - case PTXRegisterType::F64: - return ".f64"; - } -} - -static const char *getStateSpaceName(unsigned addressSpace) { - switch (addressSpace) { - default: llvm_unreachable("Unknown state space"); - case PTXStateSpace::Global: return "global"; - case PTXStateSpace::Constant: return "const"; - case PTXStateSpace::Local: return "local"; - case PTXStateSpace::Parameter: return "param"; - case PTXStateSpace::Shared: return "shared"; - } -} - -static const char *getTypeName(Type* type) { - while (true) { - switch (type->getTypeID()) { - default: llvm_unreachable("Unknown type"); - case Type::FloatTyID: return ".f32"; - case Type::DoubleTyID: return ".f64"; - case Type::IntegerTyID: - switch (type->getPrimitiveSizeInBits()) { - default: llvm_unreachable("Unknown integer bit-width"); - case 16: return ".u16"; - case 32: return ".u32"; - case 64: return ".u64"; - } - case Type::ArrayTyID: - case Type::PointerTyID: - type = dyn_cast<SequentialType>(type)->getElementType(); - break; - } - } - return NULL; -} - -bool PTXAsmPrinter::doFinalization(Module &M) { - // XXX Temproarily remove global variables so that doFinalization() will not - // emit them again (global variables are emitted at beginning). - - Module::GlobalListType &global_list = M.getGlobalList(); - int i, n = global_list.size(); - GlobalVariable **gv_array = new GlobalVariable* [n]; - - // first, back-up GlobalVariable in gv_array - i = 0; - for (Module::global_iterator I = global_list.begin(), E = global_list.end(); - I != E; ++I) - gv_array[i++] = &*I; - - // second, empty global_list - while (!global_list.empty()) - global_list.remove(global_list.begin()); - - // call doFinalization - bool ret = AsmPrinter::doFinalization(M); - - // now we restore global variables - for (i = 0; i < n; i ++) - global_list.insert(global_list.end(), gv_array[i]); - - delete[] gv_array; - return ret; -} - -void PTXAsmPrinter::EmitStartOfAsmFile(Module &M) -{ - const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>(); - - // Emit the PTX .version and .target attributes - OutStreamer.EmitRawText(Twine("\t.version ") + ST.getPTXVersionString()); - OutStreamer.EmitRawText(Twine("\t.target ") + ST.getTargetString() + - (ST.supportsDouble() ? "" - : ", map_f64_to_f32")); - // .address_size directive is optional, but it must immediately follow - // the .target directive if present within a module - if (ST.supportsPTX23()) { - const char *addrSize = ST.is64Bit() ? "64" : "32"; - OutStreamer.EmitRawText(Twine("\t.address_size ") + addrSize); - } - - OutStreamer.AddBlankLine(); - - // Define any .file directives - DebugInfoFinder DbgFinder; - DbgFinder.processModule(M); - - for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(), - E = DbgFinder.compile_unit_end(); I != E; ++I) { - DICompileUnit DIUnit(*I); - StringRef FN = DIUnit.getFilename(); - StringRef Dir = DIUnit.getDirectory(); - GetOrCreateSourceID(FN, Dir); - } - - OutStreamer.AddBlankLine(); - - // declare external functions - for (Module::const_iterator i = M.begin(), e = M.end(); - i != e; ++i) - EmitFunctionDeclaration(i); - - // declare global variables - for (Module::const_global_iterator i = M.global_begin(), e = M.global_end(); - i != e; ++i) - EmitVariableDeclaration(i); -} - -void PTXAsmPrinter::EmitFunctionBodyStart() { - OutStreamer.EmitRawText(Twine("{")); - - const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>(); - const PTXParamManager &PM = MFI->getParamManager(); - - // Print register definitions - SmallString<128> regDefs; - raw_svector_ostream os(regDefs); - unsigned numRegs; - - // pred - numRegs = MFI->countRegisters(PTXRegisterType::Pred, PTXRegisterSpace::Reg); - if(numRegs > 0) - os << "\t.reg .pred %p<" << numRegs << ">;\n"; - - // i16 - numRegs = MFI->countRegisters(PTXRegisterType::B16, PTXRegisterSpace::Reg); - if(numRegs > 0) - os << "\t.reg .b16 %rh<" << numRegs << ">;\n"; - - // i32 - numRegs = MFI->countRegisters(PTXRegisterType::B32, PTXRegisterSpace::Reg); - if(numRegs > 0) - os << "\t.reg .b32 %r<" << numRegs << ">;\n"; - - // i64 - numRegs = MFI->countRegisters(PTXRegisterType::B64, PTXRegisterSpace::Reg); - if(numRegs > 0) - os << "\t.reg .b64 %rd<" << numRegs << ">;\n"; - - // f32 - numRegs = MFI->countRegisters(PTXRegisterType::F32, PTXRegisterSpace::Reg); - if(numRegs > 0) - os << "\t.reg .f32 %f<" << numRegs << ">;\n"; - - // f64 - numRegs = MFI->countRegisters(PTXRegisterType::F64, PTXRegisterSpace::Reg); - if(numRegs > 0) - os << "\t.reg .f64 %fd<" << numRegs << ">;\n"; - - // Local params - for (PTXParamManager::param_iterator i = PM.local_begin(), e = PM.local_end(); - i != e; ++i) - os << "\t.param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i) - << ";\n"; - - OutStreamer.EmitRawText(os.str()); - - - const MachineFrameInfo* FrameInfo = MF->getFrameInfo(); - DEBUG(dbgs() << "Have " << FrameInfo->getNumObjects() - << " frame object(s)\n"); - for (unsigned i = 0, e = FrameInfo->getNumObjects(); i != e; ++i) { - DEBUG(dbgs() << "Size of object: " << FrameInfo->getObjectSize(i) << "\n"); - if (FrameInfo->getObjectSize(i) > 0) { - OutStreamer.EmitRawText("\t.local .align " + - Twine(FrameInfo->getObjectAlignment(i)) + - " .b8 __local" + - Twine(i) + - "[" + - Twine(FrameInfo->getObjectSize(i)) + - "];"); - } - } - - //unsigned Index = 1; - // Print parameter passing params - //for (PTXMachineFunctionInfo::param_iterator - // i = MFI->paramBegin(), e = MFI->paramEnd(); i != e; ++i) { - // std::string def = "\t.param .b"; - // def += utostr(*i); - // def += " __ret_"; - // def += utostr(Index); - // Index++; - // def += ";"; - // OutStreamer.EmitRawText(Twine(def)); - //} -} - -void PTXAsmPrinter::EmitFunctionBodyEnd() { - OutStreamer.EmitRawText(Twine("}")); -} - -void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) { - MCInst TmpInst; - LowerPTXMachineInstrToMCInst(MI, TmpInst, *this); - OutStreamer.EmitInstruction(TmpInst); -} - -void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) { - // Check to see if this is a special global used by LLVM, if so, emit it. - if (EmitSpecialLLVMGlobal(gv)) - return; - - MCSymbol *gvsym = Mang->getSymbol(gv); - - assert(gvsym->isUndefined() && "Cannot define a symbol twice!"); - - SmallString<128> decl; - raw_svector_ostream os(decl); - - // check if it is defined in some other translation unit - if (gv->isDeclaration()) - os << ".extern "; - - // state space: e.g., .global - os << '.' << getStateSpaceName(gv->getType()->getAddressSpace()) << ' '; - - // alignment (optional) - unsigned alignment = gv->getAlignment(); - if (alignment != 0) - os << ".align " << gv->getAlignment() << ' '; - - - if (PointerType::classof(gv->getType())) { - PointerType* pointerTy = dyn_cast<PointerType>(gv->getType()); - Type* elementTy = pointerTy->getElementType(); - - if (elementTy->isArrayTy()) { - assert(elementTy->isArrayTy() && "Only pointers to arrays are supported"); - - ArrayType* arrayTy = dyn_cast<ArrayType>(elementTy); - elementTy = arrayTy->getElementType(); - - unsigned numElements = arrayTy->getNumElements(); - - while (elementTy->isArrayTy()) { - arrayTy = dyn_cast<ArrayType>(elementTy); - elementTy = arrayTy->getElementType(); - - numElements *= arrayTy->getNumElements(); - } - - // FIXME: isPrimitiveType() == false for i16? - assert(elementTy->isSingleValueType() && - "Non-primitive types are not handled"); - - // Find the size of the element in bits - unsigned elementSize = elementTy->getPrimitiveSizeInBits(); - - os << ".b" << elementSize << ' ' << gvsym->getName() - << '[' << numElements << ']'; - } else { - os << ".b8" << gvsym->getName() << "[]"; - } - - // handle string constants (assume ConstantArray means string) - if (gv->hasInitializer()) { - const Constant *C = gv->getInitializer(); - if (const ConstantArray *CA = dyn_cast<ConstantArray>(C)) { - os << " = {"; - - for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) { - if (i > 0) - os << ','; - - os << "0x"; - os.write_hex(cast<ConstantInt>(CA->getOperand(i))->getZExtValue()); - } - - os << '}'; - } - } - } else { - // Note: this is currently the fall-through case and most likely generates - // incorrect code. - os << getTypeName(gv->getType()) << ' ' << gvsym->getName(); - - if (isa<ArrayType>(gv->getType()) || isa<PointerType>(gv->getType())) - os << "[]"; - } - - os << ';'; - - OutStreamer.EmitRawText(os.str()); - OutStreamer.AddBlankLine(); -} - -void PTXAsmPrinter::EmitFunctionEntryLabel() { - // The function label could have already been emitted if two symbols end up - // conflicting due to asm renaming. Detect this and emit an error. - if (!CurrentFnSym->isUndefined()) - report_fatal_error("'" + Twine(CurrentFnSym->getName()) + - "' label emitted multiple times to assembly file"); - - const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>(); - const PTXParamManager &PM = MFI->getParamManager(); - const bool isKernel = MFI->isKernel(); - const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>(); - - SmallString<128> decl; - raw_svector_ostream os(decl); - os << (isKernel ? ".entry" : ".func"); - - if (!isKernel) { - os << " ("; - if (ST.useParamSpaceForDeviceArgs()) { - for (PTXParamManager::param_iterator i = PM.ret_begin(), e = PM.ret_end(), - b = i; i != e; ++i) { - if (i != b) - os << ", "; - - os << ".param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i); - } - } else { - for (PTXMachineFunctionInfo::reg_iterator - i = MFI->retreg_begin(), e = MFI->retreg_end(), b = i; - i != e; ++i) { - if (i != b) - os << ", "; - - os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' ' - << MFI->getRegisterName(*i); - } - } - os << ')'; - } - - // Print function name - os << ' ' << CurrentFnSym->getName() << " ("; - - const Function *F = MF->getFunction(); - - // Print parameters - if (isKernel || ST.useParamSpaceForDeviceArgs()) { - /*for (PTXParamManager::param_iterator i = PM.arg_begin(), e = PM.arg_end(), - b = i; i != e; ++i) { - if (i != b) - os << ", "; - - os << ".param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i); - }*/ - int Counter = 1; - for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(), - b = i; i != e; ++i) { - if (i != b) - os << ", "; - const Type *ArgType = (*i).getType(); - os << ".param .b"; - if (ArgType->isPointerTy()) { - if (ST.is64Bit()) - os << "64"; - else - os << "32"; - } else { - os << ArgType->getPrimitiveSizeInBits(); - } - if (ArgType->isPointerTy() && ST.emitPtrAttribute()) { - const PointerType *PtrType = dyn_cast<const PointerType>(ArgType); - os << " .ptr"; - switch (PtrType->getAddressSpace()) { - default: - llvm_unreachable("Unknown address space in argument"); - case PTXStateSpace::Global: - os << " .global"; - break; - case PTXStateSpace::Shared: - os << " .shared"; - break; - } - } - os << " __param_" << Counter++; - } - } else { - for (PTXMachineFunctionInfo::reg_iterator - i = MFI->argreg_begin(), e = MFI->argreg_end(), b = i; - i != e; ++i) { - if (i != b) - os << ", "; - - os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' ' - << MFI->getRegisterName(*i); - } - } - os << ')'; - - OutStreamer.EmitRawText(os.str()); -} - -void PTXAsmPrinter::EmitFunctionDeclaration(const Function* func) -{ - const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>(); - - std::string decl = ""; - - // hard-coded emission of extern vprintf function - - if (func->getName() == "printf" || func->getName() == "puts") { - decl += ".extern .func (.param .b32 __param_1) vprintf (.param .b"; - if (ST.is64Bit()) - decl += "64"; - else - decl += "32"; - decl += " __param_2, .param .b"; - if (ST.is64Bit()) - decl += "64"; - else - decl += "32"; - decl += " __param_3)\n"; - } - - OutStreamer.EmitRawText(Twine(decl)); -} - -unsigned PTXAsmPrinter::GetOrCreateSourceID(StringRef FileName, - StringRef DirName) { - // If FE did not provide a file name, then assume stdin. - if (FileName.empty()) - return GetOrCreateSourceID("<stdin>", StringRef()); - - // MCStream expects full path name as filename. - if (!DirName.empty() && !sys::path::is_absolute(FileName)) { - SmallString<128> FullPathName = DirName; - sys::path::append(FullPathName, FileName); - // Here FullPathName will be copied into StringMap by GetOrCreateSourceID. - return GetOrCreateSourceID(StringRef(FullPathName), StringRef()); - } - - StringMapEntry<unsigned> &Entry = SourceIdMap.GetOrCreateValue(FileName); - if (Entry.getValue()) - return Entry.getValue(); - - unsigned SrcId = SourceIdMap.size(); - Entry.setValue(SrcId); - - // Print out a .file directive to specify files for .loc directives. - OutStreamer.EmitDwarfFileDirective(SrcId, "", Entry.getKey()); - - return SrcId; -} - -MCOperand PTXAsmPrinter::GetSymbolRef(const MachineOperand &MO, - const MCSymbol *Symbol) { - const MCExpr *Expr; - Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, OutContext); - return MCOperand::CreateExpr(Expr); -} - -MCOperand PTXAsmPrinter::lowerOperand(const MachineOperand &MO) { - MCOperand MCOp; - const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>(); - unsigned EncodedReg; - switch (MO.getType()) { - default: - llvm_unreachable("Unknown operand type"); - case MachineOperand::MO_Register: - if (MO.getReg() > 0) { - // Encode the register - EncodedReg = MFI->getEncodedRegister(MO.getReg()); - } else { - EncodedReg = 0; - } - MCOp = MCOperand::CreateReg(EncodedReg); - break; - case MachineOperand::MO_Immediate: - MCOp = MCOperand::CreateImm(MO.getImm()); - break; - case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( - MO.getMBB()->getSymbol(), OutContext)); - break; - case MachineOperand::MO_GlobalAddress: - MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal())); - break; - case MachineOperand::MO_ExternalSymbol: - MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName())); - break; - case MachineOperand::MO_FPImmediate: - APFloat Val = MO.getFPImm()->getValueAPF(); - bool ignored; - Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); - MCOp = MCOperand::CreateFPImm(Val.convertToDouble()); - break; - } - - return MCOp; -} - -// Force static initialization. -extern "C" void LLVMInitializePTXAsmPrinter() { - RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target); - RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target); -} diff --git a/lib/Target/PTX/PTXAsmPrinter.h b/lib/Target/PTX/PTXAsmPrinter.h deleted file mode 100644 index 74c8d58..0000000 --- a/lib/Target/PTX/PTXAsmPrinter.h +++ /dev/null @@ -1,57 +0,0 @@ -//===-- PTXAsmPrinter.h - Print machine code to a PTX file ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// PTX Assembly printer class. -// -//===----------------------------------------------------------------------===// - -#ifndef PTXASMPRINTER_H -#define PTXASMPRINTER_H - -#include "PTX.h" -#include "PTXTargetMachine.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/Support/Compiler.h" - -namespace llvm { - -class MCOperand; - -class LLVM_LIBRARY_VISIBILITY PTXAsmPrinter : public AsmPrinter { -public: - explicit PTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer) {} - - const char *getPassName() const { return "PTX Assembly Printer"; } - - bool doFinalization(Module &M); - - virtual void EmitStartOfAsmFile(Module &M); - virtual void EmitFunctionBodyStart(); - virtual void EmitFunctionBodyEnd(); - virtual void EmitFunctionEntryLabel(); - virtual void EmitInstruction(const MachineInstr *MI); - - unsigned GetOrCreateSourceID(StringRef FileName, - StringRef DirName); - - MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol); - MCOperand lowerOperand(const MachineOperand &MO); - -private: - void EmitVariableDeclaration(const GlobalVariable *gv); - void EmitFunctionDeclaration(const Function* func); - - StringMap<unsigned> SourceIdMap; -}; // class PTXAsmPrinter -} // namespace llvm - -#endif - diff --git a/lib/Target/PTX/PTXFPRoundingModePass.cpp b/lib/Target/PTX/PTXFPRoundingModePass.cpp deleted file mode 100644 index a21d172..0000000 --- a/lib/Target/PTX/PTXFPRoundingModePass.cpp +++ /dev/null @@ -1,181 +0,0 @@ -//===-- PTXFPRoundingModePass.cpp - Assign rounding modes pass ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines a machine function pass that sets appropriate FP rounding -// modes for all relevant instructions. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "ptx-fp-rounding-mode" - -#include "PTX.h" -#include "PTXTargetMachine.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -// NOTE: PTXFPRoundingModePass should be executed just before emission. - -namespace { - /// PTXFPRoundingModePass - Pass to assign appropriate FP rounding modes to - /// all FP instructions. Essentially, this pass just looks for all FP - /// instructions that have a rounding mode set to RndDefault, and sets an - /// appropriate rounding mode based on the target device. - /// - class PTXFPRoundingModePass : public MachineFunctionPass { - private: - static char ID; - - typedef std::pair<unsigned, unsigned> RndModeDesc; - - PTXTargetMachine& TargetMachine; - DenseMap<unsigned, RndModeDesc> Instrs; - - public: - PTXFPRoundingModePass(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel) - : MachineFunctionPass(ID), - TargetMachine(TM) { - initializeMap(); - } - - virtual bool runOnMachineFunction(MachineFunction &MF); - - virtual const char *getPassName() const { - return "PTX FP Rounding Mode Pass"; - } - - private: - - void initializeMap(); - void processInstruction(MachineInstr &MI); - }; // class PTXFPRoundingModePass -} // end anonymous namespace - -using namespace llvm; - -char PTXFPRoundingModePass::ID = 0; - -bool PTXFPRoundingModePass::runOnMachineFunction(MachineFunction &MF) { - // Look at each basic block - for (MachineFunction::iterator bbi = MF.begin(), bbe = MF.end(); bbi != bbe; - ++bbi) { - MachineBasicBlock &MBB = *bbi; - // Look at each instruction - for (MachineBasicBlock::iterator ii = MBB.begin(), ie = MBB.end(); - ii != ie; ++ii) { - MachineInstr &MI = *ii; - processInstruction(MI); - } - } - return false; -} - -void PTXFPRoundingModePass::initializeMap() { - using namespace PTXRoundingMode; - const PTXSubtarget& ST = TargetMachine.getSubtarget<PTXSubtarget>(); - - // Build a map of default rounding mode for all instructions that need a - // rounding mode. - Instrs[PTX::FADDrr32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FADDri32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FADDrr64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FADDri64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FSUBrr32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FSUBri32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FSUBrr64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FSUBri64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FMULrr32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FMULri32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FMULrr64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FMULri64] = std::make_pair(1U, (unsigned)RndNearestEven); - - Instrs[PTX::FNEGrr32] = std::make_pair(1U, (unsigned)RndNone); - Instrs[PTX::FNEGri32] = std::make_pair(1U, (unsigned)RndNone); - Instrs[PTX::FNEGrr64] = std::make_pair(1U, (unsigned)RndNone); - Instrs[PTX::FNEGri64] = std::make_pair(1U, (unsigned)RndNone); - - unsigned FDivRndMode = ST.fdivNeedsRoundingMode() ? RndNearestEven : RndNone; - Instrs[PTX::FDIVrr32] = std::make_pair(1U, FDivRndMode); - Instrs[PTX::FDIVri32] = std::make_pair(1U, FDivRndMode); - Instrs[PTX::FDIVrr64] = std::make_pair(1U, FDivRndMode); - Instrs[PTX::FDIVri64] = std::make_pair(1U, FDivRndMode); - - unsigned FMADRndMode = ST.fmadNeedsRoundingMode() ? RndNearestEven : RndNone; - Instrs[PTX::FMADrrr32] = std::make_pair(1U, FMADRndMode); - Instrs[PTX::FMADrri32] = std::make_pair(1U, FMADRndMode); - Instrs[PTX::FMADrii32] = std::make_pair(1U, FMADRndMode); - Instrs[PTX::FMADrrr64] = std::make_pair(1U, FMADRndMode); - Instrs[PTX::FMADrri64] = std::make_pair(1U, FMADRndMode); - Instrs[PTX::FMADrii64] = std::make_pair(1U, FMADRndMode); - - Instrs[PTX::FSQRTrr32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FSQRTri32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FSQRTrr64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::FSQRTri64] = std::make_pair(1U, (unsigned)RndNearestEven); - - Instrs[PTX::FSINrr32] = std::make_pair(1U, (unsigned)RndApprox); - Instrs[PTX::FSINri32] = std::make_pair(1U, (unsigned)RndApprox); - Instrs[PTX::FSINrr64] = std::make_pair(1U, (unsigned)RndApprox); - Instrs[PTX::FSINri64] = std::make_pair(1U, (unsigned)RndApprox); - Instrs[PTX::FCOSrr32] = std::make_pair(1U, (unsigned)RndApprox); - Instrs[PTX::FCOSri32] = std::make_pair(1U, (unsigned)RndApprox); - Instrs[PTX::FCOSrr64] = std::make_pair(1U, (unsigned)RndApprox); - Instrs[PTX::FCOSri64] = std::make_pair(1U, (unsigned)RndApprox); - - Instrs[PTX::CVTu16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTs16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTu16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTs16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTu32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTs32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTu32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTs32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTu64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTs64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTu64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - Instrs[PTX::CVTs64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt); - - Instrs[PTX::CVTf32u16] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf32s16] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf32u32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf32s32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf32u64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf32s64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf32f64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf64u16] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf64s16] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf64u32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf64s32] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf64u64] = std::make_pair(1U, (unsigned)RndNearestEven); - Instrs[PTX::CVTf64s64] = std::make_pair(1U, (unsigned)RndNearestEven); -} - -void PTXFPRoundingModePass::processInstruction(MachineInstr &MI) { - // Is this an instruction that needs a rounding mode? - if (Instrs.count(MI.getOpcode())) { - const RndModeDesc &Desc = Instrs[MI.getOpcode()]; - // Get the rounding mode operand - MachineOperand &Op = MI.getOperand(Desc.first); - // Update the rounding mode if needed - if (Op.getImm() == PTXRoundingMode::RndDefault) { - Op.setImm(Desc.second); - } - } -} - -FunctionPass *llvm::createPTXFPRoundingModePass(PTXTargetMachine &TM, - CodeGenOpt::Level OptLevel) { - return new PTXFPRoundingModePass(TM, OptLevel); -} - diff --git a/lib/Target/PTX/PTXFrameLowering.cpp b/lib/Target/PTX/PTXFrameLowering.cpp deleted file mode 100644 index e6e268e..0000000 --- a/lib/Target/PTX/PTXFrameLowering.cpp +++ /dev/null @@ -1,24 +0,0 @@ -//===-- PTXFrameLowering.cpp - PTX Frame Information ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the PTX implementation of TargetFrameLowering class. -// -//===----------------------------------------------------------------------===// - -#include "PTXFrameLowering.h" -#include "llvm/CodeGen/MachineFunction.h" - -using namespace llvm; - -void PTXFrameLowering::emitPrologue(MachineFunction &MF) const { -} - -void PTXFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { -} diff --git a/lib/Target/PTX/PTXFrameLowering.h b/lib/Target/PTX/PTXFrameLowering.h deleted file mode 100644 index 831e818..0000000 --- a/lib/Target/PTX/PTXFrameLowering.h +++ /dev/null @@ -1,44 +0,0 @@ -//===-- PTXFrameLowering.h - Define frame lowering for PTX -----*- C++ -*--===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_FRAMEINFO_H -#define PTX_FRAMEINFO_H - -#include "PTX.h" -#include "PTXSubtarget.h" -#include "llvm/Target/TargetFrameLowering.h" - -namespace llvm { - class PTXSubtarget; - -class PTXFrameLowering : public TargetFrameLowering { -protected: - const PTXSubtarget &STI; - -public: - explicit PTXFrameLowering(const PTXSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2), - STI(sti) { - } - - /// emitProlog/emitEpilog - These methods insert prolog and epilog code into - /// the function. - void emitPrologue(MachineFunction &MF) const; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; - - bool hasFP(const MachineFunction &MF) const { return false; } -}; - -} // End llvm namespace - -#endif diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp deleted file mode 100644 index 5c7ee29..0000000 --- a/lib/Target/PTX/PTXISelDAGToDAG.cpp +++ /dev/null @@ -1,356 +0,0 @@ -//===-- PTXISelDAGToDAG.cpp - A dag to dag inst selector for PTX ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines an instruction selector for the PTX target. -// -//===----------------------------------------------------------------------===// - -#include "PTX.h" -#include "PTXMachineFunctionInfo.h" -#include "PTXTargetMachine.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { -// PTXDAGToDAGISel - PTX specific code to select PTX machine -// instructions for SelectionDAG operations. -class PTXDAGToDAGISel : public SelectionDAGISel { - public: - PTXDAGToDAGISel(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel); - - virtual const char *getPassName() const { - return "PTX DAG->DAG Pattern Instruction Selection"; - } - - SDNode *Select(SDNode *Node); - - // Complex Pattern Selectors. - bool SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2); - bool SelectADDRri(SDValue &Addr, SDValue &Base, SDValue &Offset); - bool SelectADDRii(SDValue &Addr, SDValue &Base, SDValue &Offset); - bool SelectADDRlocal(SDValue &Addr, SDValue &Base, SDValue &Offset); - - // Include the pieces auto'gened from the target description -#include "PTXGenDAGISel.inc" - - private: - // We need this only because we can't match intruction BRAdp - // pattern (PTXbrcond bb:$d, ...) in PTXInstrInfo.td - SDNode *SelectBRCOND(SDNode *Node); - - SDNode *SelectREADPARAM(SDNode *Node); - SDNode *SelectWRITEPARAM(SDNode *Node); - SDNode *SelectFrameIndex(SDNode *Node); - - bool isImm(const SDValue &operand); - bool SelectImm(const SDValue &operand, SDValue &imm); - - const PTXSubtarget& getSubtarget() const; -}; // class PTXDAGToDAGISel -} // namespace - -// createPTXISelDag - This pass converts a legalized DAG into a -// PTX-specific DAG, ready for instruction scheduling -FunctionPass *llvm::createPTXISelDag(PTXTargetMachine &TM, - CodeGenOpt::Level OptLevel) { - return new PTXDAGToDAGISel(TM, OptLevel); -} - -PTXDAGToDAGISel::PTXDAGToDAGISel(PTXTargetMachine &TM, - CodeGenOpt::Level OptLevel) - : SelectionDAGISel(TM, OptLevel) {} - -SDNode *PTXDAGToDAGISel::Select(SDNode *Node) { - switch (Node->getOpcode()) { - case ISD::BRCOND: - return SelectBRCOND(Node); - case PTXISD::READ_PARAM: - return SelectREADPARAM(Node); - case PTXISD::WRITE_PARAM: - return SelectWRITEPARAM(Node); - case ISD::FrameIndex: - return SelectFrameIndex(Node); - default: - return SelectCode(Node); - } -} - -SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) { - assert(Node->getNumOperands() >= 3); - - SDValue Chain = Node->getOperand(0); - SDValue Pred = Node->getOperand(1); - SDValue Target = Node->getOperand(2); // branch target - SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::Normal, MVT::i32); - DebugLoc dl = Node->getDebugLoc(); - - assert(Target.getOpcode() == ISD::BasicBlock); - assert(Pred.getValueType() == MVT::i1); - - // Emit BRAdp - SDValue Ops[] = { Target, Pred, PredOp, Chain }; - return CurDAG->getMachineNode(PTX::BRAdp, dl, MVT::Other, Ops, 4); -} - -SDNode *PTXDAGToDAGISel::SelectREADPARAM(SDNode *Node) { - SDValue Chain = Node->getOperand(0); - SDValue Index = Node->getOperand(1); - - int OpCode; - - // Get the type of parameter we are reading - EVT VT = Node->getValueType(0); - assert(VT.isSimple() && "READ_PARAM only implemented for MVT types"); - - MVT Type = VT.getSimpleVT(); - - if (Type == MVT::i1) - OpCode = PTX::READPARAMPRED; - else if (Type == MVT::i16) - OpCode = PTX::READPARAMI16; - else if (Type == MVT::i32) - OpCode = PTX::READPARAMI32; - else if (Type == MVT::i64) - OpCode = PTX::READPARAMI64; - else if (Type == MVT::f32) - OpCode = PTX::READPARAMF32; - else { - assert(Type == MVT::f64 && "Unexpected type!"); - OpCode = PTX::READPARAMF64; - } - - SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1); - SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32); - DebugLoc dl = Node->getDebugLoc(); - - SDValue Ops[] = { Index, Pred, PredOp, Chain }; - return CurDAG->getMachineNode(OpCode, dl, VT, Ops, 4); -} - -SDNode *PTXDAGToDAGISel::SelectWRITEPARAM(SDNode *Node) { - - SDValue Chain = Node->getOperand(0); - SDValue Value = Node->getOperand(1); - - int OpCode; - - //Node->dumpr(CurDAG); - - // Get the type of parameter we are writing - EVT VT = Value->getValueType(0); - assert(VT.isSimple() && "WRITE_PARAM only implemented for MVT types"); - - MVT Type = VT.getSimpleVT(); - - if (Type == MVT::i1) - OpCode = PTX::WRITEPARAMPRED; - else if (Type == MVT::i16) - OpCode = PTX::WRITEPARAMI16; - else if (Type == MVT::i32) - OpCode = PTX::WRITEPARAMI32; - else if (Type == MVT::i64) - OpCode = PTX::WRITEPARAMI64; - else if (Type == MVT::f32) - OpCode = PTX::WRITEPARAMF32; - else if (Type == MVT::f64) - OpCode = PTX::WRITEPARAMF64; - else - llvm_unreachable("Invalid type in SelectWRITEPARAM"); - - SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1); - SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32); - DebugLoc dl = Node->getDebugLoc(); - - SDValue Ops[] = { Value, Pred, PredOp, Chain }; - SDNode* Ret = CurDAG->getMachineNode(OpCode, dl, MVT::Other, Ops, 4); - - //dbgs() << "SelectWRITEPARAM produced:\n\t"; - //Ret->dumpr(CurDAG); - - return Ret; -} - -SDNode *PTXDAGToDAGISel::SelectFrameIndex(SDNode *Node) { - int FI = cast<FrameIndexSDNode>(Node)->getIndex(); - //dbgs() << "Selecting FrameIndex at index " << FI << "\n"; - //SDValue TFI = CurDAG->getTargetFrameIndex(FI, Node->getValueType(0)); - - PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>(); - - SDValue FrameSymbol = CurDAG->getTargetExternalSymbol(MFI->getFrameSymbol(FI), - Node->getValueType(0)); - - return FrameSymbol.getNode(); -} - -// Match memory operand of the form [reg+reg] -bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) { - if (Addr.getOpcode() != ISD::ADD || Addr.getNumOperands() < 2 || - isImm(Addr.getOperand(0)) || isImm(Addr.getOperand(1))) - return false; - - assert(Addr.getValueType().isSimple() && "Type must be simple"); - - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); - - return true; -} - -// Match memory operand of the form [reg], [imm+reg], and [reg+imm] -bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base, - SDValue &Offset) { - // FrameIndex addresses are handled separately - //errs() << "SelectADDRri: "; - //Addr.getNode()->dumpr(); - if (isa<FrameIndexSDNode>(Addr)) { - //errs() << "Failure\n"; - return false; - } - - if (CurDAG->isBaseWithConstantOffset(Addr)) { - Base = Addr.getOperand(0); - if (isa<FrameIndexSDNode>(Base)) { - //errs() << "Failure\n"; - return false; - } - ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); - Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32); - //errs() << "Success\n"; - return true; - } - - /*if (Addr.getNumOperands() == 1) { - Base = Addr; - Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); - errs() << "Success\n"; - return true; - }*/ - - //errs() << "SelectADDRri fails on: "; - //Addr.getNode()->dumpr(); - - if (isImm(Addr)) { - //errs() << "Failure\n"; - return false; - } - - Base = Addr; - Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); - - //errs() << "Success\n"; - return true; - - /*if (Addr.getOpcode() != ISD::ADD) { - // let SelectADDRii handle the [imm] case - if (isImm(Addr)) - return false; - // it is [reg] - - assert(Addr.getValueType().isSimple() && "Type must be simple"); - Base = Addr; - Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); - - return true; - } - - if (Addr.getNumOperands() < 2) - return false; - - // let SelectADDRii handle the [imm+imm] case - if (isImm(Addr.getOperand(0)) && isImm(Addr.getOperand(1))) - return false; - - // try [reg+imm] and [imm+reg] - for (int i = 0; i < 2; i ++) - if (SelectImm(Addr.getOperand(1-i), Offset)) { - Base = Addr.getOperand(i); - return true; - } - - // neither [reg+imm] nor [imm+reg] - return false;*/ -} - -// Match memory operand of the form [imm+imm] and [imm] -bool PTXDAGToDAGISel::SelectADDRii(SDValue &Addr, SDValue &Base, - SDValue &Offset) { - // is [imm+imm]? - if (Addr.getOpcode() == ISD::ADD) { - return SelectImm(Addr.getOperand(0), Base) && - SelectImm(Addr.getOperand(1), Offset); - } - - // is [imm]? - if (SelectImm(Addr, Base)) { - assert(Addr.getValueType().isSimple() && "Type must be simple"); - - Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); - - return true; - } - - return false; -} - -// Match memory operand of the form [reg], [imm+reg], and [reg+imm] -bool PTXDAGToDAGISel::SelectADDRlocal(SDValue &Addr, SDValue &Base, - SDValue &Offset) { - //errs() << "SelectADDRlocal: "; - //Addr.getNode()->dumpr(); - if (isa<FrameIndexSDNode>(Addr)) { - Base = Addr; - Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); - //errs() << "Success\n"; - return true; - } - - if (CurDAG->isBaseWithConstantOffset(Addr)) { - Base = Addr.getOperand(0); - if (!isa<FrameIndexSDNode>(Base)) { - //errs() << "Failure\n"; - return false; - } - ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); - Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32); - //errs() << "Offset: "; - //Offset.getNode()->dumpr(); - //errs() << "Success\n"; - return true; - } - - //errs() << "Failure\n"; - return false; -} - -bool PTXDAGToDAGISel::isImm(const SDValue &operand) { - return ConstantSDNode::classof(operand.getNode()); -} - -bool PTXDAGToDAGISel::SelectImm(const SDValue &operand, SDValue &imm) { - SDNode *node = operand.getNode(); - if (!ConstantSDNode::classof(node)) - return false; - - ConstantSDNode *CN = cast<ConstantSDNode>(node); - imm = CurDAG->getTargetConstant(*CN->getConstantIntValue(), - operand.getValueType()); - return true; -} - -const PTXSubtarget& PTXDAGToDAGISel::getSubtarget() const -{ - return TM.getSubtarget<PTXSubtarget>(); -} - diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp deleted file mode 100644 index ef4455b..0000000 --- a/lib/Target/PTX/PTXISelLowering.cpp +++ /dev/null @@ -1,522 +0,0 @@ -//===-- PTXISelLowering.cpp - PTX DAG Lowering Implementation -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the PTXTargetLowering class. -// -//===----------------------------------------------------------------------===// - -#include "PTXISelLowering.h" -#include "PTX.h" -#include "PTXMachineFunctionInfo.h" -#include "PTXRegisterInfo.h" -#include "PTXSubtarget.h" -#include "llvm/Function.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -//===----------------------------------------------------------------------===// -// TargetLowering Implementation -//===----------------------------------------------------------------------===// - -PTXTargetLowering::PTXTargetLowering(TargetMachine &TM) - : TargetLowering(TM, new TargetLoweringObjectFileELF()) { - // Set up the register classes. - addRegisterClass(MVT::i1, PTX::RegPredRegisterClass); - addRegisterClass(MVT::i16, PTX::RegI16RegisterClass); - addRegisterClass(MVT::i32, PTX::RegI32RegisterClass); - addRegisterClass(MVT::i64, PTX::RegI64RegisterClass); - addRegisterClass(MVT::f32, PTX::RegF32RegisterClass); - addRegisterClass(MVT::f64, PTX::RegF64RegisterClass); - - setBooleanContents(ZeroOrOneBooleanContent); - setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? - setMinFunctionAlignment(2); - - // Let LLVM use loads/stores for all mem* operations - maxStoresPerMemcpy = 4096; - maxStoresPerMemmove = 4096; - maxStoresPerMemset = 4096; - - //////////////////////////////////// - /////////// Expansion ////////////// - //////////////////////////////////// - - // (any/zero/sign) extload => load + (any/zero/sign) extend - - setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand); - - // f32 extload => load + fextend - - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - - // f64 truncstore => trunc + store - - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - - // sign_extend_inreg => sign_extend - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - - // br_cc => brcond - - setOperationAction(ISD::BR_CC, MVT::Other, Expand); - - // select_cc => setcc - - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - - //////////////////////////////////// - //////////// Legal ///////////////// - //////////////////////////////////// - - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - - //////////////////////////////////// - //////////// Custom //////////////// - //////////////////////////////////// - - // customise setcc to use bitwise logic if possible - - //setOperationAction(ISD::SETCC, MVT::i1, Custom); - setOperationAction(ISD::SETCC, MVT::i1, Legal); - - // customize translation of memory addresses - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - - // Compute derived properties from the register classes - computeRegisterProperties(); -} - -EVT PTXTargetLowering::getSetCCResultType(EVT VT) const { - return MVT::i1; -} - -SDValue PTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { - switch (Op.getOpcode()) { - default: - llvm_unreachable("Unimplemented operand"); - case ISD::SETCC: - return LowerSETCC(Op, DAG); - case ISD::GlobalAddress: - return LowerGlobalAddress(Op, DAG); - } -} - -const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - default: - llvm_unreachable("Unknown opcode"); - case PTXISD::COPY_ADDRESS: - return "PTXISD::COPY_ADDRESS"; - case PTXISD::LOAD_PARAM: - return "PTXISD::LOAD_PARAM"; - case PTXISD::STORE_PARAM: - return "PTXISD::STORE_PARAM"; - case PTXISD::READ_PARAM: - return "PTXISD::READ_PARAM"; - case PTXISD::WRITE_PARAM: - return "PTXISD::WRITE_PARAM"; - case PTXISD::EXIT: - return "PTXISD::EXIT"; - case PTXISD::RET: - return "PTXISD::RET"; - case PTXISD::CALL: - return "PTXISD::CALL"; - } -} - -//===----------------------------------------------------------------------===// -// Custom Lower Operation -//===----------------------------------------------------------------------===// - -SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType() == MVT::i1 && "SetCC type must be 1-bit integer"); - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - SDValue Op2 = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); - //ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); - - // Look for X == 0, X == 1, X != 0, or X != 1 - // We can simplify these to bitwise logic - - //if (Op1.getOpcode() == ISD::Constant && - // (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || - // cast<ConstantSDNode>(Op1)->isNullValue()) && - // (CC == ISD::SETEQ || CC == ISD::SETNE)) { - // - // return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1); - //} - - //ConstantSDNode* COp1 = cast<ConstantSDNode>(Op1); - //if(COp1 && COp1->getZExtValue() == 1) { - // if(CC == ISD::SETNE) { - // return DAG.getNode(PTX::XORripreds, dl, MVT::i1, Op0); - // } - //} - - llvm_unreachable("setcc was not matched by a pattern!"); - - return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2); -} - -SDValue PTXTargetLowering:: -LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); - const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - - assert(PtrVT.isSimple() && "Pointer must be to primitive type."); - - SDValue targetGlobal = DAG.getTargetGlobalAddress(GV, dl, PtrVT); - SDValue movInstr = DAG.getNode(PTXISD::COPY_ADDRESS, - dl, - PtrVT.getSimpleVT(), - targetGlobal); - - return movInstr; -} - -//===----------------------------------------------------------------------===// -// Calling Convention Implementation -//===----------------------------------------------------------------------===// - -SDValue PTXTargetLowering:: - LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, - SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { - if (isVarArg) llvm_unreachable("PTX does not support varargs"); - - MachineFunction &MF = DAG.getMachineFunction(); - const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>(); - PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>(); - PTXParamManager &PM = MFI->getParamManager(); - - switch (CallConv) { - default: - llvm_unreachable("Unsupported calling convention"); - case CallingConv::PTX_Kernel: - MFI->setKernel(true); - break; - case CallingConv::PTX_Device: - MFI->setKernel(false); - break; - } - - // We do one of two things here: - // IsKernel || SM >= 2.0 -> Use param space for arguments - // SM < 2.0 -> Use registers for arguments - if (MFI->isKernel() || ST.useParamSpaceForDeviceArgs()) { - // We just need to emit the proper LOAD_PARAM ISDs - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - assert((!MFI->isKernel() || Ins[i].VT != MVT::i1) && - "Kernels cannot take pred operands"); - - unsigned ParamSize = Ins[i].VT.getStoreSizeInBits(); - unsigned Param = PM.addArgumentParam(ParamSize); - const std::string &ParamName = PM.getParamName(Param); - SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(), - MVT::Other); - SDValue ArgValue = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain, - ParamValue); - InVals.push_back(ArgValue); - } - } - else { - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - EVT RegVT = Ins[i].VT; - const TargetRegisterClass* TRC = getRegClassFor(RegVT); - unsigned RegType; - - // Determine which register class we need - if (RegVT == MVT::i1) - RegType = PTXRegisterType::Pred; - else if (RegVT == MVT::i16) - RegType = PTXRegisterType::B16; - else if (RegVT == MVT::i32) - RegType = PTXRegisterType::B32; - else if (RegVT == MVT::i64) - RegType = PTXRegisterType::B64; - else if (RegVT == MVT::f32) - RegType = PTXRegisterType::F32; - else if (RegVT == MVT::f64) - RegType = PTXRegisterType::F64; - else - llvm_unreachable("Unknown parameter type"); - - // Use a unique index in the instruction to prevent instruction folding. - // Yes, this is a hack. - SDValue Index = DAG.getTargetConstant(i, MVT::i32); - unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC); - SDValue ArgValue = DAG.getNode(PTXISD::READ_PARAM, dl, RegVT, Chain, - Index); - - InVals.push_back(ArgValue); - - MFI->addRegister(Reg, RegType, PTXRegisterSpace::Argument); - } - } - - return Chain; -} - -SDValue PTXTargetLowering:: - LowerReturn(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - DebugLoc dl, - SelectionDAG &DAG) const { - if (isVarArg) llvm_unreachable("PTX does not support varargs"); - - switch (CallConv) { - default: - llvm_unreachable("Unsupported calling convention."); - case CallingConv::PTX_Kernel: - assert(Outs.size() == 0 && "Kernel must return void."); - return DAG.getNode(PTXISD::EXIT, dl, MVT::Other, Chain); - case CallingConv::PTX_Device: - assert(Outs.size() <= 1 && "Can at most return one value."); - break; - } - - MachineFunction& MF = DAG.getMachineFunction(); - PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>(); - PTXParamManager &PM = MFI->getParamManager(); - - SDValue Flag; - const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>(); - - if (ST.useParamSpaceForDeviceArgs()) { - assert(Outs.size() < 2 && "Device functions can return at most one value"); - - if (Outs.size() == 1) { - unsigned ParamSize = OutVals[0].getValueType().getSizeInBits(); - unsigned Param = PM.addReturnParam(ParamSize); - const std::string &ParamName = PM.getParamName(Param); - SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(), - MVT::Other); - Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain, - ParamValue, OutVals[0]); - } - } else { - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { - EVT RegVT = Outs[i].VT; - const TargetRegisterClass* TRC; - unsigned RegType; - - // Determine which register class we need - if (RegVT == MVT::i1) { - TRC = PTX::RegPredRegisterClass; - RegType = PTXRegisterType::Pred; - } - else if (RegVT == MVT::i16) { - TRC = PTX::RegI16RegisterClass; - RegType = PTXRegisterType::B16; - } - else if (RegVT == MVT::i32) { - TRC = PTX::RegI32RegisterClass; - RegType = PTXRegisterType::B32; - } - else if (RegVT == MVT::i64) { - TRC = PTX::RegI64RegisterClass; - RegType = PTXRegisterType::B64; - } - else if (RegVT == MVT::f32) { - TRC = PTX::RegF32RegisterClass; - RegType = PTXRegisterType::F32; - } - else if (RegVT == MVT::f64) { - TRC = PTX::RegF64RegisterClass; - RegType = PTXRegisterType::F64; - } - else { - llvm_unreachable("Unknown parameter type"); - } - - unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC); - - SDValue Copy = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i]/*, Flag*/); - SDValue OutReg = DAG.getRegister(Reg, RegVT); - - Chain = DAG.getNode(PTXISD::WRITE_PARAM, dl, MVT::Other, Copy, OutReg); - - MFI->addRegister(Reg, RegType, PTXRegisterSpace::Return); - } - } - - if (Flag.getNode() == 0) { - return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain); - } - else { - return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag); - } -} - -SDValue -PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { - - MachineFunction& MF = DAG.getMachineFunction(); - PTXMachineFunctionInfo *PTXMFI = MF.getInfo<PTXMachineFunctionInfo>(); - PTXParamManager &PM = PTXMFI->getParamManager(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - - assert(getTargetMachine().getSubtarget<PTXSubtarget>().callsAreHandled() && - "Calls are not handled for the target device"); - - // Identify the callee function - const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); - const Function *function = cast<Function>(GV); - - // allow non-device calls only for printf - bool isPrintf = function->getName() == "printf" || function->getName() == "puts"; - - assert((isPrintf || function->getCallingConv() == CallingConv::PTX_Device) && - "PTX function calls must be to PTX device functions"); - - unsigned outSize = isPrintf ? 2 : Outs.size(); - - std::vector<SDValue> Ops; - // The layout of the ops will be [Chain, #Ins, Ins, Callee, #Outs, Outs] - Ops.resize(outSize + Ins.size() + 4); - - Ops[0] = Chain; - - // Identify the callee function - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); - Ops[Ins.size()+2] = Callee; - - // #Outs - Ops[Ins.size()+3] = DAG.getTargetConstant(outSize, MVT::i32); - - if (isPrintf) { - // first argument is the address of the global string variable in memory - unsigned Param0 = PM.addLocalParam(getPointerTy().getSizeInBits()); - SDValue ParamValue0 = DAG.getTargetExternalSymbol(PM.getParamName(Param0).c_str(), - MVT::Other); - Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain, - ParamValue0, OutVals[0]); - Ops[Ins.size()+4] = ParamValue0; - - // alignment is the maximum size of all the arguments - unsigned alignment = 0; - for (unsigned i = 1; i < OutVals.size(); ++i) { - alignment = std::max(alignment, - OutVals[i].getValueType().getSizeInBits()); - } - - // size is the alignment multiplied by the number of arguments - unsigned size = alignment * (OutVals.size() - 1); - - // second argument is the address of the stack object (unless no arguments) - unsigned Param1 = PM.addLocalParam(getPointerTy().getSizeInBits()); - SDValue ParamValue1 = DAG.getTargetExternalSymbol(PM.getParamName(Param1).c_str(), - MVT::Other); - Ops[Ins.size()+5] = ParamValue1; - - if (size > 0) - { - // create a local stack object to store the arguments - unsigned StackObject = MFI->CreateStackObject(size / 8, alignment / 8, false); - SDValue FrameIndex = DAG.getFrameIndex(StackObject, getPointerTy()); - - // store each of the arguments to the stack in turn - for (unsigned int i = 1; i != OutVals.size(); i++) { - SDValue FrameAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameIndex, DAG.getTargetConstant((i - 1) * 8, getPointerTy())); - Chain = DAG.getStore(Chain, dl, OutVals[i], FrameAddr, - MachinePointerInfo(), - false, false, 0); - } - - // copy the address of the local frame index to get the address in non-local space - SDValue genericAddr = DAG.getNode(PTXISD::COPY_ADDRESS, dl, getPointerTy(), FrameIndex); - - // store this address in the second argument - Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain, ParamValue1, genericAddr); - } - } - else - { - // Generate STORE_PARAM nodes for each function argument. In PTX, function - // arguments are explicitly stored into .param variables and passed as - // arguments. There is no register/stack-based calling convention in PTX. - for (unsigned i = 0; i != OutVals.size(); ++i) { - unsigned Size = OutVals[i].getValueType().getSizeInBits(); - unsigned Param = PM.addLocalParam(Size); - const std::string &ParamName = PM.getParamName(Param); - SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(), - MVT::Other); - Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain, - ParamValue, OutVals[i]); - Ops[i+Ins.size()+4] = ParamValue; - } - } - - std::vector<SDValue> InParams; - - // Generate list of .param variables to hold the return value(s). - Ops[1] = DAG.getTargetConstant(Ins.size(), MVT::i32); - for (unsigned i = 0; i < Ins.size(); ++i) { - unsigned Size = Ins[i].VT.getStoreSizeInBits(); - unsigned Param = PM.addLocalParam(Size); - const std::string &ParamName = PM.getParamName(Param); - SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(), - MVT::Other); - Ops[i+2] = ParamValue; - InParams.push_back(ParamValue); - } - - Ops[0] = Chain; - - // Create the CALL node. - Chain = DAG.getNode(PTXISD::CALL, dl, MVT::Other, &Ops[0], Ops.size()); - - // Create the LOAD_PARAM nodes that retrieve the function return value(s). - for (unsigned i = 0; i < Ins.size(); ++i) { - SDValue Load = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain, - InParams[i]); - InVals.push_back(Load); - } - - return Chain; -} - -unsigned PTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT) { - // All arguments consist of one "register," regardless of the type. - return 1; -} - diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h deleted file mode 100644 index 33220f4..0000000 --- a/lib/Target/PTX/PTXISelLowering.h +++ /dev/null @@ -1,82 +0,0 @@ -//===-- PTXISelLowering.h - PTX DAG Lowering Interface ----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the interfaces that PTX uses to lower LLVM code into a -// selection DAG. -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_ISEL_LOWERING_H -#define PTX_ISEL_LOWERING_H - -#include "llvm/Target/TargetLowering.h" - -namespace llvm { - -namespace PTXISD { - enum NodeType { - FIRST_NUMBER = ISD::BUILTIN_OP_END, - LOAD_PARAM, - STORE_PARAM, - READ_PARAM, - WRITE_PARAM, - EXIT, - RET, - COPY_ADDRESS, - CALL - }; -} // namespace PTXISD - -class PTXTargetLowering : public TargetLowering { - public: - explicit PTXTargetLowering(TargetMachine &TM); - - virtual const char *getTargetNodeName(unsigned Opcode) const; - - virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; - - virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; - - virtual SDValue - LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, - SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const; - - virtual SDValue - LowerReturn(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - DebugLoc dl, - SelectionDAG &DAG) const; - - virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, - bool isVarArg, bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const; - - virtual EVT getSetCCResultType(EVT VT) const; - - virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT); - - private: - SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; -}; // class PTXTargetLowering -} // namespace llvm - -#endif // PTX_ISEL_LOWERING_H diff --git a/lib/Target/PTX/PTXInstrFormats.td b/lib/Target/PTX/PTXInstrFormats.td deleted file mode 100644 index 267e834..0000000 --- a/lib/Target/PTX/PTXInstrFormats.td +++ /dev/null @@ -1,51 +0,0 @@ -//===-- PTXInstrFormats.td - PTX Instruction Formats -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - - -// Rounding Mode Specifier -/*class RoundingMode<bits<3> val> { - bits<3> Value = val; -} - -def RndDefault : RoundingMode<0>; -def RndNearestEven : RoundingMode<1>; -def RndNearestZero : RoundingMode<2>; -def RndNegInf : RoundingMode<3>; -def RndPosInf : RoundingMode<4>; -def RndApprox : RoundingMode<5>;*/ - - -// Rounding Mode Operand -def RndMode : Operand<i32> { - let PrintMethod = "printRoundingMode"; -} - -def RndDefault : PatLeaf<(i32 0)>; - -// PTX Predicate operand, default to (0, 0) = (zero-reg, none). -// Leave PrintMethod empty; predicate printing is defined elsewhere. -def pred : PredicateOperand<OtherVT, (ops RegPred, i32imm), - (ops (i1 zero_reg), (i32 2))>; - -def RndModeOperand : Operand<OtherVT> { - let MIOperandInfo = (ops i32imm); -} - -// Instruction Types -let Namespace = "PTX" in { - - class InstPTX<dag oops, dag iops, string asmstr, list<dag> pattern> - : Instruction { - dag OutOperandList = oops; - dag InOperandList = !con(iops, (ins pred:$_p)); - let AsmString = asmstr; // Predicate printing is defined elsewhere. - let Pattern = pattern; - let isPredicable = 1; - } -} diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp deleted file mode 100644 index 443cd54..0000000 --- a/lib/Target/PTX/PTXInstrInfo.cpp +++ /dev/null @@ -1,359 +0,0 @@ -//===-- PTXInstrInfo.cpp - PTX Instruction Information --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the PTX implementation of the TargetInstrInfo class. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "ptx-instrinfo" - -#include "PTXInstrInfo.h" -#include "PTX.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_ostream.h" - -#define GET_INSTRINFO_CTOR -#include "PTXGenInstrInfo.inc" - -using namespace llvm; - -PTXInstrInfo::PTXInstrInfo(PTXTargetMachine &_TM) - : PTXGenInstrInfo(), - RI(_TM, *this), TM(_TM) {} - -static const struct map_entry { - const TargetRegisterClass *cls; - const int opcode; -} map[] = { - { &PTX::RegI16RegClass, PTX::MOVU16rr }, - { &PTX::RegI32RegClass, PTX::MOVU32rr }, - { &PTX::RegI64RegClass, PTX::MOVU64rr }, - { &PTX::RegF32RegClass, PTX::MOVF32rr }, - { &PTX::RegF64RegClass, PTX::MOVF64rr }, - { &PTX::RegPredRegClass, PTX::MOVPREDrr } -}; - -void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, unsigned SrcReg, - bool KillSrc) const { - - const MachineRegisterInfo& MRI = MBB.getParent()->getRegInfo(); - //assert(MRI.getRegClass(SrcReg) == MRI.getRegClass(DstReg) && - // "Invalid register copy between two register classes"); - - for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++i) { - if (map[i].cls == MRI.getRegClass(DstReg)) { - const MCInstrDesc &MCID = get(map[i].opcode); - MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg). - addReg(SrcReg, getKillRegState(KillSrc)); - AddDefaultPredicate(MI); - return; - } - } - - llvm_unreachable("Impossible reg-to-reg copy"); -} - -bool PTXInstrInfo::copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg, - const TargetRegisterClass *DstRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const { - if (DstRC != SrcRC) - return false; - - for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) - if (DstRC == map[i].cls) { - const MCInstrDesc &MCID = get(map[i].opcode); - MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).addReg(SrcReg); - AddDefaultPredicate(MI); - return true; - } - - return false; -} - -bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SrcSubIdx, unsigned &DstSubIdx) const { - switch (MI.getOpcode()) { - default: - return false; - case PTX::MOVU16rr: - case PTX::MOVU32rr: - case PTX::MOVU64rr: - case PTX::MOVF32rr: - case PTX::MOVF64rr: - case PTX::MOVPREDrr: - assert(MI.getNumOperands() >= 2 && - MI.getOperand(0).isReg() && MI.getOperand(1).isReg() && - "Invalid register-register move instruction"); - SrcSubIdx = DstSubIdx = 0; // No sub-registers - DstReg = MI.getOperand(0).getReg(); - SrcReg = MI.getOperand(1).getReg(); - return true; - } -} - -// predicate support - -bool PTXInstrInfo::isPredicated(const MachineInstr *MI) const { - int i = MI->findFirstPredOperandIdx(); - return i != -1 && MI->getOperand(i).getReg() != PTX::NoRegister; -} - -bool PTXInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { - return !isPredicated(MI) && MI->isTerminator(); -} - -bool PTXInstrInfo:: -PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const { - if (Pred.size() < 2) - llvm_unreachable("lesser than 2 predicate operands are provided"); - - int i = MI->findFirstPredOperandIdx(); - if (i == -1) - llvm_unreachable("missing predicate operand"); - - MI->getOperand(i).setReg(Pred[0].getReg()); - MI->getOperand(i+1).setImm(Pred[1].getImm()); - - return true; -} - -bool PTXInstrInfo:: -SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const { - const MachineOperand &PredReg1 = Pred1[0]; - const MachineOperand &PredReg2 = Pred2[0]; - if (PredReg1.getReg() != PredReg2.getReg()) - return false; - - const MachineOperand &PredOp1 = Pred1[1]; - const MachineOperand &PredOp2 = Pred2[1]; - if (PredOp1.getImm() != PredOp2.getImm()) - return false; - - return true; -} - -bool PTXInstrInfo:: -DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const { - // If an instruction sets a predicate register, it defines a predicate. - - // TODO supprot 5-operand format of setp instruction - - if (MI->getNumOperands() < 1) - return false; - - const MachineOperand &MO = MI->getOperand(0); - - if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::RegPredRegClass) - return false; - - Pred.push_back(MO); - Pred.push_back(MachineOperand::CreateImm(PTXPredicate::None)); - return true; -} - -// branch support - -bool PTXInstrInfo:: -AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { - // TODO implement cases when AllowModify is true - - if (MBB.empty()) - return true; - - MachineBasicBlock::iterator iter = MBB.end(); - const MachineInstr& instLast1 = *--iter; - // for special case that MBB has only 1 instruction - const bool IsSizeOne = MBB.size() == 1; - // if IsSizeOne is true, *--iter and instLast2 are invalid - // we put a dummy value in instLast2 and desc2 since they are used - const MachineInstr& instLast2 = IsSizeOne ? instLast1 : *--iter; - - DEBUG(dbgs() << "\n"); - DEBUG(dbgs() << "AnalyzeBranch: opcode: " << instLast1.getOpcode() << "\n"); - DEBUG(dbgs() << "AnalyzeBranch: MBB: " << MBB.getName().str() << "\n"); - DEBUG(dbgs() << "AnalyzeBranch: TBB: " << TBB << "\n"); - DEBUG(dbgs() << "AnalyzeBranch: FBB: " << FBB << "\n"); - - // this block ends with no branches - if (!IsAnyKindOfBranch(instLast1)) { - DEBUG(dbgs() << "AnalyzeBranch: ends with no branch\n"); - return false; - } - - // this block ends with only an unconditional branch - if (instLast1.isUnconditionalBranch() && - // when IsSizeOne is true, it "absorbs" the evaluation of instLast2 - (IsSizeOne || !IsAnyKindOfBranch(instLast2))) { - DEBUG(dbgs() << "AnalyzeBranch: ends with only uncond branch\n"); - TBB = GetBranchTarget(instLast1); - return false; - } - - // this block ends with a conditional branch and - // it falls through to a successor block - if (instLast1.isConditionalBranch() && - IsAnySuccessorAlsoLayoutSuccessor(MBB)) { - DEBUG(dbgs() << "AnalyzeBranch: ends with cond branch and fall through\n"); - TBB = GetBranchTarget(instLast1); - int i = instLast1.findFirstPredOperandIdx(); - Cond.push_back(instLast1.getOperand(i)); - Cond.push_back(instLast1.getOperand(i+1)); - return false; - } - - // when IsSizeOne is true, we are done - if (IsSizeOne) - return true; - - // this block ends with a conditional branch - // followed by an unconditional branch - if (instLast2.isConditionalBranch() && - instLast1.isUnconditionalBranch()) { - DEBUG(dbgs() << "AnalyzeBranch: ends with cond and uncond branch\n"); - TBB = GetBranchTarget(instLast2); - FBB = GetBranchTarget(instLast1); - int i = instLast2.findFirstPredOperandIdx(); - Cond.push_back(instLast2.getOperand(i)); - Cond.push_back(instLast2.getOperand(i+1)); - return false; - } - - // branch cannot be understood - DEBUG(dbgs() << "AnalyzeBranch: cannot be understood\n"); - return true; -} - -unsigned PTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { - unsigned count = 0; - while (!MBB.empty()) - if (IsAnyKindOfBranch(MBB.back())) { - MBB.pop_back(); - ++count; - } else - break; - DEBUG(dbgs() << "RemoveBranch: MBB: " << MBB.getName().str() << "\n"); - DEBUG(dbgs() << "RemoveBranch: remove " << count << " branch inst\n"); - return count; -} - -unsigned PTXInstrInfo:: -InsertBranch(MachineBasicBlock &MBB, - MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, - DebugLoc DL) const { - DEBUG(dbgs() << "InsertBranch: MBB: " << MBB.getName().str() << "\n"); - DEBUG(if (TBB) dbgs() << "InsertBranch: TBB: " << TBB->getName().str() - << "\n"; - else dbgs() << "InsertBranch: TBB: (NULL)\n"); - DEBUG(if (FBB) dbgs() << "InsertBranch: FBB: " << FBB->getName().str() - << "\n"; - else dbgs() << "InsertBranch: FBB: (NULL)\n"); - DEBUG(dbgs() << "InsertBranch: Cond size: " << Cond.size() << "\n"); - - assert(TBB && "TBB is NULL"); - - if (FBB) { - BuildMI(&MBB, DL, get(PTX::BRAdp)) - .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(PTX::BRAd)) - .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None); - return 2; - } else if (Cond.size()) { - BuildMI(&MBB, DL, get(PTX::BRAdp)) - .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm()); - return 1; - } else { - BuildMI(&MBB, DL, get(PTX::BRAd)) - .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None); - return 1; - } -} - -// Memory operand folding for spills -void PTXInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MII, - unsigned SrcReg, bool isKill, int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("storeRegToStackSlot should not be called for PTX"); -} - -void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MII, - unsigned DestReg, int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("loadRegFromStackSlot should not be called for PTX"); -} - -// static helper routines - -MachineSDNode *PTXInstrInfo:: -GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, - DebugLoc dl, EVT VT, SDValue Op1) { - SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1); - SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32); - SDValue ops[] = { Op1, predReg, predOp }; - return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops)); -} - -MachineSDNode *PTXInstrInfo:: -GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, - DebugLoc dl, EVT VT, SDValue Op1, SDValue Op2) { - SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1); - SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32); - SDValue ops[] = { Op1, Op2, predReg, predOp }; - return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops)); -} - -void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) { - if (MI->findFirstPredOperandIdx() == -1) { - MI->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false)); - MI->addOperand(MachineOperand::CreateImm(PTXPredicate::None)); - } -} - -bool PTXInstrInfo::IsAnyKindOfBranch(const MachineInstr& inst) { - return inst.isTerminator() || inst.isBranch() || inst.isIndirectBranch(); -} - -bool PTXInstrInfo:: -IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB) { - for (MachineBasicBlock::const_succ_iterator - i = MBB.succ_begin(), e = MBB.succ_end(); i != e; ++i) - if (MBB.isLayoutSuccessor((const MachineBasicBlock*) &*i)) - return true; - return false; -} - -MachineBasicBlock *PTXInstrInfo::GetBranchTarget(const MachineInstr& inst) { - // FIXME So far all branch instructions put destination in 1st operand - const MachineOperand& target = inst.getOperand(0); - assert(target.isMBB() && "FIXME: detect branch target operand"); - return target.getMBB(); -} diff --git a/lib/Target/PTX/PTXInstrInfo.h b/lib/Target/PTX/PTXInstrInfo.h deleted file mode 100644 index fba89c0..0000000 --- a/lib/Target/PTX/PTXInstrInfo.h +++ /dev/null @@ -1,133 +0,0 @@ -//===-- PTXInstrInfo.h - PTX Instruction Information ------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the PTX implementation of the TargetInstrInfo class. -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_INSTR_INFO_H -#define PTX_INSTR_INFO_H - -#include "PTXRegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" - -#define GET_INSTRINFO_HEADER -#include "PTXGenInstrInfo.inc" - -namespace llvm { -class PTXTargetMachine; - -class MachineSDNode; -class SDValue; -class SelectionDAG; - -class PTXInstrInfo : public PTXGenInstrInfo { -private: - const PTXRegisterInfo RI; - PTXTargetMachine &TM; - -public: - explicit PTXInstrInfo(PTXTargetMachine &_TM); - - virtual const PTXRegisterInfo &getRegisterInfo() const { return RI; } - - virtual void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, unsigned SrcReg, - bool KillSrc) const; - - virtual bool copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg, - const TargetRegisterClass *DstRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const; - - virtual bool isMoveInstr(const MachineInstr& MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SrcSubIdx, unsigned &DstSubIdx) const; - - // predicate support - - virtual bool isPredicated(const MachineInstr *MI) const; - - virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const; - - virtual - bool PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl<MachineOperand> &Pred) const; - - virtual - bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, - const SmallVectorImpl<MachineOperand> &Pred2) const; - - virtual bool DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const; - - // PTX is fully-predicable - virtual bool isPredicable(MachineInstr *MI) const { return true; } - - // branch support - - virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify = false) const; - - virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; - - virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond, - DebugLoc DL) const; - - // Memory operand folding for spills - // TODO: Implement this eventually and get rid of storeRegToStackSlot and - // loadRegFromStackSlot. Doing so will get rid of the "stack" registers - // we currently use to spill, though I doubt the overall effect on ptxas - // output will be large. I have yet to see a case where ptxas is unable - // to see through the "stack" register usage and hence generates - // efficient code anyway. - // virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - // MachineInstr* MI, - // const SmallVectorImpl<unsigned> &Ops, - // int FrameIndex) const; - - virtual void storeRegToStackSlot(MachineBasicBlock& MBB, - MachineBasicBlock::iterator MII, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass* RC, - const TargetRegisterInfo* TRI) const; - virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MII, - unsigned DestReg, int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; - - // static helper routines - - static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, - DebugLoc dl, EVT VT, - SDValue Op1); - - static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, - DebugLoc dl, EVT VT, - SDValue Op1, SDValue Op2); - - static void AddDefaultPredicate(MachineInstr *MI); - - static bool IsAnyKindOfBranch(const MachineInstr& inst); - - static bool IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB); - - static MachineBasicBlock *GetBranchTarget(const MachineInstr& inst); -}; // class PTXInstrInfo -} // namespace llvm - -#endif // PTX_INSTR_INFO_H diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td deleted file mode 100644 index bead428..0000000 --- a/lib/Target/PTX/PTXInstrInfo.td +++ /dev/null @@ -1,1031 +0,0 @@ -//===-- PTXInstrInfo.td - PTX Instruction defs --------------*- tablegen-*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file describes the PTX instructions in TableGen format. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Instruction format superclass -//===----------------------------------------------------------------------===// - -include "PTXInstrFormats.td" - -//===----------------------------------------------------------------------===// -// Code Generation Predicates -//===----------------------------------------------------------------------===// - -// Shader Model Support -def FDivNeedsRoundingMode : Predicate<"getSubtarget().fdivNeedsRoundingMode()">; -def FDivNoRoundingMode : Predicate<"!getSubtarget().fdivNeedsRoundingMode()">; -def FMadNeedsRoundingMode : Predicate<"getSubtarget().fmadNeedsRoundingMode()">; -def FMadNoRoundingMode : Predicate<"!getSubtarget().fmadNeedsRoundingMode()">; - -// PTX Version Support -def SupportsPTX21 : Predicate<"getSubtarget().supportsPTX21()">; -def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">; -def SupportsPTX22 : Predicate<"getSubtarget().supportsPTX22()">; -def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">; -def SupportsPTX23 : Predicate<"getSubtarget().supportsPTX23()">; -def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">; - -// Fused-Multiply Add -def SupportsFMA : Predicate<"getSubtarget().supportsFMA()">; -def DoesNotSupportFMA : Predicate<"!getSubtarget().supportsFMA()">; - - - -// def SDT_PTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; -// def SDT_PTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; - -// def PTXcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PTXCallSeqStart, -// [SDNPHasChain, SDNPOutGlue]>; -// def PTXcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_PTXCallSeqEnd, -// [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; - -def PTXcall : SDNode<"PTXISD::CALL", SDTNone, - [SDNPHasChain, SDNPVariadic, SDNPOptInGlue, SDNPOutGlue]>; - - -// Branch & call targets have OtherVT type. -def brtarget : Operand<OtherVT>; -def calltarget : Operand<i32>; - -//===----------------------------------------------------------------------===// -// PTX Specific Node Definitions -//===----------------------------------------------------------------------===// - -// PTX allow generic 3-reg shifts like shl r0, r1, r2 -def PTXshl : SDNode<"ISD::SHL", SDTIntBinOp>; -def PTXsrl : SDNode<"ISD::SRL", SDTIntBinOp>; -def PTXsra : SDNode<"ISD::SRA", SDTIntBinOp>; - -def PTXexit - : SDNode<"PTXISD::EXIT", SDTNone, [SDNPHasChain]>; -def PTXret - : SDNode<"PTXISD::RET", SDTNone, - [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def PTXcopyaddress - : SDNode<"PTXISD::COPY_ADDRESS", SDTypeProfile<1, 1, []>, []>; - - - -//===----------------------------------------------------------------------===// -// Instruction Class Templates -//===----------------------------------------------------------------------===// - -// For floating-point instructions, we cannot just embed the pattern into the -// instruction definition since we need to muck around with the rounding mode, -// and I do not know how to insert constants into instructions directly from -// pattern matches. - -//===- Floating-Point Instructions - 2 Operand Form -----------------------===// -multiclass PTX_FLOAT_2OP<string opcstr> { - def rr32 : InstPTX<(outs RegF32:$d), - (ins RndMode:$r, RegF32:$a), - !strconcat(opcstr, "$r.f32\t$d, $a"), []>; - def ri32 : InstPTX<(outs RegF32:$d), - (ins RndMode:$r, f32imm:$a), - !strconcat(opcstr, "$r.f32\t$d, $a"), []>; - def rr64 : InstPTX<(outs RegF64:$d), - (ins RndMode:$r, RegF64:$a), - !strconcat(opcstr, "$r.f64\t$d, $a"), []>; - def ri64 : InstPTX<(outs RegF64:$d), - (ins RndMode:$r, f64imm:$a), - !strconcat(opcstr, "$r.f64\t$d, $a"), []>; -} - -//===- Floating-Point Instructions - 3 Operand Form -----------------------===// -multiclass PTX_FLOAT_3OP<string opcstr> { - def rr32 : InstPTX<(outs RegF32:$d), - (ins RndMode:$r, RegF32:$a, RegF32:$b), - !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>; - def ri32 : InstPTX<(outs RegF32:$d), - (ins RndMode:$r, RegF32:$a, f32imm:$b), - !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>; - def rr64 : InstPTX<(outs RegF64:$d), - (ins RndMode:$r, RegF64:$a, RegF64:$b), - !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>; - def ri64 : InstPTX<(outs RegF64:$d), - (ins RndMode:$r, RegF64:$a, f64imm:$b), - !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>; -} - -//===- Floating-Point Instructions - 4 Operand Form -----------------------===// -multiclass PTX_FLOAT_4OP<string opcstr> { - def rrr32 : InstPTX<(outs RegF32:$d), - (ins RndMode:$r, RegF32:$a, RegF32:$b, RegF32:$c), - !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>; - def rri32 : InstPTX<(outs RegF32:$d), - (ins RndMode:$r, RegF32:$a, RegF32:$b, f32imm:$c), - !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>; - def rii32 : InstPTX<(outs RegF32:$d), - (ins RndMode:$r, RegF32:$a, f32imm:$b, f32imm:$c), - !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>; - def rrr64 : InstPTX<(outs RegF64:$d), - (ins RndMode:$r, RegF64:$a, RegF64:$b, RegF64:$c), - !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>; - def rri64 : InstPTX<(outs RegF64:$d), - (ins RndMode:$r, RegF64:$a, RegF64:$b, f64imm:$c), - !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>; - def rii64 : InstPTX<(outs RegF64:$d), - (ins RndMode:$r, RegF64:$a, f64imm:$b, f64imm:$c), - !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>; -} - -//===- Integer Instructions - 3 Operand Form ------------------------------===// -multiclass PTX_INT3<string opcstr, SDNode opnode> { - def rr16 : InstPTX<(outs RegI16:$d), - (ins RegI16:$a, RegI16:$b), - !strconcat(opcstr, ".u16\t$d, $a, $b"), - [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>; - def ri16 : InstPTX<(outs RegI16:$d), - (ins RegI16:$a, i16imm:$b), - !strconcat(opcstr, ".u16\t$d, $a, $b"), - [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>; - def rr32 : InstPTX<(outs RegI32:$d), - (ins RegI32:$a, RegI32:$b), - !strconcat(opcstr, ".u32\t$d, $a, $b"), - [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>; - def ri32 : InstPTX<(outs RegI32:$d), - (ins RegI32:$a, i32imm:$b), - !strconcat(opcstr, ".u32\t$d, $a, $b"), - [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>; - def rr64 : InstPTX<(outs RegI64:$d), - (ins RegI64:$a, RegI64:$b), - !strconcat(opcstr, ".u64\t$d, $a, $b"), - [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>; - def ri64 : InstPTX<(outs RegI64:$d), - (ins RegI64:$a, i64imm:$b), - !strconcat(opcstr, ".u64\t$d, $a, $b"), - [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>; -} - -//===- Integer Instructions - 3 Operand Form (Signed) ---------------------===// -multiclass PTX_INT3_SIGNED<string opcstr, SDNode opnode> { - def rr16 : InstPTX<(outs RegI16:$d), - (ins RegI16:$a, RegI16:$b), - !strconcat(opcstr, ".s16\t$d, $a, $b"), - [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>; - def ri16 : InstPTX<(outs RegI16:$d), - (ins RegI16:$a, i16imm:$b), - !strconcat(opcstr, ".s16\t$d, $a, $b"), - [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>; - def rr32 : InstPTX<(outs RegI32:$d), - (ins RegI32:$a, RegI32:$b), - !strconcat(opcstr, ".s32\t$d, $a, $b"), - [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>; - def ri32 : InstPTX<(outs RegI32:$d), - (ins RegI32:$a, i32imm:$b), - !strconcat(opcstr, ".s32\t$d, $a, $b"), - [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>; - def rr64 : InstPTX<(outs RegI64:$d), - (ins RegI64:$a, RegI64:$b), - !strconcat(opcstr, ".s64\t$d, $a, $b"), - [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>; - def ri64 : InstPTX<(outs RegI64:$d), - (ins RegI64:$a, i64imm:$b), - !strconcat(opcstr, ".s64\t$d, $a, $b"), - [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>; -} - -//===- Bitwise Logic Instructions - 3 Operand Form ------------------------===// -multiclass PTX_LOGIC<string opcstr, SDNode opnode> { - def ripreds : InstPTX<(outs RegPred:$d), - (ins RegPred:$a, i1imm:$b), - !strconcat(opcstr, ".pred\t$d, $a, $b"), - [(set RegPred:$d, (opnode RegPred:$a, imm:$b))]>; - def rrpreds : InstPTX<(outs RegPred:$d), - (ins RegPred:$a, RegPred:$b), - !strconcat(opcstr, ".pred\t$d, $a, $b"), - [(set RegPred:$d, (opnode RegPred:$a, RegPred:$b))]>; - def rr16 : InstPTX<(outs RegI16:$d), - (ins RegI16:$a, RegI16:$b), - !strconcat(opcstr, ".b16\t$d, $a, $b"), - [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>; - def ri16 : InstPTX<(outs RegI16:$d), - (ins RegI16:$a, i16imm:$b), - !strconcat(opcstr, ".b16\t$d, $a, $b"), - [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>; - def rr32 : InstPTX<(outs RegI32:$d), - (ins RegI32:$a, RegI32:$b), - !strconcat(opcstr, ".b32\t$d, $a, $b"), - [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>; - def ri32 : InstPTX<(outs RegI32:$d), - (ins RegI32:$a, i32imm:$b), - !strconcat(opcstr, ".b32\t$d, $a, $b"), - [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>; - def rr64 : InstPTX<(outs RegI64:$d), - (ins RegI64:$a, RegI64:$b), - !strconcat(opcstr, ".b64\t$d, $a, $b"), - [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>; - def ri64 : InstPTX<(outs RegI64:$d), - (ins RegI64:$a, i64imm:$b), - !strconcat(opcstr, ".b64\t$d, $a, $b"), - [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>; -} - -//===- Integer Shift Instructions - 3 Operand Form ------------------------===// -multiclass PTX_INT3ntnc<string opcstr, SDNode opnode> { - def rr16 : InstPTX<(outs RegI16:$d), - (ins RegI16:$a, RegI16:$b), - !strconcat(opcstr, "16\t$d, $a, $b"), - [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>; - def rr32 : InstPTX<(outs RegI32:$d), - (ins RegI32:$a, RegI32:$b), - !strconcat(opcstr, "32\t$d, $a, $b"), - [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>; - def rr64 : InstPTX<(outs RegI64:$d), - (ins RegI64:$a, RegI64:$b), - !strconcat(opcstr, "64\t$d, $a, $b"), - [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>; - def ri16 : InstPTX<(outs RegI16:$d), - (ins RegI16:$a, i16imm:$b), - !strconcat(opcstr, "16\t$d, $a, $b"), - [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>; - def ri32 : InstPTX<(outs RegI32:$d), - (ins RegI32:$a, i32imm:$b), - !strconcat(opcstr, "32\t$d, $a, $b"), - [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>; - def ri64 : InstPTX<(outs RegI64:$d), - (ins RegI64:$a, i64imm:$b), - !strconcat(opcstr, "64\t$d, $a, $b"), - [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>; - def ir16 : InstPTX<(outs RegI16:$d), - (ins i16imm:$a, RegI16:$b), - !strconcat(opcstr, "16\t$d, $a, $b"), - [(set RegI16:$d, (opnode imm:$a, RegI16:$b))]>; - def ir32 : InstPTX<(outs RegI32:$d), - (ins i32imm:$a, RegI32:$b), - !strconcat(opcstr, "32\t$d, $a, $b"), - [(set RegI32:$d, (opnode imm:$a, RegI32:$b))]>; - def ir64 : InstPTX<(outs RegI64:$d), - (ins i64imm:$a, RegI64:$b), - !strconcat(opcstr, "64\t$d, $a, $b"), - [(set RegI64:$d, (opnode imm:$a, RegI64:$b))]>; -} - -//===- Set Predicate Instructions (Int) - 3/4 Operand Forms ---------------===// -multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls, - CondCode cmp, string cmpstr> { - // TODO support 5-operand format: p|q, a, b, c - - def rr - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b), - !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), - [(set RegPred:$p, (setcc RC:$a, RC:$b, cmp))]>; - def ri - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b), - !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), - [(set RegPred:$p, (setcc RC:$a, imm:$b, cmp))]>; - - def rr_and_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".and.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>; - def ri_and_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".and.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), - RegPred:$c))]>; - def rr_or_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".or.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>; - def ri_or_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".or.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>; - def rr_xor_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".xor.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>; - def ri_xor_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".xor.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), - RegPred:$c))]>; - - def rr_and_not_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".and.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), - (not RegPred:$c)))]>; - def ri_and_not_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".and.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), - (not RegPred:$c)))]>; - def rr_or_not_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".or.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), - (not RegPred:$c)))]>; - def ri_or_not_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".or.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), - (not RegPred:$c)))]>; - def rr_xor_not_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".xor.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), - (not RegPred:$c)))]>; - def ri_xor_not_r - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".xor.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), - (not RegPred:$c)))]>; -} - -//===- Set Predicate Instructions (FP) - 3/4 Operand Form -----------------===// -multiclass PTX_SETP_FP<RegisterClass RC, string regclsname, Operand immcls, - CondCode ucmp, CondCode ocmp, string cmpstr> { - // TODO support 5-operand format: p|q, a, b, c - - def rr_u - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b), - !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"), - [(set RegPred:$p, (setcc RC:$a, RC:$b, ucmp))]>; - def rr_o - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b), - !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), - [(set RegPred:$p, (setcc RC:$a, RC:$b, ocmp))]>; - - def ri_u - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b), - !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"), - [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ucmp))]>; - def ri_o - : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b), - !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), - [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ocmp))]>; - - def rr_and_r_u - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, "u.and.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), - RegPred:$c))]>; - def rr_and_r_o - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".and.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), - RegPred:$c))]>; - - def rr_or_r_u - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, "u.or.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>; - def rr_or_r_o - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".or.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>; - - def rr_xor_r_u - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, "u.xor.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), - RegPred:$c))]>; - def rr_xor_r_o - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".xor.", regclsname, - "\t$p, $a, $b, $c"), - [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), - RegPred:$c))]>; - - def rr_and_not_r_u - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, "u.and.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), - (not RegPred:$c)))]>; - def rr_and_not_r_o - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".and.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), - (not RegPred:$c)))]>; - - def rr_or_not_r_u - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, "u.or.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), - (not RegPred:$c)))]>; - def rr_or_not_r_o - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".or.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), - (not RegPred:$c)))]>; - - def rr_xor_not_r_u - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, "u.xor.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), - (not RegPred:$c)))]>; - def rr_xor_not_r_o - : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c), - !strconcat("setp.", cmpstr, ".xor.", regclsname, - "\t$p, $a, $b, !$c"), - [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), - (not RegPred:$c)))]>; -} - -//===- Select Predicate Instructions - 4 Operand Form ---------------------===// -multiclass PTX_SELP<RegisterClass RC, string regclsname, Operand immcls, - SDNode immnode> { - def rr - : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, RC:$c), - !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"), - [(set RC:$r, (select RegPred:$a, RC:$b, RC:$c))]>; - def ri - : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, immcls:$c), - !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"), - [(set RC:$r, (select RegPred:$a, RC:$b, immnode:$c))]>; - def ii - : InstPTX<(outs RC:$r), (ins RegPred:$a, immcls:$b, immcls:$c), - !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"), - [(set RC:$r, (select RegPred:$a, immnode:$b, immnode:$c))]>; -} - - - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -///===- Integer Arithmetic Instructions -----------------------------------===// - -defm ADD : PTX_INT3<"add", add>; -defm SUB : PTX_INT3<"sub", sub>; -defm MUL : PTX_INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies -defm DIV : PTX_INT3<"div", udiv>; -defm SDIV : PTX_INT3_SIGNED<"div", sdiv>; -defm REM : PTX_INT3<"rem", urem>; - -///===- Floating-Point Arithmetic Instructions ----------------------------===// - -// FNEG -defm FNEG : PTX_FLOAT_2OP<"neg">; - -// Standard Binary Operations -defm FADD : PTX_FLOAT_3OP<"add">; -defm FSUB : PTX_FLOAT_3OP<"sub">; -defm FMUL : PTX_FLOAT_3OP<"mul">; -defm FDIV : PTX_FLOAT_3OP<"div">; - -// Multi-operation hybrid instructions -defm FMAD : PTX_FLOAT_4OP<"mad">, Requires<[SupportsFMA]>; - - -///===- Floating-Point Intrinsic Instructions -----------------------------===// - -// SQRT -def FSQRTrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a), - "sqrt$r.f32\t$d, $a", []>; -def FSQRTri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a), - "sqrt$r.f32\t$d, $a", []>; -def FSQRTrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a), - "sqrt$r.f64\t$d, $a", []>; -def FSQRTri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a), - "sqrt$r.f64\t$d, $a", []>; - -// SIN -def FSINrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a), - "sin$r.f32\t$d, $a", []>; -def FSINri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a), - "sin$r.f32\t$d, $a", []>; -def FSINrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a), - "sin$r.f64\t$d, $a", []>; -def FSINri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a), - "sin$r.f64\t$d, $a", []>; - -// COS -def FCOSrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a), - "cos$r.f32\t$d, $a", []>; -def FCOSri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a), - "cos$r.f32\t$d, $a", []>; -def FCOSrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a), - "cos$r.f64\t$d, $a", []>; -def FCOSri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a), - "cos$r.f64\t$d, $a", []>; - - - - -///===- Comparison and Selection Instructions -----------------------------===// - -// .setp - -// Compare u16 - -defm SETPEQu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETEQ, "eq">; -defm SETPNEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETNE, "ne">; -defm SETPLTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULT, "lt">; -defm SETPLEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULE, "le">; -defm SETPGTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGT, "gt">; -defm SETPGEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGE, "ge">; -defm SETPLTs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETLT, "lt">; -defm SETPLEs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETLE, "le">; -defm SETPGTs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETGT, "gt">; -defm SETPGEs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETGE, "ge">; - -// Compare u32 - -defm SETPEQu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETEQ, "eq">; -defm SETPNEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETNE, "ne">; -defm SETPLTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULT, "lt">; -defm SETPLEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULE, "le">; -defm SETPGTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGT, "gt">; -defm SETPGEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGE, "ge">; -defm SETPLTs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETLT, "lt">; -defm SETPLEs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETLE, "le">; -defm SETPGTs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETGT, "gt">; -defm SETPGEs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETGE, "ge">; - -// Compare u64 - -defm SETPEQu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETEQ, "eq">; -defm SETPNEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETNE, "ne">; -defm SETPLTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULT, "lt">; -defm SETPLEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULE, "le">; -defm SETPGTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGT, "gt">; -defm SETPGEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGE, "ge">; -defm SETPLTs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETLT, "lt">; -defm SETPLEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETLE, "le">; -defm SETPGTs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGT, "gt">; -defm SETPGEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGE, "ge">; - -// Compare f32 - -defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUEQ, SETOEQ, "eq">; -defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUNE, SETONE, "ne">; -defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULT, SETOLT, "lt">; -defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULE, SETOLE, "le">; -defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGT, SETOGT, "gt">; -defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGE, SETOGE, "ge">; - -// Compare f64 - -defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUEQ, SETOEQ, "eq">; -defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUNE, SETONE, "ne">; -defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULT, SETOLT, "lt">; -defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULE, SETOLE, "le">; -defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGT, SETOGT, "gt">; -defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGE, SETOGE, "ge">; - -// .selp - -defm SELPi16 : PTX_SELP<RegI16, "u16", i16imm, imm>; -defm SELPi32 : PTX_SELP<RegI32, "u32", i32imm, imm>; -defm SELPi64 : PTX_SELP<RegI64, "u64", i64imm, imm>; -defm SELPf32 : PTX_SELP<RegF32, "f32", f32imm, fpimm>; -defm SELPf64 : PTX_SELP<RegF64, "f64", f64imm, fpimm>; - -///===- Logic and Shift Instructions --------------------------------------===// - -defm SHL : PTX_INT3ntnc<"shl.b", PTXshl>; -defm SRL : PTX_INT3ntnc<"shr.u", PTXsrl>; -defm SRA : PTX_INT3ntnc<"shr.s", PTXsra>; - -defm AND : PTX_LOGIC<"and", and>; -defm OR : PTX_LOGIC<"or", or>; -defm XOR : PTX_LOGIC<"xor", xor>; - -///===- Data Movement and Conversion Instructions -------------------------===// - -// any_extend -// Implement the anyext instruction in terms of the PTX cvt instructions. -//def : Pat<(i32 (anyext RegI16:$a)), (CVT_u32_u16 RegI16:$a)>; -//def : Pat<(i64 (anyext RegI16:$a)), (CVT_u64_u16 RegI16:$a)>; -//def : Pat<(i64 (anyext RegI32:$a)), (CVT_u64_u32 RegI32:$a)>; - -// bitconvert -// These instructions implement the bit-wise conversion between integer and -// floating-point types. -def MOVi32f32 - : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "mov.b32\t$d, $a", []>; -def MOVf32i32 - : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "mov.b32\t$d, $a", []>; -def MOVi64f64 - : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "mov.b64\t$d, $a", []>; -def MOVf64i64 - : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "mov.b64\t$d, $a", []>; - -let neverHasSideEffects = 1 in { - def MOVPREDrr - : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>; - def MOVU16rr - : InstPTX<(outs RegI16:$d), (ins RegI16:$a), "mov.u16\t$d, $a", []>; - def MOVU32rr - : InstPTX<(outs RegI32:$d), (ins RegI32:$a), "mov.u32\t$d, $a", []>; - def MOVU64rr - : InstPTX<(outs RegI64:$d), (ins RegI64:$a), "mov.u64\t$d, $a", []>; - def MOVF32rr - : InstPTX<(outs RegF32:$d), (ins RegF32:$a), "mov.f32\t$d, $a", []>; - def MOVF64rr - : InstPTX<(outs RegF64:$d), (ins RegF64:$a), "mov.f64\t$d, $a", []>; -} - -let isReMaterializable = 1, isAsCheapAsAMove = 1 in { - def MOVPREDri - : InstPTX<(outs RegPred:$d), (ins i1imm:$a), "mov.pred\t$d, $a", - [(set RegPred:$d, imm:$a)]>; - def MOVU16ri - : InstPTX<(outs RegI16:$d), (ins i16imm:$a), "mov.u16\t$d, $a", - [(set RegI16:$d, imm:$a)]>; - def MOVU32ri - : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", - [(set RegI32:$d, imm:$a)]>; - def MOVU64ri - : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", - [(set RegI64:$d, imm:$a)]>; - def MOVF32ri - : InstPTX<(outs RegF32:$d), (ins f32imm:$a), "mov.f32\t$d, $a", - [(set RegF32:$d, fpimm:$a)]>; - def MOVF64ri - : InstPTX<(outs RegF64:$d), (ins f64imm:$a), "mov.f64\t$d, $a", - [(set RegF64:$d, fpimm:$a)]>; -} - -let isReMaterializable = 1, isAsCheapAsAMove = 1 in { - def MOVaddr32 - : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", - [(set RegI32:$d, (PTXcopyaddress tglobaladdr:$a))]>; - def MOVaddr64 - : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", - [(set RegI64:$d, (PTXcopyaddress tglobaladdr:$a))]>; - def MOVframe32 - : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "cvta.local.u32\t$d, $a", - [(set RegI32:$d, (PTXcopyaddress frameindex:$a))]>; - def MOVframe64 - : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "cvta.local.u64\t$d, $a", - [(set RegI64:$d, (PTXcopyaddress frameindex:$a))]>; -} - -// PTX cvt instructions -// Note all of these may actually be used, we just define all possible patterns -// here (that make sense). -// FIXME: Can we collapse this somehow into a multiclass def? - -// To i16 -def CVTu16u32 - : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a", []>; -def CVTu16u64 - : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a", []>; -def CVTu16f32 - : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a), - "cvt$r.u16.f32\t$d, $a", []>; -def CVTs16f32 - : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a), - "cvt$r.s16.f32\t$d, $a", []>; -def CVTu16f64 - : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a), - "cvt$r.u16.f64\t$d, $a", []>; -def CVTs16f64 - : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a), - "cvt$r.s16.f64\t$d, $a", []>; - -// To i32 -def CVTu32u16 - : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a", []>; -def CVTs32s16 - : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.s32.s16\t$d, $a", []>; -def CVTu32u64 - : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a", []>; -def CVTu32f32 - : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a), - "cvt$r.u32.f32\t$d, $a", []>; -def CVTs32f32 - : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a), - "cvt$r.s32.f32\t$d, $a", []>; -def CVTu32f64 - : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a), - "cvt$r.u32.f64\t$d, $a", []>; -def CVTs32f64 - : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a), - "cvt$r.s32.f64\t$d, $a", []>; - -// To i64 -def CVTu64u16 - : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a", []>; -def CVTs64s16 - : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.s64.s16\t$d, $a", []>; -def CVTu64u32 - : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a", []>; -def CVTs64s32 - : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.s64.s32\t$d, $a", []>; -def CVTu64f32 - : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a), - "cvt$r.u64.f32\t$d, $a", []>; -def CVTs64f32 - : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a), - "cvt$r.s64.f32\t$d, $a", []>; -def CVTu64f64 - : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a), - "cvt$r.u64.f64\t$d, $a", []>; -def CVTs64f64 - : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a), - "cvt$r.s64.f64\t$d, $a", []>; - -// To f32 -def CVTf32u16 - : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a), - "cvt$r.f32.u16\t$d, $a", []>; -def CVTf32s16 - : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a), - "cvt$r.f32.s16\t$d, $a", []>; -def CVTf32u32 - : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a), - "cvt$r.f32.u32\t$d, $a", []>; -def CVTf32s32 - : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a), - "cvt$r.f32.s32\t$d, $a", []>; -def CVTf32u64 - : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a), - "cvt$r.f32.u64\t$d, $a", []>; -def CVTf32s64 - : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a), - "cvt$r.f32.s64\t$d, $a", []>; -def CVTf32f64 - : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF64:$a), - "cvt$r.f32.f64\t$d, $a", []>; - -// To f64 -def CVTf64u16 - : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a), - "cvt$r.f64.u16\t$d, $a", []>; -def CVTf64s16 - : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a), - "cvt$r.f64.s16\t$d, $a", []>; -def CVTf64u32 - : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a), - "cvt$r.f64.u32\t$d, $a", []>; -def CVTf64s32 - : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a), - "cvt$r.f64.s32\t$d, $a", []>; -def CVTf64u64 - : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a), - "cvt$r.f64.u64\t$d, $a", []>; -def CVTf64s64 - : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a), - "cvt$r.f64.s64\t$d, $a", []>; -def CVTf64f32 - : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a", []>; - - ///===- Control Flow Instructions -----------------------------------------===// - -let isBranch = 1, isTerminator = 1, isBarrier = 1 in { - def BRAd - : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>; -} - -let isBranch = 1, isTerminator = 1 in { - // FIXME: The pattern part is blank because I cannot (or do not yet know - // how to) use the first operand of PredicateOperand (a RegPred register) here - // When this is revisited, make sure to also look at LowerSETCC and try to - // fold it into negated predicates, if possible. - def BRAdp - : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", - [/*(brcond pred:$_p, bb:$d)*/]>; -} - -let isReturn = 1, isTerminator = 1, isBarrier = 1 in { - def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>; - def RET : InstPTX<(outs), (ins), "ret", [(PTXret)]>; -} - -let hasSideEffects = 1 in { - def CALL : InstPTX<(outs), (ins), "call", [(PTXcall)]>; -} - -///===- Parameter Passing Pseudo-Instructions -----------------------------===// - -def READPARAMPRED : InstPTX<(outs RegPred:$a), (ins i32imm:$b), - "mov.pred\t$a, %arg$b", []>; -def READPARAMI16 : InstPTX<(outs RegI16:$a), (ins i32imm:$b), - "mov.b16\t$a, %arg$b", []>; -def READPARAMI32 : InstPTX<(outs RegI32:$a), (ins i32imm:$b), - "mov.b32\t$a, %arg$b", []>; -def READPARAMI64 : InstPTX<(outs RegI64:$a), (ins i32imm:$b), - "mov.b64\t$a, %arg$b", []>; -def READPARAMF32 : InstPTX<(outs RegF32:$a), (ins i32imm:$b), - "mov.f32\t$a, %arg$b", []>; -def READPARAMF64 : InstPTX<(outs RegF64:$a), (ins i32imm:$b), - "mov.f64\t$a, %arg$b", []>; - -def WRITEPARAMPRED : InstPTX<(outs), (ins RegPred:$a), "//w", []>; -def WRITEPARAMI16 : InstPTX<(outs), (ins RegI16:$a), "//w", []>; -def WRITEPARAMI32 : InstPTX<(outs), (ins RegI32:$a), "//w", []>; -def WRITEPARAMI64 : InstPTX<(outs), (ins RegI64:$a), "//w", []>; -def WRITEPARAMF32 : InstPTX<(outs), (ins RegF32:$a), "//w", []>; -def WRITEPARAMF64 : InstPTX<(outs), (ins RegF64:$a), "//w", []>; - - -//===----------------------------------------------------------------------===// -// Instruction Selection Patterns -//===----------------------------------------------------------------------===// - -// FADD -def : Pat<(f32 (fadd RegF32:$a, RegF32:$b)), - (FADDrr32 RndDefault, RegF32:$a, RegF32:$b)>; -def : Pat<(f32 (fadd RegF32:$a, fpimm:$b)), - (FADDri32 RndDefault, RegF32:$a, fpimm:$b)>; -def : Pat<(f64 (fadd RegF64:$a, RegF64:$b)), - (FADDrr64 RndDefault, RegF64:$a, RegF64:$b)>; -def : Pat<(f64 (fadd RegF64:$a, fpimm:$b)), - (FADDri64 RndDefault, RegF64:$a, fpimm:$b)>; - -// FSUB -def : Pat<(f32 (fsub RegF32:$a, RegF32:$b)), - (FSUBrr32 RndDefault, RegF32:$a, RegF32:$b)>; -def : Pat<(f32 (fsub RegF32:$a, fpimm:$b)), - (FSUBri32 RndDefault, RegF32:$a, fpimm:$b)>; -def : Pat<(f64 (fsub RegF64:$a, RegF64:$b)), - (FSUBrr64 RndDefault, RegF64:$a, RegF64:$b)>; -def : Pat<(f64 (fsub RegF64:$a, fpimm:$b)), - (FSUBri64 RndDefault, RegF64:$a, fpimm:$b)>; - -// FMUL -def : Pat<(f32 (fmul RegF32:$a, RegF32:$b)), - (FMULrr32 RndDefault, RegF32:$a, RegF32:$b)>; -def : Pat<(f32 (fmul RegF32:$a, fpimm:$b)), - (FMULri32 RndDefault, RegF32:$a, fpimm:$b)>; -def : Pat<(f64 (fmul RegF64:$a, RegF64:$b)), - (FMULrr64 RndDefault, RegF64:$a, RegF64:$b)>; -def : Pat<(f64 (fmul RegF64:$a, fpimm:$b)), - (FMULri64 RndDefault, RegF64:$a, fpimm:$b)>; - -// FDIV -def : Pat<(f32 (fdiv RegF32:$a, RegF32:$b)), - (FDIVrr32 RndDefault, RegF32:$a, RegF32:$b)>; -def : Pat<(f32 (fdiv RegF32:$a, fpimm:$b)), - (FDIVri32 RndDefault, RegF32:$a, fpimm:$b)>; -def : Pat<(f64 (fdiv RegF64:$a, RegF64:$b)), - (FDIVrr64 RndDefault, RegF64:$a, RegF64:$b)>; -def : Pat<(f64 (fdiv RegF64:$a, fpimm:$b)), - (FDIVri64 RndDefault, RegF64:$a, fpimm:$b)>; - -// FMUL+FADD -def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), RegF32:$c)), - (FMADrrr32 RndDefault, RegF32:$a, RegF32:$b, RegF32:$c)>, - Requires<[SupportsFMA]>; -def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)), - (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>, - Requires<[SupportsFMA]>; -def : Pat<(f32 (fadd (fmul RegF32:$a, fpimm:$b), fpimm:$c)), - (FMADrrr32 RndDefault, RegF32:$a, fpimm:$b, fpimm:$c)>, - Requires<[SupportsFMA]>; -def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)), - (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>, - Requires<[SupportsFMA]>; -def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), RegF64:$c)), - (FMADrrr64 RndDefault, RegF64:$a, RegF64:$b, RegF64:$c)>, - Requires<[SupportsFMA]>; -def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), fpimm:$c)), - (FMADrri64 RndDefault, RegF64:$a, RegF64:$b, fpimm:$c)>, - Requires<[SupportsFMA]>; -def : Pat<(f64 (fadd (fmul RegF64:$a, fpimm:$b), fpimm:$c)), - (FMADrri64 RndDefault, RegF64:$a, fpimm:$b, fpimm:$c)>, - Requires<[SupportsFMA]>; - -// FNEG -def : Pat<(f32 (fneg RegF32:$a)), (FNEGrr32 RndDefault, RegF32:$a)>; -def : Pat<(f32 (fneg fpimm:$a)), (FNEGri32 RndDefault, fpimm:$a)>; -def : Pat<(f64 (fneg RegF64:$a)), (FNEGrr64 RndDefault, RegF64:$a)>; -def : Pat<(f64 (fneg fpimm:$a)), (FNEGri64 RndDefault, fpimm:$a)>; - -// FSQRT -def : Pat<(f32 (fsqrt RegF32:$a)), (FSQRTrr32 RndDefault, RegF32:$a)>; -def : Pat<(f32 (fsqrt fpimm:$a)), (FSQRTri32 RndDefault, fpimm:$a)>; -def : Pat<(f64 (fsqrt RegF64:$a)), (FSQRTrr64 RndDefault, RegF64:$a)>; -def : Pat<(f64 (fsqrt fpimm:$a)), (FSQRTri64 RndDefault, fpimm:$a)>; - -// FSIN -def : Pat<(f32 (fsin RegF32:$a)), (FSINrr32 RndDefault, RegF32:$a)>; -def : Pat<(f32 (fsin fpimm:$a)), (FSINri32 RndDefault, fpimm:$a)>; -def : Pat<(f64 (fsin RegF64:$a)), (FSINrr64 RndDefault, RegF64:$a)>; -def : Pat<(f64 (fsin fpimm:$a)), (FSINri64 RndDefault, fpimm:$a)>; - -// FCOS -def : Pat<(f32 (fcos RegF32:$a)), (FCOSrr32 RndDefault, RegF32:$a)>; -def : Pat<(f32 (fcos fpimm:$a)), (FCOSri32 RndDefault, fpimm:$a)>; -def : Pat<(f64 (fcos RegF64:$a)), (FCOSrr64 RndDefault, RegF64:$a)>; -def : Pat<(f64 (fcos fpimm:$a)), (FCOSri64 RndDefault, fpimm:$a)>; - -// Type conversion notes: -// - PTX does not directly support converting a predicate to a value, so we -// use a select instruction to select either 0 or 1 (integer or fp) based -// on the truth value of the predicate. -// - PTX does not directly support converting to a predicate type, so we fake it -// by performing a greater-than test between the value and zero. This follows -// the C convention that any non-zero value is equivalent to 'true'. - -// Conversion to pred -def : Pat<(i1 (trunc RegI16:$a)), (SETPGTu16ri RegI16:$a, 0)>; -def : Pat<(i1 (trunc RegI32:$a)), (SETPGTu32ri RegI32:$a, 0)>; -def : Pat<(i1 (trunc RegI64:$a)), (SETPGTu64ri RegI64:$a, 0)>; -def : Pat<(i1 (fp_to_uint RegF32:$a)), (SETPGTu32ri (MOVi32f32 RegF32:$a), 0)>; -def : Pat<(i1 (fp_to_uint RegF64:$a)), (SETPGTu64ri (MOVi64f64 RegF64:$a), 0)>; - -// Conversion to u16 -def : Pat<(i16 (anyext RegPred:$a)), (SELPi16ii RegPred:$a, 1, 0)>; -def : Pat<(i16 (sext RegPred:$a)), (SELPi16ii RegPred:$a, 0xFFFF, 0)>; -def : Pat<(i16 (zext RegPred:$a)), (SELPi16ii RegPred:$a, 1, 0)>; -def : Pat<(i16 (trunc RegI32:$a)), (CVTu16u32 RegI32:$a)>; -def : Pat<(i16 (trunc RegI64:$a)), (CVTu16u64 RegI64:$a)>; -def : Pat<(i16 (fp_to_uint RegF32:$a)), (CVTu16f32 RndDefault, RegF32:$a)>; -def : Pat<(i16 (fp_to_sint RegF32:$a)), (CVTs16f32 RndDefault, RegF32:$a)>; -def : Pat<(i16 (fp_to_uint RegF64:$a)), (CVTu16f64 RndDefault, RegF64:$a)>; -def : Pat<(i16 (fp_to_sint RegF64:$a)), (CVTs16f64 RndDefault, RegF64:$a)>; - -// Conversion to u32 -def : Pat<(i32 (anyext RegPred:$a)), (SELPi32ii RegPred:$a, 1, 0)>; -def : Pat<(i32 (sext RegPred:$a)), (SELPi32ii RegPred:$a, 0xFFFFFFFF, 0)>; -def : Pat<(i32 (zext RegPred:$a)), (SELPi32ii RegPred:$a, 1, 0)>; -def : Pat<(i32 (anyext RegI16:$a)), (CVTu32u16 RegI16:$a)>; -def : Pat<(i32 (sext RegI16:$a)), (CVTs32s16 RegI16:$a)>; -def : Pat<(i32 (zext RegI16:$a)), (CVTu32u16 RegI16:$a)>; -def : Pat<(i32 (trunc RegI64:$a)), (CVTu32u64 RegI64:$a)>; -def : Pat<(i32 (fp_to_uint RegF32:$a)), (CVTu32f32 RndDefault, RegF32:$a)>; -def : Pat<(i32 (fp_to_sint RegF32:$a)), (CVTs32f32 RndDefault, RegF32:$a)>; -def : Pat<(i32 (fp_to_uint RegF64:$a)), (CVTu32f64 RndDefault, RegF64:$a)>; -def : Pat<(i32 (fp_to_sint RegF64:$a)), (CVTs32f64 RndDefault, RegF64:$a)>; -def : Pat<(i32 (bitconvert RegF32:$a)), (MOVi32f32 RegF32:$a)>; - -// Conversion to u64 -def : Pat<(i64 (anyext RegPred:$a)), (SELPi64ii RegPred:$a, 1, 0)>; -def : Pat<(i64 (sext RegPred:$a)), (SELPi64ii RegPred:$a, - 0xFFFFFFFFFFFFFFFF, 0)>; -def : Pat<(i64 (zext RegPred:$a)), (SELPi64ii RegPred:$a, 1, 0)>; -def : Pat<(i64 (anyext RegI16:$a)), (CVTu64u16 RegI16:$a)>; -def : Pat<(i64 (sext RegI16:$a)), (CVTs64s16 RegI16:$a)>; -def : Pat<(i64 (zext RegI16:$a)), (CVTu64u16 RegI16:$a)>; -def : Pat<(i64 (anyext RegI32:$a)), (CVTu64u32 RegI32:$a)>; -def : Pat<(i64 (sext RegI32:$a)), (CVTs64s32 RegI32:$a)>; -def : Pat<(i64 (zext RegI32:$a)), (CVTu64u32 RegI32:$a)>; -def : Pat<(i64 (fp_to_uint RegF32:$a)), (CVTu64f32 RndDefault, RegF32:$a)>; -def : Pat<(i64 (fp_to_sint RegF32:$a)), (CVTs64f32 RndDefault, RegF32:$a)>; -def : Pat<(i64 (fp_to_uint RegF64:$a)), (CVTu64f64 RndDefault, RegF64:$a)>; -def : Pat<(i64 (fp_to_sint RegF64:$a)), (CVTs64f64 RndDefault, RegF64:$a)>; -def : Pat<(i64 (bitconvert RegF64:$a)), (MOVi64f64 RegF64:$a)>; - -// Conversion to f32 -def : Pat<(f32 (uint_to_fp RegPred:$a)), (SELPf32rr RegPred:$a, - (MOVf32i32 0x3F800000), (MOVf32i32 0))>; -def : Pat<(f32 (uint_to_fp RegI16:$a)), (CVTf32u16 RndDefault, RegI16:$a)>; -def : Pat<(f32 (sint_to_fp RegI16:$a)), (CVTf32s16 RndDefault, RegI16:$a)>; -def : Pat<(f32 (uint_to_fp RegI32:$a)), (CVTf32u32 RndDefault, RegI32:$a)>; -def : Pat<(f32 (sint_to_fp RegI32:$a)), (CVTf32s32 RndDefault, RegI32:$a)>; -def : Pat<(f32 (uint_to_fp RegI64:$a)), (CVTf32u64 RndDefault, RegI64:$a)>; -def : Pat<(f32 (sint_to_fp RegI64:$a)), (CVTf32s64 RndDefault, RegI64:$a)>; -def : Pat<(f32 (fround RegF64:$a)), (CVTf32f64 RndDefault, RegF64:$a)>; -def : Pat<(f32 (bitconvert RegI32:$a)), (MOVf32i32 RegI32:$a)>; - -// Conversion to f64 -def : Pat<(f64 (uint_to_fp RegPred:$a)), (SELPf64rr RegPred:$a, - (MOVf64i64 0x3F80000000000000), (MOVf64i64 0))>; -def : Pat<(f64 (uint_to_fp RegI16:$a)), (CVTf64u16 RndDefault, RegI16:$a)>; -def : Pat<(f64 (sint_to_fp RegI16:$a)), (CVTf64s16 RndDefault, RegI16:$a)>; -def : Pat<(f64 (uint_to_fp RegI32:$a)), (CVTf64u32 RndDefault, RegI32:$a)>; -def : Pat<(f64 (sint_to_fp RegI32:$a)), (CVTf64s32 RndDefault, RegI32:$a)>; -def : Pat<(f64 (uint_to_fp RegI64:$a)), (CVTf64u64 RndDefault, RegI64:$a)>; -def : Pat<(f64 (sint_to_fp RegI64:$a)), (CVTf64s64 RndDefault, RegI64:$a)>; -def : Pat<(f64 (fextend RegF32:$a)), (CVTf64f32 RegF32:$a)>; -def : Pat<(f64 (bitconvert RegI64:$a)), (MOVf64i64 RegI64:$a)>; - -// setcc - predicate inversion for branch conditions -def : Pat<(i1 (setcc RegPred:$a, imm:$b, SETNE)), - (XORripreds RegPred:$a, imm:$b)>; - -///===- Intrinsic Instructions --------------------------------------------===// -include "PTXIntrinsicInstrInfo.td" - -///===- Load/Store Instructions -------------------------------------------===// -include "PTXInstrLoadStore.td" - diff --git a/lib/Target/PTX/PTXInstrLoadStore.td b/lib/Target/PTX/PTXInstrLoadStore.td deleted file mode 100644 index 7a62684..0000000 --- a/lib/Target/PTX/PTXInstrLoadStore.td +++ /dev/null @@ -1,278 +0,0 @@ -//===- PTXInstrLoadStore.td - PTX Load/Store Instruction Defs -*- tablegen-*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file describes the PTX load/store instructions in TableGen format. -// -//===----------------------------------------------------------------------===// - - -// Addressing Predicates -// We have to differentiate between 32- and 64-bit pointer types -def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">; -def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">; - -//===----------------------------------------------------------------------===// -// Pattern Fragments for Loads/Stores -//===----------------------------------------------------------------------===// - -def load_global : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - const Value *Src; - const PointerType *PT; - if ((Src = cast<LoadSDNode>(N)->getSrcValue()) && - (PT = dyn_cast<PointerType>(Src->getType()))) - return PT->getAddressSpace() == PTXStateSpace::Global; - return false; -}]>; - -def load_constant : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - const Value *Src; - const PointerType *PT; - if ((Src = cast<LoadSDNode>(N)->getSrcValue()) && - (PT = dyn_cast<PointerType>(Src->getType()))) - return PT->getAddressSpace() == PTXStateSpace::Constant; - return false; -}]>; - -def load_shared : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - const Value *Src; - const PointerType *PT; - if ((Src = cast<LoadSDNode>(N)->getSrcValue()) && - (PT = dyn_cast<PointerType>(Src->getType()))) - return PT->getAddressSpace() == PTXStateSpace::Shared; - return false; -}]>; - -def store_global - : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{ - const Value *Src; - const PointerType *PT; - if ((Src = cast<StoreSDNode>(N)->getSrcValue()) && - (PT = dyn_cast<PointerType>(Src->getType()))) - return PT->getAddressSpace() == PTXStateSpace::Global; - return false; -}]>; - -def store_shared - : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{ - const Value *Src; - const PointerType *PT; - if ((Src = cast<StoreSDNode>(N)->getSrcValue()) && - (PT = dyn_cast<PointerType>(Src->getType()))) - return PT->getAddressSpace() == PTXStateSpace::Shared; - return false; -}]>; - -// Addressing modes. -def ADDRrr32 : ComplexPattern<i32, 2, "SelectADDRrr", [], []>; -def ADDRrr64 : ComplexPattern<i64, 2, "SelectADDRrr", [], []>; -def ADDRri32 : ComplexPattern<i32, 2, "SelectADDRri", [], []>; -def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri", [], []>; -def ADDRii32 : ComplexPattern<i32, 2, "SelectADDRii", [], []>; -def ADDRii64 : ComplexPattern<i64, 2, "SelectADDRii", [], []>; -def ADDRlocal32 : ComplexPattern<i32, 2, "SelectADDRlocal", [], []>; -def ADDRlocal64 : ComplexPattern<i64, 2, "SelectADDRlocal", [], []>; - -// Address operands -def MEMri32 : Operand<i32> { - let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops RegI32, i32imm); -} -def MEMri64 : Operand<i64> { - let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops RegI64, i64imm); -} -def LOCALri32 : Operand<i32> { - let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops i32imm, i32imm); -} -def LOCALri64 : Operand<i64> { - let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops i64imm, i64imm); -} -def MEMii32 : Operand<i32> { - let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops i32imm, i32imm); -} -def MEMii64 : Operand<i64> { - let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops i64imm, i64imm); -} -// The operand here does not correspond to an actual address, so we -// can use i32 in 64-bit address modes. -def MEMpi : Operand<i32> { - let PrintMethod = "printParamOperand"; - let MIOperandInfo = (ops i32imm); -} -def MEMret : Operand<i32> { - let PrintMethod = "printReturnOperand"; - let MIOperandInfo = (ops i32imm); -} - - -// Load/store .param space -def PTXloadparam - : SDNode<"PTXISD::LOAD_PARAM", SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>, - [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>; -def PTXstoreparam - : SDNode<"PTXISD::STORE_PARAM", SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>, - [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>; - -def PTXreadparam - : SDNode<"PTXISD::READ_PARAM", SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>, - [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>; -def PTXwriteparam - : SDNode<"PTXISD::WRITE_PARAM", SDTypeProfile<0, 1, []>, - [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>; - - - -//===----------------------------------------------------------------------===// -// Classes for loads/stores -//===----------------------------------------------------------------------===// -multiclass PTX_LD<string opstr, string typestr, - RegisterClass RC, PatFrag pat_load> { - def rr32 : InstPTX<(outs RC:$d), - (ins MEMri32:$a), - !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (pat_load ADDRrr32:$a))]>, - Requires<[Use32BitAddresses]>; - def rr64 : InstPTX<(outs RC:$d), - (ins MEMri64:$a), - !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (pat_load ADDRrr64:$a))]>, - Requires<[Use64BitAddresses]>; - def ri32 : InstPTX<(outs RC:$d), - (ins MEMri32:$a), - !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (pat_load ADDRri32:$a))]>, - Requires<[Use32BitAddresses]>; - def ri64 : InstPTX<(outs RC:$d), - (ins MEMri64:$a), - !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (pat_load ADDRri64:$a))]>, - Requires<[Use64BitAddresses]>; - def ii32 : InstPTX<(outs RC:$d), - (ins MEMii32:$a), - !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (pat_load ADDRii32:$a))]>, - Requires<[Use32BitAddresses]>; - def ii64 : InstPTX<(outs RC:$d), - (ins MEMii64:$a), - !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (pat_load ADDRii64:$a))]>, - Requires<[Use64BitAddresses]>; -} - -multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, - PatFrag pat_store> { - def rr32 : InstPTX<(outs), - (ins RC:$d, MEMri32:$a), - !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), - [(pat_store RC:$d, ADDRrr32:$a)]>, - Requires<[Use32BitAddresses]>; - def rr64 : InstPTX<(outs), - (ins RC:$d, MEMri64:$a), - !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), - [(pat_store RC:$d, ADDRrr64:$a)]>, - Requires<[Use64BitAddresses]>; - def ri32 : InstPTX<(outs), - (ins RC:$d, MEMri32:$a), - !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), - [(pat_store RC:$d, ADDRri32:$a)]>, - Requires<[Use32BitAddresses]>; - def ri64 : InstPTX<(outs), - (ins RC:$d, MEMri64:$a), - !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), - [(pat_store RC:$d, ADDRri64:$a)]>, - Requires<[Use64BitAddresses]>; - def ii32 : InstPTX<(outs), - (ins RC:$d, MEMii32:$a), - !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), - [(pat_store RC:$d, ADDRii32:$a)]>, - Requires<[Use32BitAddresses]>; - def ii64 : InstPTX<(outs), - (ins RC:$d, MEMii64:$a), - !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), - [(pat_store RC:$d, ADDRii64:$a)]>, - Requires<[Use64BitAddresses]>; -} - -multiclass PTX_LOCAL_LD_ST<string typestr, RegisterClass RC> { - def LDri32 : InstPTX<(outs RC:$d), (ins LOCALri32:$a), - !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (load_global ADDRlocal32:$a))]>; - def LDri64 : InstPTX<(outs RC:$d), (ins LOCALri64:$a), - !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (load_global ADDRlocal64:$a))]>; - def STri32 : InstPTX<(outs), (ins RC:$d, LOCALri32:$a), - !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")), - [(store_global RC:$d, ADDRlocal32:$a)]>; - def STri64 : InstPTX<(outs), (ins RC:$d, LOCALri64:$a), - !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")), - [(store_global RC:$d, ADDRlocal64:$a)]>; -} - -multiclass PTX_PARAM_LD_ST<string typestr, RegisterClass RC> { - let hasSideEffects = 1 in { - def LDpi : InstPTX<(outs RC:$d), (ins i32imm:$a), - !strconcat("ld.param", !strconcat(typestr, "\t$d, [$a]")), - [(set RC:$d, (PTXloadparam texternalsym:$a))]>; - def STpi : InstPTX<(outs), (ins i32imm:$d, RC:$a), - !strconcat("st.param", !strconcat(typestr, "\t[$d], $a")), - [(PTXstoreparam texternalsym:$d, RC:$a)]>; - } -} - -multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> { - defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>; - defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>; - defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>; - defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>; - defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>; -} - -multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> { - defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>; - defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>; - defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>; - defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>; - defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>; -} - - - -//===----------------------------------------------------------------------===// -// Instruction definitions for loads/stores -//===----------------------------------------------------------------------===// - -// Global/shared stores -defm STg : PTX_ST_ALL<"st.global", store_global>; -defm STs : PTX_ST_ALL<"st.shared", store_shared>; - -// Global/shared/constant loads -defm LDg : PTX_LD_ALL<"ld.global", load_global>; -defm LDc : PTX_LD_ALL<"ld.const", load_constant>; -defm LDs : PTX_LD_ALL<"ld.shared", load_shared>; - -// Param loads/stores -defm PARAMPRED : PTX_PARAM_LD_ST<".pred", RegPred>; -defm PARAMU16 : PTX_PARAM_LD_ST<".u16", RegI16>; -defm PARAMU32 : PTX_PARAM_LD_ST<".u32", RegI32>; -defm PARAMU64 : PTX_PARAM_LD_ST<".u64", RegI64>; -defm PARAMF32 : PTX_PARAM_LD_ST<".f32", RegF32>; -defm PARAMF64 : PTX_PARAM_LD_ST<".f64", RegF64>; - -// Local loads/stores -defm LOCALPRED : PTX_LOCAL_LD_ST<".pred", RegPred>; -defm LOCALU16 : PTX_LOCAL_LD_ST<".u16", RegI16>; -defm LOCALU32 : PTX_LOCAL_LD_ST<".u32", RegI32>; -defm LOCALU64 : PTX_LOCAL_LD_ST<".u64", RegI64>; -defm LOCALF32 : PTX_LOCAL_LD_ST<".f32", RegF32>; -defm LOCALF64 : PTX_LOCAL_LD_ST<".f64", RegF64>; - diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td deleted file mode 100644 index 3416f1c..0000000 --- a/lib/Target/PTX/PTXIntrinsicInstrInfo.td +++ /dev/null @@ -1,110 +0,0 @@ -//===-- PTXIntrinsicInstrInfo.td - Defines PTX intrinsics --*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines all of the PTX-specific intrinsic instructions. -// -//===----------------------------------------------------------------------===// - -// PTX Special Purpose Register Accessor Intrinsics - -class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop> - : InstPTX<(outs RegI64:$d), (ins), - !strconcat("mov.u64\t$d, %", regname), - [(set RegI64:$d, (intop))]>; - -class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop> - : InstPTX<(outs RegI32:$d), (ins), - !strconcat("mov.u32\t$d, %", regname), - [(set RegI32:$d, (intop))]>; - -// TODO Add read vector-version of special registers - -//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid", -// int_ptx_read_tid_r64>; -def PTX_READ_TID_X : PTX_READ_SPECIAL_REGISTER_R32<"tid.x", - int_ptx_read_tid_x>; -def PTX_READ_TID_Y : PTX_READ_SPECIAL_REGISTER_R32<"tid.y", - int_ptx_read_tid_y>; -def PTX_READ_TID_Z : PTX_READ_SPECIAL_REGISTER_R32<"tid.z", - int_ptx_read_tid_z>; -def PTX_READ_TID_W : PTX_READ_SPECIAL_REGISTER_R32<"tid.w", - int_ptx_read_tid_w>; - -//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid", -// int_ptx_read_ntid_r64>; -def PTX_READ_NTID_X : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x", - int_ptx_read_ntid_x>; -def PTX_READ_NTID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y", - int_ptx_read_ntid_y>; -def PTX_READ_NTID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z", - int_ptx_read_ntid_z>; -def PTX_READ_NTID_W : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w", - int_ptx_read_ntid_w>; - -def PTX_READ_LANEID : PTX_READ_SPECIAL_REGISTER_R32<"laneid", - int_ptx_read_laneid>; -def PTX_READ_WARPID : PTX_READ_SPECIAL_REGISTER_R32<"warpid", - int_ptx_read_warpid>; -def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid", - int_ptx_read_nwarpid>; - -//def PTX_READ_CTAID_R64 : -//PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>; -def PTX_READ_CTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x", - int_ptx_read_ctaid_x>; -def PTX_READ_CTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y", - int_ptx_read_ctaid_y>; -def PTX_READ_CTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z", - int_ptx_read_ctaid_z>; -def PTX_READ_CTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w", - int_ptx_read_ctaid_w>; - -//def PTX_READ_NCTAID_R64 : -//PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>; -def PTX_READ_NCTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x", - int_ptx_read_nctaid_x>; -def PTX_READ_NCTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y", - int_ptx_read_nctaid_y>; -def PTX_READ_NCTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z", - int_ptx_read_nctaid_z>; -def PTX_READ_NCTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w", - int_ptx_read_nctaid_w>; - -def PTX_READ_SMID : PTX_READ_SPECIAL_REGISTER_R32<"smid", - int_ptx_read_smid>; -def PTX_READ_NSMID : PTX_READ_SPECIAL_REGISTER_R32<"nsmid", - int_ptx_read_nsmid>; -def PTX_READ_GRIDID : PTX_READ_SPECIAL_REGISTER_R32<"gridid", - int_ptx_read_gridid>; - -def PTX_READ_LANEMASK_EQ - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>; -def PTX_READ_LANEMASK_LE - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>; -def PTX_READ_LANEMASK_LT - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>; -def PTX_READ_LANEMASK_GE - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>; -def PTX_READ_LANEMASK_GT - : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>; - -def PTX_READ_CLOCK - : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>; -def PTX_READ_CLOCK64 - : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>; - -def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>; -def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>; -def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>; -def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>; - -// PTX Parallel Synchronization and Communication Intrinsics - -def PTX_BAR_SYNC : InstPTX<(outs), (ins i32imm:$i), "bar.sync\t$i", - [(int_ptx_bar_sync imm:$i)]>; diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp deleted file mode 100644 index 3ed67a6..0000000 --- a/lib/Target/PTX/PTXMCAsmStreamer.cpp +++ /dev/null @@ -1,556 +0,0 @@ -//===-- PTXMCAsmStreamer.cpp - PTX Text Assembly Output -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/OwningPtr.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/Twine.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/Support/PathV2.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { -class PTXMCAsmStreamer : public MCStreamer { - formatted_raw_ostream &OS; - const MCAsmInfo &MAI; - OwningPtr<MCInstPrinter> InstPrinter; - OwningPtr<MCCodeEmitter> Emitter; - - SmallString<128> CommentToEmit; - raw_svector_ostream CommentStream; - - unsigned IsVerboseAsm : 1; - unsigned ShowInst : 1; - -public: - PTXMCAsmStreamer(MCContext &Context, - formatted_raw_ostream &os, - bool isVerboseAsm, bool useLoc, - MCInstPrinter *printer, - MCCodeEmitter *emitter, - bool showInst) - : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()), - InstPrinter(printer), Emitter(emitter), CommentStream(CommentToEmit), - IsVerboseAsm(isVerboseAsm), - ShowInst(showInst) { - if (InstPrinter && IsVerboseAsm) - InstPrinter->setCommentStream(CommentStream); - } - - ~PTXMCAsmStreamer() {} - - inline void EmitEOL() { - // If we don't have any comments, just emit a \n. - if (!IsVerboseAsm) { - OS << '\n'; - return; - } - EmitCommentsAndEOL(); - } - void EmitCommentsAndEOL(); - - /// isVerboseAsm - Return true if this streamer supports verbose assembly at - /// all. - virtual bool isVerboseAsm() const { return IsVerboseAsm; } - - /// hasRawTextSupport - We support EmitRawText. - virtual bool hasRawTextSupport() const { return true; } - - /// AddComment - Add a comment that can be emitted to the generated .s - /// file if applicable as a QoI issue to make the output of the compiler - /// more readable. This only affects the MCAsmStreamer, and only when - /// verbose assembly output is enabled. - virtual void AddComment(const Twine &T); - - /// AddEncodingComment - Add a comment showing the encoding of an instruction. - virtual void AddEncodingComment(const MCInst &Inst); - - /// GetCommentOS - Return a raw_ostream that comments can be written to. - /// Unlike AddComment, you are required to terminate comments with \n if you - /// use this method. - virtual raw_ostream &GetCommentOS() { - if (!IsVerboseAsm) - return nulls(); // Discard comments unless in verbose asm mode. - return CommentStream; - } - - /// AddBlankLine - Emit a blank line to a .s file to pretty it up. - virtual void AddBlankLine() { - EmitEOL(); - } - - /// @name MCStreamer Interface - /// @{ - - virtual void ChangeSection(const MCSection *Section); - virtual void InitSections() { /* PTX does not use sections */ } - - virtual void EmitLabel(MCSymbol *Symbol); - - virtual void EmitAssemblerFlag(MCAssemblerFlag Flag); - - virtual void EmitThumbFunc(MCSymbol *Func); - - virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value); - - virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol); - - virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta, - const MCSymbol *LastLabel, - const MCSymbol *Label, - unsigned PointerSize); - - virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute); - - virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue); - virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol); - virtual void EmitCOFFSymbolStorageClass(int StorageClass); - virtual void EmitCOFFSymbolType(int Type); - virtual void EndCOFFSymbolDef(); - virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value); - virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, - unsigned ByteAlignment); - - /// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol. - /// - /// @param Symbol - The common symbol to emit. - /// @param Size - The size of the common symbol. - /// @param ByteAlignment - The alignment of the common symbol in bytes. - virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, - unsigned ByteAlignment); - - virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0, - unsigned Size = 0, unsigned ByteAlignment = 0); - - virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, - uint64_t Size, unsigned ByteAlignment = 0); - - virtual void EmitBytes(StringRef Data, unsigned AddrSpace); - - virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - unsigned AddrSpace); - virtual void EmitULEB128Value(const MCExpr *Value); - virtual void EmitSLEB128Value(const MCExpr *Value); - virtual void EmitGPRel32Value(const MCExpr *Value); - - - virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue, - unsigned AddrSpace); - - virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0, - unsigned ValueSize = 1, - unsigned MaxBytesToEmit = 0); - - virtual void EmitCodeAlignment(unsigned ByteAlignment, - unsigned MaxBytesToEmit = 0); - - virtual bool EmitValueToOffset(const MCExpr *Offset, - unsigned char Value = 0); - - virtual void EmitFileDirective(StringRef Filename); - virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory, - StringRef Filename); - - virtual void EmitInstruction(const MCInst &Inst); - - /// EmitRawText - If this file is backed by an assembly streamer, this dumps - /// the specified string in the output .s file. This capability is - /// indicated by the hasRawTextSupport() predicate. - virtual void EmitRawText(StringRef String); - - virtual void FinishImpl(); - - /// @} - -}; // class PTXMCAsmStreamer - -} - -/// TODO: Add appropriate implementation of Emit*() methods when needed - -void PTXMCAsmStreamer::AddComment(const Twine &T) { - if (!IsVerboseAsm) return; - - // Make sure that CommentStream is flushed. - CommentStream.flush(); - - T.toVector(CommentToEmit); - // Each comment goes on its own line. - CommentToEmit.push_back('\n'); - - // Tell the comment stream that the vector changed underneath it. - CommentStream.resync(); -} - -void PTXMCAsmStreamer::EmitCommentsAndEOL() { - if (CommentToEmit.empty() && CommentStream.GetNumBytesInBuffer() == 0) { - OS << '\n'; - return; - } - - CommentStream.flush(); - StringRef Comments = CommentToEmit.str(); - - assert(Comments.back() == '\n' && - "Comment array not newline terminated"); - do { - // Emit a line of comments. - OS.PadToColumn(MAI.getCommentColumn()); - size_t Position = Comments.find('\n'); - OS << MAI.getCommentString() << ' ' << Comments.substr(0, Position) << '\n'; - - Comments = Comments.substr(Position+1); - } while (!Comments.empty()); - - CommentToEmit.clear(); - // Tell the comment stream that the vector changed underneath it. - CommentStream.resync(); -} - -static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) { - assert(Bytes && "Invalid size!"); - return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8)); -} - -void PTXMCAsmStreamer::ChangeSection(const MCSection *Section) { - assert(Section && "Cannot switch to a null section!"); -} - -void PTXMCAsmStreamer::EmitLabel(MCSymbol *Symbol) { - assert(Symbol->isUndefined() && "Cannot define a symbol twice!"); - assert(!Symbol->isVariable() && "Cannot emit a variable symbol!"); - assert(getCurrentSection() && "Cannot emit before setting section!"); - - OS << *Symbol << MAI.getLabelSuffix(); - EmitEOL(); - Symbol->setSection(*getCurrentSection()); -} - -void PTXMCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {} - -void PTXMCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {} - -void PTXMCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) { - OS << *Symbol << " = " << *Value; - EmitEOL(); - - // FIXME: Lift context changes into super class. - Symbol->setVariableValue(Value); -} - -void PTXMCAsmStreamer::EmitWeakReference(MCSymbol *Alias, - const MCSymbol *Symbol) { - OS << ".weakref " << *Alias << ", " << *Symbol; - EmitEOL(); -} - -void PTXMCAsmStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta, - const MCSymbol *LastLabel, - const MCSymbol *Label, - unsigned PointerSize) { - report_fatal_error("Unimplemented."); -} - -void PTXMCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol, - MCSymbolAttr Attribute) {} - -void PTXMCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {} - -void PTXMCAsmStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {} - -void PTXMCAsmStreamer::EmitCOFFSymbolStorageClass (int StorageClass) {} - -void PTXMCAsmStreamer::EmitCOFFSymbolType (int Type) {} - -void PTXMCAsmStreamer::EndCOFFSymbolDef() {} - -void PTXMCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {} - -void PTXMCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, - unsigned ByteAlignment) {} - -void PTXMCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, - unsigned ByteAlignment) {} - -void PTXMCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol, - unsigned Size, unsigned ByteAlignment) {} - -void PTXMCAsmStreamer::EmitTBSSSymbol(const MCSection *Section, - MCSymbol *Symbol, - uint64_t Size, unsigned ByteAlignment) {} - -static inline char toOctal(int X) { return (X&7)+'0'; } - -static void PrintQuotedString(StringRef Data, raw_ostream &OS) { - OS << '"'; - - for (unsigned i = 0, e = Data.size(); i != e; ++i) { - unsigned char C = Data[i]; - if (C == '"' || C == '\\') { - OS << '\\' << (char)C; - continue; - } - - if (isprint((unsigned char)C)) { - OS << (char)C; - continue; - } - - switch (C) { - case '\b': OS << "\\b"; break; - case '\f': OS << "\\f"; break; - case '\n': OS << "\\n"; break; - case '\r': OS << "\\r"; break; - case '\t': OS << "\\t"; break; - default: - OS << '\\'; - OS << toOctal(C >> 6); - OS << toOctal(C >> 3); - OS << toOctal(C >> 0); - break; - } - } - - OS << '"'; -} - -void PTXMCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) { - assert(getCurrentSection() && "Cannot emit contents before setting section!"); - if (Data.empty()) return; - - if (Data.size() == 1) { - OS << MAI.getData8bitsDirective(AddrSpace); - OS << (unsigned)(unsigned char)Data[0]; - EmitEOL(); - return; - } - - // If the data ends with 0 and the target supports .asciz, use it, otherwise - // use .ascii - if (MAI.getAscizDirective() && Data.back() == 0) { - OS << MAI.getAscizDirective(); - Data = Data.substr(0, Data.size()-1); - } else { - OS << MAI.getAsciiDirective(); - } - - OS << ' '; - PrintQuotedString(Data, OS); - EmitEOL(); -} - -void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - unsigned AddrSpace) { - assert(getCurrentSection() && "Cannot emit contents before setting section!"); - const char *Directive = 0; - switch (Size) { - default: break; - case 1: Directive = MAI.getData8bitsDirective(AddrSpace); break; - case 2: Directive = MAI.getData16bitsDirective(AddrSpace); break; - case 4: Directive = MAI.getData32bitsDirective(AddrSpace); break; - case 8: - Directive = MAI.getData64bitsDirective(AddrSpace); - // If the target doesn't support 64-bit data, emit as two 32-bit halves. - if (Directive) break; - int64_t IntValue; - if (!Value->EvaluateAsAbsolute(IntValue)) - report_fatal_error("Don't know how to emit this value."); - if (getContext().getAsmInfo().isLittleEndian()) { - EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace); - EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace); - } else { - EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace); - EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace); - } - return; - } - - assert(Directive && "Invalid size for machine code value!"); - OS << Directive << *Value; - EmitEOL(); -} - -void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value) { - assert(MAI.hasLEB128() && "Cannot print a .uleb"); - OS << ".uleb128 " << *Value; - EmitEOL(); -} - -void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) { - assert(MAI.hasLEB128() && "Cannot print a .sleb"); - OS << ".sleb128 " << *Value; - EmitEOL(); -} - -void PTXMCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) { - assert(MAI.getGPRel32Directive() != 0); - OS << MAI.getGPRel32Directive() << *Value; - EmitEOL(); -} - - -/// EmitFill - Emit NumBytes bytes worth of the value specified by -/// FillValue. This implements directives such as '.space'. -void PTXMCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue, - unsigned AddrSpace) { - if (NumBytes == 0) return; - - if (AddrSpace == 0) - if (const char *ZeroDirective = MAI.getZeroDirective()) { - OS << ZeroDirective << NumBytes; - if (FillValue != 0) - OS << ',' << (int)FillValue; - EmitEOL(); - return; - } - - // Emit a byte at a time. - MCStreamer::EmitFill(NumBytes, FillValue, AddrSpace); -} - -void PTXMCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, - int64_t Value, - unsigned ValueSize, - unsigned MaxBytesToEmit) { - // Some assemblers don't support non-power of two alignments, so we always - // emit alignments as a power of two if possible. - if (isPowerOf2_32(ByteAlignment)) { - switch (ValueSize) { - default: llvm_unreachable("Invalid size for machine code value!"); - case 1: OS << MAI.getAlignDirective(); break; - // FIXME: use MAI for this! - case 2: OS << ".p2alignw "; break; - case 4: OS << ".p2alignl "; break; - case 8: llvm_unreachable("Unsupported alignment size!"); - } - - if (MAI.getAlignmentIsInBytes()) - OS << ByteAlignment; - else - OS << Log2_32(ByteAlignment); - - if (Value || MaxBytesToEmit) { - OS << ", 0x"; - OS.write_hex(truncateToSize(Value, ValueSize)); - - if (MaxBytesToEmit) - OS << ", " << MaxBytesToEmit; - } - EmitEOL(); - return; - } - - // Non-power of two alignment. This is not widely supported by assemblers. - // FIXME: Parameterize this based on MAI. - switch (ValueSize) { - default: llvm_unreachable("Invalid size for machine code value!"); - case 1: OS << ".balign"; break; - case 2: OS << ".balignw"; break; - case 4: OS << ".balignl"; break; - case 8: llvm_unreachable("Unsupported alignment size!"); - } - - OS << ' ' << ByteAlignment; - OS << ", " << truncateToSize(Value, ValueSize); - if (MaxBytesToEmit) - OS << ", " << MaxBytesToEmit; - EmitEOL(); -} - -void PTXMCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment, - unsigned MaxBytesToEmit) {} - -bool PTXMCAsmStreamer::EmitValueToOffset(const MCExpr *Offset, - unsigned char Value) {return false;} - - -void PTXMCAsmStreamer::EmitFileDirective(StringRef Filename) { - assert(MAI.hasSingleParameterDotFile()); - OS << "\t.file\t"; - PrintQuotedString(Filename, OS); - EmitEOL(); -} - -// FIXME: should we inherit from MCAsmStreamer? -bool PTXMCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo, - StringRef Directory, - StringRef Filename) { - if (!Directory.empty()) { - if (sys::path::is_absolute(Filename)) - return EmitDwarfFileDirective(FileNo, "", Filename); - SmallString<128> FullPathName = Directory; - sys::path::append(FullPathName, Filename); - return EmitDwarfFileDirective(FileNo, "", FullPathName); - } - - OS << "\t.file\t" << FileNo << ' '; - PrintQuotedString(Filename, OS); - EmitEOL(); - return this->MCStreamer::EmitDwarfFileDirective(FileNo, Directory, Filename); -} - -void PTXMCAsmStreamer::AddEncodingComment(const MCInst &Inst) {} - -void PTXMCAsmStreamer::EmitInstruction(const MCInst &Inst) { - assert(getCurrentSection() && "Cannot emit contents before setting section!"); - - // Show the encoding in a comment if we have a code emitter. - if (Emitter) - AddEncodingComment(Inst); - - // Show the MCInst if enabled. - if (ShowInst) { - Inst.dump_pretty(GetCommentOS(), &MAI, InstPrinter.get(), "\n "); - GetCommentOS() << "\n"; - } - - // If we have an AsmPrinter, use that to print, otherwise print the MCInst. - if (InstPrinter) - InstPrinter->printInst(&Inst, OS, ""); - else - Inst.print(OS, &MAI); - EmitEOL(); -} - -/// EmitRawText - If this file is backed by an assembly streamer, this dumps -/// the specified string in the output .s file. This capability is -/// indicated by the hasRawTextSupport() predicate. -void PTXMCAsmStreamer::EmitRawText(StringRef String) { - if (!String.empty() && String.back() == '\n') - String = String.substr(0, String.size()-1); - OS << String; - EmitEOL(); -} - -void PTXMCAsmStreamer::FinishImpl() {} - -namespace llvm { - MCStreamer *createPTXAsmStreamer(MCContext &Context, - formatted_raw_ostream &OS, - bool isVerboseAsm, bool useLoc, bool useCFI, - bool useDwarfDirectory, - MCInstPrinter *IP, - MCCodeEmitter *CE, MCAsmBackend *MAB, - bool ShowInst) { - return new PTXMCAsmStreamer(Context, OS, isVerboseAsm, useLoc, - IP, CE, ShowInst); - } -} diff --git a/lib/Target/PTX/PTXMCInstLower.cpp b/lib/Target/PTX/PTXMCInstLower.cpp deleted file mode 100644 index 142e639..0000000 --- a/lib/Target/PTX/PTXMCInstLower.cpp +++ /dev/null @@ -1,32 +0,0 @@ -//===-- PTXMCInstLower.cpp - Convert PTX MachineInstr to an MCInst --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains code to lower PTX MachineInstrs to their corresponding -// MCInst records. -// -//===----------------------------------------------------------------------===// - -#include "PTX.h" -#include "PTXAsmPrinter.h" -#include "llvm/Constants.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/Target/Mangler.h" - -void llvm::LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, - PTXAsmPrinter &AP) { - OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - MCOperand MCOp; - OutMI.addOperand(AP.lowerOperand(MO)); - } -} - diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp deleted file mode 100644 index 172a0e0..0000000 --- a/lib/Target/PTX/PTXMFInfoExtract.cpp +++ /dev/null @@ -1,85 +0,0 @@ -//===-- PTXMFInfoExtract.cpp - Extract PTX machine function info ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines an information extractor for PTX machine functions. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "ptx-mf-info-extract" - -#include "PTX.h" -#include "PTXTargetMachine.h" -#include "PTXMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -// NOTE: PTXMFInfoExtract must after register allocation! - -namespace { - /// PTXMFInfoExtract - PTX specific code to extract of PTX machine - /// function information for PTXAsmPrinter - /// - class PTXMFInfoExtract : public MachineFunctionPass { - private: - static char ID; - - public: - PTXMFInfoExtract(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel) - : MachineFunctionPass(ID) {} - - virtual bool runOnMachineFunction(MachineFunction &MF); - - virtual const char *getPassName() const { - return "PTX Machine Function Info Extractor"; - } - }; // class PTXMFInfoExtract -} // end anonymous namespace - -using namespace llvm; - -char PTXMFInfoExtract::ID = 0; - -bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) { - PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - // Generate list of all virtual registers used in this function - for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(i); - const TargetRegisterClass *TRC = MRI.getRegClass(Reg); - unsigned RegType; - if (TRC == PTX::RegPredRegisterClass) - RegType = PTXRegisterType::Pred; - else if (TRC == PTX::RegI16RegisterClass) - RegType = PTXRegisterType::B16; - else if (TRC == PTX::RegI32RegisterClass) - RegType = PTXRegisterType::B32; - else if (TRC == PTX::RegI64RegisterClass) - RegType = PTXRegisterType::B64; - else if (TRC == PTX::RegF32RegisterClass) - RegType = PTXRegisterType::F32; - else if (TRC == PTX::RegF64RegisterClass) - RegType = PTXRegisterType::F64; - else - llvm_unreachable("Unkown register class."); - MFI->addRegister(Reg, RegType, PTXRegisterSpace::Reg); - } - - return false; -} - -FunctionPass *llvm::createPTXMFInfoExtract(PTXTargetMachine &TM, - CodeGenOpt::Level OptLevel) { - return new PTXMFInfoExtract(TM, OptLevel); -} diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h deleted file mode 100644 index bb7574c..0000000 --- a/lib/Target/PTX/PTXMachineFunctionInfo.h +++ /dev/null @@ -1,202 +0,0 @@ -//===-- PTXMachineFuctionInfo.h - PTX machine function info ------*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares PTX-specific per-machine-function information. -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_MACHINE_FUNCTION_INFO_H -#define PTX_MACHINE_FUNCTION_INFO_H - -#include "PTX.h" -#include "PTXParamManager.h" -#include "PTXRegisterInfo.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -namespace llvm { - -/// PTXMachineFunctionInfo - This class is derived from MachineFunction and -/// contains private PTX target-specific information for each MachineFunction. -/// -class PTXMachineFunctionInfo : public MachineFunctionInfo { - virtual void anchor(); - bool IsKernel; - DenseSet<unsigned> RegArgs; - DenseSet<unsigned> RegRets; - - typedef DenseMap<int, std::string> FrameMap; - - FrameMap FrameSymbols; - - struct RegisterInfo { - unsigned Reg; - unsigned Type; - unsigned Space; - unsigned Offset; - unsigned Encoded; - }; - - typedef DenseMap<unsigned, RegisterInfo> RegisterInfoMap; - - RegisterInfoMap RegInfo; - - PTXParamManager ParamManager; - -public: - typedef DenseSet<unsigned>::const_iterator reg_iterator; - - PTXMachineFunctionInfo(MachineFunction &MF) - : IsKernel(false) { - } - - /// getParamManager - Returns the PTXParamManager instance for this function. - PTXParamManager& getParamManager() { return ParamManager; } - const PTXParamManager& getParamManager() const { return ParamManager; } - - /// setKernel/isKernel - Gets/sets a flag that indicates if this function is - /// a PTX kernel function. - void setKernel(bool _IsKernel=true) { IsKernel = _IsKernel; } - bool isKernel() const { return IsKernel; } - - /// argreg_begin/argreg_end - Returns iterators to the set of registers - /// containing function arguments. - reg_iterator argreg_begin() const { return RegArgs.begin(); } - reg_iterator argreg_end() const { return RegArgs.end(); } - - /// retreg_begin/retreg_end - Returns iterators to the set of registers - /// containing the function return values. - reg_iterator retreg_begin() const { return RegRets.begin(); } - reg_iterator retreg_end() const { return RegRets.end(); } - - /// addRegister - Adds a virtual register to the set of all used registers - void addRegister(unsigned Reg, unsigned RegType, unsigned RegSpace) { - if (!RegInfo.count(Reg)) { - RegisterInfo Info; - Info.Reg = Reg; - Info.Type = RegType; - Info.Space = RegSpace; - - // Determine register offset - Info.Offset = 0; - for(RegisterInfoMap::const_iterator i = RegInfo.begin(), - e = RegInfo.end(); i != e; ++i) { - const RegisterInfo& RI = i->second; - if (RI.Space == RegSpace) - if (RI.Space != PTXRegisterSpace::Reg || RI.Type == Info.Type) - Info.Offset++; - } - - // Encode the register data into a single register number - Info.Encoded = (Info.Offset << 6) | (Info.Type << 3) | Info.Space; - - RegInfo[Reg] = Info; - - if (RegSpace == PTXRegisterSpace::Argument) - RegArgs.insert(Reg); - else if (RegSpace == PTXRegisterSpace::Return) - RegRets.insert(Reg); - } - } - - /// countRegisters - Returns the number of registers of the given type and - /// space. - unsigned countRegisters(unsigned RegType, unsigned RegSpace) const { - unsigned Count = 0; - for(RegisterInfoMap::const_iterator i = RegInfo.begin(), e = RegInfo.end(); - i != e; ++i) { - const RegisterInfo& RI = i->second; - if (RI.Type == RegType && RI.Space == RegSpace) - Count++; - } - return Count; - } - - /// getEncodedRegister - Returns the encoded value of the register. - unsigned getEncodedRegister(unsigned Reg) const { - return RegInfo.lookup(Reg).Encoded; - } - - /// addRetReg - Adds a register to the set of return-value registers. - void addRetReg(unsigned Reg) { - if (!RegRets.count(Reg)) { - RegRets.insert(Reg); - } - } - - /// addArgReg - Adds a register to the set of function argument registers. - void addArgReg(unsigned Reg) { - RegArgs.insert(Reg); - } - - /// getRegisterName - Returns the name of the specified virtual register. This - /// name is used during PTX emission. - std::string getRegisterName(unsigned Reg) const { - if (RegInfo.count(Reg)) { - const RegisterInfo& RI = RegInfo.lookup(Reg); - std::string Name; - raw_string_ostream NameStr(Name); - decodeRegisterName(NameStr, RI.Encoded); - NameStr.flush(); - return Name; - } - else if (Reg == PTX::NoRegister) - return "%noreg"; - else - llvm_unreachable("Register not in register name map"); - } - - /// getEncodedRegisterName - Returns the name of the encoded register. - std::string getEncodedRegisterName(unsigned EncodedReg) const { - std::string Name; - raw_string_ostream NameStr(Name); - decodeRegisterName(NameStr, EncodedReg); - NameStr.flush(); - return Name; - } - - /// getRegisterType - Returns the type of the specified virtual register. - unsigned getRegisterType(unsigned Reg) const { - if (RegInfo.count(Reg)) - return RegInfo.lookup(Reg).Type; - else - llvm_unreachable("Unknown register"); - } - - /// getOffsetForRegister - Returns the offset of the virtual register - unsigned getOffsetForRegister(unsigned Reg) const { - if (RegInfo.count(Reg)) - return RegInfo.lookup(Reg).Offset; - else - return 0; - } - - /// getFrameSymbol - Returns the symbol name for the given FrameIndex. - const char* getFrameSymbol(int FrameIndex) { - if (FrameSymbols.count(FrameIndex)) { - return FrameSymbols.lookup(FrameIndex).c_str(); - } else { - std::string Name = "__local"; - Name += utostr(FrameIndex); - // The whole point of caching this name is to ensure the pointer we pass - // to any getExternalSymbol() calls will remain valid for the lifetime of - // the back-end instance. This is to work around an issue in SelectionDAG - // where symbol names are expected to be life-long strings. - FrameSymbols[FrameIndex] = Name; - return FrameSymbols[FrameIndex].c_str(); - } - } -}; // class PTXMachineFunctionInfo -} // namespace llvm - -#endif // PTX_MACHINE_FUNCTION_INFO_H diff --git a/lib/Target/PTX/PTXParamManager.cpp b/lib/Target/PTX/PTXParamManager.cpp deleted file mode 100644 index cc1cc71..0000000 --- a/lib/Target/PTX/PTXParamManager.cpp +++ /dev/null @@ -1,73 +0,0 @@ -//===-- PTXParamManager.cpp - Manager for .param variables ----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the PTXParamManager class. -// -//===----------------------------------------------------------------------===// - -#include "PTXParamManager.h" -#include "PTX.h" -#include "llvm/ADT/StringExtras.h" - -using namespace llvm; - -PTXParamManager::PTXParamManager() { -} - -unsigned PTXParamManager::addArgumentParam(unsigned Size) { - PTXParam Param; - Param.Type = PTX_PARAM_TYPE_ARGUMENT; - Param.Size = Size; - - std::string Name; - Name = "__param_"; - Name += utostr(ArgumentParams.size()+1); - Param.Name = Name; - - unsigned Index = AllParams.size(); - AllParams[Index] = Param; - ArgumentParams.push_back(Index); - - return Index; -} - -unsigned PTXParamManager::addReturnParam(unsigned Size) { - PTXParam Param; - Param.Type = PTX_PARAM_TYPE_RETURN; - Param.Size = Size; - - std::string Name; - Name = "__ret_"; - Name += utostr(ReturnParams.size()+1); - Param.Name = Name; - - unsigned Index = AllParams.size(); - AllParams[Index] = Param; - ReturnParams.push_back(Index); - - return Index; -} - -unsigned PTXParamManager::addLocalParam(unsigned Size) { - PTXParam Param; - Param.Type = PTX_PARAM_TYPE_LOCAL; - Param.Size = Size; - - std::string Name; - Name = "__localparam_"; - Name += utostr(LocalParams.size()+1); - Param.Name = Name; - - unsigned Index = AllParams.size(); - AllParams[Index] = Param; - LocalParams.push_back(Index); - - return Index; -} - diff --git a/lib/Target/PTX/PTXParamManager.h b/lib/Target/PTX/PTXParamManager.h deleted file mode 100644 index 92e7728..0000000 --- a/lib/Target/PTX/PTXParamManager.h +++ /dev/null @@ -1,87 +0,0 @@ -//===-- PTXParamManager.h - Manager for .param variables --------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the PTXParamManager class, which manages all defined .param -// variables for a particular function. -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_PARAM_MANAGER_H -#define PTX_PARAM_MANAGER_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallVector.h" -#include <string> - -namespace llvm { - -/// PTXParamManager - This class manages all .param variables defined for a -/// particular function. -class PTXParamManager { -private: - - /// PTXParamType - Type of a .param variable - enum PTXParamType { - PTX_PARAM_TYPE_ARGUMENT, - PTX_PARAM_TYPE_RETURN, - PTX_PARAM_TYPE_LOCAL - }; - - /// PTXParam - Definition of a PTX .param variable - struct PTXParam { - PTXParamType Type; - unsigned Size; - std::string Name; - }; - - DenseMap<unsigned, PTXParam> AllParams; - SmallVector<unsigned, 4> ArgumentParams; - SmallVector<unsigned, 4> ReturnParams; - SmallVector<unsigned, 4> LocalParams; - -public: - - typedef SmallVector<unsigned, 4>::const_iterator param_iterator; - - PTXParamManager(); - - param_iterator arg_begin() const { return ArgumentParams.begin(); } - param_iterator arg_end() const { return ArgumentParams.end(); } - param_iterator ret_begin() const { return ReturnParams.begin(); } - param_iterator ret_end() const { return ReturnParams.end(); } - param_iterator local_begin() const { return LocalParams.begin(); } - param_iterator local_end() const { return LocalParams.end(); } - - /// addArgumentParam - Returns a new .param used as an argument. - unsigned addArgumentParam(unsigned Size); - - /// addReturnParam - Returns a new .param used as a return argument. - unsigned addReturnParam(unsigned Size); - - /// addLocalParam - Returns a new .param used as a local .param variable. - unsigned addLocalParam(unsigned Size); - - /// getParamName - Returns the name of the parameter as a string. - const std::string &getParamName(unsigned Param) const { - assert(AllParams.count(Param) == 1 && "Param has not been defined!"); - return AllParams.find(Param)->second.Name; - } - - /// getParamSize - Returns the size of the parameter in bits. - unsigned getParamSize(unsigned Param) const { - assert(AllParams.count(Param) == 1 && "Param has not been defined!"); - return AllParams.find(Param)->second.Size; - } - -}; - -} - -#endif - diff --git a/lib/Target/PTX/PTXRegAlloc.cpp b/lib/Target/PTX/PTXRegAlloc.cpp deleted file mode 100644 index 7fd5375..0000000 --- a/lib/Target/PTX/PTXRegAlloc.cpp +++ /dev/null @@ -1,53 +0,0 @@ -//===-- PTXRegAlloc.cpp - PTX Register Allocator --------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a register allocator for PTX code. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "ptx-reg-alloc" - -#include "PTX.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/RegAllocRegistry.h" - -using namespace llvm; - -namespace { - // Special register allocator for PTX. - class PTXRegAlloc : public MachineFunctionPass { - public: - static char ID; - PTXRegAlloc() : MachineFunctionPass(ID) {} - - virtual const char* getPassName() const { - return "PTX Register Allocator"; - } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - virtual bool runOnMachineFunction(MachineFunction &MF) { - // We do not actually do anything (at least not yet). - return false; - } - }; - - char PTXRegAlloc::ID = 0; - - static RegisterRegAlloc - ptxRegAlloc("ptx", "PTX register allocator", createPTXRegisterAllocator); -} - -FunctionPass *llvm::createPTXRegisterAllocator() { - return new PTXRegAlloc(); -} - diff --git a/lib/Target/PTX/PTXRegisterInfo.cpp b/lib/Target/PTX/PTXRegisterInfo.cpp deleted file mode 100644 index b6ffd38..0000000 --- a/lib/Target/PTX/PTXRegisterInfo.cpp +++ /dev/null @@ -1,38 +0,0 @@ -//===-- PTXRegisterInfo.cpp - PTX Register Information --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the PTX implementation of the TargetRegisterInfo class. -// -//===----------------------------------------------------------------------===// - -#include "PTXRegisterInfo.h" -#include "PTX.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -#define GET_REGINFO_TARGET_DESC -#include "PTXGenRegisterInfo.inc" - -using namespace llvm; - -PTXRegisterInfo::PTXRegisterInfo(PTXTargetMachine &TM, - const TargetInstrInfo &tii) - // PTX does not have a return address register. - : PTXGenRegisterInfo(0), TII(tii) { -} - -void PTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator /*II*/, - int /*SPAdj*/, - RegScavenger * /*RS*/) const { - llvm_unreachable("FrameIndex should have been previously eliminated!"); -} diff --git a/lib/Target/PTX/PTXRegisterInfo.h b/lib/Target/PTX/PTXRegisterInfo.h deleted file mode 100644 index 5614ce7..0000000 --- a/lib/Target/PTX/PTXRegisterInfo.h +++ /dev/null @@ -1,56 +0,0 @@ -//===-- PTXRegisterInfo.h - PTX Register Information Impl -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the PTX implementation of the MRegisterInfo class. -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_REGISTER_INFO_H -#define PTX_REGISTER_INFO_H - -#include "llvm/Support/ErrorHandling.h" -#include "llvm/ADT/BitVector.h" - -#define GET_REGINFO_HEADER -#include "PTXGenRegisterInfo.inc" - -namespace llvm { -class PTXTargetMachine; -class MachineFunction; - -struct PTXRegisterInfo : public PTXGenRegisterInfo { -private: - const TargetInstrInfo &TII; - -public: - PTXRegisterInfo(PTXTargetMachine &TM, - const TargetInstrInfo &tii); - - virtual const uint16_t - *getCalleeSavedRegs(const MachineFunction *MF = 0) const { - static const uint16_t CalleeSavedRegs[] = { 0 }; - return CalleeSavedRegs; // save nothing - } - - virtual BitVector getReservedRegs(const MachineFunction &MF) const { - BitVector Reserved(getNumRegs()); - return Reserved; // reserve no regs - } - - virtual void eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, - RegScavenger *RS = NULL) const; - - virtual unsigned getFrameRegister(const MachineFunction &MF) const { - llvm_unreachable("PTX does not have a frame register"); - } -}; // struct PTXRegisterInfo -} // namespace llvm - -#endif // PTX_REGISTER_INFO_H diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td deleted file mode 100644 index e8b262e..0000000 --- a/lib/Target/PTX/PTXRegisterInfo.td +++ /dev/null @@ -1,36 +0,0 @@ -//===-- PTXRegisterInfo.td - PTX Register defs -------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Declarations that describe the PTX register file -//===----------------------------------------------------------------------===// - -class PTXReg<string n> : Register<n> { - let Namespace = "PTX"; -} - -//===----------------------------------------------------------------------===// -// Registers -//===----------------------------------------------------------------------===// - -// The generated register info code throws warnings for empty register classes -// (e.g. zero-length arrays), so we use a dummy register here just to prevent -// these warnings. -def DUMMY_REG : PTXReg<"R0">; - -//===----------------------------------------------------------------------===// -// Register classes -//===----------------------------------------------------------------------===// -def RegPred : RegisterClass<"PTX", [i1], 8, (add DUMMY_REG)>; -def RegI16 : RegisterClass<"PTX", [i16], 16, (add DUMMY_REG)>; -def RegI32 : RegisterClass<"PTX", [i32], 32, (add DUMMY_REG)>; -def RegI64 : RegisterClass<"PTX", [i64], 64, (add DUMMY_REG)>; -def RegF32 : RegisterClass<"PTX", [f32], 32, (add DUMMY_REG)>; -def RegF64 : RegisterClass<"PTX", [f64], 64, (add DUMMY_REG)>; - diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.cpp b/lib/Target/PTX/PTXSelectionDAGInfo.cpp deleted file mode 100644 index a116fab..0000000 --- a/lib/Target/PTX/PTXSelectionDAGInfo.cpp +++ /dev/null @@ -1,150 +0,0 @@ -//===-- PTXSelectionDAGInfo.cpp - PTX SelectionDAG Info -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the PTXSelectionDAGInfo class. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "ptx-selectiondag-info" -#include "PTXTargetMachine.h" -#include "llvm/DerivedTypes.h" -#include "llvm/CodeGen/SelectionDAG.h" -using namespace llvm; - -PTXSelectionDAGInfo::PTXSelectionDAGInfo(const TargetMachine &TM) - : TargetSelectionDAGInfo(TM), - Subtarget(&TM.getSubtarget<PTXSubtarget>()) { -} - -PTXSelectionDAGInfo::~PTXSelectionDAGInfo() { -} - -SDValue -PTXSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, bool AlwaysInline, - MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo) const { - // Do repeated 4-byte loads and stores. To be improved. - // This requires 4-byte alignment. - if ((Align & 3) != 0) - return SDValue(); - // This requires the copy size to be a constant, preferably - // within a subtarget-specific limit. - ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); - if (!ConstantSize) - return SDValue(); - uint64_t SizeVal = ConstantSize->getZExtValue(); - // Always inline memcpys. In PTX, we do not have a C library that provides - // a memcpy function. - //if (!AlwaysInline) - // return SDValue(); - - unsigned BytesLeft = SizeVal & 3; - unsigned NumMemOps = SizeVal >> 2; - unsigned EmittedNumMemOps = 0; - EVT VT = MVT::i32; - unsigned VTSize = 4; - unsigned i = 0; - const unsigned MAX_LOADS_IN_LDM = 6; - SDValue TFOps[MAX_LOADS_IN_LDM]; - SDValue Loads[MAX_LOADS_IN_LDM]; - uint64_t SrcOff = 0, DstOff = 0; - EVT PointerType = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; - - // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the - // same number of stores. The loads and stores will get combined into - // ldm/stm later on. - while (EmittedNumMemOps < NumMemOps) { - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, PointerType, Src, - DAG.getConstant(SrcOff, PointerType)), - SrcPtrInfo.getWithOffset(SrcOff), isVolatile, - false, false, 0); - TFOps[i] = Loads[i].getValue(1); - SrcOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, PointerType, Dst, - DAG.getConstant(DstOff, PointerType)), - DstPtrInfo.getWithOffset(DstOff), - isVolatile, false, 0); - DstOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - EmittedNumMemOps += i; - } - - if (BytesLeft == 0) - return Chain; - - // Issue loads / stores for the trailing (1 - 3) bytes. - unsigned BytesLeftSave = BytesLeft; - i = 0; - while (BytesLeft) { - if (BytesLeft >= 2) { - VT = MVT::i16; - VTSize = 2; - } else { - VT = MVT::i8; - VTSize = 1; - } - - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, PointerType, Src, - DAG.getConstant(SrcOff, PointerType)), - SrcPtrInfo.getWithOffset(SrcOff), false, false, - false, 0); - TFOps[i] = Loads[i].getValue(1); - ++i; - SrcOff += VTSize; - BytesLeft -= VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - i = 0; - BytesLeft = BytesLeftSave; - while (BytesLeft) { - if (BytesLeft >= 2) { - VT = MVT::i16; - VTSize = 2; - } else { - VT = MVT::i8; - VTSize = 1; - } - - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, PointerType, Dst, - DAG.getConstant(DstOff, PointerType)), - DstPtrInfo.getWithOffset(DstOff), false, false, 0); - ++i; - DstOff += VTSize; - BytesLeft -= VTSize; - } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); -} - -SDValue PTXSelectionDAGInfo:: -EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, SDValue Dst, - SDValue Src, SDValue Size, - unsigned Align, bool isVolatile, - MachinePointerInfo DstPtrInfo) const { - llvm_unreachable("memset lowering not implemented for PTX yet"); -} - diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.h b/lib/Target/PTX/PTXSelectionDAGInfo.h deleted file mode 100644 index e0c7167..0000000 --- a/lib/Target/PTX/PTXSelectionDAGInfo.h +++ /dev/null @@ -1,53 +0,0 @@ -//===-- PTXSelectionDAGInfo.h - PTX SelectionDAG Info -----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the PTX subclass for TargetSelectionDAGInfo. -// -//===----------------------------------------------------------------------===// - -#ifndef PTXSELECTIONDAGINFO_H -#define PTXSELECTIONDAGINFO_H - -#include "llvm/Target/TargetSelectionDAGInfo.h" - -namespace llvm { - -/// PTXSelectionDAGInfo - TargetSelectionDAGInfo sub-class for the PTX target. -/// At the moment, this is mostly just a copy of ARMSelectionDAGInfo. -class PTXSelectionDAGInfo : public TargetSelectionDAGInfo { - /// Subtarget - Keep a pointer to the PTXSubtarget around so that we can - /// make the right decision when generating code for different targets. - const PTXSubtarget *Subtarget; - -public: - explicit PTXSelectionDAGInfo(const TargetMachine &TM); - ~PTXSelectionDAGInfo(); - - virtual - SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, bool AlwaysInline, - MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo) const; - - virtual - SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, - SDValue Op1, SDValue Op2, - SDValue Op3, unsigned Align, - bool isVolatile, - MachinePointerInfo DstPtrInfo) const; -}; - -} - -#endif - diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp deleted file mode 100644 index 454f64e..0000000 --- a/lib/Target/PTX/PTXSubtarget.cpp +++ /dev/null @@ -1,68 +0,0 @@ -//===-- PTXSubtarget.cpp - PTX Subtarget Information ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the PTX specific subclass of TargetSubtargetInfo. -// -//===----------------------------------------------------------------------===// - -#include "PTXSubtarget.h" -#include "PTX.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/TargetRegistry.h" - -#define GET_SUBTARGETINFO_TARGET_DESC -#define GET_SUBTARGETINFO_CTOR -#include "PTXGenSubtargetInfo.inc" - -using namespace llvm; - -void PTXSubtarget::anchor() { } - -PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit) - : PTXGenSubtargetInfo(TT, CPU, FS), - PTXTarget(PTX_COMPUTE_1_0), - PTXVersion(PTX_VERSION_2_0), - SupportsDouble(false), - SupportsFMA(true), - Is64Bit(is64Bit) { - std::string TARGET = CPU; - if (TARGET.empty()) - TARGET = "generic"; - ParseSubtargetFeatures(TARGET, FS); -} - -std::string PTXSubtarget::getTargetString() const { - switch(PTXTarget) { - default: llvm_unreachable("Unknown PTX target"); - case PTX_SM_1_0: return "sm_10"; - case PTX_SM_1_1: return "sm_11"; - case PTX_SM_1_2: return "sm_12"; - case PTX_SM_1_3: return "sm_13"; - case PTX_SM_2_0: return "sm_20"; - case PTX_SM_2_1: return "sm_21"; - case PTX_SM_2_2: return "sm_22"; - case PTX_SM_2_3: return "sm_23"; - case PTX_COMPUTE_1_0: return "compute_10"; - case PTX_COMPUTE_1_1: return "compute_11"; - case PTX_COMPUTE_1_2: return "compute_12"; - case PTX_COMPUTE_1_3: return "compute_13"; - case PTX_COMPUTE_2_0: return "compute_20"; - } -} - -std::string PTXSubtarget::getPTXVersionString() const { - switch(PTXVersion) { - case PTX_VERSION_2_0: return "2.0"; - case PTX_VERSION_2_1: return "2.1"; - case PTX_VERSION_2_2: return "2.2"; - case PTX_VERSION_2_3: return "2.3"; - } - llvm_unreachable("Invalid PTX version"); -} diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h deleted file mode 100644 index ce93fef..0000000 --- a/lib/Target/PTX/PTXSubtarget.h +++ /dev/null @@ -1,131 +0,0 @@ -//===-- PTXSubtarget.h - Define Subtarget for the PTX -----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares the PTX specific subclass of TargetSubtargetInfo. -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_SUBTARGET_H -#define PTX_SUBTARGET_H - -#include "llvm/Target/TargetSubtargetInfo.h" - -#define GET_SUBTARGETINFO_HEADER -#include "PTXGenSubtargetInfo.inc" - -namespace llvm { -class StringRef; - - class PTXSubtarget : public PTXGenSubtargetInfo { - virtual void anchor(); - public: - - /** - * Enumeration of Shader Models supported by the back-end. - */ - enum PTXTargetEnum { - PTX_COMPUTE_1_0, /*< Compute Compatibility 1.0 */ - PTX_COMPUTE_1_1, /*< Compute Compatibility 1.1 */ - PTX_COMPUTE_1_2, /*< Compute Compatibility 1.2 */ - PTX_COMPUTE_1_3, /*< Compute Compatibility 1.3 */ - PTX_COMPUTE_2_0, /*< Compute Compatibility 2.0 */ - PTX_LAST_COMPUTE, - - PTX_SM_1_0, /*< Shader Model 1.0 */ - PTX_SM_1_1, /*< Shader Model 1.1 */ - PTX_SM_1_2, /*< Shader Model 1.2 */ - PTX_SM_1_3, /*< Shader Model 1.3 */ - PTX_SM_2_0, /*< Shader Model 2.0 */ - PTX_SM_2_1, /*< Shader Model 2.1 */ - PTX_SM_2_2, /*< Shader Model 2.2 */ - PTX_SM_2_3, /*< Shader Model 2.3 */ - PTX_LAST_SM - }; - - /** - * Enumeration of PTX versions supported by the back-end. - * - * Currently, PTX 2.0 is the minimum supported version. - */ - enum PTXVersionEnum { - PTX_VERSION_2_0, /*< PTX Version 2.0 */ - PTX_VERSION_2_1, /*< PTX Version 2.1 */ - PTX_VERSION_2_2, /*< PTX Version 2.2 */ - PTX_VERSION_2_3 /*< PTX Version 2.3 */ - }; - - private: - - /// Shader Model supported on the target GPU. - PTXTargetEnum PTXTarget; - - /// PTX Language Version. - PTXVersionEnum PTXVersion; - - // The native .f64 type is supported on the hardware. - bool SupportsDouble; - - // Support the fused-multiply add (FMA) and multiply-add (MAD) - // instructions - bool SupportsFMA; - - // Use .u64 instead of .u32 for addresses. - bool Is64Bit; - - public: - - PTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit); - - // Target architecture accessors - std::string getTargetString() const; - - std::string getPTXVersionString() const; - - bool supportsDouble() const { return SupportsDouble; } - - bool is64Bit() const { return Is64Bit; } - - bool supportsFMA() const { return SupportsFMA; } - - bool supportsPTX21() const { return PTXVersion >= PTX_VERSION_2_1; } - - bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; } - - bool supportsPTX23() const { return PTXVersion >= PTX_VERSION_2_3; } - - bool fdivNeedsRoundingMode() const { - return (PTXTarget >= PTX_SM_1_3 && PTXTarget < PTX_LAST_SM) || - (PTXTarget >= PTX_COMPUTE_1_3 && PTXTarget < PTX_LAST_COMPUTE); - } - - bool fmadNeedsRoundingMode() const { - return (PTXTarget >= PTX_SM_1_3 && PTXTarget < PTX_LAST_SM) || - (PTXTarget >= PTX_COMPUTE_1_3 && PTXTarget < PTX_LAST_COMPUTE); - } - - bool useParamSpaceForDeviceArgs() const { - return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) || - (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE); - } - - bool callsAreHandled() const { - return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) || - (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE); - } - - bool emitPtrAttribute() const { - return PTXVersion >= PTX_VERSION_2_2; - } - - void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - }; // class PTXSubtarget -} // namespace llvm - -#endif // PTX_SUBTARGET_H diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp deleted file mode 100644 index c55a658..0000000 --- a/lib/Target/PTX/PTXTargetMachine.cpp +++ /dev/null @@ -1,165 +0,0 @@ -//===-- PTXTargetMachine.cpp - Define TargetMachine for PTX ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Top-level implementation for the PTX target. -// -//===----------------------------------------------------------------------===// - -#include "PTXTargetMachine.h" -#include "PTX.h" -#include "llvm/PassManager.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Assembly/PrintModulePass.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" -#include "llvm/Transforms/Scalar.h" - - -using namespace llvm; - -namespace llvm { - MCStreamer *createPTXAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useLoc, - bool useCFI, bool useDwarfDirectory, - MCInstPrinter *InstPrint, - MCCodeEmitter *CE, - MCAsmBackend *MAB, - bool ShowInst); -} - -extern "C" void LLVMInitializePTXTarget() { - - RegisterTargetMachine<PTX32TargetMachine> X(ThePTX32Target); - RegisterTargetMachine<PTX64TargetMachine> Y(ThePTX64Target); - - TargetRegistry::RegisterAsmStreamer(ThePTX32Target, createPTXAsmStreamer); - TargetRegistry::RegisterAsmStreamer(ThePTX64Target, createPTXAsmStreamer); -} - -namespace { - const char* DataLayout32 = - "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64"; - const char* DataLayout64 = - "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64"; -} - -// DataLayout and FrameLowering are filled with dummy data -PTXTargetMachine::PTXTargetMachine(const Target &T, - StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool is64Bit) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - DataLayout(is64Bit ? DataLayout64 : DataLayout32), - Subtarget(TT, CPU, FS, is64Bit), - FrameLowering(Subtarget), - InstrInfo(*this), - TSInfo(*this), - TLInfo(*this) { -} - -void PTX32TargetMachine::anchor() { } - -PTX32TargetMachine::PTX32TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) { -} - -void PTX64TargetMachine::anchor() { } - -PTX64TargetMachine::PTX64TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) { -} - -namespace llvm { -/// PTX Code Generator Pass Configuration Options. -class PTXPassConfig : public TargetPassConfig { -public: - PTXPassConfig(PTXTargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} - - PTXTargetMachine &getPTXTargetMachine() const { - return getTM<PTXTargetMachine>(); - } - - bool addInstSelector(); - FunctionPass *createTargetRegisterAllocator(bool); - void addOptimizedRegAlloc(FunctionPass *RegAllocPass); - bool addPostRegAlloc(); - void addMachineLateOptimization(); - bool addPreEmitPass(); -}; -} // namespace - -TargetPassConfig *PTXTargetMachine::createPassConfig(PassManagerBase &PM) { - PTXPassConfig *PassConfig = new PTXPassConfig(this, PM); - PassConfig->disablePass(PrologEpilogCodeInserterID); - return PassConfig; -} - -bool PTXPassConfig::addInstSelector() { - PM.add(createPTXISelDag(getPTXTargetMachine(), getOptLevel())); - return false; -} - -FunctionPass *PTXPassConfig::createTargetRegisterAllocator(bool /*Optimized*/) { - return createPTXRegisterAllocator(); -} - -// Modify the optimized compilation path to bypass optimized register alloction. -void PTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - addFastRegAlloc(RegAllocPass); -} - -bool PTXPassConfig::addPostRegAlloc() { - // PTXMFInfoExtract must after register allocation! - //PM.add(createPTXMFInfoExtract(getPTXTargetMachine())); - return false; -} - -/// Add passes that optimize machine instructions after register allocation. -void PTXPassConfig::addMachineLateOptimization() { - if (addPass(BranchFolderPassID) != &NoPassID) - printAndVerify("After BranchFolding"); - - if (addPass(TailDuplicateID) != &NoPassID) - printAndVerify("After TailDuplicate"); -} - -bool PTXPassConfig::addPreEmitPass() { - PM.add(createPTXMFInfoExtract(getPTXTargetMachine(), getOptLevel())); - PM.add(createPTXFPRoundingModePass(getPTXTargetMachine(), getOptLevel())); - return true; -} diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h deleted file mode 100644 index 278d155..0000000 --- a/lib/Target/PTX/PTXTargetMachine.h +++ /dev/null @@ -1,104 +0,0 @@ -//===-- PTXTargetMachine.h - Define TargetMachine for PTX -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares the PTX specific subclass of TargetMachine. -// -//===----------------------------------------------------------------------===// - -#ifndef PTX_TARGET_MACHINE_H -#define PTX_TARGET_MACHINE_H - -#include "PTXISelLowering.h" -#include "PTXInstrInfo.h" -#include "PTXFrameLowering.h" -#include "PTXSelectionDAGInfo.h" -#include "PTXSubtarget.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetFrameLowering.h" -#include "llvm/Target/TargetMachine.h" - -namespace llvm { -class PTXTargetMachine : public LLVMTargetMachine { - private: - const TargetData DataLayout; - PTXSubtarget Subtarget; // has to be initialized before FrameLowering - PTXFrameLowering FrameLowering; - PTXInstrInfo InstrInfo; - PTXSelectionDAGInfo TSInfo; - PTXTargetLowering TLInfo; - - public: - PTXTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool is64Bit); - - virtual const TargetData *getTargetData() const { return &DataLayout; } - - virtual const TargetFrameLowering *getFrameLowering() const { - return &FrameLowering; - } - - virtual const PTXInstrInfo *getInstrInfo() const { return &InstrInfo; } - virtual const TargetRegisterInfo *getRegisterInfo() const { - return &InstrInfo.getRegisterInfo(); } - - virtual const PTXTargetLowering *getTargetLowering() const { - return &TLInfo; } - - virtual const PTXSelectionDAGInfo* getSelectionDAGInfo() const { - return &TSInfo; - } - - virtual const PTXSubtarget *getSubtargetImpl() const { return &Subtarget; } - - // Emission of machine code through JITCodeEmitter is not supported. - virtual bool addPassesToEmitMachineCode(PassManagerBase &, - JITCodeEmitter &, - bool = true) { - return true; - } - - // Emission of machine code through MCJIT is not supported. - virtual bool addPassesToEmitMC(PassManagerBase &, - MCContext *&, - raw_ostream &, - bool = true) { - return true; - } - - // Pass Pipeline Configuration - virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); -}; // class PTXTargetMachine - - -class PTX32TargetMachine : public PTXTargetMachine { - virtual void anchor(); -public: - - PTX32TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; // class PTX32TargetMachine - -class PTX64TargetMachine : public PTXTargetMachine { - virtual void anchor(); -public: - - PTX64TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; // class PTX32TargetMachine - -} // namespace llvm - -#endif // PTX_TARGET_MACHINE_H diff --git a/lib/Target/PTX/TargetInfo/CMakeLists.txt b/lib/Target/PTX/TargetInfo/CMakeLists.txt deleted file mode 100644 index d9a5da3..0000000 --- a/lib/Target/PTX/TargetInfo/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMPTXInfo - PTXTargetInfo.cpp - ) - -add_dependencies(LLVMPTXInfo PTXCommonTableGen) diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp deleted file mode 100644 index 09a2735..0000000 --- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//===-- PTXTargetInfo.cpp - PTX Target Implementation ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "PTX.h" -#include "llvm/Module.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -Target llvm::ThePTX32Target; -Target llvm::ThePTX64Target; - -extern "C" void LLVMInitializePTXTargetInfo() { - // see llvm/ADT/Triple.h - RegisterTarget<Triple::ptx32> X32(ThePTX32Target, "ptx32", - "PTX (32-bit) [Experimental]"); - RegisterTarget<Triple::ptx64> X64(ThePTX64Target, "ptx64", - "PTX (64-bit) [Experimental]"); -} diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt index bcd8bd2..192d18d 100644 --- a/lib/Target/PowerPC/CMakeLists.txt +++ b/lib/Target/PowerPC/CMakeLists.txt @@ -14,6 +14,7 @@ add_llvm_target(PowerPCCodeGen PPCAsmPrinter.cpp PPCBranchSelector.cpp PPCCodeEmitter.cpp + PPCCTRLoops.cpp PPCHazardRecognizers.cpp PPCInstrInfo.cpp PPCISelDAGToDAG.cpp @@ -28,6 +29,8 @@ add_llvm_target(PowerPCCodeGen PPCSelectionDAGInfo.cpp ) +add_dependencies(LLVMPowerPCCodeGen intrinsics_gen) + add_subdirectory(InstPrinter) add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index 61d23ce..d175e3e 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -86,8 +86,33 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, const char *Modifier) { - assert(Modifier && "Must specify 'cc' or 'reg' as predicate op modifier!"); unsigned Code = MI->getOperand(OpNo).getImm(); + if (!Modifier) { + unsigned CCReg = MI->getOperand(OpNo+1).getReg(); + unsigned RegNo; + switch (CCReg) { + default: llvm_unreachable("Unknown CR register"); + case PPC::CR0: RegNo = 0; break; + case PPC::CR1: RegNo = 1; break; + case PPC::CR2: RegNo = 2; break; + case PPC::CR3: RegNo = 3; break; + case PPC::CR4: RegNo = 4; break; + case PPC::CR5: RegNo = 5; break; + case PPC::CR6: RegNo = 6; break; + case PPC::CR7: RegNo = 7; break; + } + + // Print the CR bit number. The Code is ((BI << 5) | BO) for a + // BCC, but we must have the positive form here (BO == 12) + unsigned BI = Code >> 5; + assert((Code & 0xF) == 12 && + "BO in predicate bit must have the positive form"); + + unsigned Value = 4*RegNo + BI; + O << Value; + return; + } + if (StringRef(Modifier) == "cc") { switch ((PPC::Predicate)Code) { case PPC::PRED_ALWAYS: return; // Don't print anything for always. diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h index 73fd534..8f1e211 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -42,7 +42,7 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printPredicateOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier); + raw_ostream &O, const char *Modifier = 0); void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 5a6827f..f652422 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -77,6 +77,7 @@ public: } // end anonymous namespace MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx) { return new PPCMCCodeEmitter(MCII, STI, Ctx); diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index b7fa064..7162e15 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -22,6 +22,7 @@ class MCCodeEmitter; class MCContext; class MCInstrInfo; class MCObjectWriter; +class MCRegisterInfo; class MCSubtargetInfo; class Target; class StringRef; @@ -31,6 +32,7 @@ extern Target ThePPC32Target; extern Target ThePPC64Target; MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx); diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h index 24a7178..9103e12 100644 --- a/lib/Target/PowerPC/PPC.h +++ b/lib/Target/PowerPC/PPC.h @@ -30,6 +30,7 @@ namespace llvm { class AsmPrinter; class MCInst; + FunctionPass *createPPCCTRLoops(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM, @@ -50,21 +51,27 @@ namespace llvm { /// and jumps to external functions on Tiger and earlier. MO_DARWIN_STUB = 1, - /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol) - MO_LO16 = 4, MO_HA16 = 8, - /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to /// the function's picbase, e.g. lo16(symbol-picbase). - MO_PIC_FLAG = 16, + MO_PIC_FLAG = 4, /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase). - MO_NLP_FLAG = 32, + MO_NLP_FLAG = 8, /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a /// symbol with hidden visibility. This causes a different kind of /// non-lazy-pointer to be generated. - MO_NLP_HIDDEN_FLAG = 64 + MO_NLP_HIDDEN_FLAG = 16, + + /// The next are not flags but distinct values. + MO_ACCESS_MASK = 224, + + /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol) + MO_LO16 = 32, MO_HA16 = 64, + + MO_TPREL16_HA = 96, + MO_TPREL16_LO = 128 }; } // end namespace PPCII diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index c554d39..b7f1688 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -35,6 +35,8 @@ def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">; def Directive32 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">; def Directive64 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">; def DirectiveA2 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">; +def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">; +def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">; def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", "Enable 64-bit instructions">; @@ -42,12 +44,14 @@ def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true", "Enable 64-bit registers usage for ppc32 [beta]">; def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true", "Enable Altivec instructions">; -def FeatureGPUL : SubtargetFeature<"gpul","IsGigaProcessor", "true", - "Enable GPUL instructions">; +def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true", + "Enable the MFOCRF instruction">; def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true", "Enable the fsqrt instruction">; def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true", "Enable the stfiwx instruction">; +def FeatureISEL : SubtargetFeature<"isel","HasISEL", "true", + "Enable the isel instruction">; def FeatureBookE : SubtargetFeature<"booke", "IsBookE", "true", "Enable Book E instructions">; @@ -64,8 +68,10 @@ include "PPCInstrInfo.td" // def : Processor<"generic", G3Itineraries, [Directive32]>; -def : Processor<"440", PPC440Itineraries, [Directive440, FeatureBookE]>; -def : Processor<"450", PPC440Itineraries, [Directive440, FeatureBookE]>; +def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL, + FeatureBookE]>; +def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL, + FeatureBookE]>; def : Processor<"601", G3Itineraries, [Directive601]>; def : Processor<"602", G3Itineraries, [Directive602]>; def : Processor<"603", G3Itineraries, [Directive603]>; @@ -74,28 +80,37 @@ def : Processor<"603ev", G3Itineraries, [Directive603]>; def : Processor<"604", G3Itineraries, [Directive604]>; def : Processor<"604e", G3Itineraries, [Directive604]>; def : Processor<"620", G3Itineraries, [Directive620]>; -def : Processor<"g3", G3Itineraries, [Directive7400]>; +def : Processor<"750", G4Itineraries, [Directive750]>; +def : Processor<"g3", G3Itineraries, [Directive750]>; def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>; def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>; def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>; -def : Processor<"g4+", G4PlusItineraries, [Directive750, FeatureAltivec]>; -def : Processor<"750", G4Itineraries, [Directive750, FeatureAltivec]>; +def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec]>; def : Processor<"970", G5Itineraries, [Directive970, FeatureAltivec, - FeatureGPUL, FeatureFSqrt, FeatureSTFIWX, + FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, Feature64Bit /*, Feature64BitRegs */]>; def : Processor<"g5", G5Itineraries, [Directive970, FeatureAltivec, - FeatureGPUL, FeatureFSqrt, FeatureSTFIWX, + FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, Feature64Bit /*, Feature64BitRegs */]>; -def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE, - FeatureFSqrt, FeatureSTFIWX, - Feature64Bit - /*, Feature64BitRegs */]>; +def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE, + FeatureMFOCRF, FeatureFSqrt, + FeatureSTFIWX, FeatureISEL, + Feature64Bit + /*, Feature64BitRegs */]>; +def : Processor<"pwr6", G5Itineraries, + [DirectivePwr6, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */]>; +def : Processor<"pwr7", G5Itineraries, + [DirectivePwr7, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, + FeatureISEL, Feature64Bit /*, Feature64BitRegs */]>; def : Processor<"ppc", G3Itineraries, [Directive32]>; def : Processor<"ppc64", G5Itineraries, [Directive64, FeatureAltivec, - FeatureGPUL, FeatureFSqrt, FeatureSTFIWX, + FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, Feature64Bit /*, Feature64BitRegs */]>; diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index fb7aa71..f76b89c 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -22,8 +22,8 @@ #include "PPCSubtarget.h" #include "InstPrinter/PPCInstPrinter.h" #include "MCTargetDesc/PPCPredicates.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Module.h" #include "llvm/Assembly/Writer.h" @@ -248,7 +248,9 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, if (ExtraCode[1] != 0) return true; // Unknown modifier. switch (ExtraCode[0]) { - default: return true; // Unknown modifier. + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); case 'c': // Don't print "$" before a global var name or constant. break; // PPC never has a prefix. case 'L': // Write second word of DImode reference. @@ -451,11 +453,13 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) { "ppc750", "ppc970", "ppcA2", + "power6", + "power7", "ppc64" }; unsigned Directive = Subtarget.getDarwinDirective(); - if (Subtarget.isGigaProcessor() && Directive < PPC::DIR_970) + if (Subtarget.hasMFOCRF() && Directive < PPC::DIR_970) Directive = PPC::DIR_970; if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400) Directive = PPC::DIR_7400; diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp index 5f775e1..21a0fb2 100644 --- a/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -135,21 +135,33 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { MBBStartOffset += 4; continue; } - + // Otherwise, we have to expand it to a long branch. - // The BCC operands are: - // 0. PPC branch predicate - // 1. CR register - // 2. Target MBB - PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm(); - unsigned CRReg = I->getOperand(1).getReg(); - MachineInstr *OldBranch = I; DebugLoc dl = OldBranch->getDebugLoc(); - - // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition. - BuildMI(MBB, I, dl, TII->get(PPC::BCC)) - .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2); + + if (I->getOpcode() == PPC::BCC) { + // The BCC operands are: + // 0. PPC branch predicate + // 1. CR register + // 2. Target MBB + PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm(); + unsigned CRReg = I->getOperand(1).getReg(); + + // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition. + BuildMI(MBB, I, dl, TII->get(PPC::BCC)) + .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2); + } else if (I->getOpcode() == PPC::BDNZ) { + BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2); + } else if (I->getOpcode() == PPC::BDNZ8) { + BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2); + } else if (I->getOpcode() == PPC::BDZ) { + BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2); + } else if (I->getOpcode() == PPC::BDZ8) { + BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2); + } else { + llvm_unreachable("Unhandled branch type!"); + } // Uncond branch to the real destination. I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest); diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp new file mode 100644 index 0000000..f50f9b5 --- /dev/null +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -0,0 +1,721 @@ +//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass identifies loops where we can generate the PPC branch instructions +// that decrement and test the count register (CTR) (bdnz and friends). +// This pass is based on the HexagonHardwareLoops pass. +// +// The pattern that defines the induction variable can changed depending on +// prior optimizations. For example, the IndVarSimplify phase run by 'opt' +// normalizes induction variables, and the Loop Strength Reduction pass +// run by 'llc' may also make changes to the induction variable. +// The pattern detected by this phase is due to running Strength Reduction. +// +// Criteria for CTR loops: +// - Countable loops (w/ ind. var for a trip count) +// - Assumes loops are normalized by IndVarSimplify +// - Try inner-most loops first +// - No nested CTR loops. +// - No function calls in loops. +// +// Note: As with unconverted loops, PPCBranchSelector must be run after this +// pass in order to convert long-displacement jumps into jump pairs. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ctrloops" +#include "PPC.h" +#include "PPCTargetMachine.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "llvm/Constants.h" +#include "llvm/PassSupport.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include <algorithm> + +using namespace llvm; + +STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); + +namespace { + class CountValue; + struct PPCCTRLoops : public MachineFunctionPass { + MachineLoopInfo *MLI; + MachineRegisterInfo *MRI; + const TargetInstrInfo *TII; + + public: + static char ID; // Pass identification, replacement for typeid + + PPCCTRLoops() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "PPC CTR Loops"; } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + private: + /// getCanonicalInductionVariable - Check to see if the loop has a canonical + /// induction variable. + /// Should be defined in MachineLoop. Based upon version in class Loop. + void getCanonicalInductionVariable(MachineLoop *L, + SmallVector<MachineInstr *, 4> &IVars, + SmallVector<MachineInstr *, 4> &IOps) const; + + /// getTripCount - Return a loop-invariant LLVM register indicating the + /// number of times the loop will be executed. If the trip-count cannot + /// be determined, this return null. + CountValue *getTripCount(MachineLoop *L, + SmallVector<MachineInstr *, 2> &OldInsts) const; + + /// isInductionOperation - Return true if the instruction matches the + /// pattern for an opertion that defines an induction variable. + bool isInductionOperation(const MachineInstr *MI, unsigned IVReg) const; + + /// isInvalidOperation - Return true if the instruction is not valid within + /// a CTR loop. + bool isInvalidLoopOperation(const MachineInstr *MI) const; + + /// containsInavlidInstruction - Return true if the loop contains an + /// instruction that inhibits using the CTR loop. + bool containsInvalidInstruction(MachineLoop *L) const; + + /// converToCTRLoop - Given a loop, check if we can convert it to a + /// CTR loop. If so, then perform the conversion and return true. + bool convertToCTRLoop(MachineLoop *L); + + /// isDead - Return true if the instruction is now dead. + bool isDead(const MachineInstr *MI, + SmallVector<MachineInstr *, 1> &DeadPhis) const; + + /// removeIfDead - Remove the instruction if it is now dead. + void removeIfDead(MachineInstr *MI); + }; + + char PPCCTRLoops::ID = 0; + + + // CountValue class - Abstraction for a trip count of a loop. A + // smaller vesrsion of the MachineOperand class without the concerns + // of changing the operand representation. + class CountValue { + public: + enum CountValueType { + CV_Register, + CV_Immediate + }; + private: + CountValueType Kind; + union Values { + unsigned RegNum; + int64_t ImmVal; + Values(unsigned r) : RegNum(r) {} + Values(int64_t i) : ImmVal(i) {} + } Contents; + bool isNegative; + + public: + CountValue(unsigned r, bool neg) : Kind(CV_Register), Contents(r), + isNegative(neg) {} + explicit CountValue(int64_t i) : Kind(CV_Immediate), Contents(i), + isNegative(i < 0) {} + CountValueType getType() const { return Kind; } + bool isReg() const { return Kind == CV_Register; } + bool isImm() const { return Kind == CV_Immediate; } + bool isNeg() const { return isNegative; } + + unsigned getReg() const { + assert(isReg() && "Wrong CountValue accessor"); + return Contents.RegNum; + } + void setReg(unsigned Val) { + Contents.RegNum = Val; + } + int64_t getImm() const { + assert(isImm() && "Wrong CountValue accessor"); + if (isNegative) { + return -Contents.ImmVal; + } + return Contents.ImmVal; + } + void setImm(int64_t Val) { + Contents.ImmVal = Val; + } + + void print(raw_ostream &OS, const TargetMachine *TM = 0) const { + if (isReg()) { OS << PrintReg(getReg()); } + if (isImm()) { OS << getImm(); } + } + }; +} // end anonymous namespace + + +/// isCompareEquals - Returns true if the instruction is a compare equals +/// instruction with an immediate operand. +static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp) { + if (MI->getOpcode() == PPC::CMPWI || MI->getOpcode() == PPC::CMPDI) { + SignedCmp = true; + return true; + } else if (MI->getOpcode() == PPC::CMPLWI || MI->getOpcode() == PPC::CMPLDI) { + SignedCmp = false; + return true; + } + + return false; +} + + +/// createPPCCTRLoops - Factory for creating +/// the CTR loop phase. +FunctionPass *llvm::createPPCCTRLoops() { + return new PPCCTRLoops(); +} + + +bool PPCCTRLoops::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********* PPC CTR Loops *********\n"); + + bool Changed = false; + + // get the loop information + MLI = &getAnalysis<MachineLoopInfo>(); + // get the register information + MRI = &MF.getRegInfo(); + // the target specific instructio info. + TII = MF.getTarget().getInstrInfo(); + + for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); + I != E; ++I) { + MachineLoop *L = *I; + if (!L->getParentLoop()) { + Changed |= convertToCTRLoop(L); + } + } + + return Changed; +} + +/// getCanonicalInductionVariable - Check to see if the loop has a canonical +/// induction variable. We check for a simple recurrence pattern - an +/// integer recurrence that decrements by one each time through the loop and +/// ends at zero. If so, return the phi node that corresponds to it. +/// +/// Based upon the similar code in LoopInfo except this code is specific to +/// the machine. +/// This method assumes that the IndVarSimplify pass has been run by 'opt'. +/// +void +PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L, + SmallVector<MachineInstr *, 4> &IVars, + SmallVector<MachineInstr *, 4> &IOps) const { + MachineBasicBlock *TopMBB = L->getTopBlock(); + MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin(); + assert(PI != TopMBB->pred_end() && + "Loop must have more than one incoming edge!"); + MachineBasicBlock *Backedge = *PI++; + if (PI == TopMBB->pred_end()) return; // dead loop + MachineBasicBlock *Incoming = *PI++; + if (PI != TopMBB->pred_end()) return; // multiple backedges? + + // make sure there is one incoming and one backedge and determine which + // is which. + if (L->contains(Incoming)) { + if (L->contains(Backedge)) + return; + std::swap(Incoming, Backedge); + } else if (!L->contains(Backedge)) + return; + + // Loop over all of the PHI nodes, looking for a canonical induction variable: + // - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2". + // - The recurrence comes from the backedge. + // - the definition is an induction operatio.n + for (MachineBasicBlock::iterator I = TopMBB->begin(), E = TopMBB->end(); + I != E && I->isPHI(); ++I) { + MachineInstr *MPhi = &*I; + unsigned DefReg = MPhi->getOperand(0).getReg(); + for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) { + // Check each operand for the value from the backedge. + MachineBasicBlock *MBB = MPhi->getOperand(i+1).getMBB(); + if (L->contains(MBB)) { // operands comes from the backedge + // Check if the definition is an induction operation. + MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg()); + if (isInductionOperation(DI, DefReg)) { + IOps.push_back(DI); + IVars.push_back(MPhi); + } + } + } + } + return; +} + +/// getTripCount - Return a loop-invariant LLVM value indicating the +/// number of times the loop will be executed. The trip count can +/// be either a register or a constant value. If the trip-count +/// cannot be determined, this returns null. +/// +/// We find the trip count from the phi instruction that defines the +/// induction variable. We follow the links to the CMP instruction +/// to get the trip count. +/// +/// Based upon getTripCount in LoopInfo. +/// +CountValue *PPCCTRLoops::getTripCount(MachineLoop *L, + SmallVector<MachineInstr *, 2> &OldInsts) const { + MachineBasicBlock *LastMBB = L->getExitingBlock(); + // Don't generate a CTR loop if the loop has more than one exit. + if (LastMBB == 0) + return 0; + + MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator(); + if (LastI->getOpcode() != PPC::BCC) + return 0; + + // We need to make sure that this compare is defining the condition + // register actually used by the terminating branch. + + unsigned PredReg = LastI->getOperand(1).getReg(); + DEBUG(dbgs() << "Examining loop with first terminator: " << *LastI); + + unsigned PredCond = LastI->getOperand(0).getImm(); + if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE) + return 0; + + // Check that the loop has a induction variable. + SmallVector<MachineInstr *, 4> IVars, IOps; + getCanonicalInductionVariable(L, IVars, IOps); + for (unsigned i = 0; i < IVars.size(); ++i) { + MachineInstr *IOp = IOps[i]; + MachineInstr *IV_Inst = IVars[i]; + + // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm', + // if Imm is 0, get the count from the PHI opnd + // if Imm is -M, than M is the count + // Otherwise, Imm is the count + MachineOperand *IV_Opnd; + const MachineOperand *InitialValue; + if (!L->contains(IV_Inst->getOperand(2).getMBB())) { + InitialValue = &IV_Inst->getOperand(1); + IV_Opnd = &IV_Inst->getOperand(3); + } else { + InitialValue = &IV_Inst->getOperand(3); + IV_Opnd = &IV_Inst->getOperand(1); + } + + DEBUG(dbgs() << "Considering:\n"); + DEBUG(dbgs() << " induction operation: " << *IOp); + DEBUG(dbgs() << " induction variable: " << *IV_Inst); + DEBUG(dbgs() << " initial value: " << *InitialValue << "\n"); + + // Look for the cmp instruction to determine if we + // can get a useful trip count. The trip count can + // be either a register or an immediate. The location + // of the value depends upon the type (reg or imm). + while ((IV_Opnd = IV_Opnd->getNextOperandForReg())) { + bool SignedCmp; + MachineInstr *MI = IV_Opnd->getParent(); + if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp) && + MI->getOperand(0).getReg() == PredReg) { + + OldInsts.push_back(MI); + OldInsts.push_back(IOp); + + DEBUG(dbgs() << " compare: " << *MI); + + const MachineOperand &MO = MI->getOperand(2); + assert(MO.isImm() && "IV Cmp Operand should be an immediate"); + + int64_t ImmVal; + if (SignedCmp) + ImmVal = (short) MO.getImm(); + else + ImmVal = MO.getImm(); + + const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg()); + assert(L->contains(IV_DefInstr->getParent()) && + "IV definition should occurs in loop"); + int64_t iv_value = (short) IV_DefInstr->getOperand(2).getImm(); + + assert(InitialValue->isReg() && "Expecting register for init value"); + unsigned InitialValueReg = InitialValue->getReg(); + + const MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg); + + // Here we need to look for an immediate load (an li or lis/ori pair). + if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 || + DefInstr->getOpcode() == PPC::ORI)) { + int64_t start = (short) DefInstr->getOperand(2).getImm(); + const MachineInstr *DefInstr2 = + MRI->getVRegDef(DefInstr->getOperand(0).getReg()); + if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 || + DefInstr2->getOpcode() == PPC::LIS)) { + DEBUG(dbgs() << " initial constant: " << *DefInstr); + DEBUG(dbgs() << " initial constant: " << *DefInstr2); + + start |= int64_t(short(DefInstr2->getOperand(1).getImm())) << 16; + + int64_t count = ImmVal - start; + if ((count % iv_value) != 0) { + return 0; + } + return new CountValue(count/iv_value); + } + } else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 || + DefInstr->getOpcode() == PPC::LI)) { + DEBUG(dbgs() << " initial constant: " << *DefInstr); + + int64_t count = ImmVal - int64_t(short(DefInstr->getOperand(1).getImm())); + if ((count % iv_value) != 0) { + return 0; + } + return new CountValue(count/iv_value); + } else if (iv_value == 1 || iv_value == -1) { + // We can't determine a constant starting value. + if (ImmVal == 0) { + return new CountValue(InitialValueReg, iv_value > 0); + } + // FIXME: handle non-zero end value. + } + // FIXME: handle non-unit increments (we might not want to introduce division + // but we can handle some 2^n cases with shifts). + + } + } + } + return 0; +} + +/// isInductionOperation - return true if the operation is matches the +/// pattern that defines an induction variable: +/// addi iv, c +/// +bool +PPCCTRLoops::isInductionOperation(const MachineInstr *MI, + unsigned IVReg) const { + return ((MI->getOpcode() == PPC::ADDI || MI->getOpcode() == PPC::ADDI8) && + MI->getOperand(1).isReg() && // could be a frame index instead + MI->getOperand(1).getReg() == IVReg); +} + +/// isInvalidOperation - Return true if the operation is invalid within +/// CTR loop. +bool +PPCCTRLoops::isInvalidLoopOperation(const MachineInstr *MI) const { + + // call is not allowed because the callee may use a CTR loop + if (MI->getDesc().isCall()) { + return true; + } + // check if the instruction defines a CTR loop register + // (this will also catch nested CTR loops) + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDef() && + (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) { + return true; + } + } + return false; +} + +/// containsInvalidInstruction - Return true if the loop contains +/// an instruction that inhibits the use of the CTR loop function. +/// +bool PPCCTRLoops::containsInvalidInstruction(MachineLoop *L) const { + const std::vector<MachineBasicBlock*> Blocks = L->getBlocks(); + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + MachineBasicBlock *MBB = Blocks[i]; + for (MachineBasicBlock::iterator + MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) { + const MachineInstr *MI = &*MII; + if (isInvalidLoopOperation(MI)) { + return true; + } + } + } + return false; +} + +/// isDead returns true if the instruction is dead +/// (this was essentially copied from DeadMachineInstructionElim::isDead, but +/// with special cases for inline asm, physical registers and instructions with +/// side effects removed) +bool PPCCTRLoops::isDead(const MachineInstr *MI, + SmallVector<MachineInstr *, 1> &DeadPhis) const { + // Examine each operand. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDef()) { + unsigned Reg = MO.getReg(); + if (!MRI->use_nodbg_empty(Reg)) { + // This instruction has users, but if the only user is the phi node for the + // parent block, and the only use of that phi node is this instruction, then + // this instruction is dead: both it (and the phi node) can be removed. + MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg); + if (llvm::next(I) == MRI->use_end() && + I.getOperand().getParent()->isPHI()) { + MachineInstr *OnePhi = I.getOperand().getParent(); + + for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) { + const MachineOperand &OPO = OnePhi->getOperand(j); + if (OPO.isReg() && OPO.isDef()) { + unsigned OPReg = OPO.getReg(); + + MachineRegisterInfo::use_iterator nextJ; + for (MachineRegisterInfo::use_iterator J = MRI->use_begin(OPReg), + E = MRI->use_end(); J!=E; J=nextJ) { + nextJ = llvm::next(J); + MachineOperand& Use = J.getOperand(); + MachineInstr *UseMI = Use.getParent(); + + if (MI != UseMI) { + // The phi node has a user that is not MI, bail... + return false; + } + } + } + } + + DeadPhis.push_back(OnePhi); + } else { + // This def has a non-debug use. Don't delete the instruction! + return false; + } + } + } + } + + // If there are no defs with uses, the instruction is dead. + return true; +} + +void PPCCTRLoops::removeIfDead(MachineInstr *MI) { + // This procedure was essentially copied from DeadMachineInstructionElim + + SmallVector<MachineInstr *, 1> DeadPhis; + if (isDead(MI, DeadPhis)) { + DEBUG(dbgs() << "CTR looping will remove: " << *MI); + + // It is possible that some DBG_VALUE instructions refer to this + // instruction. Examine each def operand for such references; + // if found, mark the DBG_VALUE as undef (but don't delete it). + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + MachineRegisterInfo::use_iterator nextI; + for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg), + E = MRI->use_end(); I!=E; I=nextI) { + nextI = llvm::next(I); // I is invalidated by the setReg + MachineOperand& Use = I.getOperand(); + MachineInstr *UseMI = Use.getParent(); + if (UseMI==MI) + continue; + if (Use.isDebug()) // this might also be a instr -> phi -> instr case + // which can also be removed. + UseMI->getOperand(0).setReg(0U); + } + } + + MI->eraseFromParent(); + for (unsigned i = 0; i < DeadPhis.size(); ++i) { + DeadPhis[i]->eraseFromParent(); + } + } +} + +/// converToCTRLoop - check if the loop is a candidate for +/// converting to a CTR loop. If so, then perform the +/// transformation. +/// +/// This function works on innermost loops first. A loop can +/// be converted if it is a counting loop; either a register +/// value or an immediate. +/// +/// The code makes several assumptions about the representation +/// of the loop in llvm. +bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) { + bool Changed = false; + // Process nested loops first. + for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + Changed |= convertToCTRLoop(*I); + } + // If a nested loop has been converted, then we can't convert this loop. + if (Changed) { + return Changed; + } + + SmallVector<MachineInstr *, 2> OldInsts; + // Are we able to determine the trip count for the loop? + CountValue *TripCount = getTripCount(L, OldInsts); + if (TripCount == 0) { + DEBUG(dbgs() << "failed to get trip count!\n"); + return false; + } + // Does the loop contain any invalid instructions? + if (containsInvalidInstruction(L)) { + return false; + } + MachineBasicBlock *Preheader = L->getLoopPreheader(); + // No preheader means there's not place for the loop instr. + if (Preheader == 0) { + return false; + } + MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator(); + + DebugLoc dl; + if (InsertPos != Preheader->end()) + dl = InsertPos->getDebugLoc(); + + MachineBasicBlock *LastMBB = L->getExitingBlock(); + // Don't generate CTR loop if the loop has more than one exit. + if (LastMBB == 0) { + return false; + } + MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator(); + + // Determine the loop start. + MachineBasicBlock *LoopStart = L->getTopBlock(); + if (L->getLoopLatch() != LastMBB) { + // When the exit and latch are not the same, use the latch block as the + // start. + // The loop start address is used only after the 1st iteration, and the loop + // latch may contains instrs. that need to be executed after the 1st iter. + LoopStart = L->getLoopLatch(); + // Make sure the latch is a successor of the exit, otherwise it won't work. + if (!LastMBB->isSuccessor(LoopStart)) { + return false; + } + } + + // Convert the loop to a CTR loop + DEBUG(dbgs() << "Change to CTR loop at "; L->dump()); + + MachineFunction *MF = LastMBB->getParent(); + const PPCSubtarget &Subtarget = MF->getTarget().getSubtarget<PPCSubtarget>(); + bool isPPC64 = Subtarget.isPPC64(); + + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC; + + unsigned CountReg; + if (TripCount->isReg()) { + // Create a copy of the loop count register. + const TargetRegisterClass *SrcRC = + MF->getRegInfo().getRegClass(TripCount->getReg()); + CountReg = MF->getRegInfo().createVirtualRegister(RC); + unsigned CopyOp = (isPPC64 && SrcRC == GPRC) ? + (unsigned) PPC::EXTSW_32_64 : + (unsigned) TargetOpcode::COPY; + BuildMI(*Preheader, InsertPos, dl, + TII->get(CopyOp), CountReg).addReg(TripCount->getReg()); + if (TripCount->isNeg()) { + unsigned CountReg1 = CountReg; + CountReg = MF->getRegInfo().createVirtualRegister(RC); + BuildMI(*Preheader, InsertPos, dl, + TII->get(isPPC64 ? PPC::NEG8 : PPC::NEG), + CountReg).addReg(CountReg1); + } + } else { + assert(TripCount->isImm() && "Expecting immedate vaule for trip count"); + // Put the trip count in a register for transfer into the count register. + + int64_t CountImm = TripCount->getImm(); + assert(!TripCount->isNeg() && "Constant trip count must be positive"); + + CountReg = MF->getRegInfo().createVirtualRegister(RC); + if (CountImm > 0xFFFF) { + BuildMI(*Preheader, InsertPos, dl, + TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), + CountReg).addImm(CountImm >> 16); + unsigned CountReg1 = CountReg; + CountReg = MF->getRegInfo().createVirtualRegister(RC); + BuildMI(*Preheader, InsertPos, dl, + TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI), + CountReg).addReg(CountReg1).addImm(CountImm & 0xFFFF); + } else { + BuildMI(*Preheader, InsertPos, dl, + TII->get(isPPC64 ? PPC::LI8 : PPC::LI), + CountReg).addImm(CountImm); + } + } + + // Add the mtctr instruction to the beginning of the loop. + BuildMI(*Preheader, InsertPos, dl, + TII->get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(CountReg, + TripCount->isImm() ? RegState::Kill : 0); + + // Make sure the loop start always has a reference in the CFG. We need to + // create a BlockAddress operand to get this mechanism to work both the + // MachineBasicBlock and BasicBlock objects need the flag set. + LoopStart->setHasAddressTaken(); + // This line is needed to set the hasAddressTaken flag on the BasicBlock + // object + BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock())); + + // Replace the loop branch with a bdnz instruction. + dl = LastI->getDebugLoc(); + const std::vector<MachineBasicBlock*> Blocks = L->getBlocks(); + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + MachineBasicBlock *MBB = Blocks[i]; + if (MBB != Preheader) + MBB->addLiveIn(isPPC64 ? PPC::CTR8 : PPC::CTR); + } + + // The loop ends with either: + // - a conditional branch followed by an unconditional branch, or + // - a conditional branch to the loop start. + assert(LastI->getOpcode() == PPC::BCC && + "loop end must start with a BCC instruction"); + // Either the BCC branches to the beginning of the loop, or it + // branches out of the loop and there is an unconditional branch + // to the start of the loop. + MachineBasicBlock *BranchTarget = LastI->getOperand(2).getMBB(); + BuildMI(*LastMBB, LastI, dl, + TII->get((BranchTarget == LoopStart) ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(BranchTarget); + + // Conditional branch; just delete it. + DEBUG(dbgs() << "Removing old branch: " << *LastI); + LastMBB->erase(LastI); + + delete TripCount; + + // The induction operation (add) and the comparison (cmpwi) may now be + // unneeded. If these are unneeded, then remove them. + for (unsigned i = 0; i < OldInsts.size(); ++i) + removeIfDead(OldInsts[i]); + + ++NumCTRLoops; + return true; +} + diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index b77a80b..c24afa9 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -330,6 +330,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0); if (HasFP) + // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative + // offsets of R1 is not allowed. BuildMI(MBB, MBBI, dl, TII.get(PPC::STW)) .addReg(PPC::R31) .addImm(FPOffset) @@ -366,9 +368,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC) ,PPC::R0) .addReg(PPC::R0, RegState::Kill) .addImm(NegFrameSize); - BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX)) + BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1) .addReg(PPC::R1, RegState::Kill) - .addReg(PPC::R1, RegState::Define) + .addReg(PPC::R1) .addReg(PPC::R0); } else if (isInt<16>(NegFrameSize)) { BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1) @@ -381,9 +383,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0) .addReg(PPC::R0, RegState::Kill) .addImm(NegFrameSize & 0xFFFF); - BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX)) + BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1) .addReg(PPC::R1, RegState::Kill) - .addReg(PPC::R1, RegState::Define) + .addReg(PPC::R1) .addReg(PPC::R0); } } else { // PPC64. @@ -399,9 +401,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0) .addReg(PPC::X0) .addImm(NegFrameSize); - BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX)) + BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1) .addReg(PPC::X1, RegState::Kill) - .addReg(PPC::X1, RegState::Define) + .addReg(PPC::X1) .addReg(PPC::X0); } else if (isInt<16>(NegFrameSize)) { BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1) @@ -414,9 +416,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0) .addReg(PPC::X0, RegState::Kill) .addImm(NegFrameSize & 0xFFFF); - BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX)) + BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1) .addReg(PPC::X1, RegState::Kill) - .addReg(PPC::X1, RegState::Define) + .addReg(PPC::X1) .addReg(PPC::X0); } } @@ -492,7 +494,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just // subregisters of CR2. We just need to emit a move of CR2. - if (PPC::CRBITRCRegisterClass->contains(Reg)) + if (PPC::CRBITRCRegClass.contains(Reg)) continue; MachineLocation CSDst(MachineLocation::VirtualFP, Offset); @@ -817,7 +819,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF) for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); - if (PPC::GPRCRegisterClass->contains(Reg)) { + if (PPC::GPRCRegClass.contains(Reg)) { HasGPSaveArea = true; GPRegs.push_back(CSI[i]); @@ -825,7 +827,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF) if (Reg < MinGPR) { MinGPR = Reg; } - } else if (PPC::G8RCRegisterClass->contains(Reg)) { + } else if (PPC::G8RCRegClass.contains(Reg)) { HasG8SaveArea = true; G8Regs.push_back(CSI[i]); @@ -833,7 +835,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF) if (Reg < MinG8R) { MinG8R = Reg; } - } else if (PPC::F8RCRegisterClass->contains(Reg)) { + } else if (PPC::F8RCRegClass.contains(Reg)) { HasFPSaveArea = true; FPRegs.push_back(CSI[i]); @@ -842,12 +844,12 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF) MinFPR = Reg; } // FIXME SVR4: Disable CR save area for now. - } else if (PPC::CRBITRCRegisterClass->contains(Reg) - || PPC::CRRCRegisterClass->contains(Reg)) { + } else if (PPC::CRBITRCRegClass.contains(Reg) || + PPC::CRRCRegClass.contains(Reg)) { // HasCRSaveArea = true; - } else if (PPC::VRSAVERCRegisterClass->contains(Reg)) { + } else if (PPC::VRSAVERCRegClass.contains(Reg)) { HasVRSAVESaveArea = true; - } else if (PPC::VRRCRegisterClass->contains(Reg)) { + } else if (PPC::VRRCRegClass.contains(Reg)) { HasVRSaveArea = true; VRegs.push_back(CSI[i]); @@ -932,8 +934,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF) for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); - if (PPC::CRBITRCRegisterClass->contains(Reg) || - PPC::CRRCRegisterClass->contains(Reg)) { + if (PPC::CRBITRCRegClass.contains(Reg) || + PPC::CRRCRegClass.contains(Reg)) { int FI = CSI[i].getFrameIdx(); FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); @@ -950,7 +952,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF) for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); - if (PPC::VRSAVERCRegisterClass->contains(Reg)) { + if (PPC::VRSAVERCRegClass.contains(Reg)) { int FI = CSI[i].getFrameIdx(); FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 5a04888..a00f686 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -111,6 +111,23 @@ namespace { /// immediate field. Because preinc imms have already been validated, just /// accept it. bool SelectAddrImmOffs(SDValue N, SDValue &Out) const { + if (isa<ConstantSDNode>(N) || N.getOpcode() == PPCISD::Lo || + N.getOpcode() == ISD::TargetGlobalAddress) { + Out = N; + return true; + } + + return false; + } + + /// SelectAddrIdxOffs - Return true if the operand is valid for a preinc + /// index field. Because preinc imms have already been validated, just + /// accept it. + bool SelectAddrIdxOffs(SDValue N, SDValue &Out) const { + if (isa<ConstantSDNode>(N) || N.getOpcode() == PPCISD::Lo || + N.getOpcode() == ISD::TargetGlobalAddress) + return false; + Out = N; return true; } @@ -238,11 +255,11 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { DebugLoc dl; if (PPCLowering.getPointerTy() == MVT::i32) { - GlobalBaseReg = RegInfo->createVirtualRegister(PPC::GPRCRegisterClass); + GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); } else { - GlobalBaseReg = RegInfo->createVirtualRegister(PPC::G8RCRegisterClass); + GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RCRegClass); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8)); BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg); } @@ -697,7 +714,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) { CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, InFlag).getValue(1); - if (PPCSubTarget.isGigaProcessor() && OtherCondIdx == -1) + if (PPCSubTarget.hasMFOCRF() && OtherCondIdx == -1) IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg, CCReg), 0); else @@ -833,7 +850,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { case PPCISD::MFCR: { SDValue InFlag = N->getOperand(1); // Use MFOCRF if supported. - if (PPCSubTarget.isGigaProcessor()) + if (PPCSubTarget.hasMFOCRF()) return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, N->getOperand(0), InFlag); else @@ -915,12 +932,44 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[] = { Offset, Base, Chain }; - // FIXME: PPC64 return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0), PPCLowering.getPointerTy(), MVT::Other, Ops, 3); } else { - llvm_unreachable("R+R preindex loads not supported yet!"); + unsigned Opcode; + bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; + if (LD->getValueType(0) != MVT::i64) { + // Handle PPC32 integer and normal FP loads. + assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::f64: Opcode = PPC::LFDUX; break; + case MVT::f32: Opcode = PPC::LFSUX; break; + case MVT::i32: Opcode = PPC::LWZUX; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZUX; break; + } + } else { + assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!"); + assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) && + "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::i64: Opcode = PPC::LDUX; break; + case MVT::i32: Opcode = isSExt ? PPC::LWAUX : PPC::LWZUX8; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZUX8; break; + } + } + + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[] = { Offset, Base, Chain }; + return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0), + PPCLowering.getPointerTy(), + MVT::Other, Ops, 3); } } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 3b24951..13250b3 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -51,9 +51,11 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, ISD::ArgFlagsTy &ArgFlags, CCState &State); -static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc", -cl::desc("enable preincrement load/store generation on PPC (experimental)"), - cl::Hidden); +static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", +cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); + +static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref", +cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) { if (TM.getSubtargetImpl()->isDarwin()) @@ -64,6 +66,7 @@ static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) { PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) { + const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>(); setPow2DivIsCheap(); @@ -73,12 +76,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all // arguments are at least 4/8 bytes aligned. - setMinStackArgumentAlignment(TM.getSubtarget<PPCSubtarget>().isPPC64() ? 8:4); + bool isPPC64 = Subtarget->isPPC64(); + setMinStackArgumentAlignment(isPPC64 ? 8:4); // Set up the register classes. - addRegisterClass(MVT::i32, PPC::GPRCRegisterClass); - addRegisterClass(MVT::f32, PPC::F4RCRegisterClass); - addRegisterClass(MVT::f64, PPC::F8RCRegisterClass); + addRegisterClass(MVT::i32, &PPC::GPRCRegClass); + addRegisterClass(MVT::f32, &PPC::F4RCRegClass); + addRegisterClass(MVT::f64, &PPC::F8RCRegClass); // PowerPC has an i16 but no i8 (or i1) SEXTLOAD setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); @@ -130,17 +134,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FREM , MVT::f64, Expand); setOperationAction(ISD::FPOW , MVT::f64, Expand); - setOperationAction(ISD::FMA , MVT::f64, Expand); + setOperationAction(ISD::FMA , MVT::f64, Legal); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FREM , MVT::f32, Expand); setOperationAction(ISD::FPOW , MVT::f32, Expand); - setOperationAction(ISD::FMA , MVT::f32, Expand); + setOperationAction(ISD::FMA , MVT::f32, Legal); setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root - if (!TM.getSubtarget<PPCSubtarget>().hasFSQRT()) { + if (!Subtarget->hasFSQRT()) { setOperationAction(ISD::FSQRT, MVT::f64, Expand); setOperationAction(ISD::FSQRT, MVT::f32, Expand); } @@ -226,8 +230,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); - if (TM.getSubtarget<PPCSubtarget>().isSVR4ABI()) { - if (TM.getSubtarget<PPCSubtarget>().isPPC64()) { + if (Subtarget->isSVR4ABI()) { + if (isPPC64) { // VAARG always uses double-word chunks, so promote anything smaller. setOperationAction(ISD::VAARG, MVT::i1, Promote); AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); @@ -271,7 +275,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setCondCodeAction(ISD::SETONE, MVT::f32, Expand); setCondCodeAction(ISD::SETONE, MVT::f64, Expand); - if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) { + if (Subtarget->has64BitSupport()) { // They also have instructions for converting between i64 and fp. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); @@ -290,9 +294,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); } - if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) { + if (Subtarget->use64BitRegs()) { // 64-bit PowerPC implementations can support i64 types directly - addRegisterClass(MVT::i64, PPC::G8RCRegisterClass); + addRegisterClass(MVT::i64, &PPC::G8RCRegClass); // BUILD_PAIR can't be handled natively, and should be expanded to shl/or setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); // 64-bit PowerPC wants to expand i128 shifts itself. @@ -306,7 +310,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } - if (TM.getSubtarget<PPCSubtarget>().hasAltivec()) { + if (Subtarget->hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; @@ -370,12 +374,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::SELECT, MVT::v4i32, Expand); setOperationAction(ISD::STORE , MVT::v4i32, Legal); - addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass); - addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass); - addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass); - addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass); + addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); + addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); + addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); + addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); setOperationAction(ISD::MUL, MVT::v4f32, Legal); + setOperationAction(ISD::FMA, MVT::v4f32, Legal); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); @@ -389,7 +394,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); } - if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) + if (Subtarget->has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); @@ -398,7 +403,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? - if (TM.getSubtarget<PPCSubtarget>().isPPC64()) { + if (isPPC64) { setStackPointerRegisterToSaveRestore(PPC::X1); setExceptionPointerRegister(PPC::X3); setExceptionSelectorRegister(PPC::X4); @@ -415,7 +420,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setTargetDAGCombine(ISD::BSWAP); // Darwin long double math library functions have $LDBL128 appended. - if (TM.getSubtarget<PPCSubtarget>().isDarwin()) { + if (Subtarget->isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); @@ -432,6 +437,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) if (PPCSubTarget.isDarwin()) setPrefFunctionAlignment(4); + if (isPPC64 && Subtarget->isJITCodeModel()) + // Temporary workaround for the inability of PPC64 JIT to handle jump + // tables. + setSupportJumpTables(false); + setInsertFencesForAtomic(true); setSchedulingPreference(Sched::Hybrid); @@ -902,10 +912,11 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, return true; // [r+i] } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { // Match LOAD (ADD (X, Lo(G))). - assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() + assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() && "Cannot handle constant offsets yet!"); Disp = N.getOperand(1).getOperand(0); // The global address. assert(Disp.getOpcode() == ISD::TargetGlobalAddress || + Disp.getOpcode() == ISD::TargetGlobalTLSAddress || Disp.getOpcode() == ISD::TargetConstantPool || Disp.getOpcode() == ISD::TargetJumpTable); Base = N.getOperand(0); @@ -1006,7 +1017,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp, if (N.getOpcode() == ISD::ADD) { short imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) { - Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32); + Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32); if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); } else { @@ -1015,7 +1026,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp, return true; // [r+i] } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { // Match LOAD (ADD (X, Lo(G))). - assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() + assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() && "Cannot handle constant offsets yet!"); Disp = N.getOperand(1).getOperand(0); // The global address. assert(Disp.getOpcode() == ISD::TargetGlobalAddress || @@ -1084,8 +1095,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { - // Disabled by default for now. - if (!EnablePPCPreinc) return false; + if (DisablePPCPreinc) return false; SDValue Ptr; EVT VT; @@ -1103,7 +1113,10 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, if (VT.isVector()) return false; - // TODO: Check reg+reg first. + if (SelectAddressRegReg(Ptr, Offset, Base, DAG)) { + AM = ISD::PRE_INC; + return true; + } // LDU/STU use reg+imm*4, others use reg+imm. if (VT != MVT::i64) { @@ -1222,6 +1235,30 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG); } +SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + DebugLoc dl = GA->getDebugLoc(); + const GlobalValue *GV = GA->getGlobal(); + EVT PtrVT = getPointerTy(); + bool is64bit = PPCSubTarget.isPPC64(); + + TLSModel::Model model = getTargetMachine().getTLSModel(GV); + + SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL16_HA); + SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL16_LO); + + if (model != TLSModel::LocalExec) + llvm_unreachable("only local-exec TLS mode supported"); + SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, + is64bit ? MVT::i64 : MVT::i32); + SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); + return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); +} + SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); @@ -1440,13 +1477,16 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Entry.Node = Nest; Args.push_back(Entry); // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) - std::pair<SDValue, SDValue> CallResult = - LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), - false, false, false, false, 0, CallingConv::C, + TargetLowering::CallLoweringInfo CLI(Chain, + Type::getVoidTy(*DAG.getContext()), + false, false, false, false, 0, + CallingConv::C, /*isTailCall=*/false, - /*doesNotRet=*/false, /*isReturnValueUsed=*/true, + /*doesNotRet=*/false, + /*isReturnValueUsed=*/true, DAG.getExternalSymbol("__trampoline_setup", PtrVT), Args, DAG, dl); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.second; } @@ -1702,7 +1742,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); @@ -1721,19 +1761,19 @@ PPCTargetLowering::LowerFormalArguments_SVR4( default: llvm_unreachable("ValVT not supported by formal arguments Lowering"); case MVT::i32: - RC = PPC::GPRCRegisterClass; + RC = &PPC::GPRCRegClass; break; case MVT::f32: - RC = PPC::F4RCRegisterClass; + RC = &PPC::F4RCRegClass; break; case MVT::f64: - RC = PPC::F8RCRegisterClass; + RC = &PPC::F8RCRegClass; break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v4f32: - RC = PPC::VRRCRegisterClass; + RC = &PPC::VRRCRegClass; break; } @@ -1763,7 +1803,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4( // caller's stack frame, right above the parameter list area. SmallVector<CCValAssign, 16> ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + getTargetMachine(), ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -2743,7 +2783,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVector<CCValAssign, 16> RVLocs; CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); // Copy all of the result registers out of their specified physreg. @@ -2800,7 +2840,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_PPC); for (unsigned i = 0; i != RVLocs.size(); ++i) DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); @@ -2864,14 +2904,19 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, } SDValue -PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + if (isTailCall) isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, Ins, DAG); @@ -2921,7 +2966,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); @@ -2961,7 +3006,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, // Assign locations to all of the outgoing aggregate by value arguments. SmallVector<CCValAssign, 16> ByValArgLocs; CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ByValArgLocs, *DAG.getContext()); + getTargetMachine(), ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); @@ -3485,7 +3530,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain, SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_PPC); // If this is the first return lowered for this function, add the regs to the @@ -4559,7 +4604,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); - case ISD::GlobalTLSAddress: llvm_unreachable("TLS not implemented for PPC"); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); @@ -4899,11 +4944,37 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineFunction *F = BB->getParent(); - if (MI->getOpcode() == PPC::SELECT_CC_I4 || - MI->getOpcode() == PPC::SELECT_CC_I8 || - MI->getOpcode() == PPC::SELECT_CC_F4 || - MI->getOpcode() == PPC::SELECT_CC_F8 || - MI->getOpcode() == PPC::SELECT_CC_VRRC) { + if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8)) { + unsigned OpCode = MI->getOpcode() == PPC::SELECT_CC_I8 ? + PPC::ISEL8 : PPC::ISEL; + unsigned SelectPred = MI->getOperand(4).getImm(); + DebugLoc dl = MI->getDebugLoc(); + + // The SelectPred is ((BI << 5) | BO) for a BCC + unsigned BO = SelectPred & 0xF; + assert((BO == 12 || BO == 4) && "invalid predicate BO field for isel"); + + unsigned TrueOpNo, FalseOpNo; + if (BO == 12) { + TrueOpNo = 2; + FalseOpNo = 3; + } else { + TrueOpNo = 3; + FalseOpNo = 2; + SelectPred = PPC::InvertPredicate((PPC::Predicate)SelectPred); + } + + BuildMI(*BB, MI, dl, TII->get(OpCode), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(TrueOpNo).getReg()) + .addReg(MI->getOperand(FalseOpNo).getReg()) + .addImm(SelectPred).addReg(MI->getOperand(1).getReg()); + } else if (MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8 || + MI->getOpcode() == PPC::SELECT_CC_F4 || + MI->getOpcode() == PPC::SELECT_CC_F8 || + MI->getOpcode() == PPC::SELECT_CC_VRRC) { + // The incoming instruction knows the destination vreg to set, the // condition code register to branch on, the true/false values to @@ -5612,18 +5683,18 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case 'b': // R1-R31 case 'r': // R0-R31 if (VT == MVT::i64 && PPCSubTarget.isPPC64()) - return std::make_pair(0U, PPC::G8RCRegisterClass); - return std::make_pair(0U, PPC::GPRCRegisterClass); + return std::make_pair(0U, &PPC::G8RCRegClass); + return std::make_pair(0U, &PPC::GPRCRegClass); case 'f': if (VT == MVT::f32) - return std::make_pair(0U, PPC::F4RCRegisterClass); - else if (VT == MVT::f64) - return std::make_pair(0U, PPC::F8RCRegisterClass); + return std::make_pair(0U, &PPC::F4RCRegClass); + if (VT == MVT::f64) + return std::make_pair(0U, &PPC::F8RCRegClass); break; case 'v': - return std::make_pair(0U, PPC::VRRCRegisterClass); + return std::make_pair(0U, &PPC::VRRCRegClass); case 'y': // crrc - return std::make_pair(0U, PPC::CRRCRegisterClass); + return std::make_pair(0U, &PPC::CRRCRegClass); } } @@ -5839,11 +5910,30 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, } } +/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than +/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to +/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd +/// is expanded to mul + add. +bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const { + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + case MVT::v4f32: + return true; + default: + break; + } + + return false; +} + Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { - unsigned Directive = PPCSubTarget.getDarwinDirective(); - if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2) - return Sched::ILP; + if (DisableILPPref) + return TargetLowering::getSchedulingPreference(N); - return TargetLowering::getSchedulingPreference(N); + return Sched::ILP; } diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 18eb072..b0a013b 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -366,6 +366,12 @@ namespace llvm { bool IsZeroVal, bool MemcpyStrSrc, MachineFunction &MF) const; + /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than + /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to + /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd + /// is expanded to mul + add. + virtual bool isFMAFasterThanMulAndAdd(EVT VT) const; + private: SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; @@ -389,6 +395,7 @@ namespace llvm { SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; @@ -439,12 +446,7 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, - bool isVarArg, bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; virtual bool diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 7f67a41..91c5366 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -68,15 +68,15 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in { // Convenient aliases for call instructions let Uses = [RM] in { def BL8_Darwin : IForm<18, 0, 1, - (outs), (ins calltarget:$func, variable_ops), + (outs), (ins calltarget:$func), "bl $func", BrB, []>; // See Pat patterns below. def BLA8_Darwin : IForm<18, 1, 1, - (outs), (ins aaddr:$func, variable_ops), + (outs), (ins aaddr:$func), "bla $func", BrB, [(PPCcall_Darwin (i64 imm:$func))]>; } let Uses = [CTR8, RM] in { def BCTRL8_Darwin : XLForm_2_ext<19, 528, 20, 0, 1, - (outs), (ins variable_ops), + (outs), (ins), "bctrl", BrB, [(PPCbctrl_Darwin)]>, Requires<[In64BitMode]>; } @@ -88,27 +88,27 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in { // Convenient aliases for call instructions let Uses = [RM] in { def BL8_ELF : IForm<18, 0, 1, - (outs), (ins calltarget:$func, variable_ops), + (outs), (ins calltarget:$func), "bl $func", BrB, []>; // See Pat patterns below. let isCodeGenOnly = 1 in def BL8_NOP_ELF : IForm_and_DForm_4_zero<18, 0, 1, 24, - (outs), (ins calltarget:$func, variable_ops), + (outs), (ins calltarget:$func), "bl $func\n\tnop", BrB, []>; def BLA8_ELF : IForm<18, 1, 1, - (outs), (ins aaddr:$func, variable_ops), + (outs), (ins aaddr:$func), "bla $func", BrB, [(PPCcall_SVR4 (i64 imm:$func))]>; let isCodeGenOnly = 1 in def BLA8_NOP_ELF : IForm_and_DForm_4_zero<18, 1, 1, 24, - (outs), (ins aaddr:$func, variable_ops), + (outs), (ins aaddr:$func), "bla $func\n\tnop", BrB, [(PPCcall_nop_SVR4 (i64 imm:$func))]>; } let Uses = [X11, CTR8, RM] in { def BCTRL8_ELF : XLForm_2_ext<19, 528, 20, 0, 1, - (outs), (ins variable_ops), + (outs), (ins), "bctrl", BrB, [(PPCbctrl_SVR4)]>, Requires<[In64BitMode]>; } @@ -180,17 +180,17 @@ def STDCX : XForm_1<31, 214, (outs), (ins G8RC:$rS, memrr:$dst), let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in def TCRETURNdi8 :Pseudo< (outs), - (ins calltarget:$dst, i32imm:$offset, variable_ops), + (ins calltarget:$dst, i32imm:$offset), "#TC_RETURNd8 $dst $offset", []>; let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops), +def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset), "#TC_RETURNa8 $func $offset", [(PPCtc_return (i64 imm:$func), imm:$offset)]>; let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset, variable_ops), +def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset), "#TC_RETURNr8 $dst $offset", []>; @@ -229,6 +229,15 @@ def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm), def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm), (TCRETURNri8 CTRRC8:$dst, imm:$imm)>; +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { + let Defs = [CTR8], Uses = [CTR8] in { + def BDZ8 : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz $dst", BrB, []>; + def BDNZ8 : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz $dst", BrB, []>; + } +} + // 64-but CR instructions def MTCRF8 : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins G8RC:$rS), "mtcrf $FXM, $rS", BrMCRX>, @@ -278,45 +287,37 @@ def MFLR8 : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins), let PPC970_Unit = 1 in { // FXU Operations. -// Copies, extends, truncates. -def OR4To8 : XForm_6<31, 444, (outs G8RC:$rA), (ins GPRC:$rS, GPRC:$rB), - "or $rA, $rS, $rB", IntGeneral, - []>; -def OR8To4 : XForm_6<31, 444, (outs GPRC:$rA), (ins G8RC:$rS, G8RC:$rB), - "or $rA, $rS, $rB", IntGeneral, - []>; - def LI8 : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm), - "li $rD, $imm", IntGeneral, + "li $rD, $imm", IntSimple, [(set G8RC:$rD, immSExt16:$imm)]>; def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm), - "lis $rD, $imm", IntGeneral, + "lis $rD, $imm", IntSimple, [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>; // Logical ops. def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), - "nand $rA, $rS, $rB", IntGeneral, + "nand $rA, $rS, $rB", IntSimple, [(set G8RC:$rA, (not (and G8RC:$rS, G8RC:$rB)))]>; def AND8 : XForm_6<31, 28, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), - "and $rA, $rS, $rB", IntGeneral, + "and $rA, $rS, $rB", IntSimple, [(set G8RC:$rA, (and G8RC:$rS, G8RC:$rB))]>; def ANDC8: XForm_6<31, 60, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), - "andc $rA, $rS, $rB", IntGeneral, + "andc $rA, $rS, $rB", IntSimple, [(set G8RC:$rA, (and G8RC:$rS, (not G8RC:$rB)))]>; def OR8 : XForm_6<31, 444, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), - "or $rA, $rS, $rB", IntGeneral, + "or $rA, $rS, $rB", IntSimple, [(set G8RC:$rA, (or G8RC:$rS, G8RC:$rB))]>; def NOR8 : XForm_6<31, 124, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), - "nor $rA, $rS, $rB", IntGeneral, + "nor $rA, $rS, $rB", IntSimple, [(set G8RC:$rA, (not (or G8RC:$rS, G8RC:$rB)))]>; def ORC8 : XForm_6<31, 412, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), - "orc $rA, $rS, $rB", IntGeneral, + "orc $rA, $rS, $rB", IntSimple, [(set G8RC:$rA, (or G8RC:$rS, (not G8RC:$rB)))]>; def EQV8 : XForm_6<31, 284, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), - "eqv $rA, $rS, $rB", IntGeneral, + "eqv $rA, $rS, $rB", IntSimple, [(set G8RC:$rA, (not (xor G8RC:$rS, G8RC:$rB)))]>; def XOR8 : XForm_6<31, 316, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB), - "xor $rA, $rS, $rB", IntGeneral, + "xor $rA, $rS, $rB", IntSimple, [(set G8RC:$rA, (xor G8RC:$rS, G8RC:$rB))]>; // Logical ops with immediate. @@ -329,20 +330,20 @@ def ANDISo8 : DForm_4<29, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2), [(set G8RC:$dst, (and G8RC:$src1,imm16ShiftedZExt:$src2))]>, isDOT; def ORI8 : DForm_4<24, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2), - "ori $dst, $src1, $src2", IntGeneral, + "ori $dst, $src1, $src2", IntSimple, [(set G8RC:$dst, (or G8RC:$src1, immZExt16:$src2))]>; def ORIS8 : DForm_4<25, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2), - "oris $dst, $src1, $src2", IntGeneral, + "oris $dst, $src1, $src2", IntSimple, [(set G8RC:$dst, (or G8RC:$src1, imm16ShiftedZExt:$src2))]>; def XORI8 : DForm_4<26, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2), - "xori $dst, $src1, $src2", IntGeneral, + "xori $dst, $src1, $src2", IntSimple, [(set G8RC:$dst, (xor G8RC:$src1, immZExt16:$src2))]>; def XORIS8 : DForm_4<27, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2), - "xoris $dst, $src1, $src2", IntGeneral, + "xoris $dst, $src1, $src2", IntSimple, [(set G8RC:$dst, (xor G8RC:$src1, imm16ShiftedZExt:$src2))]>; def ADD8 : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB), - "add $rT, $rA, $rB", IntGeneral, + "add $rT, $rA, $rB", IntSimple, [(set G8RC:$rT, (add G8RC:$rA, G8RC:$rB))]>; let Defs = [CARRY] in { @@ -355,10 +356,13 @@ def ADDIC8 : DForm_2<12, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm), [(set G8RC:$rD, (addc G8RC:$rA, immSExt16:$imm))]>; } def ADDI8 : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm), - "addi $rD, $rA, $imm", IntGeneral, + "addi $rD, $rA, $imm", IntSimple, + [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>; +def ADDI8L : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, symbolLo64:$imm), + "addi $rD, $rA, $imm", IntSimple, [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>; def ADDIS8 : DForm_2<15, (outs G8RC:$rD), (ins G8RC:$rA, symbolHi64:$imm), - "addis $rD, $rA, $imm", IntGeneral, + "addis $rD, $rA, $imm", IntSimple, [(set G8RC:$rD, (add G8RC:$rA, imm16ShiftedSExt:$imm))]>; let Defs = [CARRY] in { @@ -374,7 +378,7 @@ def SUBF8 : XOForm_1<31, 40, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB), "subf $rT, $rA, $rB", IntGeneral, [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>; def NEG8 : XOForm_3<31, 104, 0, (outs G8RC:$rT), (ins G8RC:$rA), - "neg $rT, $rA", IntGeneral, + "neg $rT, $rA", IntSimple, [(set G8RC:$rT, (ineg G8RC:$rA))]>; let Uses = [CARRY], Defs = [CARRY] in { def ADDE8 : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB), @@ -427,21 +431,21 @@ def SRAD : XForm_6<31, 794, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB), } def EXTSB8 : XForm_11<31, 954, (outs G8RC:$rA), (ins G8RC:$rS), - "extsb $rA, $rS", IntGeneral, + "extsb $rA, $rS", IntSimple, [(set G8RC:$rA, (sext_inreg G8RC:$rS, i8))]>; def EXTSH8 : XForm_11<31, 922, (outs G8RC:$rA), (ins G8RC:$rS), - "extsh $rA, $rS", IntGeneral, + "extsh $rA, $rS", IntSimple, [(set G8RC:$rA, (sext_inreg G8RC:$rS, i16))]>; def EXTSW : XForm_11<31, 986, (outs G8RC:$rA), (ins G8RC:$rS), - "extsw $rA, $rS", IntGeneral, + "extsw $rA, $rS", IntSimple, [(set G8RC:$rA, (sext_inreg G8RC:$rS, i32))]>, isPPC64; /// EXTSW_32 - Just like EXTSW, but works on '32-bit' registers. def EXTSW_32 : XForm_11<31, 986, (outs GPRC:$rA), (ins GPRC:$rS), - "extsw $rA, $rS", IntGeneral, + "extsw $rA, $rS", IntSimple, [(set GPRC:$rA, (PPCextsw_32 GPRC:$rS))]>, isPPC64; def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS), - "extsw $rA, $rS", IntGeneral, + "extsw $rA, $rS", IntSimple, [(set G8RC:$rA, (sext GPRC:$rS))]>, isPPC64; let Defs = [CARRY] in { @@ -493,6 +497,10 @@ def RLWINM8 : MForm_2<21, "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral, []>; +def ISEL8 : AForm_1<31, 15, + (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB, pred:$cond), + "isel $rT, $rA, $rB, $cond", IntGeneral, + []>; } // End FXU Operations. @@ -529,6 +537,16 @@ def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp NoEncode<"$ea_result">; // NO LWAU! +def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lhaux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; +def LWAUX : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lwaux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">, isPPC64; } // Zero extending loads. @@ -568,6 +586,22 @@ def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr), "lwzu $rD, $addr", LdStLoad, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; + +def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lbzux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; +def LHZUX8 : XForm_1<31, 331, (outs G8RC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lhzux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; +def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lwzux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; } } @@ -603,6 +637,11 @@ def LDU : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64, NoEncode<"$ea_result">; +def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "ldux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">, isPPC64; } def : Pat<(PPCload ixaddr:$src), @@ -660,6 +699,14 @@ def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS, iaddroff:$ptroff))]>, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; +def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "stwu $rS, $ptroff($ptrreg)", LdStStore, + [(set ptr_rc:$ea_res, + (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; + def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS, s16immX4:$ptroff, ptr_rc:$ptrreg), "stdu $rS, $ptroff($ptrreg)", LdStSTD, @@ -668,10 +715,41 @@ def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS, RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">, isPPC64; -let mayStore = 1 in -def STDUX : XForm_8<31, 181, (outs), (ins G8RC:$rS, memrr:$dst), - "stdux $rS, $dst", LdStSTD, - []>, isPPC64; + +def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res), + (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "stbux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_truncsti8 G8RC:$rS, + ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; + +def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res), + (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "sthux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_truncsti16 G8RC:$rS, + ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; + +def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res), + (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "stwux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_truncsti32 G8RC:$rS, + ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; + +def STDUX : XForm_8<31, 181, (outs ptr_rc:$ea_res), + (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "stdux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_store G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked, isPPC64; // STD_32/STDX_32 - Just like STD/STDX, but uses a '32-bit' input register. def STD_32 : DSForm_1<62, 0, (outs), (ins GPRC:$rT, memrix:$dst), @@ -706,11 +784,12 @@ def FCTIDZ : XForm_26<63, 815, (outs F8RC:$frD), (ins F8RC:$frB), // Extensions and truncates to/from 32-bit regs. def : Pat<(i64 (zext GPRC:$in)), - (RLDICL (OR4To8 GPRC:$in, GPRC:$in), 0, 32)>; + (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPRC:$in, sub_32), + 0, 32)>; def : Pat<(i64 (anyext GPRC:$in)), - (OR4To8 GPRC:$in, GPRC:$in)>; + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPRC:$in, sub_32)>; def : Pat<(i32 (trunc G8RC:$in)), - (OR8To4 G8RC:$in, G8RC:$in)>; + (EXTRACT_SUBREG G8RC:$in, sub_32)>; // Extending loads with i64 targets. def : Pat<(zextloadi1 iaddr:$src), @@ -765,6 +844,10 @@ def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>; def : Pat<(PPClo tjumptable:$in , 0), (LI8 tjumptable:$in)>; def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>; def : Pat<(PPClo tblockaddress:$in, 0), (LI8 tblockaddress:$in)>; +def : Pat<(PPChi tglobaltlsaddr:$g, G8RC:$in), + (ADDIS8 G8RC:$in, tglobaltlsaddr:$g)>; +def : Pat<(PPClo tglobaltlsaddr:$g, G8RC:$in), + (ADDI8L G8RC:$in, tglobaltlsaddr:$g)>; def : Pat<(add G8RC:$in, (PPChi tglobaladdr:$g, 0)), (ADDIS8 G8RC:$in, tglobaladdr:$g)>; def : Pat<(add G8RC:$in, (PPChi tconstpool:$g, 0)), diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index 6c0f3d3..b0b8423 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -274,15 +274,11 @@ let PPC970_Unit = 5 in { // VALU Operations. // VA-Form instructions. 3-input AltiVec ops. def VMADDFP : VAForm_1<46, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB), "vmaddfp $vD, $vA, $vC, $vB", VecFP, - [(set VRRC:$vD, (fadd (fmul VRRC:$vA, VRRC:$vC), - VRRC:$vB))]>, - Requires<[FPContractions]>; + [(set VRRC:$vD, (fma VRRC:$vA, VRRC:$vC, VRRC:$vB))]>; def VNMSUBFP: VAForm_1<47, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB), "vnmsubfp $vD, $vA, $vC, $vB", VecFP, - [(set VRRC:$vD, (fsub V_immneg0, - (fsub (fmul VRRC:$vA, VRRC:$vC), - VRRC:$vB)))]>, - Requires<[FPContractions]>; + [(set VRRC:$vD, (fneg (fma VRRC:$vA, VRRC:$vC, + (fneg VRRC:$vB))))]>; def VMHADDSHS : VA1a_Int<32, "vmhaddshs", int_ppc_altivec_vmhaddshs>; def VMHRADDSHS : VA1a_Int<33, "vmhraddshs", int_ppc_altivec_vmhraddshs>; diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index d8e4b2b..a41a027 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -94,6 +94,12 @@ class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr, let Inst{31} = lk; } +class IForm_ext<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : IForm<opcode, aa, lk, OOL, IOL, asmstr, itin, pattern> { + let LI{0-4} = bo; +} + // 1.7.2 B-Form class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr> : I<opcode, OOL, IOL, asmstr, BrB> { diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index b45ada9..47f09dc 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -40,6 +40,10 @@ extern cl::opt<bool> DisablePPC64RS; using namespace llvm; +static cl:: +opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden, + cl::desc("Disable analysis for CTR loops")); + PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm) : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), TM(tm), RI(*TM.getSubtargetImpl(), *this) {} @@ -75,6 +79,22 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer( return new PPCScoreboardHazardRecognizer(II, DAG); } + +// Detect 32 -> 64-bit extensions where we may reuse the low sub-register. +bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { + switch (MI.getOpcode()) { + default: return false; + case PPC::EXTSW: + case PPC::EXTSW_32_64: + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + SubIdx = PPC::sub_32; + return true; + } +} + unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const { switch (MI->getOpcode()) { @@ -186,10 +206,14 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, // Branch analysis. +// Note: If the condition register is set to CTR or CTR8 then this is a +// BDNZ (imm == 1) or BDZ (imm == 0) branch. bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const { + bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) @@ -221,7 +245,30 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, Cond.push_back(LastInst->getOperand(0)); Cond.push_back(LastInst->getOperand(1)); return false; + } else if (LastInst->getOpcode() == PPC::BDNZ8 || + LastInst->getOpcode() == PPC::BDNZ) { + if (!LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(1)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + return false; + } else if (LastInst->getOpcode() == PPC::BDZ8 || + LastInst->getOpcode() == PPC::BDZ) { + if (!LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(0)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + return false; } + // Otherwise, don't know what this is. return true; } @@ -245,6 +292,34 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, Cond.push_back(SecondLastInst->getOperand(1)); FBB = LastInst->getOperand(0).getMBB(); return false; + } else if ((SecondLastInst->getOpcode() == PPC::BDNZ8 || + SecondLastInst->getOpcode() == PPC::BDNZ) && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(0).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(1)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if ((SecondLastInst->getOpcode() == PPC::BDZ8 || + SecondLastInst->getOpcode() == PPC::BDZ) && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(0).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(0)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + FBB = LastInst->getOperand(0).getMBB(); + return false; } // If the block ends with two PPC:Bs, handle it. The second one is not @@ -273,7 +348,9 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 0; --I; } - if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC) + if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC && + I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ && + I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ) return 0; // Remove the branch. @@ -283,7 +360,9 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { if (I == MBB.begin()) return 1; --I; - if (I->getOpcode() != PPC::BCC) + if (I->getOpcode() != PPC::BCC && + I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ && + I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ) return 1; // Remove the branch. @@ -301,10 +380,16 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, assert((Cond.size() == 2 || Cond.size() == 0) && "PPC branch conditions have two components!"); + bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + // One-way branch. if (FBB == 0) { if (Cond.empty()) // Unconditional branch BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB); + else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8) + BuildMI(&MBB, DL, get(Cond[0].getImm() ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB); else // Conditional branch BuildMI(&MBB, DL, get(PPC::BCC)) .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); @@ -312,8 +397,13 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, } // Two-way Conditional Branch. - BuildMI(&MBB, DL, get(PPC::BCC)) - .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); + if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8) + BuildMI(&MBB, DL, get(Cond[0].getImm() ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB); + else + BuildMI(&MBB, DL, get(PPC::BCC)) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB); return 2; } @@ -354,7 +444,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, const TargetRegisterClass *RC, SmallVectorImpl<MachineInstr*> &NewMIs) const{ DebugLoc DL; - if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) { + if (PPC::GPRCRegClass.hasSubClassEq(RC)) { if (SrcReg != PPC::LR) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW)) .addReg(SrcReg, @@ -370,7 +460,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, getKillRegState(isKill)), FrameIdx)); } - } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) { if (SrcReg != PPC::LR8) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD)) .addReg(SrcReg, @@ -386,17 +476,17 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, getKillRegState(isKill)), FrameIdx)); } - } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD)) .addReg(SrcReg, getKillRegState(isKill)), FrameIdx)); - } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS)) .addReg(SrcReg, getKillRegState(isKill)), FrameIdx)); - } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) { if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) || (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR)) @@ -438,7 +528,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, getKillRegState(isKill)), FrameIdx)); } - } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) { // FIXME: We use CRi here because there is no mtcrf on a bit. Since the // backend currently only uses CR1EQ as an individual bit, this should // not cause any bug. If we need other uses of CR bits, the following @@ -470,9 +560,9 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, Reg = PPC::CR7; return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx, - PPC::CRRCRegisterClass, NewMIs); + &PPC::CRRCRegClass, NewMIs); - } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) { // We don't have indexed addressing for vector loads. Emit: // R0 = ADDI FI# // STVX VAL, 0, R0 @@ -522,7 +612,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, unsigned DestReg, int FrameIdx, const TargetRegisterClass *RC, SmallVectorImpl<MachineInstr*> &NewMIs)const{ - if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) { + if (PPC::GPRCRegClass.hasSubClassEq(RC)) { if (DestReg != PPC::LR) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ), DestReg), FrameIdx)); @@ -531,7 +621,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, PPC::R11), FrameIdx)); NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR)).addReg(PPC::R11)); } - } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) { if (DestReg != PPC::LR8) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg), FrameIdx)); @@ -540,13 +630,13 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, PPC::X11), FrameIdx)); NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::X11)); } - } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg), FrameIdx)); - } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg), FrameIdx)); - } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) { if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) || (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) { NewMIs.push_back(addFrameReference(BuildMI(MF, DL, @@ -578,7 +668,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, PPC::MTCRF8 : PPC::MTCRF), DestReg) .addReg(ScratchReg)); } - } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) { unsigned Reg = 0; if (DestReg == PPC::CR0LT || DestReg == PPC::CR0GT || @@ -607,9 +697,9 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, Reg = PPC::CR7; return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx, - PPC::CRRCRegisterClass, NewMIs); + &PPC::CRRCRegClass, NewMIs); - } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) { + } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) { // We don't have indexed addressing for vector loads. Emit: // R0 = ADDI FI# // Dest = LVX 0, R0 @@ -665,8 +755,11 @@ PPCInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, bool PPCInstrInfo:: ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { assert(Cond.size() == 2 && "Invalid PPC branch opcode!"); - // Leave the CR# the same, but invert the condition. - Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm())); + if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR) + Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0); + else + // Leave the CR# the same, but invert the condition. + Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm())); return false; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index 7d49aa1..374213e 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -92,6 +92,9 @@ public: CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const; + bool isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const; unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; unsigned isStoreToStackSlot(const MachineInstr *MI, diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 748486c..f57f0c9 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -323,7 +323,7 @@ def memri : Operand<iPTR> { } def memrr : Operand<iPTR> { let PrintMethod = "printMemRegReg"; - let MIOperandInfo = (ops ptr_rc, ptr_rc); + let MIOperandInfo = (ops ptr_rc:$offreg, ptr_rc:$ptrreg); } def memrix : Operand<iPTR> { // memri where the imm is shifted 2 bits. let PrintMethod = "printMemRegImmShifted"; @@ -349,10 +349,10 @@ def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std" /// This is just the offset part of iaddr, used for preinc. def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>; +def xaddroff : ComplexPattern<iPTR, 1, "SelectAddrIdxOffs", [], []>; //===----------------------------------------------------------------------===// // PowerPC Instruction Predicate Definitions. -def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">; def In32BitMode : Predicate<"!PPCSubTarget.isPPC64()">; def In64BitMode : Predicate<"PPCSubTarget.isPPC64()">; def IsBookE : Predicate<"PPCSubTarget.isBookE()">; @@ -438,6 +438,13 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), "b${cond:cc} ${cond:reg}, $dst" /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>; + + let Defs = [CTR], Uses = [CTR] in { + def BDZ : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz $dst", BrB, []>; + def BDNZ : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz $dst", BrB, []>; + } } // Darwin ABI Calls. @@ -445,15 +452,15 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { // Convenient aliases for call instructions let Uses = [RM] in { def BL_Darwin : IForm<18, 0, 1, - (outs), (ins calltarget:$func, variable_ops), + (outs), (ins calltarget:$func), "bl $func", BrB, []>; // See Pat patterns below. def BLA_Darwin : IForm<18, 1, 1, - (outs), (ins aaddr:$func, variable_ops), + (outs), (ins aaddr:$func), "bla $func", BrB, [(PPCcall_Darwin (i32 imm:$func))]>; } let Uses = [CTR, RM] in { def BCTRL_Darwin : XLForm_2_ext<19, 528, 20, 0, 1, - (outs), (ins variable_ops), + (outs), (ins), "bctrl", BrB, [(PPCbctrl_Darwin)]>, Requires<[In32BitMode]>; } @@ -464,16 +471,16 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { // Convenient aliases for call instructions let Uses = [RM] in { def BL_SVR4 : IForm<18, 0, 1, - (outs), (ins calltarget:$func, variable_ops), + (outs), (ins calltarget:$func), "bl $func", BrB, []>; // See Pat patterns below. def BLA_SVR4 : IForm<18, 1, 1, - (outs), (ins aaddr:$func, variable_ops), + (outs), (ins aaddr:$func), "bla $func", BrB, [(PPCcall_SVR4 (i32 imm:$func))]>; } let Uses = [CTR, RM] in { def BCTRL_SVR4 : XLForm_2_ext<19, 528, 20, 0, 1, - (outs), (ins variable_ops), + (outs), (ins), "bctrl", BrB, [(PPCbctrl_SVR4)]>, Requires<[In32BitMode]>; } @@ -482,18 +489,18 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in def TCRETURNdi :Pseudo< (outs), - (ins calltarget:$dst, i32imm:$offset, variable_ops), + (ins calltarget:$dst, i32imm:$offset), "#TC_RETURNd $dst $offset", []>; let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops), +def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset), "#TC_RETURNa $func $offset", [(PPCtc_return (i32 imm:$func), imm:$offset)]>; let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in -def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset, variable_ops), +def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset), "#TC_RETURNr $dst $offset", []>; @@ -704,6 +711,44 @@ def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr), "lfd $rD, $addr", LdStLFD, []>, RegConstraint<"$addr.reg = $ea_result">, NoEncode<"$ea_result">; + + +// Indexed (r+r) Loads with Update (preinc). +def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lbzux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; + +def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lhaux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; + +def LHZUX : XForm_1<31, 331, (outs GPRC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lhzux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; + +def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lwzux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; + +def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lfsux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; + +def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc:$ea_result), + (ins memrr:$addr), + "lfdux $rD, $addr", LdStLoad, + []>, RegConstraint<"$addr.offreg = $ea_result">, + NoEncode<"$ea_result">; } } @@ -815,12 +860,49 @@ def STWX : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst), "stwx $rS, $dst", LdStStore, [(store GPRC:$rS, xaddr:$dst)]>, PPC970_DGroup_Cracked; - -let mayStore = 1 in { -def STWUX : XForm_8<31, 183, (outs), (ins GPRC:$rS, GPRC:$rA, GPRC:$rB), - "stwux $rS, $rA, $rB", LdStStore, - []>; -} + +def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res), + (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "stbux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_truncsti8 GPRC:$rS, + ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; + +def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res), + (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "sthux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_truncsti16 GPRC:$rS, + ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; + +def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res), + (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "stwux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_store GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; + +def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res), + (ins F4RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "stfsux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_store F4RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; + +def STFDUX : XForm_8<31, 759, (outs ptr_rc:$ea_res), + (ins F8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg), + "stfdux $rS, $ptroff, $ptrreg", LdStStore, + [(set ptr_rc:$ea_res, + (pre_store F8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>, + RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; + def STHBRX: XForm_8<31, 918, (outs), (ins GPRC:$rS, memrr:$dst), "sthbrx $rS, $dst", LdStStore, [(PPCstbrx GPRC:$rS, xoaddr:$dst, i16)]>, @@ -852,7 +934,10 @@ def SYNC : XForm_24_sync<31, 598, (outs), (ins), let PPC970_Unit = 1 in { // FXU Operations. def ADDI : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm), - "addi $rD, $rA, $imm", IntGeneral, + "addi $rD, $rA, $imm", IntSimple, + [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>; +def ADDIL : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$imm), + "addi $rD, $rA, $imm", IntSimple, [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>; let Defs = [CARRY] in { def ADDIC : DForm_2<12, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm), @@ -864,7 +949,7 @@ def ADDICo : DForm_2<13, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm), []>; } def ADDIS : DForm_2<15, (outs GPRC:$rD), (ins GPRC:$rA, symbolHi:$imm), - "addis $rD, $rA, $imm", IntGeneral, + "addis $rD, $rA, $imm", IntSimple, [(set GPRC:$rD, (add GPRC:$rA, imm16ShiftedSExt:$imm))]>; def LA : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$sym), "la $rD, $sym($rA)", IntGeneral, @@ -881,10 +966,10 @@ def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm), let isReMaterializable = 1 in { def LI : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm), - "li $rD, $imm", IntGeneral, + "li $rD, $imm", IntSimple, [(set GPRC:$rD, immSExt16:$imm)]>; def LIS : DForm_2_r0<15, (outs GPRC:$rD), (ins symbolHi:$imm), - "lis $rD, $imm", IntGeneral, + "lis $rD, $imm", IntSimple, [(set GPRC:$rD, imm16ShiftedSExt:$imm)]>; } } @@ -899,18 +984,18 @@ def ANDISo : DForm_4<29, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2), [(set GPRC:$dst, (and GPRC:$src1,imm16ShiftedZExt:$src2))]>, isDOT; def ORI : DForm_4<24, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2), - "ori $dst, $src1, $src2", IntGeneral, + "ori $dst, $src1, $src2", IntSimple, [(set GPRC:$dst, (or GPRC:$src1, immZExt16:$src2))]>; def ORIS : DForm_4<25, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2), - "oris $dst, $src1, $src2", IntGeneral, + "oris $dst, $src1, $src2", IntSimple, [(set GPRC:$dst, (or GPRC:$src1, imm16ShiftedZExt:$src2))]>; def XORI : DForm_4<26, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2), - "xori $dst, $src1, $src2", IntGeneral, + "xori $dst, $src1, $src2", IntSimple, [(set GPRC:$dst, (xor GPRC:$src1, immZExt16:$src2))]>; def XORIS : DForm_4<27, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2), - "xoris $dst, $src1, $src2", IntGeneral, + "xoris $dst, $src1, $src2", IntSimple, [(set GPRC:$dst, (xor GPRC:$src1,imm16ShiftedZExt:$src2))]>; -def NOP : DForm_4_zero<24, (outs), (ins), "nop", IntGeneral, +def NOP : DForm_4_zero<24, (outs), (ins), "nop", IntSimple, []>; def CMPWI : DForm_5_ext<11, (outs CRRC:$crD), (ins GPRC:$rA, s16imm:$imm), "cmpwi $crD, $rA, $imm", IntCompare>; @@ -921,28 +1006,28 @@ def CMPLWI : DForm_6_ext<10, (outs CRRC:$dst), (ins GPRC:$src1, u16imm:$src2), let PPC970_Unit = 1 in { // FXU Operations. def NAND : XForm_6<31, 476, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), - "nand $rA, $rS, $rB", IntGeneral, + "nand $rA, $rS, $rB", IntSimple, [(set GPRC:$rA, (not (and GPRC:$rS, GPRC:$rB)))]>; def AND : XForm_6<31, 28, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), - "and $rA, $rS, $rB", IntGeneral, + "and $rA, $rS, $rB", IntSimple, [(set GPRC:$rA, (and GPRC:$rS, GPRC:$rB))]>; def ANDC : XForm_6<31, 60, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), - "andc $rA, $rS, $rB", IntGeneral, + "andc $rA, $rS, $rB", IntSimple, [(set GPRC:$rA, (and GPRC:$rS, (not GPRC:$rB)))]>; def OR : XForm_6<31, 444, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), - "or $rA, $rS, $rB", IntGeneral, + "or $rA, $rS, $rB", IntSimple, [(set GPRC:$rA, (or GPRC:$rS, GPRC:$rB))]>; def NOR : XForm_6<31, 124, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), - "nor $rA, $rS, $rB", IntGeneral, + "nor $rA, $rS, $rB", IntSimple, [(set GPRC:$rA, (not (or GPRC:$rS, GPRC:$rB)))]>; def ORC : XForm_6<31, 412, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), - "orc $rA, $rS, $rB", IntGeneral, + "orc $rA, $rS, $rB", IntSimple, [(set GPRC:$rA, (or GPRC:$rS, (not GPRC:$rB)))]>; def EQV : XForm_6<31, 284, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), - "eqv $rA, $rS, $rB", IntGeneral, + "eqv $rA, $rS, $rB", IntSimple, [(set GPRC:$rA, (not (xor GPRC:$rS, GPRC:$rB)))]>; def XOR : XForm_6<31, 316, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), - "xor $rA, $rS, $rB", IntGeneral, + "xor $rA, $rS, $rB", IntSimple, [(set GPRC:$rA, (xor GPRC:$rS, GPRC:$rB))]>; def SLW : XForm_6<31, 24, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB), "slw $rA, $rS, $rB", IntGeneral, @@ -967,10 +1052,10 @@ def CNTLZW : XForm_11<31, 26, (outs GPRC:$rA), (ins GPRC:$rS), "cntlzw $rA, $rS", IntGeneral, [(set GPRC:$rA, (ctlz GPRC:$rS))]>; def EXTSB : XForm_11<31, 954, (outs GPRC:$rA), (ins GPRC:$rS), - "extsb $rA, $rS", IntGeneral, + "extsb $rA, $rS", IntSimple, [(set GPRC:$rA, (sext_inreg GPRC:$rS, i8))]>; def EXTSH : XForm_11<31, 922, (outs GPRC:$rA), (ins GPRC:$rS), - "extsh $rA, $rS", IntGeneral, + "extsh $rA, $rS", IntSimple, [(set GPRC:$rA, (sext_inreg GPRC:$rS, i16))]>; def CMPW : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB), @@ -1115,7 +1200,7 @@ def MFCR : XFXForm_3<31, 19, (outs GPRC:$rT), (ins), PPC970_MicroCode, PPC970_Unit_CRU; def MFOCRF: XFXForm_5a<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM), - "mfcr $rT, $FXM", SprMFCR>, + "mfocrf $rT, $FXM", SprMFCR>, PPC970_DGroup_First, PPC970_Unit_CRU; // Instructions to manipulate FPSCR. Only long double handling uses these. @@ -1159,7 +1244,7 @@ let PPC970_Unit = 1 in { // FXU Operations. // XO-Form instructions. Arithmetic instructions that can set overflow bit // def ADD4 : XOForm_1<31, 266, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB), - "add $rT, $rA, $rB", IntGeneral, + "add $rT, $rA, $rB", IntSimple, [(set GPRC:$rT, (add GPRC:$rA, GPRC:$rB))]>; let Defs = [CARRY] in { def ADDC : XOForm_1<31, 10, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB), @@ -1194,7 +1279,7 @@ def SUBFC : XOForm_1<31, 8, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB), PPC970_DGroup_Cracked; } def NEG : XOForm_3<31, 104, 0, (outs GPRC:$rT), (ins GPRC:$rA), - "neg $rT, $rA", IntGeneral, + "neg $rT, $rA", IntSimple, [(set GPRC:$rT, (ineg GPRC:$rA))]>; let Uses = [CARRY], Defs = [CARRY] in { def ADDE : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB), @@ -1226,51 +1311,43 @@ let Uses = [RM] in { def FMADD : AForm_1<63, 29, (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), "fmadd $FRT, $FRA, $FRC, $FRB", FPFused, - [(set F8RC:$FRT, (fadd (fmul F8RC:$FRA, F8RC:$FRC), - F8RC:$FRB))]>, - Requires<[FPContractions]>; + [(set F8RC:$FRT, + (fma F8RC:$FRA, F8RC:$FRC, F8RC:$FRB))]>; def FMADDS : AForm_1<59, 29, (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB), "fmadds $FRT, $FRA, $FRC, $FRB", FPGeneral, - [(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC), - F4RC:$FRB))]>, - Requires<[FPContractions]>; + [(set F4RC:$FRT, + (fma F4RC:$FRA, F4RC:$FRC, F4RC:$FRB))]>; def FMSUB : AForm_1<63, 28, (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), "fmsub $FRT, $FRA, $FRC, $FRB", FPFused, - [(set F8RC:$FRT, (fsub (fmul F8RC:$FRA, F8RC:$FRC), - F8RC:$FRB))]>, - Requires<[FPContractions]>; + [(set F8RC:$FRT, + (fma F8RC:$FRA, F8RC:$FRC, (fneg F8RC:$FRB)))]>; def FMSUBS : AForm_1<59, 28, (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB), "fmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral, - [(set F4RC:$FRT, (fsub (fmul F4RC:$FRA, F4RC:$FRC), - F4RC:$FRB))]>, - Requires<[FPContractions]>; + [(set F4RC:$FRT, + (fma F4RC:$FRA, F4RC:$FRC, (fneg F4RC:$FRB)))]>; def FNMADD : AForm_1<63, 31, (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), "fnmadd $FRT, $FRA, $FRC, $FRB", FPFused, - [(set F8RC:$FRT, (fneg (fadd (fmul F8RC:$FRA, F8RC:$FRC), - F8RC:$FRB)))]>, - Requires<[FPContractions]>; + [(set F8RC:$FRT, + (fneg (fma F8RC:$FRA, F8RC:$FRC, F8RC:$FRB)))]>; def FNMADDS : AForm_1<59, 31, (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB), "fnmadds $FRT, $FRA, $FRC, $FRB", FPGeneral, - [(set F4RC:$FRT, (fneg (fadd (fmul F4RC:$FRA, F4RC:$FRC), - F4RC:$FRB)))]>, - Requires<[FPContractions]>; + [(set F4RC:$FRT, + (fneg (fma F4RC:$FRA, F4RC:$FRC, F4RC:$FRB)))]>; def FNMSUB : AForm_1<63, 30, (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), "fnmsub $FRT, $FRA, $FRC, $FRB", FPFused, - [(set F8RC:$FRT, (fneg (fsub (fmul F8RC:$FRA, F8RC:$FRC), - F8RC:$FRB)))]>, - Requires<[FPContractions]>; + [(set F8RC:$FRT, (fneg (fma F8RC:$FRA, F8RC:$FRC, + (fneg F8RC:$FRB))))]>; def FNMSUBS : AForm_1<59, 30, (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB), "fnmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral, - [(set F4RC:$FRT, (fneg (fsub (fmul F4RC:$FRA, F4RC:$FRC), - F4RC:$FRB)))]>, - Requires<[FPContractions]>; + [(set F4RC:$FRT, (fneg (fma F4RC:$FRA, F4RC:$FRC, + (fneg F4RC:$FRB))))]>; } // FSEL is artificially split into 4 and 8-byte forms for the result. To avoid // having 4 of these, force the comparison to always be an 8-byte double (code @@ -1321,6 +1398,13 @@ let Uses = [RM] in { } let PPC970_Unit = 1 in { // FXU Operations. + def ISEL : AForm_1<31, 15, + (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB, pred:$cond), + "isel $rT, $rA, $rB, $cond", IntGeneral, + []>; +} + +let PPC970_Unit = 1 in { // FXU Operations. // M-Form instructions. rotate and mask instructions. // let isCommutable = 1 in { @@ -1418,6 +1502,10 @@ def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>; def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>; def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>; def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>; +def : Pat<(PPChi tglobaltlsaddr:$g, GPRC:$in), + (ADDIS GPRC:$in, tglobaltlsaddr:$g)>; +def : Pat<(PPClo tglobaltlsaddr:$g, GPRC:$in), + (ADDIL GPRC:$in, tglobaltlsaddr:$g)>; def : Pat<(add GPRC:$in, (PPChi tglobaladdr:$g, 0)), (ADDIS GPRC:$in, tglobaladdr:$g)>; def : Pat<(add GPRC:$in, (PPChi tconstpool:$g, 0)), @@ -1427,14 +1515,6 @@ def : Pat<(add GPRC:$in, (PPChi tjumptable:$g, 0)), def : Pat<(add GPRC:$in, (PPChi tblockaddress:$g, 0)), (ADDIS GPRC:$in, tblockaddress:$g)>; -// Fused negative multiply subtract, alternate pattern -def : Pat<(fsub F8RC:$B, (fmul F8RC:$A, F8RC:$C)), - (FNMSUB F8RC:$A, F8RC:$C, F8RC:$B)>, - Requires<[FPContractions]>; -def : Pat<(fsub F4RC:$B, (fmul F4RC:$A, F4RC:$C)), - (FNMSUBS F4RC:$A, F4RC:$C, F4RC:$B)>, - Requires<[FPContractions]>; - // Standard shifts. These are represented separately from the real shifts above // so that we can distinguish between shifts that allow 5-bit and 6-bit shift // amounts. diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp index a6528c0..aba2739 100644 --- a/lib/Target/PowerPC/PPCJITInfo.cpp +++ b/lib/Target/PowerPC/PPCJITInfo.cpp @@ -210,7 +210,7 @@ asm( ".text\n" ".align 2\n" ".globl PPC64CompilationCallback\n" - ".section \".opd\",\"aw\"\n" + ".section \".opd\",\"aw\",@progbits\n" ".align 3\n" "PPC64CompilationCallback:\n" ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n" diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp index 276edcb..19ec993 100644 --- a/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -99,10 +99,22 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, MCContext &Ctx = Printer.OutContext; MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; - if (MO.getTargetFlags() & PPCII::MO_LO16) - RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : MCSymbolRefExpr::VK_PPC_GAS_LO16; - else if (MO.getTargetFlags() & PPCII::MO_HA16) - RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : MCSymbolRefExpr::VK_PPC_GAS_HA16; + unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK; + + switch (access) { + case PPCII::MO_HA16: RefKind = isDarwin ? + MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : + MCSymbolRefExpr::VK_PPC_GAS_HA16; + break; + case PPCII::MO_LO16: RefKind = isDarwin ? + MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : + MCSymbolRefExpr::VK_PPC_GAS_LO16; + break; + case PPCII::MO_TPREL16_HA: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_HA; + break; + case PPCII::MO_TPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO; + break; + } // FIXME: This isn't right, but we don't have a good way to express this in // the MC Level, see below. diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index ef13571..ab8bf1f 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -89,10 +89,17 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST, ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; ImmToIdxMap[PPC::STD_32] = PPC::STDX_32; } +bool +PPCRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + return requiresRegisterScavenging(MF); +} + + /// getPointerRegClass - Return the register class to use to hold pointers. /// This is used for addressing modes. const TargetRegisterClass * -PPCRegisterInfo::getPointerRegClass(unsigned Kind) const { +PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { if (Subtarget.isPPC64()) return &PPC::G8RCRegClass; return &PPC::GPRCRegClass; @@ -192,6 +199,20 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } } +bool +PPCRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const { + switch (RC->getID()) { + case PPC::G8RCRegClassID: + case PPC::GPRCRegClassID: + case PPC::F8RCRegClassID: + case PPC::F4RCRegClassID: + case PPC::VRRCRegClassID: + return true; + default: + return false; + } +} + //===----------------------------------------------------------------------===// // Stack Frame Processing methods //===----------------------------------------------------------------------===// @@ -321,14 +342,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II, // address of new allocated space. if (LP64) { if (requiresRegisterScavenging(MF)) // FIXME (64-bit): Use "true" part. - BuildMI(MBB, II, dl, TII.get(PPC::STDUX)) + BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1) .addReg(Reg, RegState::Kill) - .addReg(PPC::X1, RegState::Define) + .addReg(PPC::X1) .addReg(MI.getOperand(1).getReg()); else - BuildMI(MBB, II, dl, TII.get(PPC::STDUX)) + BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1) .addReg(PPC::X0, RegState::Kill) - .addReg(PPC::X1, RegState::Define) + .addReg(PPC::X1) .addReg(MI.getOperand(1).getReg()); if (!MI.getOperand(1).isKill()) @@ -342,9 +363,9 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II, .addImm(maxCallFrameSize) .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill); } else { - BuildMI(MBB, II, dl, TII.get(PPC::STWUX)) + BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1) .addReg(Reg, RegState::Kill) - .addReg(PPC::R1, RegState::Define) + .addReg(PPC::R1) .addReg(MI.getOperand(1).getReg()); if (!MI.getOperand(1).isKill()) diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h index b1e6a72..152c36d 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/lib/Target/PowerPC/PPCRegisterInfo.h @@ -35,7 +35,8 @@ public: /// getPointerRegClass - Return the register class to use to hold pointers. /// This is used for addressing modes. - virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const; + virtual const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const; unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const; @@ -46,10 +47,14 @@ public: BitVector getReservedRegs(const MachineFunction &MF) const; + virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const; + /// requiresRegisterScavenging - We require a register scavenger. /// FIXME (64-bit): Should be inlined. bool requiresRegisterScavenging(const MachineFunction &MF) const; + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const; + void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td index 0e55313..5ca3876 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/lib/Target/PowerPC/PPCRegisterInfo.td @@ -314,12 +314,18 @@ def CRBITRC : RegisterClass<"PPC", [i32], 32, } def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6, - CR7, CR2, CR3, CR4)> { - let SubRegClasses = [(CRBITRC sub_lt, sub_gt, sub_eq, sub_un)]; + CR7, CR2, CR3, CR4)>; + +// The CTR registers are not allocatable because they're used by the +// decrement-and-branch instructions, and thus need to stay live across +// multiple basic blocks. +def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> { + let isAllocatable = 0; +} +def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> { + let isAllocatable = 0; } -def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)>; -def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)>; def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>; def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> { let CopyCost = -1; diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td index 8c0a858..6a6ccb9 100644 --- a/lib/Target/PowerPC/PPCSchedule.td +++ b/lib/Target/PowerPC/PPCSchedule.td @@ -25,6 +25,7 @@ def VFPU : FuncUnit; // vector floating point unit //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for PowerPC // +def IntSimple : InstrItinClass; def IntGeneral : InstrItinClass; def IntCompare : InstrItinClass; def IntDivD : InstrItinClass; @@ -117,17 +118,17 @@ include "PPCScheduleA2.td" // // opcode itinerary class // ====== =============== -// add IntGeneral +// add IntSimple // addc IntGeneral // adde IntGeneral -// addi IntGeneral +// addi IntSimple // addic IntGeneral // addic. IntGeneral -// addis IntGeneral +// addis IntSimple // addme IntGeneral // addze IntGeneral -// and IntGeneral -// andc IntGeneral +// and IntSimple +// andc IntSimple // andi. IntGeneral // andis. IntGeneral // b BrB @@ -165,10 +166,10 @@ include "PPCScheduleA2.td" // eciwx LdStLoad // ecowx LdStLoad // eieio LdStLoad -// eqv IntGeneral -// extsb IntGeneral -// extsh IntGeneral -// extsw IntRotateD +// eqv IntSimple +// extsb IntSimple +// extsh IntSimple +// extsw IntSimple // fabs FPGeneral // fadd FPGeneral // fadds FPGeneral @@ -280,13 +281,13 @@ include "PPCScheduleA2.td" // mulld IntMulHD // mulli IntMulLI // mullw IntMulHW -// nand IntGeneral -// neg IntGeneral -// nor IntGeneral -// or IntGeneral -// orc IntGeneral -// ori IntGeneral -// oris IntGeneral +// nand IntSimple +// neg IntSimple +// nor IntSimple +// or IntSimple +// orc IntSimple +// ori IntSimple +// oris IntSimple // rfi SprRFI // rfid IntRFID // rldcl IntRotateD @@ -502,7 +503,7 @@ include "PPCScheduleA2.td" // vupklsb VecPerm // vupklsh VecPerm // vxor VecGeneral -// xor IntGeneral -// xori IntGeneral -// xoris IntGeneral +// xor IntSimple +// xori IntSimple +// xoris IntSimple // diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td index 419faea..cd0fb70 100644 --- a/lib/Target/PowerPC/PPCSchedule440.td +++ b/lib/Target/PowerPC/PPCSchedule440.td @@ -108,6 +108,15 @@ def PPC440Itineraries : ProcessorItineraries< IRACC, IEXE1, IEXE2, IWB, LRACC, JEXE1, JEXE2, JWB, AGEN, CRD, LWB, FEXE1, FEXE2, FEXE3, FEXE4, FEXE5, FEXE6, FWB, LWARX_Hold], [GPR_Bypass, FPR_Bypass], [ + InstrItinData<IntSimple , [InstrStage<1, [IFTH1, IFTH2]>, + InstrStage<1, [PDCD1, PDCD2]>, + InstrStage<1, [DISS1, DISS2]>, + InstrStage<1, [IRACC, LRACC]>, + InstrStage<1, [IEXE1, JEXE1]>, + InstrStage<1, [IEXE2, JEXE2]>, + InstrStage<1, [IWB, JWB]>], + [6, 4, 4], + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, InstrItinData<IntGeneral , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1, DISS2]>, @@ -373,26 +382,6 @@ def PPC440Itineraries : ProcessorItineraries< InstrStage<1, [LWB]>], [8, 5], [NoBypass, GPR_Bypass]>, - InstrItinData<LdStSTD , [InstrStage<1, [IFTH1, IFTH2]>, - InstrStage<1, [PDCD1, PDCD2]>, - InstrStage<1, [DISS1, DISS2]>, - InstrStage<1, [LRACC]>, - InstrStage<1, [AGEN]>, - InstrStage<1, [CRD]>, - InstrStage<2, [LWB]>], - [8, 5], - [NoBypass, GPR_Bypass]>, - InstrItinData<LdStSTDCX , [InstrStage<1, [IFTH1, IFTH2]>, - InstrStage<1, [PDCD1, PDCD2]>, - InstrStage<1, [DISS1]>, - InstrStage<1, [IRACC], 0>, - InstrStage<4, [LWARX_Hold], 0>, - InstrStage<1, [LRACC]>, - InstrStage<1, [AGEN]>, - InstrStage<1, [CRD]>, - InstrStage<1, [LWB]>], - [8, 5], - [NoBypass, GPR_Bypass]>, InstrItinData<LdStSTWCX , [InstrStage<1, [IFTH1, IFTH2]>, InstrStage<1, [PDCD1, PDCD2]>, InstrStage<1, [DISS1]>, diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td index 857ba40..4d4a5d0 100644 --- a/lib/Target/PowerPC/PPCScheduleA2.td +++ b/lib/Target/PowerPC/PPCScheduleA2.td @@ -60,6 +60,17 @@ def PPCA2Itineraries : ProcessorItineraries< IU5, IU6, RF0, XRF1, XEX1, XEX2, XEX3, XEX4, XEX5, XEX6, FRF1, FEX1, FEX2, FEX3, FEX4, FEX5, FEX6], [CR_Bypass, GPR_Bypass, FPR_Bypass], [ + InstrItinData<IntSimple , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [10, 7, 7], + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, InstrItinData<IntGeneral , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -159,6 +170,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [10, 7, 7], [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntRotateD , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [10, 7, 7], + [GPR_Bypass, GPR_Bypass, GPR_Bypass]>, InstrItinData<IntShift , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -181,6 +203,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [10, 7, 7], [GPR_Bypass, GPR_Bypass]>, + InstrItinData<IntTrapD , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [10, 7, 7], + [GPR_Bypass, GPR_Bypass]>, InstrItinData<BrB , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -269,6 +302,17 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [14, 7], [GPR_Bypass, GPR_Bypass]>, + InstrItinData<LdStLD , [InstrStage<4, + [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, + InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, + IU4_4, IU4_5, IU4_6, IU4_7]>, + InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, + InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, + InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, + InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, + InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], + [14, 7], + [GPR_Bypass, GPR_Bypass]>, InstrItinData<LdStStore , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, @@ -379,28 +423,6 @@ def PPCA2Itineraries : ProcessorItineraries< InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], [26, 7], [NoBypass, GPR_Bypass]>, - InstrItinData<LdStSTD , [InstrStage<4, - [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, - InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, - IU4_4, IU4_5, IU4_6, IU4_7]>, - InstrStage<1, [IU5]>, InstrStage<1, [IU6]>, - InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, - InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, - InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, - InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], - [13, 7], - [GPR_Bypass, GPR_Bypass]>, - InstrItinData<LdStSTDCX , [InstrStage<4, - [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, - InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, - IU4_4, IU4_5, IU4_6, IU4_7]>, - InstrStage<1, [IU5]>, InstrStage<13, [IU6]>, - InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>, - InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>, - InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>, - InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>], - [26, 7], - [NoBypass, GPR_Bypass]>, InstrItinData<LdStSTWCX , [InstrStage<4, [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>, InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3, diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td index bc926f7..61e89ed 100644 --- a/lib/Target/PowerPC/PPCScheduleG3.td +++ b/lib/Target/PowerPC/PPCScheduleG3.td @@ -14,6 +14,7 @@ def G3Itineraries : ProcessorItineraries< [IU1, IU2, FPU1, BPU, SRU, SLU], [], [ + InstrItinData<IntSimple , [InstrStage<1, [IU1, IU2]>]>, InstrItinData<IntGeneral , [InstrStage<1, [IU1, IU2]>]>, InstrItinData<IntCompare , [InstrStage<1, [IU1, IU2]>]>, InstrItinData<IntDivW , [InstrStage<19, [IU1]>]>, diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td index f7ec1e0..e19ddfa 100644 --- a/lib/Target/PowerPC/PPCScheduleG4.td +++ b/lib/Target/PowerPC/PPCScheduleG4.td @@ -13,6 +13,7 @@ def G4Itineraries : ProcessorItineraries< [IU1, IU2, SLU, SRU, BPU, FPU1, VIU1, VIU2, VPU, VFPU], [], [ + InstrItinData<IntSimple , [InstrStage<1, [IU1, IU2]>]>, InstrItinData<IntGeneral , [InstrStage<1, [IU1, IU2]>]>, InstrItinData<IntCompare , [InstrStage<1, [IU1, IU2]>]>, InstrItinData<IntDivW , [InstrStage<19, [IU1]>]>, diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td index 37ebfc5..e7446cb 100644 --- a/lib/Target/PowerPC/PPCScheduleG4Plus.td +++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td @@ -16,6 +16,7 @@ def IU4 : FuncUnit; // integer unit 4 (7450 simple) def G4PlusItineraries : ProcessorItineraries< [IU1, IU2, IU3, IU4, BPU, SLU, FPU1, VFPU, VIU1, VIU2, VPU], [], [ + InstrItinData<IntSimple , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>, InstrItinData<IntGeneral , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>, InstrItinData<IntCompare , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>, InstrItinData<IntDivW , [InstrStage<23, [IU2]>]>, diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td index d1e40ce..1371499 100644 --- a/lib/Target/PowerPC/PPCScheduleG5.td +++ b/lib/Target/PowerPC/PPCScheduleG5.td @@ -13,6 +13,7 @@ def G5Itineraries : ProcessorItineraries< [IU1, IU2, SLU, BPU, FPU1, FPU2, VFPU, VIU1, VIU2, VPU], [], [ + InstrItinData<IntSimple , [InstrStage<2, [IU1, IU2]>]>, InstrItinData<IntGeneral , [InstrStage<2, [IU1, IU2]>]>, InstrItinData<IntCompare , [InstrStage<3, [IU1, IU2]>]>, InstrItinData<IntDivD , [InstrStage<68, [IU1]>]>, diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index f405b47..bb193ac 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -16,6 +16,7 @@ #include "PPC.h" #include "llvm/GlobalValue.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Host.h" #include "llvm/Support/TargetRegistry.h" #include <cstdlib> @@ -25,56 +26,19 @@ using namespace llvm; -#if defined(__APPLE__) -#include <mach/mach.h> -#include <mach/mach_host.h> -#include <mach/host_info.h> -#include <mach/machine.h> - -/// GetCurrentPowerPCFeatures - Returns the current CPUs features. -static const char *GetCurrentPowerPCCPU() { - host_basic_info_data_t hostInfo; - mach_msg_type_number_t infoCount; - - infoCount = HOST_BASIC_INFO_COUNT; - host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, - &infoCount); - - if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic"; - - switch(hostInfo.cpu_subtype) { - case CPU_SUBTYPE_POWERPC_601: return "601"; - case CPU_SUBTYPE_POWERPC_602: return "602"; - case CPU_SUBTYPE_POWERPC_603: return "603"; - case CPU_SUBTYPE_POWERPC_603e: return "603e"; - case CPU_SUBTYPE_POWERPC_603ev: return "603ev"; - case CPU_SUBTYPE_POWERPC_604: return "604"; - case CPU_SUBTYPE_POWERPC_604e: return "604e"; - case CPU_SUBTYPE_POWERPC_620: return "620"; - case CPU_SUBTYPE_POWERPC_750: return "750"; - case CPU_SUBTYPE_POWERPC_7400: return "7400"; - case CPU_SUBTYPE_POWERPC_7450: return "7450"; - case CPU_SUBTYPE_POWERPC_970: return "970"; - default: ; - } - - return "generic"; -} -#endif - - PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU, const std::string &FS, bool is64Bit) : PPCGenSubtargetInfo(TT, CPU, FS) , StackAlignment(16) , DarwinDirective(PPC::DIR_NONE) - , IsGigaProcessor(false) + , HasMFOCRF(false) , Has64BitSupport(false) , Use64BitRegs(false) , IsPPC64(is64Bit) , HasAltivec(false) , HasFSQRT(false) , HasSTFIWX(false) + , HasISEL(false) , IsBookE(false) , HasLazyResolverStubs(false) , IsJITCodeModel(false) @@ -84,9 +48,10 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU, std::string CPUName = CPU; if (CPUName.empty()) CPUName = "generic"; -#if defined(__APPLE__) +#if (defined(__APPLE__) || defined(__linux__)) && \ + (defined(__ppc__) || defined(__powerpc__)) if (CPUName == "generic") - CPUName = GetCurrentPowerPCCPU(); + CPUName = sys::getHostCPUName(); #endif // Parse features string. @@ -146,10 +111,14 @@ bool PPCSubtarget::enablePostRAScheduler( CodeGenOpt::Level OptLevel, TargetSubtargetInfo::AntiDepBreakMode& Mode, RegClassVector& CriticalPathRCs) const { - if (DarwinDirective == PPC::DIR_440 || DarwinDirective == PPC::DIR_A2) - Mode = TargetSubtargetInfo::ANTIDEP_ALL; - else - Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; + // FIXME: It would be best to use TargetSubtargetInfo::ANTIDEP_ALL here, + // but we can't because we can't reassign the cr registers. There is a + // dependence between the cr register and the RLWINM instruction used + // to extract its value which the anti-dependency breaker can't currently + // see. Maybe we should make a late-expanded pseudo to encode this dependency. + // (the relevant code is in PPCDAGToDAGISel::SelectSETCC) + + Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; CriticalPathRCs.clear(); @@ -157,6 +126,9 @@ bool PPCSubtarget::enablePostRAScheduler( CriticalPathRCs.push_back(&PPC::G8RCRegClass); else CriticalPathRCs.push_back(&PPC::GPRCRegClass); + + CriticalPathRCs.push_back(&PPC::F8RCRegClass); + CriticalPathRCs.push_back(&PPC::VRRCRegClass); return OptLevel >= CodeGenOpt::Default; } diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index a275029..0207c83 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -41,6 +41,8 @@ namespace PPC { DIR_750, DIR_970, DIR_A2, + DIR_PWR6, + DIR_PWR7, DIR_64 }; } @@ -61,13 +63,14 @@ protected: unsigned DarwinDirective; /// Used by the ISel to turn in optimizations for POWER4-derived architectures - bool IsGigaProcessor; + bool HasMFOCRF; bool Has64BitSupport; bool Use64BitRegs; bool IsPPC64; bool HasAltivec; bool HasFSQRT; bool HasSTFIWX; + bool HasISEL; bool IsBookE; bool HasLazyResolverStubs; bool IsJITCodeModel; @@ -138,7 +141,8 @@ public: bool hasFSQRT() const { return HasFSQRT; } bool hasSTFIWX() const { return HasSTFIWX; } bool hasAltivec() const { return HasAltivec; } - bool isGigaProcessor() const { return IsGigaProcessor; } + bool hasMFOCRF() const { return HasMFOCRF; } + bool hasISEL() const { return HasISEL; } bool isBookE() const { return IsBookE; } const Triple &getTargetTriple() const { return TargetTriple; } diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index d113976..9805112 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -17,10 +17,15 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; +static cl:: +opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden, + cl::desc("Disable CTR loops for PPC")); + extern "C" void LLVMInitializePowerPCTarget() { // Register the targets RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target); @@ -81,41 +86,37 @@ public: return getTM<PPCTargetMachine>(); } + virtual bool addPreRegAlloc(); virtual bool addInstSelector(); virtual bool addPreEmitPass(); }; } // namespace TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { - TargetPassConfig *PassConfig = new PPCPassConfig(this, PM); + return new PPCPassConfig(this, PM); +} - // Override this for PowerPC. Tail merging happily breaks up instruction issue - // groups, which typically degrades performance. - PassConfig->setEnableTailMerge(false); +bool PPCPassConfig::addPreRegAlloc() { + if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) + addPass(createPPCCTRLoops()); - return PassConfig; + return false; } bool PPCPassConfig::addInstSelector() { // Install an instruction selector. - PM.add(createPPCISelDag(getPPCTargetMachine())); + addPass(createPPCISelDag(getPPCTargetMachine())); return false; } bool PPCPassConfig::addPreEmitPass() { // Must run branch selection immediately preceding the asm printer. - PM.add(createPPCBranchSelectionPass()); + addPass(createPPCBranchSelectionPass()); return false; } bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) { - // FIXME: This should be moved to TargetJITInfo!! - if (Subtarget.isPPC64()) - // Temporary workaround for the inability of PPC64 JIT to handle jump - // tables. - Options.DisableJumpTables = true; - // Inform the subtarget that we are in JIT mode. FIXME: does this break macho // writing? Subtarget.SetJITMode(); diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt index 349cd89..b6763aa 100644 --- a/lib/Target/PowerPC/README.txt +++ b/lib/Target/PowerPC/README.txt @@ -2,7 +2,6 @@ TODO: * gpr0 allocation -* implement do-loop -> bdnz transform * lmw/stmw pass a la arm load store optimizer for prolog/epilog ===-------------------------------------------------------------------------=== diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 093255e..cbfa4cf 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -964,6 +964,12 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts". //===---------------------------------------------------------------------===// +unsigned f(unsigned x) { return ((x & 7) + 1) & 15; } +The & 15 part should be optimized away, it doesn't change the result. Currently +not optimized with "clang -emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + This was noticed in the entryblock for grokdeclarator in 403.gcc: %tmp = icmp eq i32 %decl_context, 4 diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt index ae4af0f..efb10db 100644 --- a/lib/Target/Sparc/CMakeLists.txt +++ b/lib/Target/Sparc/CMakeLists.txt @@ -23,5 +23,7 @@ add_llvm_target(SparcCodeGen SparcSelectionDAGInfo.cpp ) +add_dependencies(LLVMSparcCodeGen intrinsics_gen) + add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp index 883aa3a..7bf8c3f 100644 --- a/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/lib/Target/Sparc/DelaySlotFiller.cpp @@ -279,14 +279,11 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI, //returns true if the Reg or its alias is in the RegSet. bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg) { - if (RegSet.count(Reg)) - return true; - // check Aliased Registers - for (const uint16_t *Alias = TM.getRegisterInfo()->getAliasSet(Reg); - *Alias; ++ Alias) - if (RegSet.count(*Alias)) + // Check Reg and all aliased Registers. + for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true); + AI.isValid(); ++AI) + if (RegSet.count(*AI)) return true; - return false; } diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp index c14b3d4..2554862 100644 --- a/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -187,7 +187,9 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, if (ExtraCode[1] != 0) return true; // Unknown modifier. switch (ExtraCode[0]) { - default: return true; // Unknown modifier. + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); case 'r': break; } diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h index 210705e..6b593c9 100644 --- a/lib/Target/Sparc/SparcFrameLowering.h +++ b/lib/Target/Sparc/SparcFrameLowering.h @@ -22,10 +22,9 @@ namespace llvm { class SparcSubtarget; class SparcFrameLowering : public TargetFrameLowering { - const SparcSubtarget &STI; public: - explicit SparcFrameLowering(const SparcSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0), STI(sti) { + explicit SparcFrameLowering(const SparcSubtarget &/*sti*/) + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) { } /// emitProlog/emitEpilog - These methods insert prolog and epilog code into diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index c3e6f16..79f7ebd 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -90,7 +90,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain, // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs, *DAG.getContext()); + DAG.getTarget(), RVLocs, *DAG.getContext()); // Analize return values. CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32); @@ -160,7 +160,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32); const unsigned StackOffset = 92; @@ -345,21 +345,26 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain, } SDValue -SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + // Sparc target does not yet support tail call optimization. isTailCall = false; // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), ArgLocs, *DAG.getContext()); + DAG.getTarget(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32); // Get the size of the outgoing arguments stack space requirement. @@ -590,7 +595,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs, *DAG.getContext()); + DAG.getTarget(), RVLocs, *DAG.getContext()); RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32); @@ -689,9 +694,9 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { // Set up the register classes. - addRegisterClass(MVT::i32, SP::IntRegsRegisterClass); - addRegisterClass(MVT::f32, SP::FPRegsRegisterClass); - addRegisterClass(MVT::f64, SP::DFPRegsRegisterClass); + addRegisterClass(MVT::i32, &SP::IntRegsRegClass); + addRegisterClass(MVT::f32, &SP::FPRegsRegClass); + addRegisterClass(MVT::f64, &SP::DFPRegsRegClass); // Turn FP extload into load/fextend setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); @@ -1259,7 +1264,7 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': - return std::make_pair(0U, SP::IntRegsRegisterClass); + return std::make_pair(0U, &SP::IntRegsRegClass); } } diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h index cf43048..09148ea 100644 --- a/lib/Target/Sparc/SparcISelLowering.h +++ b/lib/Target/Sparc/SparcISelLowering.h @@ -76,12 +76,7 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, - bool isVarArg, bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp index faff468..f8674d0 100644 --- a/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/lib/Target/Sparc/SparcInstrInfo.cpp @@ -303,13 +303,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, if (I != MBB.end()) DL = I->getDebugLoc(); // On the order of operands here: think "[FrameIdx + 0] = SrcReg". - if (RC == SP::IntRegsRegisterClass) + if (RC == &SP::IntRegsRegClass) BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)); - else if (RC == SP::FPRegsRegisterClass) + else if (RC == &SP::FPRegsRegClass) BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)); - else if (RC == SP::DFPRegsRegisterClass) + else if (RC == &SP::DFPRegsRegClass) BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)); else @@ -324,11 +324,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == SP::IntRegsRegisterClass) + if (RC == &SP::IntRegsRegClass) BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0); - else if (RC == SP::FPRegsRegisterClass) + else if (RC == &SP::FPRegsRegClass) BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0); - else if (RC == SP::DFPRegsRegisterClass) + else if (RC == &SP::DFPRegsRegClass) BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0); else llvm_unreachable("Can't load this register from stack slot"); diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 6f31356..9ee12ed 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -34,7 +34,8 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), Subtarget(TT, CPU, FS, is64bit), DataLayout(Subtarget.getDataLayout()), - TLInfo(*this), TSInfo(*this), InstrInfo(Subtarget), + InstrInfo(Subtarget), + TLInfo(*this), TSInfo(*this), FrameLowering(Subtarget) { } @@ -59,7 +60,7 @@ TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) { } bool SparcPassConfig::addInstSelector() { - PM.add(createSparcISelDag(getSparcTargetMachine())); + addPass(createSparcISelDag(getSparcTargetMachine())); return false; } @@ -67,8 +68,8 @@ bool SparcPassConfig::addInstSelector() { /// passes immediately before machine code is emitted. This should return /// true if -print-machineinstrs should print out the code after the passes. bool SparcPassConfig::addPreEmitPass(){ - PM.add(createSparcFPMoverPass(getSparcTargetMachine())); - PM.add(createSparcDelaySlotFillerPass(getSparcTargetMachine())); + addPass(createSparcFPMoverPass(getSparcTargetMachine())); + addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine())); return true; } diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h index b203dfa..b2cc624 100644 --- a/lib/Target/Sparc/SparcTargetMachine.h +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -28,9 +28,9 @@ namespace llvm { class SparcTargetMachine : public LLVMTargetMachine { SparcSubtarget Subtarget; const TargetData DataLayout; // Calculates type size & alignment + SparcInstrInfo InstrInfo; SparcTargetLowering TLInfo; SparcSelectionDAGInfo TSInfo; - SparcInstrInfo InstrInfo; SparcFrameLowering FrameLowering; public: SparcTargetMachine(const Target &T, StringRef TT, diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp index acb7476..cc6dc1e 100644 --- a/lib/Target/TargetData.cpp +++ b/lib/Target/TargetData.cpp @@ -117,8 +117,8 @@ TargetAlignElem::operator==(const TargetAlignElem &rhs) const { && TypeBitWidth == rhs.TypeBitWidth); } -const TargetAlignElem TargetData::InvalidAlignmentElem = - TargetAlignElem::get((AlignTypeEnum) -1, 0, 0, 0); +const TargetAlignElem +TargetData::InvalidAlignmentElem = { (AlignTypeEnum)0xFF, 0, 0, 0 }; //===----------------------------------------------------------------------===// // TargetData Class Implementation diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp index 440f9ad..f1d1d07 100644 --- a/lib/Target/TargetInstrInfo.cpp +++ b/lib/Target/TargetInstrInfo.cpp @@ -21,20 +21,25 @@ using namespace llvm; //===----------------------------------------------------------------------===// // TargetInstrInfo -//===----------------------------------------------------------------------===// +// +// Methods that depend on CodeGen are implemented in +// TargetInstrInfoImpl.cpp. Invoking them without linking libCodeGen raises a +// link error. +// ===----------------------------------------------------------------------===// TargetInstrInfo::~TargetInstrInfo() { } const TargetRegisterClass* TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { + const TargetRegisterInfo *TRI, + const MachineFunction &MF) const { if (OpNum >= MCID.getNumOperands()) return 0; short RegClass = MCID.OpInfo[OpNum].RegClass; if (MCID.OpInfo[OpNum].isLookupPtrRegClass()) - return TRI->getPointerRegClass(RegClass); + return TRI->getPointerRegClass(MF, RegClass); // Instructions like INSERT_SUBREG do not have fixed register classes. if (RegClass < 0) @@ -44,54 +49,6 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, return TRI->getRegClass(RegClass); } -unsigned -TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, - const MachineInstr *MI) const { - if (!ItinData || ItinData->isEmpty()) - return 1; - - unsigned Class = MI->getDesc().getSchedClass(); - unsigned UOps = ItinData->Itineraries[Class].NumMicroOps; - if (UOps) - return UOps; - - // The # of u-ops is dynamically determined. The specific target should - // override this function to return the right number. - return 1; -} - -int -TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const { - if (!ItinData || ItinData->isEmpty()) - return -1; - - unsigned DefClass = DefMI->getDesc().getSchedClass(); - unsigned UseClass = UseMI->getDesc().getSchedClass(); - return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); -} - -int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost) const { - if (!ItinData || ItinData->isEmpty()) - return 1; - - return ItinData->getStageLatency(MI->getDesc().getSchedClass()); -} - -bool TargetInstrInfo::hasLowDefLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, - unsigned DefIdx) const { - if (!ItinData || ItinData->isEmpty()) - return false; - - unsigned DefClass = DefMI->getDesc().getSchedClass(); - int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); - return (DefCycle != -1 && DefCycle <= 1); -} - /// insertNoop - Insert a noop into the instruction stream at the specified /// point. void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB, @@ -99,7 +56,6 @@ void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB, llvm_unreachable("Target didn't implement insertNoop!"); } - /// Measure the specified inline asm to determine an approximation of its /// length. /// Comments (which run till the next SeparatorString or newline) do not diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp index 2570e0d..b74a0bd 100644 --- a/lib/Target/TargetLoweringObjectFile.cpp +++ b/lib/Target/TargetLoweringObjectFile.cpp @@ -152,7 +152,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, // a mergable string section, or general .data if it contains relocations. if (GVar->isConstant()) { // If the initializer for the global contains something that requires a - // relocation, then we may have to drop this into a wriable data section + // relocation, then we may have to drop this into a writable data section // even though it is marked const. switch (C->getRelocationInfo()) { case Constant::NoRelocation: diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index b9b2526..3825719 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -11,7 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/GlobalAlias.h" #include "llvm/GlobalValue.h" +#include "llvm/GlobalVariable.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeGenInfo.h" #include "llvm/Target/TargetMachine.h" @@ -75,25 +77,58 @@ CodeModel::Model TargetMachine::getCodeModel() const { return CodeGenInfo->getCodeModel(); } +/// Get the IR-specified TLS model for Var. +static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) { + switch (Var->getThreadLocalMode()) { + case GlobalVariable::NotThreadLocal: + llvm_unreachable("getSelectedTLSModel for non-TLS variable"); + break; + case GlobalVariable::GeneralDynamicTLSModel: + return TLSModel::GeneralDynamic; + case GlobalVariable::LocalDynamicTLSModel: + return TLSModel::LocalDynamic; + case GlobalVariable::InitialExecTLSModel: + return TLSModel::InitialExec; + case GlobalVariable::LocalExecTLSModel: + return TLSModel::LocalExec; + } + llvm_unreachable("invalid TLS model"); +} + TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const { - bool isLocal = GV->hasLocalLinkage(); - bool isDeclaration = GV->isDeclaration(); + // If GV is an alias then use the aliasee for determining + // thread-localness. + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + GV = GA->resolveAliasedGlobal(false); + const GlobalVariable *Var = cast<GlobalVariable>(GV); + + bool isLocal = Var->hasLocalLinkage(); + bool isDeclaration = Var->isDeclaration(); + bool isPIC = getRelocationModel() == Reloc::PIC_; + bool isPIE = Options.PositionIndependentExecutable; // FIXME: what should we do for protected and internal visibility? // For variables, is internal different from hidden? - bool isHidden = GV->hasHiddenVisibility(); + bool isHidden = Var->hasHiddenVisibility(); - if (getRelocationModel() == Reloc::PIC_ && - !Options.PositionIndependentExecutable) { + TLSModel::Model Model; + if (isPIC && !isPIE) { if (isLocal || isHidden) - return TLSModel::LocalDynamic; + Model = TLSModel::LocalDynamic; else - return TLSModel::GeneralDynamic; + Model = TLSModel::GeneralDynamic; } else { if (!isDeclaration || isHidden) - return TLSModel::LocalExec; + Model = TLSModel::LocalExec; else - return TLSModel::InitialExec; + Model = TLSModel::InitialExec; } + + // If the user specified a more specific model, use that. + TLSModel::Model SelectedModel = getSelectedTLSModel(Var); + if (SelectedModel > Model) + return SelectedModel; + + return Model; } /// getOptLevel - Returns the optimization level: None, Less, @@ -127,4 +162,3 @@ void TargetMachine::setFunctionSections(bool V) { void TargetMachine::setDataSections(bool V) { DataSections = V; } - diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp index 1716423..2395f2b 100644 --- a/lib/Target/TargetRegisterInfo.cpp +++ b/lib/Target/TargetRegisterInfo.cpp @@ -46,6 +46,50 @@ void PrintReg::print(raw_ostream &OS) const { } } +void PrintRegUnit::print(raw_ostream &OS) const { + // Generic printout when TRI is missing. + if (!TRI) { + OS << "Unit~" << Unit; + return; + } + + // Check for invalid register units. + if (Unit >= TRI->getNumRegUnits()) { + OS << "BadUnit~" << Unit; + return; + } + + // Normal units have at least one root. + MCRegUnitRootIterator Roots(Unit, TRI); + assert(Roots.isValid() && "Unit has no roots."); + OS << TRI->getName(*Roots); + for (++Roots; Roots.isValid(); ++Roots) + OS << '~' << TRI->getName(*Roots); +} + +/// getAllocatableClass - Return the maximal subclass of the given register +/// class that is alloctable, or NULL. +const TargetRegisterClass * +TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const { + if (!RC || RC->isAllocatable()) + return RC; + + const unsigned *SubClass = RC->getSubClassMask(); + for (unsigned Base = 0, BaseE = getNumRegClasses(); + Base < BaseE; Base += 32) { + unsigned Idx = Base; + for (unsigned Mask = *SubClass++; Mask; Mask >>= 1) { + unsigned Offset = CountTrailingZeros_32(Mask); + const TargetRegisterClass *SubRC = getRegClass(Idx + Offset); + if (SubRC->isAllocatable()) + return SubRC; + Mask >>= Offset; + Idx += Offset + 1; + } + } + return NULL; +} + /// getMinimalPhysRegClass - Returns the Register Class of a physical /// register of the given type, picking the most sub register class of /// the right type that contains this physreg. @@ -71,6 +115,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const { /// registers for the specific register class. static void getAllocatableSetForRC(const MachineFunction &MF, const TargetRegisterClass *RC, BitVector &R){ + assert(RC->isAllocatable() && "invalid for nonallocatable sets"); ArrayRef<uint16_t> Order = RC->getRawAllocationOrder(MF); for (unsigned i = 0; i != Order.size(); ++i) R.set(Order[i]); @@ -80,7 +125,10 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, const TargetRegisterClass *RC) const { BitVector Allocatable(getNumRegs()); if (RC) { - getAllocatableSetForRC(MF, RC, Allocatable); + // A register class with no allocatable subclass returns an empty set. + const TargetRegisterClass *SubClass = getAllocatableClass(RC); + if (SubClass) + getAllocatableSetForRC(MF, SubClass, Allocatable); } else { for (TargetRegisterInfo::regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I) @@ -95,6 +143,16 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, return Allocatable; } +static inline +const TargetRegisterClass *firstCommonClass(const uint32_t *A, + const uint32_t *B, + const TargetRegisterInfo *TRI) { + for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32) + if (unsigned Common = *A++ & *B++) + return TRI->getRegClass(I + CountTrailingZeros_32(Common)); + return 0; +} + const TargetRegisterClass * TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A, const TargetRegisterClass *B) const { @@ -106,15 +164,83 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A, // Register classes are ordered topologically, so the largest common // sub-class it the common sub-class with the smallest ID. - const unsigned *SubA = A->getSubClassMask(); - const unsigned *SubB = B->getSubClassMask(); + return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this); +} - // We could start the search from max(A.ID, B.ID), but we are only going to - // execute 2-3 iterations anyway. - for (unsigned Base = 0, BaseE = getNumRegClasses(); Base < BaseE; Base += 32) - if (unsigned Common = *SubA++ & *SubB++) - return getRegClass(Base + CountTrailingZeros_32(Common)); +const TargetRegisterClass * +TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, + unsigned Idx) const { + assert(A && B && "Missing register class"); + assert(Idx && "Bad sub-register index"); + + // Find Idx in the list of super-register indices. + for (SuperRegClassIterator RCI(B, this); RCI.isValid(); ++RCI) + if (RCI.getSubReg() == Idx) + // The bit mask contains all register classes that are projected into B + // by Idx. Find a class that is also a sub-class of A. + return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this); + return 0; +} - // No common sub-class exists. - return NULL; +const TargetRegisterClass *TargetRegisterInfo:: +getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, + const TargetRegisterClass *RCB, unsigned SubB, + unsigned &PreA, unsigned &PreB) const { + assert(RCA && SubA && RCB && SubB && "Invalid arguments"); + + // Search all pairs of sub-register indices that project into RCA and RCB + // respectively. This is quadratic, but usually the sets are very small. On + // most targets like X86, there will only be a single sub-register index + // (e.g., sub_16bit projecting into GR16). + // + // The worst case is a register class like DPR on ARM. + // We have indices dsub_0..dsub_7 projecting into that class. + // + // It is very common that one register class is a sub-register of the other. + // Arrange for RCA to be the larger register so the answer will be found in + // the first iteration. This makes the search linear for the most common + // case. + const TargetRegisterClass *BestRC = 0; + unsigned *BestPreA = &PreA; + unsigned *BestPreB = &PreB; + if (RCA->getSize() < RCB->getSize()) { + std::swap(RCA, RCB); + std::swap(SubA, SubB); + std::swap(BestPreA, BestPreB); + } + + // Also terminate the search one we have found a register class as small as + // RCA. + unsigned MinSize = RCA->getSize(); + + for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) { + unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA); + for (SuperRegClassIterator IB(RCB, this, true); IB.isValid(); ++IB) { + // Check if a common super-register class exists for this index pair. + const TargetRegisterClass *RC = + firstCommonClass(IA.getMask(), IB.getMask(), this); + if (!RC || RC->getSize() < MinSize) + continue; + + // The indexes must compose identically: PreA+SubA == PreB+SubB. + unsigned FinalB = composeSubRegIndices(IB.getSubReg(), SubB); + if (FinalA != FinalB) + continue; + + // Is RC a better candidate than BestRC? + if (BestRC && RC->getSize() >= BestRC->getSize()) + continue; + + // Yes, RC is the smallest super-register seen so far. + BestRC = RC; + *BestPreA = IA.getSubReg(); + *BestPreB = IB.getSubReg(); + + // Bail early if we reached MinSize. We won't find a better candidate. + if (BestRC->getSize() == MinSize) + return BestRC; + } + } + return BestRC; } diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 08c732c..95e83ec 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -117,7 +117,7 @@ static unsigned MatchRegisterName(StringRef Name); /// } -static bool isImmSExti16i8Value(uint64_t Value) { +static bool isImmSExti16i8Value(uint64_t Value) { return (( Value <= 0x000000000000007FULL)|| (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)|| (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); @@ -135,12 +135,12 @@ static bool isImmZExtu32u8Value(uint64_t Value) { static bool isImmSExti64i8Value(uint64_t Value) { return (( Value <= 0x000000000000007FULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); } static bool isImmSExti64i32Value(uint64_t Value) { return (( Value <= 0x000000007FFFFFFFULL)|| - (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); } namespace { @@ -187,7 +187,7 @@ struct X86Operand : public MCParsedAsmOperand { SMLoc getStartLoc() const { return StartLoc; } /// getEndLoc - Get the location of the last token of this operand. SMLoc getEndLoc() const { return EndLoc; } - + SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } virtual void print(raw_ostream &OS) const {} @@ -309,28 +309,45 @@ struct X86Operand : public MCParsedAsmOperand { } bool isMem() const { return Kind == Memory; } - bool isMem8() const { + bool isMem8() const { return Kind == Memory && (!Mem.Size || Mem.Size == 8); } - bool isMem16() const { + bool isMem16() const { return Kind == Memory && (!Mem.Size || Mem.Size == 16); } - bool isMem32() const { + bool isMem32() const { return Kind == Memory && (!Mem.Size || Mem.Size == 32); } - bool isMem64() const { + bool isMem64() const { return Kind == Memory && (!Mem.Size || Mem.Size == 64); } - bool isMem80() const { + bool isMem80() const { return Kind == Memory && (!Mem.Size || Mem.Size == 80); } - bool isMem128() const { + bool isMem128() const { return Kind == Memory && (!Mem.Size || Mem.Size == 128); } - bool isMem256() const { + bool isMem256() const { return Kind == Memory && (!Mem.Size || Mem.Size == 256); } + bool isMemVX32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; + } + bool isMemVY32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; + } + bool isMemVX64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; + } + bool isMemVY64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; + } + bool isAbsMem() const { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && !getMemIndexReg() && getMemScale() == 1; @@ -356,26 +373,38 @@ struct X86Operand : public MCParsedAsmOperand { addExpr(Inst, getImm()); } - void addMem8Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + void addMem8Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem16Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); } - void addMem16Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + void addMem64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); } - void addMem32Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + void addMem80Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); } - void addMem64Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + void addMem128Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); } - void addMem80Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + void addMem256Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); } - void addMem128Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + void addMemVX32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); } - void addMem256Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + void addMemVY32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemVX64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemVY64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); } void addMemOperands(MCInst &Inst, unsigned N) const { @@ -467,7 +496,7 @@ bool X86AsmParser::isSrcOp(X86Operand &Op) { bool X86AsmParser::isDstOp(X86Operand &Op) { unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI; - return Op.isMem() && + return Op.isMem() && (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) && isa<MCConstantExpr>(Op.Mem.Disp) && cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && @@ -611,7 +640,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, if (getLexer().isNot(AsmToken::LBrac)) return ErrorOperand(Start, "Expected '[' token!"); Parser.Lex(); - + if (getLexer().is(AsmToken::Identifier)) { // Parse BaseReg if (ParseRegister(BaseReg, Start, End)) { @@ -638,11 +667,11 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, // Handle '[' Scale*IndexReg ']' Parser.Lex(); SMLoc IdxRegLoc = Parser.getTok().getLoc(); - if (ParseRegister(IndexReg, IdxRegLoc, End)) - return ErrorOperand(IdxRegLoc, "Expected register"); + if (ParseRegister(IndexReg, IdxRegLoc, End)) + return ErrorOperand(IdxRegLoc, "Expected register"); Scale = Val; } else - return ErrorOperand(Loc, "Unepxeted token"); + return ErrorOperand(Loc, "Unexpected token"); } if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) { @@ -655,8 +684,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, if (getLexer().is(AsmToken::Star)) { Parser.Lex(); SMLoc IdxRegLoc = Parser.getTok().getLoc(); - if (ParseRegister(IndexReg, IdxRegLoc, End)) - return ErrorOperand(IdxRegLoc, "Expected register"); + if (ParseRegister(IndexReg, IdxRegLoc, End)) + return ErrorOperand(IdxRegLoc, "Expected register"); Scale = Val; } else if (getLexer().is(AsmToken::RBrac)) { const MCExpr *ValExpr = MCConstantExpr::Create(Val, getContext()); @@ -668,7 +697,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, End = Parser.getTok().getLoc(); if (!IndexReg) ParseRegister(IndexReg, Start, End); - else if (getParser().ParseExpression(Disp, End)) return 0; + else if (getParser().ParseExpression(Disp, End)) return 0; } } @@ -881,7 +910,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { if (getParser().ParseAbsoluteExpression(ScaleVal)){ Error(Loc, "expected scale expression"); return 0; - } + } // Validate the scale amount. if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){ @@ -916,15 +945,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { // If we have both a base register and an index register make sure they are // both 64-bit or 32-bit registers. + // To support VSIB, IndexReg can be 128-bit or 256-bit registers. if (BaseReg != 0 && IndexReg != 0) { if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) && - !X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) && + (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) || + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) && IndexReg != X86::RIZ) { Error(IndexLoc, "index register is 32-bit, but base register is 64-bit"); return 0; } if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) && - !X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) && + (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) || + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) && IndexReg != X86::EIZ){ Error(IndexLoc, "index register is 64-bit, but base register is 32-bit"); return 0; @@ -944,7 +976,7 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, if (PatchedName.startswith("set") && PatchedName.endswith("b") && PatchedName != "setb" && PatchedName != "setnb") PatchedName = PatchedName.substr(0, Name.size()-1); - + // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. const MCExpr *ExtraImmOp = 0; if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && @@ -1204,20 +1236,20 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, // Intel syntax X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]); if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { - delete Operands[2]; - Operands.pop_back(); + cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { + delete Operands[2]; + Operands.pop_back(); } } else { X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { - delete Operands[1]; - Operands.erase(Operands.begin() + 1); + cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); } } } - + // Transforms "int $3" into "int3" as a size optimization. We can't write an // instalias with an immediate operand yet. if (Name == "int" && Operands.size() == 2) { @@ -1520,7 +1552,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, case Match_Success: // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the - // individual transformations can chain off each other. + // individual transformations can chain off each other. while (processInstruction(Inst, Operands)) ; @@ -1558,12 +1590,12 @@ MatchAndEmitInstruction(SMLoc IDLoc, // Otherwise, we assume that this may be an integer instruction, which comes // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively. const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0"; - + // Check for the various suffix matches. Tmp[Base.size()] = Suffixes[0]; unsigned ErrorInfoIgnore; unsigned Match1, Match2, Match3, Match4; - + Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); Tmp[Base.size()] = Suffixes[1]; Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); @@ -1673,10 +1705,10 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { getParser().setAssemblerDialect(1); if (getLexer().isNot(AsmToken::EndOfStatement)) { if(Parser.getTok().getString() == "noprefix") { - // FIXME : Handle noprefix - Parser.Lex(); + // FIXME : Handle noprefix + Parser.Lex(); } else - return true; + return true; } return false; } @@ -1691,19 +1723,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { const MCExpr *Value; if (getParser().ParseExpression(Value)) return true; - + getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/); - + if (getLexer().is(AsmToken::EndOfStatement)) break; - + // FIXME: Improve diagnostic. if (getLexer().isNot(AsmToken::Comma)) return Error(L, "unexpected token in directive"); Parser.Lex(); } } - + Parser.Lex(); return false; } diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index f612e23..b886d46 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -52,6 +52,8 @@ endif() add_llvm_target(X86CodeGen ${sources}) +add_dependencies(LLVMX86CodeGen intrinsics_gen) + add_subdirectory(AsmParser) add_subdirectory(Disassembler) add_subdirectory(InstPrinter) diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index b13a006..4bbfe95 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -356,15 +356,15 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, // Special case those X86 instructions that use the imm8 as a set of // bits, bit count, etc. and are not sign-extend. if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri && - Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri && - Opcode != X86::DPPSrri && Opcode != X86::DPPDrri && - Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri && - Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri && - Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri && - Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri && - Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri && - Opcode != X86::VINSERTPSrr) - type = TYPE_MOFFS8; + Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri && + Opcode != X86::DPPSrri && Opcode != X86::DPPDrri && + Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri && + Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri && + Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri && + Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri && + Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri && + Opcode != X86::VINSERTPSrr) + type = TYPE_MOFFS8; break; case ENCODING_IW: type = TYPE_MOFFS16; @@ -498,7 +498,38 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, } else { baseReg = MCOperand::CreateReg(0); } - + + // Check whether we are handling VSIB addressing mode for GATHER. + // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and + // we should use SIB_INDEX_XMM4|YMM4 for VSIB. + // I don't see a way to get the correct IndexReg in readSIB: + // We can tell whether it is VSIB or SIB after instruction ID is decoded, + // but instruction ID may not be decoded yet when calling readSIB. + uint32_t Opcode = mcInst.getOpcode(); + bool IndexIs128 = (Opcode == X86::VGATHERDPDrm || + Opcode == X86::VGATHERDPDYrm || + Opcode == X86::VGATHERQPDrm || + Opcode == X86::VGATHERDPSrm || + Opcode == X86::VGATHERQPSrm || + Opcode == X86::VPGATHERDQrm || + Opcode == X86::VPGATHERDQYrm || + Opcode == X86::VPGATHERQQrm || + Opcode == X86::VPGATHERDDrm || + Opcode == X86::VPGATHERQDrm); + bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm || + Opcode == X86::VGATHERDPSYrm || + Opcode == X86::VGATHERQPSYrm || + Opcode == X86::VPGATHERQQYrm || + Opcode == X86::VPGATHERDDYrm || + Opcode == X86::VPGATHERQDYrm); + if (IndexIs128 || IndexIs256) { + unsigned IndexOffset = insn.sibIndex - + (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX); + SIBIndex IndexBase = IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0; + insn.sibIndex = (SIBIndex)(IndexBase + + (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset)); + } + if (insn.sibIndex != SIB_INDEX_NONE) { switch (insn.sibIndex) { default: @@ -509,6 +540,8 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, indexReg = MCOperand::CreateReg(X86::x); break; EA_BASES_32BIT EA_BASES_64BIT + REGS_XMM + REGS_YMM #undef ENTRY } } else { diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index fae309b..e2caf6a 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -310,11 +310,14 @@ typedef enum { * SIBIndex - All possible values of the SIB index field. * Borrows entries from ALL_EA_BASES with the special case that * sib is synonymous with NONE. + * Vector SIB: index can be XMM or YMM. */ typedef enum { SIB_INDEX_NONE, #define ENTRY(x) SIB_INDEX_##x, ALL_EA_BASES + REGS_XMM + REGS_YMM #undef ENTRY SIB_INDEX_max } SIBIndex; diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index f532019..64ac5e6 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -96,7 +96,17 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PSHUFHWmi: case X86::VPSHUFHWmi: DestName = getRegName(MI->getOperand(0).getReg()); - DecodePSHUFHWMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + DecodePSHUFHWMask(MVT::v8i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::VPSHUFHWYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPSHUFHWYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFHWMask(MVT::v16i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), ShuffleMask); break; case X86::PSHUFLWri: @@ -106,7 +116,17 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PSHUFLWmi: case X86::VPSHUFLWmi: DestName = getRegName(MI->getOperand(0).getReg()); - DecodePSHUFLWMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + DecodePSHUFLWMask(MVT::v8i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::VPSHUFLWYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPSHUFLWYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFLWMask(MVT::v16i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), ShuffleMask); break; @@ -487,6 +507,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERMQYri: + case X86::VPERMPDYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMQYmi: + case X86::VPERMPDYmi: + DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; } diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index a0bb6dc..db597fb 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -94,40 +94,83 @@ namespace X86II { MO_PLT, /// MO_TLSGD - On a symbol operand this indicates that the immediate is - /// some TLS offset. + /// the offset of the GOT entry with the TLS index structure that contains + /// the module number and variable offset for the symbol. Used in the + /// general dynamic TLS access model. /// /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @TLSGD MO_TLSGD, + /// MO_TLSLD - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS index for the module that + /// contains the symbol. When this index is passed to a call to to + /// __tls_get_addr, the function will return the base address of the TLS + /// block for the symbol. Used in the x86-64 local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSLD + MO_TLSLD, + + /// MO_TLSLDM - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS index for the module that + /// contains the symbol. When this index is passed to a call to to + /// ___tls_get_addr, the function will return the base address of the TLS + /// block for the symbol. Used in the IA32 local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSLDM + MO_TLSLDM, + /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is - /// some TLS offset. + /// the offset of the GOT entry with the thread-pointer offset for the + /// symbol. Used in the x86-64 initial exec TLS access model. /// /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @GOTTPOFF MO_GOTTPOFF, /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is - /// some TLS offset. + /// the absolute address of the GOT entry with the negative thread-pointer + /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access + /// model. /// /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @INDNTPOFF MO_INDNTPOFF, /// MO_TPOFF - On a symbol operand this indicates that the immediate is - /// some TLS offset. + /// the thread-pointer offset for the symbol. Used in the x86-64 local + /// exec TLS access model. /// /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @TPOFF MO_TPOFF, + /// MO_DTPOFF - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS offset of the symbol. Used + /// in the local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @DTPOFF + MO_DTPOFF, + /// MO_NTPOFF - On a symbol operand this indicates that the immediate is - /// some TLS offset. + /// the negative thread-pointer offset for the symbol. Used in the IA32 + /// local exec TLS access model. /// /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @NTPOFF MO_NTPOFF, + /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the negative thread-pointer offset for + /// the symbol. Used in the PIC IA32 initial exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @GOTNTPOFF + MO_GOTNTPOFF, + /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the /// reference is actually to the "__imp_FOO" symbol. This is used for /// dllimport linkage on windows. @@ -438,17 +481,17 @@ namespace X86II { // getBaseOpcodeFor - This function returns the "base" X86 opcode for the // specified machine instruction. // - static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) { + inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) { return TSFlags >> X86II::OpcodeShift; } - static inline bool hasImm(uint64_t TSFlags) { + inline bool hasImm(uint64_t TSFlags) { return (TSFlags & X86II::ImmMask) != 0; } /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field /// of the specified instruction. - static inline unsigned getSizeOfImm(uint64_t TSFlags) { + inline unsigned getSizeOfImm(uint64_t TSFlags) { switch (TSFlags & X86II::ImmMask) { default: llvm_unreachable("Unknown immediate size"); case X86II::Imm8: @@ -463,7 +506,7 @@ namespace X86II { /// isImmPCRel - Return true if the immediate of the specified instruction's /// TSFlags indicates that it is pc relative. - static inline unsigned isImmPCRel(uint64_t TSFlags) { + inline unsigned isImmPCRel(uint64_t TSFlags) { switch (TSFlags & X86II::ImmMask) { default: llvm_unreachable("Unknown immediate size"); case X86II::Imm8PCRel: @@ -486,9 +529,11 @@ namespace X86II { /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only /// counted as one operand. /// - static inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) { + inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) { switch (TSFlags & X86II::FormMask) { - case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this form"); + case X86II::MRMInitReg: + // FIXME: Remove this form. + return -1; default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!"); case X86II::Pseudo: case X86II::RawFrm: @@ -546,7 +591,7 @@ namespace X86II { /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or /// higher) register? e.g. r8, xmm8, xmm13, etc. - static inline bool isX86_64ExtendedReg(unsigned RegNo) { + inline bool isX86_64ExtendedReg(unsigned RegNo) { switch (RegNo) { default: break; case X86::R8: case X86::R9: case X86::R10: case X86::R11: @@ -568,7 +613,7 @@ namespace X86II { return false; } - static inline bool isX86_64NonExtLowByteReg(unsigned reg) { + inline bool isX86_64NonExtLowByteReg(unsigned reg) { return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index afa545c..49c07f3 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -35,19 +35,6 @@ AsmWriterFlavor("x86-asm-syntax", cl::init(ATT), clEnumValEnd)); -static const char *const x86_asm_table[] = { - "{si}", "S", - "{di}", "D", - "{ax}", "a", - "{cx}", "c", - "{memory}", "memory", - "{flags}", "", - "{dirflag}", "", - "{fpsr}", "", - "{fpcr}", "", - "{cc}", "cc", - 0,0}; - void X86MCAsmInfoDarwin::anchor() { } X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { @@ -55,7 +42,6 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { if (is64Bit) PointerSize = 8; - AsmTransCBE = x86_asm_table; AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; @@ -88,7 +74,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { if (T.getArch() == Triple::x86_64) PointerSize = 8; - AsmTransCBE = x86_asm_table; AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; @@ -137,7 +122,6 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { PrivateGlobalPrefix = ".L"; } - AsmTransCBE = x86_asm_table; AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; @@ -151,7 +135,6 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { PrivateGlobalPrefix = ".L"; } - AsmTransCBE = x86_asm_table; AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 80990e5..4a38324 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -139,6 +139,7 @@ public: MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx) { return new X86MCCodeEmitter(MCII, STI, Ctx); @@ -569,7 +570,17 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, } // Classify VEX_B, VEX_4V, VEX_R, VEX_X + unsigned NumOps = Desc.getNumOperands(); unsigned CurOp = 0; + if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) + ++CurOp; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { + assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + } + switch (TSFlags & X86II::FormMask) { case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!"); case X86II::MRMDestMem: { @@ -602,11 +613,11 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // FMA4: // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), - if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg())) VEX_R = 0x0; if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, 1); + VEX_4V = getVEXRegisterEncoding(MI, CurOp); if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -616,7 +627,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, VEX_X = 0x0; if (HasVEX_4VOp3) - VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1); + // Instruction format for 4VOp3: + // src1(ModR/M), MemAddr, src3(VEX_4V) + // CurOp points to start of the MemoryOperand, + // it skips TIED_TO operands if exist, then increments past src1. + // CurOp + X86::AddrNumOperands will point to src3. + VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands); break; case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: @@ -961,11 +977,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // FIXME: This should be handled during MCInst lowering. unsigned NumOps = Desc.getNumOperands(); unsigned CurOp = 0; - if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1) + if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) ++CurOp; - else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, MCOI::TIED_TO)== 0) - // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32 - --NumOps; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { + assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + } // Keep track of the current byte being emitted. unsigned CurByte = 0; @@ -1037,7 +1056,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, SrcRegNum = CurOp + X86::AddrNumOperands; if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) - SrcRegNum++; + ++SrcRegNum; EmitMemModRMByte(MI, CurOp, GetX86RegNum(MI.getOperand(SrcRegNum)), @@ -1050,15 +1069,15 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, SrcRegNum = CurOp + 1; if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) - SrcRegNum++; + ++SrcRegNum; - if(HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) - SrcRegNum++; + if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) + ++SrcRegNum; EmitRegModRMByte(MI.getOperand(SrcRegNum), GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); - // 2 operands skipped with HasMemOp4, comensate accordingly + // 2 operands skipped with HasMemOp4, compensate accordingly CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; if (HasVEX_4VOp3) ++CurOp; @@ -1071,7 +1090,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, ++AddrOperands; ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). } - if(HasMemOp4) // Skip second register source (encoded in I8IMM) + if (HasMemOp4) // Skip second register source (encoded in I8IMM) ++FirstMemOp; EmitByte(BaseOpcode, CurByte, OS); @@ -1089,7 +1108,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM4r: case X86II::MRM5r: case X86II::MRM6r: case X86II::MRM7r: if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). - CurOp++; + ++CurOp; EmitByte(BaseOpcode, CurByte, OS); EmitRegModRMByte(MI.getOperand(CurOp++), (TSFlags & X86II::FormMask)-X86II::MRM0r, @@ -1100,7 +1119,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM4m: case X86II::MRM5m: case X86II::MRM6m: case X86II::MRM7m: if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). - CurOp++; + ++CurOp; EmitByte(BaseOpcode, CurByte, OS); EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m, TSFlags, CurByte, OS, Fixups); @@ -1149,22 +1168,23 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, } // If there is a remaining operand, it must be a trailing immediate. Emit it - // according to the right size for the instruction. - if (CurOp != NumOps) { + // according to the right size for the instruction. Some instructions + // (SSE4a extrq and insertq) have two trailing immediates. + while (CurOp != NumOps && NumOps - CurOp <= 2) { // The last source register of a 4 operand instruction in AVX is encoded // in bits[7:4] of a immediate byte. if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand - : CurOp); - CurOp++; - bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg()); - unsigned RegNum = (IsExtReg ? (1 << 7) : 0); - RegNum |= GetX86RegNum(MO) << 4; + : CurOp); + ++CurOp; + unsigned RegNum = GetX86RegNum(MO) << 4; + if (X86II::isX86_64ExtendedReg(MO.getReg())) + RegNum |= 1 << 7; // If there is an additional 5th operand it must be an immediate, which // is encoded in bits[3:0] - if(CurOp != NumOps) { + if (CurOp != NumOps) { const MCOperand &MIMM = MI.getOperand(CurOp++); - if(MIMM.isImm()) { + if (MIMM.isImm()) { unsigned Val = MIMM.getImm(); assert(Val < 16 && "Immediate operand value out of range"); RegNum |= Val; diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 9896cbe..4650069 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -76,6 +76,7 @@ namespace X86_MC { } MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, MCContext &Ctx); diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index a802333..8b87c1f 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -64,13 +64,13 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { /// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. /// VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. -void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { +void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumLaneElts = NumElts / NumLanes; - int NewImm = Imm; + unsigned NewImm = Imm; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0; i != NumLaneElts; ++i) { ShuffleMask.push_back(NewImm % NumLaneElts + l); @@ -80,48 +80,55 @@ void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } -void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - ShuffleMask.push_back(0); - ShuffleMask.push_back(1); - ShuffleMask.push_back(2); - ShuffleMask.push_back(3); - for (unsigned i = 0; i != 4; ++i) { - ShuffleMask.push_back(4+(Imm & 3)); - Imm >>= 2; +void DecodePSHUFHWMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + for (unsigned l = 0; l != NumElts; l += 8) { + unsigned NewImm = Imm; + for (unsigned i = 0, e = 4; i != e; ++i) { + ShuffleMask.push_back(l + i); + } + for (unsigned i = 4, e = 8; i != e; ++i) { + ShuffleMask.push_back(l + 4 + (NewImm & 3)); + NewImm >>= 2; + } } } -void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { - for (unsigned i = 0; i != 4; ++i) { - ShuffleMask.push_back((Imm & 3)); - Imm >>= 2; +void DecodePSHUFLWMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + for (unsigned l = 0; l != NumElts; l += 8) { + unsigned NewImm = Imm; + for (unsigned i = 0, e = 4; i != e; ++i) { + ShuffleMask.push_back(l + (NewImm & 3)); + NewImm >>= 2; + } + for (unsigned i = 4, e = 8; i != e; ++i) { + ShuffleMask.push_back(l + i); + } } - ShuffleMask.push_back(4); - ShuffleMask.push_back(5); - ShuffleMask.push_back(6); - ShuffleMask.push_back(7); } /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates /// the type of the vector allowing it to handle different datatypes and vector /// widths. -void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { +void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumLaneElts = NumElts / NumLanes; - int NewImm = Imm; + unsigned NewImm = Imm; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - // Part that reads from dest. - for (unsigned i = 0; i != NumLaneElts/2; ++i) { - ShuffleMask.push_back(NewImm % NumLaneElts + l); - NewImm /= NumLaneElts; - } - // Part that reads from src. - for (unsigned i = 0; i != NumLaneElts/2; ++i) { - ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + l); - NewImm /= NumLaneElts; + // each half of a lane comes from different source + for (unsigned s = 0; s != NumElts*2; s += NumElts) { + for (unsigned i = 0; i != NumLaneElts/2; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + s + l); + NewImm /= NumLaneElts; + } } if (NumLaneElts == 4) NewImm = Imm; // reload imm } @@ -130,7 +137,7 @@ void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd /// and punpckh*. VT indicates the type of the vector allowing it to handle /// different datatypes and vector widths. -void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) { +void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -150,7 +157,7 @@ void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) { /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd /// and punpckl*. VT indicates the type of the vector allowing it to handle /// different datatypes and vector widths. -void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) { +void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -167,19 +174,26 @@ void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) { } } -void DecodeVPERM2X128Mask(EVT VT, unsigned Imm, +void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { if (Imm & 0x88) return; // Not a shuffle unsigned HalfSize = VT.getVectorNumElements()/2; - unsigned FstHalfBegin = (Imm & 0x3) * HalfSize; - unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize; - for (int i = FstHalfBegin, e = FstHalfBegin+HalfSize; i != e; ++i) - ShuffleMask.push_back(i); - for (int i = SndHalfBegin, e = SndHalfBegin+HalfSize; i != e; ++i) - ShuffleMask.push_back(i); + for (unsigned l = 0; l != 2; ++l) { + unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize; + for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i) + ShuffleMask.push_back(i); + } +} + +/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. +/// No VT provided since it only works on 256-bit, 4 element vectors. +void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + for (unsigned i = 0; i != 4; ++i) { + ShuffleMask.push_back((Imm >> (2*i)) & 3); + } } } // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 5b8c6ef..70d8171 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -35,31 +35,35 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); // <0,2> or <0,1,4,5> void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); -void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); -void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); -void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates /// the type of the vector allowing it to handle different datatypes and vector /// widths. -void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd /// and punpckh*. VT indicates the type of the vector allowing it to handle /// different datatypes and vector widths. -void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask); +void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd /// and punpckl*. VT indicates the type of the vector allowing it to handle /// different datatypes and vector widths. -void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask); +void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); -void DecodeVPERM2X128Mask(EVT VT, unsigned Imm, +void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. +/// No VT provided since it only works on 256-bit, 4 element vectors. +void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + } // llvm namespace #endif diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index ecc7b59..bf05ccf 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -36,6 +36,11 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM, /// register for PIC on x86-32. FunctionPass* createGlobalBaseRegPass(); +/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses +/// to local-dynamic TLS variables so that the TLS base address for the module +/// is only fetched once per execution path through the function. +FunctionPass *createCleanupLocalDynamicTLSPass(); + /// createX86FloatingPointStackifierPass - This function returns a pass which /// converts floating point register references and pseudo instructions into /// floating point stack references and physical instructions. diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index b6591d4..6c1a816 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -86,21 +86,24 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", "Enable AVX2 instructions", [FeatureAVX]>; -def FeatureCLMUL : SubtargetFeature<"clmul", "HasCLMUL", "true", - "Enable carry-less multiplication instructions">; -def FeatureFMA3 : SubtargetFeature<"fma3", "HasFMA3", "true", +def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", + "Enable packed carry-less multiplication instructions", + [FeatureSSE2]>; +def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", "Enable three-operand fused multiple-add", [FeatureAVX]>; def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", "Enable four-operand fused multiple-add", - [FeatureAVX]>; + [FeatureAVX, FeatureSSE4A]>; def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", - "Enable XOP instructions">; + "Enable XOP instructions", + [FeatureAVX, FeatureSSE4A]>; def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", "HasVectorUAMem", "true", "Allow unaligned memory operands on vector/SIMD instructions">; def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", - "Enable AES instructions">; + "Enable AES instructions", + [FeatureSSE2]>; def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", "Support MOVBE instruction">; def FeatureRDRAND : SubtargetFeature<"rdrand", "HasRDRAND", "true", @@ -128,10 +131,10 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", "Intel Atom processors">; class Proc<string Name, list<SubtargetFeature> Features> - : Processor<Name, GenericItineraries, Features>; + : ProcessorModel<Name, GenericModel, Features>; class AtomProc<string Name, list<SubtargetFeature> Features> - : Processor<Name, AtomItineraries, Features>; + : ProcessorModel<Name, AtomModel, Features>; def : Proc<"generic", []>; def : Proc<"i386", []>; @@ -169,25 +172,23 @@ def : Proc<"nehalem", [FeatureSSE42, FeatureCMPXCHG16B, // Westmere is the corei3/i5/i7 path from nehalem to sandybridge def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeatureCLMUL]>; + FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; // Sandy Bridge // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. -// FIXME: Disabling AVX for now since it's not ready. -def : Proc<"corei7-avx", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT, - FeatureAES, FeatureCLMUL]>; +def : Proc<"corei7-avx", [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL]>; // Ivy Bridge -def : Proc<"core-avx-i", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT, - FeatureAES, FeatureCLMUL, +def : Proc<"core-avx-i", [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>; // Haswell -// FIXME: Disabling AVX/AVX2/FMA3 for now since it's not ready. -def : Proc<"core-avx2", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT, - FeatureAES, FeatureCLMUL, FeatureRDRAND, +def : Proc<"core-avx2", [FeatureAVX2, FeatureCMPXCHG16B, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2]>; + FeatureBMI2, FeatureFMA]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; @@ -211,21 +212,20 @@ def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, FeatureSlowBTMem]>; def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, FeatureSlowBTMem]>; -def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A, +def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, FeatureSlowBTMem]>; // Bobcat def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT]>; -// FIXME: Disabling AVX/FMA4 for now since it's not ready. // Bulldozer -def : Proc<"bdver1", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B, - FeatureAES, FeatureCLMUL, - FeatureXOP, FeatureLZCNT, FeaturePOPCNT]>; +def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, + FeatureAES, FeaturePCLMUL, + FeatureLZCNT, FeaturePOPCNT]>; // Enhanced Bulldozer -def : Proc<"bdver2", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B, - FeatureAES, FeatureCLMUL, - FeatureXOP, FeatureF16C, FeatureLZCNT, +def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, + FeatureAES, FeaturePCLMUL, + FeatureF16C, FeatureLZCNT, FeaturePOPCNT, FeatureBMI]>; def : Proc<"winchip-c6", [FeatureMMX]>; diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 7db7ccb..db71e27 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -20,10 +20,10 @@ #include "X86TargetMachine.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "llvm/CallingConv.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Module.h" #include "llvm/Type.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Assembly/Writer.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -186,10 +186,14 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, O << '-' << *MF->getPICBaseSymbol(); break; case X86II::MO_TLSGD: O << "@TLSGD"; break; + case X86II::MO_TLSLD: O << "@TLSLD"; break; + case X86II::MO_TLSLDM: O << "@TLSLDM"; break; case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break; case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break; case X86II::MO_TPOFF: O << "@TPOFF"; break; + case X86II::MO_DTPOFF: O << "@DTPOFF"; break; case X86II::MO_NTPOFF: O << "@NTPOFF"; break; + case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break; case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break; case X86II::MO_GOT: O << "@GOT"; break; case X86II::MO_GOTOFF: O << "@GOTOFF"; break; @@ -403,7 +407,9 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO = MI->getOperand(OpNo); switch (ExtraCode[0]) { - default: return true; // Unknown modifier. + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); case 'a': // This is an address. Currently only 'i' and 'r' are expected. if (MO.isImm()) { O << MO.getImm(); diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index d148989..a6d2709 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -29,10 +29,13 @@ def RetCC_X86Common : CallingConv<[ // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI // for functions that return two i8 values are currently expected to pack the // values into an i16 (which uses AX, and thus AL:AH). - CCIfType<[i8] , CCAssignToReg<[AL, DL]>>, - CCIfType<[i16], CCAssignToReg<[AX, DX]>>, - CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>, - CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>, + // + // For code that doesn't care about the ABI, we allow returning more than two + // integer values in registers. + CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, + CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>, // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 // can only be used by ABI non-compliant code. If the target doesn't have XMM @@ -413,7 +416,7 @@ def CC_X86 : CallingConv<[ // Callee-saved Registers. //===----------------------------------------------------------------------===// -def CSR_Ghc : CalleeSavedRegs<(add)>; +def CSR_NoRegs : CalleeSavedRegs<(add)>; def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>; def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>; diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index ee3de9a..d705049 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -53,12 +53,12 @@ namespace { public: static char ID; explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce) - : MachineFunctionPass(ID), II(0), TD(0), TM(tm), + : MachineFunctionPass(ID), II(0), TD(0), TM(tm), MCE(mce), PICBaseOffset(0), Is64BitMode(false), IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} Emitter(X86TargetMachine &tm, CodeEmitter &mce, const X86InstrInfo &ii, const TargetData &td, bool is64) - : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), + : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), MCE(mce), PICBaseOffset(0), Is64BitMode(is64), IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} @@ -68,8 +68,20 @@ namespace { return "X86 Machine Code Emitter"; } + void emitOpcodePrefix(uint64_t TSFlags, int MemOperand, + const MachineInstr &MI, + const MCInstrDesc *Desc) const; + + void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand, + const MachineInstr &MI, + const MCInstrDesc *Desc) const; + + void emitSegmentOverridePrefix(uint64_t TSFlags, + int MemOperand, + const MachineInstr &MI) const; + void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc); - + void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired<MachineModuleInfo>(); @@ -115,17 +127,17 @@ template<class CodeEmitter> bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { MMI = &getAnalysis<MachineModuleInfo>(); MCE.setModuleInfo(MMI); - + II = TM.getInstrInfo(); TD = TM.getTargetData(); Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit(); IsPIC = TM.getRelocationModel() == Reloc::PIC_; - + do { - DEBUG(dbgs() << "JITTing function '" + DEBUG(dbgs() << "JITTing function '" << MF.getFunction()->getName() << "'\n"); MCE.startFunction(MF); - for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); + for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; ++MBB) { MCE.StartMachineBasicBlock(MBB); for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); @@ -149,18 +161,18 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { static unsigned determineREX(const MachineInstr &MI) { unsigned REX = 0; const MCInstrDesc &Desc = MI.getDesc(); - + // Pseudo instructions do not need REX prefix byte. if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo) return 0; if (Desc.TSFlags & X86II::REX_W) REX |= 1 << 3; - + unsigned NumOps = Desc.getNumOperands(); if (NumOps) { bool isTwoAddr = NumOps > 1 && - Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; - + Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; + // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. unsigned i = isTwoAddr ? 1 : 0; for (unsigned e = NumOps; i != e; ++i) { @@ -171,7 +183,7 @@ static unsigned determineREX(const MachineInstr &MI) { REX |= 0x40; } } - + switch (Desc.TSFlags & X86II::FormMask) { case X86II::MRMInitReg: if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) @@ -362,7 +374,7 @@ void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) { } template<class CodeEmitter> -void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, +void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, unsigned Index, unsigned Base) { // SIB byte is in the same format as the ModRMByte... @@ -378,8 +390,8 @@ void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) { } } -/// isDisp8 - Return true if this signed displacement fits in a 8-bit -/// sign-extended field. +/// isDisp8 - Return true if this signed displacement fits in a 8-bit +/// sign-extended field. static bool isDisp8(int Value) { return Value == (signed char)Value; } @@ -388,10 +400,10 @@ static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp, const TargetMachine &TM) { // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer // mechanism as 32-bit mode. - if (TM.getSubtarget<X86Subtarget>().is64Bit() && + if (TM.getSubtarget<X86Subtarget>().is64Bit() && !TM.getSubtarget<X86Subtarget>().isTargetDarwin()) return false; - + // Return true if this is a reference to a stub containing the address of the // global, not the global itself. return isGlobalStubReference(GVOp.getTargetFlags()); @@ -417,7 +429,7 @@ void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp, if (RelocOp->isGlobal()) { // In 64-bit static small code model, we could potentially emit absolute. // But it's probably not beneficial. If the MCE supports using RIP directly - // do it, otherwise fallback to absolute (this is determined by IsPCRel). + // do it, otherwise fallback to absolute (this is determined by IsPCRel). // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM); @@ -441,7 +453,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, const MachineOperand &Op3 = MI.getOperand(Op+3); int DispVal = 0; const MachineOperand *DispForReloc = 0; - + // Figure out what sort of displacement we have to handle here. if (Op3.isGlobal()) { DispForReloc = &Op3; @@ -469,7 +481,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, const MachineOperand &IndexReg = MI.getOperand(Op+2); unsigned BaseReg = Base.getReg(); - + // Handle %rip relative addressing. if (BaseReg == X86::RIP || (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode @@ -486,7 +498,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, bool IsPCRel = MCE.earlyResolveAddresses() ? true : false; // Is a SIB byte needed? - // If no BaseReg, issue a RIP relative instruction only if the MCE can + // If no BaseReg, issue a RIP relative instruction only if the MCE can // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table // 2-7) and absolute references. unsigned BaseRegNo = -1U; @@ -494,7 +506,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, BaseRegNo = X86_MC::getX86RegNum(BaseReg); if (// The SIB byte must be used if there is an index register. - IndexReg.getReg() == 0 && + IndexReg.getReg() == 0 && // The SIB byte must be used if the base is ESP/RSP/R12, all of which // encode to an R/M value of 4, which indicates that a SIB byte is // present. @@ -508,7 +520,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, emitDisplacementField(DispForReloc, DispVal, PCAdj, true); return; } - + // If the base is not EBP/ESP and there is no displacement, use simple // indirect register encoding, this handles addresses like [EAX]. The // encoding for [EBP] with no displacement means [disp32] so we handle it @@ -517,20 +529,20 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo)); return; } - + // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. if (!DispForReloc && isDisp8(DispVal)) { MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo)); emitConstant(DispVal, 1); return; } - + // Otherwise, emit the most general non-SIB encoding: [REG+disp32] MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo)); emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); return; } - + // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first. assert(IndexReg.getReg() != X86::ESP && IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); @@ -563,7 +575,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, unsigned SS = SSTable[Scale.getImm()]; if (BaseReg == 0) { - // Handle the SIB byte for the case where there is no base, see Intel + // Handle the SIB byte for the case where there is no base, see Intel // Manual 2A, table 2-7. The displacement has already been output. unsigned IndexRegNo; if (IndexReg.getReg()) @@ -596,94 +608,116 @@ static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II, return Desc; } -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, - const MCInstrDesc *Desc) { - DEBUG(dbgs() << MI); - - // If this is a pseudo instruction, lower it. - switch (Desc->getOpcode()) { - case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break; - case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break; - case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break; - case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break; - case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break; - case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break; - case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break; - case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break; - case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break; - case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break; - case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break; - case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break; - case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break; - case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break; - case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break; - case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break; - case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break; - } - +/// Is16BitMemOperand - Return true if the specified instruction has +/// a 16-bit memory operand. Op specifies the operand # of the memoperand. +static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) { + const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} - MCE.processDebugLoc(MI.getDebugLoc(), true); +/// Is32BitMemOperand - Return true if the specified instruction has +/// a 32-bit memory operand. Op specifies the operand # of the memoperand. +static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) { + const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} - unsigned Opcode = Desc->Opcode; +/// Is64BitMemOperand - Return true if the specified instruction has +/// a 64-bit memory operand. Op specifies the operand # of the memoperand. +#ifndef NDEBUG +static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) { + const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} +#endif +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags, + int MemOperand, + const MachineInstr &MI, + const MCInstrDesc *Desc) const { // Emit the lock opcode prefix as needed. if (Desc->TSFlags & X86II::LOCK) MCE.emitByte(0xF0); // Emit segment override opcode prefix as needed. - switch (Desc->TSFlags & X86II::SegOvrMask) { - case X86II::FS: - MCE.emitByte(0x64); - break; - case X86II::GS: - MCE.emitByte(0x65); - break; - default: llvm_unreachable("Invalid segment!"); - case 0: break; // No segment override! - } + emitSegmentOverridePrefix(TSFlags, MemOperand, MI); // Emit the repeat opcode prefix as needed. if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3); - // Emit the operand size opcode prefix as needed. - if (Desc->TSFlags & X86II::OpSize) - MCE.emitByte(0x66); - // Emit the address size opcode prefix as needed. - if (Desc->TSFlags & X86II::AdSize) + bool need_address_override; + if (TSFlags & X86II::AdSize) { + need_address_override = true; + } else if (MemOperand == -1) { + need_address_override = false; + } else if (Is64BitMode) { + assert(!Is16BitMemOperand(MI, MemOperand)); + need_address_override = Is32BitMemOperand(MI, MemOperand); + } else { + assert(!Is64BitMemOperand(MI, MemOperand)); + need_address_override = Is16BitMemOperand(MI, MemOperand); + } + + if (need_address_override) MCE.emitByte(0x67); + // Emit the operand size opcode prefix as needed. + if (TSFlags & X86II::OpSize) + MCE.emitByte(0x66); + bool Need0FPrefix = false; switch (Desc->TSFlags & X86II::Op0Mask) { - case X86II::TB: // Two-byte opcode prefix - case X86II::T8: // 0F 38 - case X86II::TA: // 0F 3A - case X86II::A6: // 0F A6 - case X86II::A7: // 0F A7 - Need0FPrefix = true; - break; - case X86II::REP: break; // already handled. - case X86II::T8XS: // F3 0F 38 - case X86II::XS: // F3 0F - MCE.emitByte(0xF3); - Need0FPrefix = true; - break; - case X86II::T8XD: // F2 0F 38 - case X86II::TAXD: // F2 0F 3A - case X86II::XD: // F2 0F - MCE.emitByte(0xF2); - Need0FPrefix = true; - break; - case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB: - case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF: - MCE.emitByte(0xD8+ - (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8) - >> X86II::Op0Shift)); - break; // Two-byte opcode prefix - default: llvm_unreachable("Invalid prefix!"); - case 0: break; // No prefix! + case X86II::TB: // Two-byte opcode prefix + case X86II::T8: // 0F 38 + case X86II::TA: // 0F 3A + case X86II::A6: // 0F A6 + case X86II::A7: // 0F A7 + Need0FPrefix = true; + break; + case X86II::REP: break; // already handled. + case X86II::T8XS: // F3 0F 38 + case X86II::XS: // F3 0F + MCE.emitByte(0xF3); + Need0FPrefix = true; + break; + case X86II::T8XD: // F2 0F 38 + case X86II::TAXD: // F2 0F 3A + case X86II::XD: // F2 0F + MCE.emitByte(0xF2); + Need0FPrefix = true; + break; + case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB: + case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF: + MCE.emitByte(0xD8+ + (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8) + >> X86II::Op0Shift)); + break; // Two-byte opcode prefix + default: llvm_unreachable("Invalid prefix!"); + case 0: break; // No prefix! } // Handle REX prefix. @@ -697,50 +731,446 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, MCE.emitByte(0x0F); switch (Desc->TSFlags & X86II::Op0Mask) { - case X86II::T8XD: // F2 0F 38 - case X86II::T8XS: // F3 0F 38 - case X86II::T8: // 0F 38 - MCE.emitByte(0x38); - break; - case X86II::TAXD: // F2 0F 38 - case X86II::TA: // 0F 3A - MCE.emitByte(0x3A); - break; - case X86II::A6: // 0F A6 - MCE.emitByte(0xA6); - break; - case X86II::A7: // 0F A7 - MCE.emitByte(0xA7); - break; + case X86II::T8XD: // F2 0F 38 + case X86II::T8XS: // F3 0F 38 + case X86II::T8: // 0F 38 + MCE.emitByte(0x38); + break; + case X86II::TAXD: // F2 0F 38 + case X86II::TA: // 0F 3A + MCE.emitByte(0x3A); + break; + case X86II::A6: // 0F A6 + MCE.emitByte(0xA6); + break; + case X86II::A7: // 0F A7 + MCE.emitByte(0xA7); + break; + } +} + +// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range +// 0-7 and the difference between the 2 groups is given by the REX prefix. +// In the VEX prefix, registers are seen sequencially from 0-15 and encoded +// in 1's complement form, example: +// +// ModRM field => XMM9 => 1 +// VEX.VVVV => XMM9 => ~9 +// +// See table 4-35 of Intel AVX Programming Reference for details. +static unsigned char getVEXRegisterEncoding(const MachineInstr &MI, + unsigned OpNum) { + unsigned SrcReg = MI.getOperand(OpNum).getReg(); + unsigned SrcRegNum = X86_MC::getX86RegNum(MI.getOperand(OpNum).getReg()); + if (X86II::isX86_64ExtendedReg(SrcReg)) + SrcRegNum |= 8; + + // The registers represented through VEX_VVVV should + // be encoded in 1's complement form. + return (~SrcRegNum) & 0xf; +} + +/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags, + int MemOperand, + const MachineInstr &MI) const { + switch (TSFlags & X86II::SegOvrMask) { + default: llvm_unreachable("Invalid segment!"); + case 0: + // No segment override, check for explicit one on memory operand. + if (MemOperand != -1) { // If the instruction has a memory operand. + switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) { + default: llvm_unreachable("Unknown segment register!"); + case 0: break; + case X86::CS: MCE.emitByte(0x2E); break; + case X86::SS: MCE.emitByte(0x36); break; + case X86::DS: MCE.emitByte(0x3E); break; + case X86::ES: MCE.emitByte(0x26); break; + case X86::FS: MCE.emitByte(0x64); break; + case X86::GS: MCE.emitByte(0x65); break; + } + } + break; + case X86II::FS: + MCE.emitByte(0x64); + break; + case X86II::GS: + MCE.emitByte(0x65); + break; + } +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, + int MemOperand, + const MachineInstr &MI, + const MCInstrDesc *Desc) const { + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + + // VEX_R: opcode externsion equivalent to REX.R in + // 1's complement (inverted) form + // + // 1: Same as REX_R=0 (must be 1 in 32-bit mode) + // 0: Same as REX_R=1 (64 bit mode only) + // + unsigned char VEX_R = 0x1; + + // VEX_X: equivalent to REX.X, only used when a + // register is used for index in SIB Byte. + // + // 1: Same as REX.X=0 (must be 1 in 32-bit mode) + // 0: Same as REX.X=1 (64-bit mode only) + unsigned char VEX_X = 0x1; + + // VEX_B: + // + // 1: Same as REX_B=0 (ignored in 32-bit mode) + // 0: Same as REX_B=1 (64 bit mode only) + // + unsigned char VEX_B = 0x1; + + // VEX_W: opcode specific (use like REX.W, or used for + // opcode extension, or ignored, depending on the opcode byte) + unsigned char VEX_W = 0; + + // XOP: Use XOP prefix byte 0x8f instead of VEX. + unsigned char XOP = 0; + + // VEX_5M (VEX m-mmmmm field): + // + // 0b00000: Reserved for future use + // 0b00001: implied 0F leading opcode + // 0b00010: implied 0F 38 leading opcode bytes + // 0b00011: implied 0F 3A leading opcode bytes + // 0b00100-0b11111: Reserved for future use + // 0b01000: XOP map select - 08h instructions with imm byte + // 0b10001: XOP map select - 09h instructions with no imm byte + unsigned char VEX_5M = 0x1; + + // VEX_4V (VEX vvvv field): a register specifier + // (in 1's complement form) or 1111 if unused. + unsigned char VEX_4V = 0xf; + + // VEX_L (Vector Length): + // + // 0: scalar or 128-bit vector + // 1: 256-bit vector + // + unsigned char VEX_L = 0; + + // VEX_PP: opcode extension providing equivalent + // functionality of a SIMD prefix + // + // 0b00: None + // 0b01: 66 + // 0b10: F3 + // 0b11: F2 + // + unsigned char VEX_PP = 0; + + // Encode the operand size opcode prefix as needed. + if (TSFlags & X86II::OpSize) + VEX_PP = 0x01; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) + VEX_W = 1; + + if ((TSFlags >> X86II::VEXShift) & X86II::XOP) + XOP = 1; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) + VEX_L = 1; + + switch (TSFlags & X86II::Op0Mask) { + default: llvm_unreachable("Invalid prefix!"); + case X86II::T8: // 0F 38 + VEX_5M = 0x2; + break; + case X86II::TA: // 0F 3A + VEX_5M = 0x3; + break; + case X86II::T8XS: // F3 0F 38 + VEX_PP = 0x2; + VEX_5M = 0x2; + break; + case X86II::T8XD: // F2 0F 38 + VEX_PP = 0x3; + VEX_5M = 0x2; + break; + case X86II::TAXD: // F2 0F 3A + VEX_PP = 0x3; + VEX_5M = 0x3; + break; + case X86II::XS: // F3 0F + VEX_PP = 0x2; + break; + case X86II::XD: // F2 0F + VEX_PP = 0x3; + break; + case X86II::XOP8: + VEX_5M = 0x8; + break; + case X86II::XOP9: + VEX_5M = 0x9; + break; + case X86II::A6: // Bypass: Not used by VEX + case X86II::A7: // Bypass: Not used by VEX + case X86II::TB: // Bypass: Not used by VEX + case 0: + break; // No prefix! + } + + + // Set the vector length to 256-bit if YMM0-YMM15 is used + for (unsigned i = 0; i != MI.getNumOperands(); ++i) { + if (!MI.getOperand(i).isReg()) + continue; + if (MI.getOperand(i).isImplicit()) + continue; + unsigned SrcReg = MI.getOperand(i).getReg(); + if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15) + VEX_L = 1; + } + + // Classify VEX_B, VEX_4V, VEX_R, VEX_X + unsigned NumOps = Desc->getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0) + ++CurOp; + else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) { + assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + } + + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: + // Duplicate register. + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + break; + case X86II::MRMDestMem: { + // MRMDestMem instructions forms: + // MemAddr, src1(ModR/M) + // MemAddr, src1(VEX_4V), src2(ModR/M) + // MemAddr, src1(ModR/M), imm8 + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + + CurOp = X86::AddrNumOperands; + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + + const MachineOperand &MO = MI.getOperand(CurOp); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + VEX_R = 0x0; + break; + } + case X86II::MRMSrcMem: + // MRMSrcMem instructions forms: + // src1(ModR/M), MemAddr + // src1(ModR/M), src2(VEX_4V), MemAddr + // src1(ModR/M), MemAddr, imm8 + // src1(ModR/M), MemAddr, src2(VEX_I8IMM) + // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), + if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + VEX_R = 0x0; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, 1); + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1); + break; + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + // MRM[0-9]m instructions forms: + // MemAddr + // src1(VEX_4V), MemAddr + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, 0); + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + break; + } + case X86II::MRMSrcReg: + // MRMSrcReg instructions forms: + // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M), src1(ModR/M) + // dst(ModR/M), src1(ModR/M), imm8 + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + CurOp++; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + CurOp++; + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + break; + case X86II::MRMDestReg: + // MRMDestReg instructions forms: + // dst(ModR/M), src(ModR/M) + // dst(ModR/M), src(ModR/M), imm8 + if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_R = 0x0; + break; + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + // MRM0r-MRM7r instructions forms: + // dst(VEX_4V), src(ModR/M), imm8 + VEX_4V = getVEXRegisterEncoding(MI, 0); + if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_B = 0x0; + break; + default: // RawFrm + break; + } + + // Emit segment override opcode prefix as needed. + emitSegmentOverridePrefix(TSFlags, MemOperand, MI); + + // VEX opcode prefix can have 2 or 3 bytes + // + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + // + unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); + + if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix + MCE.emitByte(0xC5); + MCE.emitByte(LastByte | (VEX_R << 7)); + return; + } + + // 3 byte VEX prefix + MCE.emitByte(XOP ? 0x8F : 0xC4); + MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M); + MCE.emitByte(LastByte | (VEX_W << 7)); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, + const MCInstrDesc *Desc) { + DEBUG(dbgs() << MI); + + // If this is a pseudo instruction, lower it. + switch (Desc->getOpcode()) { + case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break; + case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break; + case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break; + case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break; + case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break; + case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break; + case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break; + case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break; + case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break; + case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break; + case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break; + case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break; + case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break; + case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break; + case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break; + case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break; + case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break; } + + MCE.processDebugLoc(MI.getDebugLoc(), true); + + unsigned Opcode = Desc->Opcode; + // If this is a two-address instruction, skip one of the register operands. unsigned NumOps = Desc->getNumOperands(); unsigned CurOp = 0; - if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) != -1) + if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0) ++CurOp; - else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1,MCOI::TIED_TO)== 0) - // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32 - --NumOps; + else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) { + assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + } + + uint64_t TSFlags = Desc->TSFlags; + + // Is this instruction encoded using the AVX VEX prefix? + bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX; + // It uses the VEX.VVVV field? + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + const unsigned MemOp4_I8IMMOperand = 2; + + // Determine where the memory operand starts, if present. + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); + if (MemoryOperand != -1) MemoryOperand += CurOp; + + if (!HasVEXPrefix) + emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc); + else + emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc); unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags); - switch (Desc->TSFlags & X86II::FormMask) { + switch (TSFlags & X86II::FormMask) { default: llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!"); case X86II::Pseudo: // Remember the current PC offset, this is the PIC relocation // base address. switch (Opcode) { - default: + default: llvm_unreachable("pseudo instructions should be removed before code" " emission"); - break; // Do nothing for Int_MemBarrier - it's just a comment. Add a debug // to make it slightly easier to see. case X86::Int_MemBarrier: DEBUG(dbgs() << "#MEMBARRIER\n"); break; - + case TargetOpcode::INLINEASM: // We allow inline assembler nodes with empty bodies - they can // implicitly define registers, which is ok for JIT. @@ -752,7 +1182,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case TargetOpcode::EH_LABEL: MCE.emitLabel(MI.getOperand(0).getMCSymbol()); break; - + case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: break; @@ -774,7 +1204,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, if (CurOp == NumOps) break; - + const MachineOperand &MO = MI.getOperand(CurOp++); DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n"); @@ -787,13 +1217,13 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, emitPCRelativeBlockAddress(MO.getMBB()); break; } - + if (MO.isGlobal()) { emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word, MO.getOffset(), 0); break; } - + if (MO.isSymbol()) { emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word); break; @@ -804,7 +1234,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word); break; } - + assert(MO.isImm() && "Unknown RawFrm operand!"); if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) { // Fix up immediate operand for pc relative calls. @@ -815,21 +1245,21 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags)); break; } - + case X86II::AddRegFrm: { MCE.emitByte(BaseOpcode + X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg())); - + if (CurOp == NumOps) break; - + const MachineOperand &MO1 = MI.getOperand(CurOp++); unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); if (MO1.isImm()) { emitConstant(MO1.getImm(), Size); break; } - + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); if (Opcode == X86::MOV64ri64i32) @@ -855,46 +1285,57 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, emitRegModRMByte(MI.getOperand(CurOp).getReg(), X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg())); CurOp += 2; - if (CurOp != NumOps) - emitConstant(MI.getOperand(CurOp++).getImm(), - X86II::getSizeOfImm(Desc->TSFlags)); break; } case X86II::MRMDestMem: { MCE.emitByte(BaseOpcode); + + unsigned SrcRegNum = CurOp + X86::AddrNumOperands; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + SrcRegNum++; emitMemModRMByte(MI, CurOp, - X86_MC::getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands) - .getReg())); - CurOp += X86::AddrNumOperands + 1; - if (CurOp != NumOps) - emitConstant(MI.getOperand(CurOp++).getImm(), - X86II::getSizeOfImm(Desc->TSFlags)); + X86_MC::getX86RegNum(MI.getOperand(SrcRegNum).getReg())); + CurOp = SrcRegNum + 1; break; } - case X86II::MRMSrcReg: + case X86II::MRMSrcReg: { MCE.emitByte(BaseOpcode); - emitRegModRMByte(MI.getOperand(CurOp+1).getReg(), + + unsigned SrcRegNum = CurOp+1; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) + ++SrcRegNum; + + emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(), X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg())); - CurOp += 2; - if (CurOp != NumOps) - emitConstant(MI.getOperand(CurOp++).getImm(), - X86II::getSizeOfImm(Desc->TSFlags)); + // 2 operands skipped with HasMemOp4, compensate accordingly + CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; + if (HasVEX_4VOp3) + ++CurOp; break; - + } case X86II::MRMSrcMem: { int AddrOperands = X86::AddrNumOperands; + unsigned FirstMemOp = CurOp+1; + if (HasVEX_4V) { + ++AddrOperands; + ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). + } + if (HasMemOp4) // Skip second register source (encoded in I8IMM) + ++FirstMemOp; + + MCE.emitByte(BaseOpcode); intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ? X86II::getSizeOfImm(Desc->TSFlags) : 0; - - MCE.emitByte(BaseOpcode); - emitMemModRMByte(MI, CurOp+1, + emitMemModRMByte(MI, FirstMemOp, X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj); CurOp += AddrOperands + 1; - if (CurOp != NumOps) - emitConstant(MI.getOperand(CurOp++).getImm(), - X86II::getSizeOfImm(Desc->TSFlags)); + if (HasVEX_4VOp3) + ++CurOp; break; } @@ -902,20 +1343,22 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: case X86II::MRM6r: case X86II::MRM7r: { + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + ++CurOp; MCE.emitByte(BaseOpcode); emitRegModRMByte(MI.getOperand(CurOp++).getReg(), (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r); if (CurOp == NumOps) break; - + const MachineOperand &MO1 = MI.getOperand(CurOp++); unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); if (MO1.isImm()) { emitConstant(MO1.getImm(), Size); break; } - + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); if (Opcode == X86::MOV64ri32) @@ -937,8 +1380,10 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: case X86II::MRM6m: case X86II::MRM7m: { + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + ++CurOp; intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ? - (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? + (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0; MCE.emitByte(BaseOpcode); @@ -948,14 +1393,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, if (CurOp == NumOps) break; - + const MachineOperand &MO = MI.getOperand(CurOp++); unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); if (MO.isImm()) { emitConstant(MO.getImm(), Size); break; } - + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); if (Opcode == X86::MOV64mi32) @@ -980,7 +1425,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg())); ++CurOp; break; - + case X86II::MRM_C1: MCE.emitByte(BaseOpcode); MCE.emitByte(0xC1); @@ -1003,6 +1448,33 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, break; } + while (CurOp != NumOps && NumOps - CurOp <= 2) { + // The last source register of a 4 operand instruction in AVX is encoded + // in bits[7:4] of a immediate byte. + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { + const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand + : CurOp); + ++CurOp; + unsigned RegNum = X86_MC::getX86RegNum(MO.getReg()) << 4; + if (X86II::isX86_64ExtendedReg(MO.getReg())) + RegNum |= 1 << 7; + // If there is an additional 5th operand it must be an immediate, which + // is encoded in bits[3:0] + if (CurOp != NumOps) { + const MachineOperand &MIMM = MI.getOperand(CurOp++); + if (MIMM.isImm()) { + unsigned Val = MIMM.getImm(); + assert(Val < 16 && "Immediate operand value out of range"); + RegNum |= Val; + } + } + emitConstant(RegNum, 1); + } else { + emitConstant(MI.getOperand(CurOp++).getImm(), + X86II::getSizeOfImm(Desc->TSFlags)); + } + } + if (!MI.isVariadic() && CurOp != NumOps) { #ifndef NDEBUG dbgs() << "Cannot encode all operands of: " << MI << "\n"; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 69752c5..585b7a5 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -183,37 +183,37 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, case MVT::i1: case MVT::i8: Opc = X86::MOV8rm; - RC = X86::GR8RegisterClass; + RC = &X86::GR8RegClass; break; case MVT::i16: Opc = X86::MOV16rm; - RC = X86::GR16RegisterClass; + RC = &X86::GR16RegClass; break; case MVT::i32: Opc = X86::MOV32rm; - RC = X86::GR32RegisterClass; + RC = &X86::GR32RegClass; break; case MVT::i64: // Must be in x86-64 mode. Opc = X86::MOV64rm; - RC = X86::GR64RegisterClass; + RC = &X86::GR64RegClass; break; case MVT::f32: if (X86ScalarSSEf32) { Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; - RC = X86::FR32RegisterClass; + RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; - RC = X86::RFP32RegisterClass; + RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; - RC = X86::FR64RegisterClass; + RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; - RC = X86::RFP64RegisterClass; + RC = &X86::RFP64RegClass; } break; case MVT::f80: @@ -240,7 +240,7 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { default: return false; case MVT::i1: { // Mask out all but lowest bit. - unsigned AndResult = createResultReg(X86::GR8RegisterClass); + unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1); Val = AndResult; @@ -547,13 +547,13 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { if (TLI.getPointerTy() == MVT::i64) { Opc = X86::MOV64rm; - RC = X86::GR64RegisterClass; + RC = &X86::GR64RegClass; if (Subtarget->isPICStyleRIPRel()) StubAM.Base.Reg = X86::RIP; } else { Opc = X86::MOV32rm; - RC = X86::GR32RegisterClass; + RC = &X86::GR32RegClass; } LoadReg = createResultReg(RC); @@ -743,7 +743,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, - I->getContext()); + I->getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); @@ -1258,7 +1258,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) { if (V->getType()->isFloatTy()) { unsigned OpReg = getRegForValue(V); if (OpReg == 0) return false; - unsigned ResultReg = createResultReg(X86::FR64RegisterClass); + unsigned ResultReg = createResultReg(&X86::FR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::CVTSS2SDrr), ResultReg) .addReg(OpReg); @@ -1277,7 +1277,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { if (V->getType()->isDoubleTy()) { unsigned OpReg = getRegForValue(V); if (OpReg == 0) return false; - unsigned ResultReg = createResultReg(X86::FR32RegisterClass); + unsigned ResultReg = createResultReg(&X86::FR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::CVTSD2SSrr), ResultReg) .addReg(OpReg); @@ -1314,8 +1314,9 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { if (!Subtarget->is64Bit()) { // If we're on x86-32; we can't extract an i8 from a general register. // First issue a copy to GR16_ABCD or GR32_ABCD. - const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) - ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass; + const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ? + (const TargetRegisterClass*)&X86::GR16_ABCDRegClass : + (const TargetRegisterClass*)&X86::GR32_ABCDRegClass; unsigned CopyReg = createResultReg(CopyRC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); @@ -1423,7 +1424,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { return DoSelectCall(&I, "memset"); } case Intrinsic::stackprotector: { - // Emit code inline code to store the stack guard onto the stack. + // Emit code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(); const Value *Op1 = I.getArgOperand(0); // The guard's value. @@ -1484,7 +1485,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { return false; // The call to CreateRegs builds two sequential registers, to store the - // both the the returned values. + // both the returned values. unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg) .addReg(Reg1).addReg(Reg2); @@ -1548,12 +1549,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Check whether the function can return without sret-demotion. SmallVector<ISD::OutputArg, 4> Outs; - SmallVector<uint64_t, 4> Offsets; GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(), - Outs, TLI, &Offsets); + Outs, TLI); bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), - *FuncInfo.MF, FTy->isVarArg(), - Outs, FTy->getContext()); + *FuncInfo.MF, FTy->isVarArg(), + Outs, FTy->getContext()); if (!CanLowerReturn) return false; @@ -1667,7 +1667,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, - I->getParent()->getContext()); + I->getParent()->getContext()); // Allocate shadow area for Win64 if (Subtarget->isTargetWin64()) @@ -1693,7 +1693,6 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Promote the value if needed. switch (VA.getLocInfo()) { - default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && @@ -1737,6 +1736,14 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { ArgVT = VA.getLocVT(); break; } + case CCValAssign::VExt: + // VExt has not been implemented, so this should be impossible to reach + // for now. However, fallback to Selection DAG isel once implemented. + return false; + case CCValAssign::Indirect: + // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully + // support this. + return false; } if (VA.isRegLoc()) { @@ -1838,25 +1845,27 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { MIB.addGlobalAddress(GV, 0, OpFlags); } + // Add a register mask with the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); + // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) - MIB.addReg(X86::EBX); + MIB.addReg(X86::EBX, RegState::Implicit); if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) - MIB.addReg(X86::AL); + MIB.addReg(X86::AL, RegState::Implicit); // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i]); - - // Add a register mask with the call-preserved registers. - // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); + MIB.addReg(RegArgs[i], RegState::Implicit); // Issue CALLSEQ_END unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); unsigned NumBytesCallee = 0; if (!Subtarget->is64Bit() && !Subtarget->isTargetWindows() && + !(CS.getCallingConv() == CallingConv::Fast || + CS.getCallingConv() == CallingConv::GHC) && CS.paramHasAttr(1, Attribute::StructRet)) NumBytesCallee = 4; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp)) @@ -1889,7 +1898,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { SmallVector<unsigned, 4> UsedRegs; SmallVector<CCValAssign, 16> RVLocs; CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs, - I->getParent()->getContext()); + I->getParent()->getContext()); unsigned ResultReg = FuncInfo.CreateRegs(I->getType()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0; i != RVLocs.size(); ++i) { @@ -1903,7 +1912,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { RVLocs[i].getLocReg() == X86::ST1)) { if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) { CopyVT = MVT::f80; - CopyReg = createResultReg(X86::RFP80RegisterClass); + CopyReg = createResultReg(&X86::RFP80RegClass); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL), CopyReg); @@ -2001,37 +2010,37 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { default: return false; case MVT::i8: Opc = X86::MOV8rm; - RC = X86::GR8RegisterClass; + RC = &X86::GR8RegClass; break; case MVT::i16: Opc = X86::MOV16rm; - RC = X86::GR16RegisterClass; + RC = &X86::GR16RegClass; break; case MVT::i32: Opc = X86::MOV32rm; - RC = X86::GR32RegisterClass; + RC = &X86::GR32RegClass; break; case MVT::i64: // Must be in x86-64 mode. Opc = X86::MOV64rm; - RC = X86::GR64RegisterClass; + RC = &X86::GR64RegClass; break; case MVT::f32: if (X86ScalarSSEf32) { Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; - RC = X86::FR32RegisterClass; + RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; - RC = X86::RFP32RegisterClass; + RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; - RC = X86::FR64RegisterClass; + RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; - RC = X86::RFP64RegisterClass; + RC = &X86::RFP64RegClass; } break; case MVT::f80: @@ -2124,19 +2133,19 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { case MVT::f32: if (X86ScalarSSEf32) { Opc = X86::FsFLD0SS; - RC = X86::FR32RegisterClass; + RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp032; - RC = X86::RFP32RegisterClass; + RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = X86::FsFLD0SD; - RC = X86::FR64RegisterClass; + RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp064; - RC = X86::RFP64RegisterClass; + RC = &X86::RFP64RegClass; } break; case MVT::f80: diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index ed1707d..711ee41 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -130,7 +130,7 @@ namespace { // The hardware keeps track of how many FP registers are live, so we have // to model that exactly. Usually, each live register corresponds to an // FP<n> register, but when dealing with calls, returns, and inline - // assembly, it is sometimes neccesary to have live scratch registers. + // assembly, it is sometimes necessary to have live scratch registers. unsigned Stack[8]; // FP<n> Registers in each stack slot... unsigned StackTop; // The current top of the FP stack. diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 000e375..2238688 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -45,14 +45,14 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetRegisterInfo *RegInfo = TM.getRegisterInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || - RI->needsStackRealignment(MF) || + RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || - MMI.callsUnwindInit()); + MMI.callsUnwindInit() || MMI.callsEHReturn()); } static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) { @@ -125,8 +125,8 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, unsigned Reg = MO.getReg(); if (!Reg) continue; - for (const uint16_t *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI) - Uses.insert(*AsI); + for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) + Uses.insert(*AI); } const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; @@ -369,7 +369,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, /// getCompactUnwindRegNum - Get the compact unwind number for a given /// register. The number corresponds to the enum lists in /// compact_unwind_encoding.h. -static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) { +static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) { for (int Idx = 1; *CURegs; ++CURegs, ++Idx) if (*CURegs == Reg) return Idx; @@ -398,13 +398,13 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], // 4 3 // 5 3 // - static const unsigned CU32BitRegs[] = { + static const uint16_t CU32BitRegs[] = { X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 }; - static const unsigned CU64BitRegs[] = { + static const uint16_t CU64BitRegs[] = { X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 }; - const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); + const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]); @@ -466,13 +466,13 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], bool Is64Bit) { - static const unsigned CU32BitRegs[] = { + static const uint16_t CU32BitRegs[] = { X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 }; - static const unsigned CU64BitRegs[] = { + static const uint16_t CU64BitRegs[] = { X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 }; - const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); + const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); // Encode the registers in the order they were saved, 3-bits per register. The // registers are numbered from 1 to CU_NUM_SAVED_REGS. @@ -650,6 +650,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned StackPtr = RegInfo->getStackRegister(); + unsigned BasePtr = RegInfo->getBaseRegister(); DebugLoc DL; // If we're forcing a stack realignment we can't rely on just the frame @@ -721,10 +722,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) - FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; - - NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + if (RegInfo->needsStackRealignment(MF)) { + // Callee-saved registers are pushed on stack before the stack + // is realigned. + FrameSize -= X86FI->getCalleeSavedFrameSize(); + NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + } else { + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + } // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. @@ -781,19 +786,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end(); I != E; ++I) I->addLiveIn(FramePtr); - - // Realign stack - if (RegInfo->needsStackRealignment(MF)) { - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) - .addReg(StackPtr) - .addImm(-MaxAlign) - .setMIFlag(MachineInstr::FrameSetup); - - // The EFLAGS implicit def is dead. - MI->getOperand(3).setIsDead(); - } } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } @@ -823,6 +815,27 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { } } + // Realign stack after we pushed callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + + // NOTE: We push the registers before realigning the stack, so + // vector callee-saved (xmm) registers may be saved w/o proper + // alignment in this way. However, currently these regs are saved in + // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so + // this shouldn't be a problem. + if (RegInfo->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) + .addReg(StackPtr) + .addImm(-MaxAlign) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); + } + DL = MBB.findDebugLoc(MBBI); // If there is an SUB32ri of ESP immediately before this instruction, merge @@ -913,6 +926,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, UseLEA, TII, *RegInfo); + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (RegInfo->hasBasePointer(MF)) { + // Update the frame pointer with the current stack pointer. + unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; + BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } + if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { // Mark end of stack pointer adjustment. MCSymbol *Label = MMI.getContext().CreateTempSymbol(); @@ -997,10 +1022,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) - FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign; - - NumBytes = FrameSize - CSSize; + if (RegInfo->needsStackRealignment(MF)) { + // Callee-saved registers were pushed on stack before the stack + // was realigned. + FrameSize -= CSSize; + NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + } else { + NumBytes = FrameSize - CSSize; + } // Pop EBP. BuildMI(MBB, MBBI, DL, @@ -1010,7 +1039,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } // Skip the callee-saved pop instructions. - MachineBasicBlock::iterator LastCSPop = MBBI; while (MBBI != MBB.begin()) { MachineBasicBlock::iterator PI = prior(MBBI); unsigned Opc = PI->getOpcode(); @@ -1021,6 +1049,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, --MBBI; } + MachineBasicBlock::iterator FirstCSPop = MBBI; DL = MBBI->getDebugLoc(); @@ -1032,28 +1061,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was // realigned. - if (RegInfo->needsStackRealignment(MF)) { - // We cannot use LEA here, because stack pointer was realigned. We need to - // deallocate local frame back. - if (CSSize) { - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII, - *RegInfo); - MBBI = prior(LastCSPop); - } - - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), - StackPtr).addReg(FramePtr); - } else if (MFI->hasVarSizedObjects()) { - if (CSSize) { - unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r; - MachineInstr *MI = - addRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr), - FramePtr, false, -CSSize); - MBB.insert(MBBI, MI); + if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { + if (RegInfo->needsStackRealignment(MF)) + MBBI = FirstCSPop; + if (CSSize != 0) { + unsigned Opc = getLEArOpcode(Is64Bit); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), + FramePtr, false, -CSSize); } else { - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr) + unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); + BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(FramePtr); } } else if (NumBytes) { @@ -1124,8 +1141,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } MachineInstr *NewMI = prior(MBBI); - for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i) - NewMI->addOperand(MBBI->getOperand(i)); + NewMI->copyImplicitOps(MBBI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); @@ -1142,16 +1158,25 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { - const X86RegisterInfo *RI = + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); const MachineFrameInfo *MFI = MF.getFrameInfo(); int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); uint64_t StackSize = MFI->getStackSize(); - if (RI->needsStackRealignment(MF)) { + if (RegInfo->hasBasePointer(MF)) { + assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. - Offset += RI->getSlotSize(); + return Offset + RegInfo->getSlotSize(); + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + } else if (RegInfo->needsStackRealignment(MF)) { + if (FI < 0) { + // Skip the saved EBP. + return Offset + RegInfo->getSlotSize(); } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; @@ -1162,7 +1187,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con return Offset + StackSize; // Skip the saved EBP. - Offset += RI->getSlotSize(); + Offset += RegInfo->getSlotSize(); // Skip the RETADDR move area const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1174,6 +1199,22 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con return Offset; } +int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + // We can't calculate offset from frame pointer if the stack is realigned, + // so enforce usage of stack/base pointer. The base pointer is used when we + // have dynamic allocas in addition to dynamic realignment. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + else if (RegInfo->needsStackRealignment(MF)) + FrameReg = RegInfo->getStackRegister(); + else + FrameReg = RegInfo->getFrameRegister(MF); + return getFrameIndexOffset(MF, FI); +} + bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -1307,6 +1348,10 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, "Slot for EBP register must be last in order to be found!"); (void)FrameIdx; } + + // Spill the BasePtr if it's used. + if (RegInfo->hasBasePointer(MF)) + MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); } static bool diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index d55a497..dc515dc 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -60,6 +60,8 @@ public: bool hasReservedCallFrame(const MachineFunction &MF) const; int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const; uint32_t getCompactUnwindEncoding(MachineFunction &MF) const; }; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 8e2b1d6..5186482 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -187,6 +187,7 @@ namespace { private: SDNode *Select(SDNode *N); + SDNode *SelectGather(SDNode *N, unsigned Opc); SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT); SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); @@ -1905,6 +1906,20 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, ChainCheck = true; continue; } + + // Make sure using Op as part of the chain would not cause a cycle here. + // In theory, we could check whether the chain node is a predecessor of + // the load. But that can be very expensive. Instead visit the uses and + // make sure they all have smaller node id than the load. + int LoadId = LoadNode->getNodeId(); + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = UI->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != 0) + continue; + if (UI->getNodeId() > LoadId) + return false; + } + ChainOps.push_back(Op); } @@ -1938,6 +1953,38 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { llvm_unreachable("unrecognized size for LdVT"); } +/// SelectGather - Customized ISel for GATHER operations. +/// +SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { + // Operands of Gather: VSrc, Base, VIdx, VMask, Scale + SDValue Chain = Node->getOperand(0); + SDValue VSrc = Node->getOperand(2); + SDValue Base = Node->getOperand(3); + SDValue VIdx = Node->getOperand(4); + SDValue VMask = Node->getOperand(5); + ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6)); + if (!Scale) + return 0; + + SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(), + MVT::Other); + + // Memory Operands: Base, Scale, Index, Disp, Segment + SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i32); + const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx, + Disp, Segment, VMask, Chain}; + SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), + VTs, Ops, array_lengthof(Ops)); + // Node has 2 outputs: VDst and MVT::Other. + // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. + // We replace VDst of Node with VDst of ResNode, and Other of Node with Other + // of ResNode. + ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2)); + return ResNode; +} + SDNode *X86DAGToDAGISel::Select(SDNode *Node) { EVT NVT = Node->getValueType(0); unsigned Opc, MOpc; @@ -1953,23 +2000,82 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: break; + case Intrinsic::x86_avx2_gather_d_pd: + case Intrinsic::x86_avx2_gather_d_pd_256: + case Intrinsic::x86_avx2_gather_q_pd: + case Intrinsic::x86_avx2_gather_q_pd_256: + case Intrinsic::x86_avx2_gather_d_ps: + case Intrinsic::x86_avx2_gather_d_ps_256: + case Intrinsic::x86_avx2_gather_q_ps: + case Intrinsic::x86_avx2_gather_q_ps_256: + case Intrinsic::x86_avx2_gather_d_q: + case Intrinsic::x86_avx2_gather_d_q_256: + case Intrinsic::x86_avx2_gather_q_q: + case Intrinsic::x86_avx2_gather_q_q_256: + case Intrinsic::x86_avx2_gather_d_d: + case Intrinsic::x86_avx2_gather_d_d_256: + case Intrinsic::x86_avx2_gather_q_d: + case Intrinsic::x86_avx2_gather_q_d_256: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break; + case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break; + case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break; + case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break; + case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break; + case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break; + case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break; + case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break; + case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break; + case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break; + case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break; + case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break; + case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break; + case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break; + case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; + case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; + } + SDNode *RetVal = SelectGather(Node, Opc); + if (RetVal) + // We already called ReplaceUses inside SelectGather. + return NULL; + break; + } + } + break; + } case X86ISD::GlobalBaseReg: return getGlobalBaseReg(); + case X86ISD::ATOMOR64_DAG: - return SelectAtomic64(Node, X86::ATOMOR6432); case X86ISD::ATOMXOR64_DAG: - return SelectAtomic64(Node, X86::ATOMXOR6432); case X86ISD::ATOMADD64_DAG: - return SelectAtomic64(Node, X86::ATOMADD6432); case X86ISD::ATOMSUB64_DAG: - return SelectAtomic64(Node, X86::ATOMSUB6432); case X86ISD::ATOMNAND64_DAG: - return SelectAtomic64(Node, X86::ATOMNAND6432); case X86ISD::ATOMAND64_DAG: - return SelectAtomic64(Node, X86::ATOMAND6432); - case X86ISD::ATOMSWAP64_DAG: - return SelectAtomic64(Node, X86::ATOMSWAP6432); + case X86ISD::ATOMSWAP64_DAG: { + unsigned Opc; + switch (Opcode) { + default: llvm_unreachable("Impossible intrinsic"); + case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break; + case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break; + case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break; + case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break; + case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break; + case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break; + case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break; + } + SDNode *RetVal = SelectAtomic64(Node, Opc); + if (RetVal) + return RetVal; + break; + } case ISD::ATOMIC_LOAD_ADD: { SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT); @@ -2128,7 +2234,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, - N0, SDValue()).getValue(1); + N0, SDValue()).getValue(1); if (foldedLoad) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), @@ -2168,7 +2274,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - LoReg, NVT, InFlag); + LoReg, NVT, InFlag); InFlag = Result.getValue(2); ReplaceUses(SDValue(Node, 0), Result); DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3042386..b88f2fa 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -63,41 +63,33 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, /// simple subregister reference. Idx is an index in the 128 bits we /// want. It need not be aligned to a 128-bit bounday. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, - DebugLoc dl) { +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, DebugLoc dl) { EVT VT = Vec.getValueType(); assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); EVT ElVT = VT.getVectorElementType(); - int Factor = VT.getSizeInBits()/128; + unsigned Factor = VT.getSizeInBits()/128; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract from UNDEF is UNDEF. if (Vec.getOpcode() == ISD::UNDEF) - return DAG.getNode(ISD::UNDEF, dl, ResultVT); - - if (isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return DAG.getUNDEF(ResultVT); - // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR - // we can match to VEXTRACTF128. - unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); + // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR + // we can match to VEXTRACTF128. + unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); - // This is the index of the first element of the 128-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) - * ElemsPerChunk); + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); - SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, - VecIdx); + SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, + VecIdx); - return Result; - } - - return SDValue(); + return Result; } /// Generate a DAG to put 128-bits into a vector > 128 bits. This @@ -105,34 +97,41 @@ static SDValue Extract128BitVector(SDValue Vec, /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit bounday. That makes /// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, - SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, DebugLoc dl) { - if (isa<ConstantSDNode>(Idx)) { - EVT VT = Vec.getValueType(); - assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); + // Inserting UNDEF is Result + if (Vec.getOpcode() == ISD::UNDEF) + return Result; - EVT ElVT = VT.getVectorElementType(); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - EVT ResultVT = Result.getValueType(); + EVT VT = Vec.getValueType(); + assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); - // Insert the relevant 128 bits. - unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); + EVT ElVT = VT.getVectorElementType(); + EVT ResultVT = Result.getValueType(); - // This is the index of the first element of the 128-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) - * ElemsPerChunk); + // Insert the relevant 128 bits. + unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); - Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, - VecIdx); - return Result; - } + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) + * ElemsPerChunk); - return SDValue(); + SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, + VecIdx); +} + +/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 +/// instructions. This is used because creating CONCAT_VECTOR nodes of +/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower +/// large BUILD_VECTORS. +static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + DebugLoc dl) { + SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { @@ -141,10 +140,12 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { if (Subtarget->isTargetEnvMacho()) { if (is64Bit) - return new X8664_MachoTargetObjectFile(); + return new X86_64MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); } + if (Subtarget->isTargetLinux()) + return new X86LinuxTargetObjectFile(); if (Subtarget->isTargetELF()) return new TargetLoweringObjectFileELF(); if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) @@ -163,7 +164,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) TD = getTargetData(); // Set up the TargetLowering object. - static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; + static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; // X86 is weird, it always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); @@ -172,11 +173,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. - // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling. - if (Subtarget->is64Bit()) + // For Atom, always use ILP scheduling. + if (Subtarget->isAtom()) + setSchedulingPreference(Sched::ILP); + else if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); - else if (Subtarget->isAtom()) - setSchedulingPreference(Sched::Hybrid); else setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); @@ -216,11 +217,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Set up the register classes. - addRegisterClass(MVT::i8, X86::GR8RegisterClass); - addRegisterClass(MVT::i16, X86::GR16RegisterClass); - addRegisterClass(MVT::i32, X86::GR32RegisterClass); + addRegisterClass(MVT::i8, &X86::GR8RegClass); + addRegisterClass(MVT::i16, &X86::GR16RegClass); + addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget->is64Bit()) - addRegisterClass(MVT::i64, X86::GR64RegisterClass); + addRegisterClass(MVT::i64, &X86::GR64RegClass); setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); @@ -346,7 +347,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. - for (unsigned i = 0, e = 4; i != e; ++i) { + for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); @@ -493,7 +494,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setShouldFoldAtomicFences(true); // Expand certain atomics - for (unsigned i = 0, e = 4; i != e; ++i) { + for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); @@ -568,8 +569,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. - addRegisterClass(MVT::f32, X86::FR32RegisterClass); - addRegisterClass(MVT::f64, X86::FR64RegisterClass); + addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f64, &X86::FR64RegClass); // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS , MVT::f64, Custom); @@ -600,8 +601,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. - addRegisterClass(MVT::f32, X86::FR32RegisterClass); - addRegisterClass(MVT::f64, X86::RFP64RegisterClass); + addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); @@ -633,8 +634,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. - addRegisterClass(MVT::f64, X86::RFP64RegisterClass); - addRegisterClass(MVT::f32, X86::RFP32RegisterClass); + addRegisterClass(MVT::f64, &X86::RFP64RegClass); + addRegisterClass(MVT::f32, &X86::RFP32RegClass); setOperationAction(ISD::UNDEF, MVT::f64, Expand); setOperationAction(ISD::UNDEF, MVT::f32, Expand); @@ -661,7 +662,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Long double always uses X87. if (!TM.Options.UseSoftFloat) { - addRegisterClass(MVT::f80, X86::RFP80RegisterClass); + addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { @@ -706,8 +707,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { + for (int VT = MVT::FIRST_VECTOR_VALUETYPE; + VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) { setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); @@ -765,8 +766,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); - for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) + for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; + InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) setTruncStoreAction((MVT::SimpleValueType)VT, (MVT::SimpleValueType)InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); @@ -777,7 +778,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { - addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); + addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } @@ -814,7 +815,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { - addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); + addRegisterClass(MVT::v4f32, &X86::VR128RegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); setOperationAction(ISD::FSUB, MVT::v4f32, Legal); @@ -831,14 +832,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { - addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); + addRegisterClass(MVT::v2f64, &X86::VR128RegClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM // registers cannot be used even for integer operations. - addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); - addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); - addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); - addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); + addRegisterClass(MVT::v16i8, &X86::VR128RegClass); + addRegisterClass(MVT::v8i16, &X86::VR128RegClass); + addRegisterClass(MVT::v4i32, &X86::VR128RegClass); + addRegisterClass(MVT::v2i64, &X86::VR128RegClass); setOperationAction(ISD::ADD, MVT::v16i8, Legal); setOperationAction(ISD::ADD, MVT::v8i16, Legal); @@ -875,7 +876,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); // Custom lower build_vector, vector_shuffle, and extract_vector_elt. - for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { + for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { EVT VT = (MVT::SimpleValueType)i; // Do not attempt to custom lower non-power-of-2 vectors if (!isPowerOf2_32(VT.getVectorNumElements())) @@ -904,7 +905,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. - for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { + for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; EVT VT = SVT; @@ -1012,12 +1013,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC, MVT::v2i64, Custom); if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { - addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); - addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); - addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); - addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); - addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); - addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); + addRegisterClass(MVT::v32i8, &X86::VR256RegClass); + addRegisterClass(MVT::v16i16, &X86::VR256RegClass); + addRegisterClass(MVT::v8i32, &X86::VR256RegClass); + addRegisterClass(MVT::v8f32, &X86::VR256RegClass); + addRegisterClass(MVT::v4i64, &X86::VR256RegClass); + addRegisterClass(MVT::v4f64, &X86::VR256RegClass); setOperationAction(ISD::LOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v4f64, Legal); @@ -1122,8 +1123,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Custom lower several nodes for 256-bit types. - for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; EVT VT = SVT; @@ -1145,7 +1146,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. - for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { + for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; EVT VT = SVT; @@ -1168,14 +1169,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { + for (int VT = MVT::FIRST_VECTOR_VALUETYPE; + VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't @@ -1223,13 +1225,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::FP_TO_SINT); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); - if (Subtarget->hasBMI()) - setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::XOR); computeRegisterProperties(); @@ -1244,6 +1249,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setPrefLoopAlignment(4); // 2^4 bytes. benefitFromCodePlacementOpt = true; + // Predictable cmov don't hurt on atom because it's in-order. + predictableSelectIsExpensive = !Subtarget->isAtom(); + setPrefFunctionAlignment(4); // 2^4 bytes. } @@ -1277,7 +1285,6 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { break; } } - return; } /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate @@ -1412,18 +1419,19 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{ default: return TargetLowering::findRepresentativeClass(VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: - RRC = (Subtarget->is64Bit() - ? X86::GR64RegisterClass : X86::GR32RegisterClass); + RRC = Subtarget->is64Bit() ? + (const TargetRegisterClass*)&X86::GR64RegClass : + (const TargetRegisterClass*)&X86::GR32RegClass; break; case MVT::x86mmx: - RRC = X86::VR64RegisterClass; + RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: - RRC = X86::VR128RegisterClass; + RRC = &X86::VR128RegClass; break; } return std::make_pair(RRC, Cost); @@ -1458,7 +1466,7 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, bool X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, - MachineFunction &MF, bool isVarArg, + MachineFunction &MF, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; @@ -1502,6 +1510,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); + // Promote values to the appropriate types + if (VA.getLocInfo() == CCValAssign::SExt) + ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::AExt) + ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::BCvt) + ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); + // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || @@ -1639,7 +1657,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -1656,7 +1674,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SDValue Val; // If this is a call to a function that returns an fp value on the floating - // point stack, we must guarantee the the value is popped from the stack, so + // point stack, we must guarantee the value is popped from the stack, so // a CopyFromReg is not good enough - the copy instruction may be eliminated // if the return value is not used. We use the FpPOP_RETVAL instruction // instead. @@ -1851,19 +1869,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, EVT RegVT = VA.getLocVT(); const TargetRegisterClass *RC; if (RegVT == MVT::i32) - RC = X86::GR32RegisterClass; + RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) - RC = X86::GR64RegisterClass; + RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) - RC = X86::FR32RegisterClass; + RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) - RC = X86::FR64RegisterClass; + RC = &X86::FR64RegClass; else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) - RC = X86::VR256RegisterClass; + RC = &X86::VR256RegClass; else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) - RC = X86::VR128RegisterClass; + RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) - RC = X86::VR64RegisterClass; + RC = &X86::VR64RegClass; else llvm_unreachable("Unknown argument type!"); @@ -2005,7 +2023,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, DAG.getIntPtrConstant(Offset)); unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], - X86::GR64RegisterClass); + &X86::GR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, @@ -2021,7 +2039,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SmallVector<SDValue, 11> SaveXMMOps; SaveXMMOps.push_back(Chain); - unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); + unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); SaveXMMOps.push_back(ALVal); @@ -2032,7 +2050,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], - X86::VR128RegisterClass); + &X86::VR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); SaveXMMOps.push_back(Val); } @@ -2128,14 +2146,19 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, } SDValue -X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + CallingConv::ID CallConv = CLI.CallConv; + bool &isTailCall = CLI.IsTailCall; + bool isVarArg = CLI.IsVarArg; + MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isTargetWin64(); @@ -2283,27 +2306,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0], MemOpChains.size()); - // Build a sequence of copy-to-reg nodes chained together with token chain - // and flag operands which copy the outgoing args into registers. - SDValue InFlag; - // Tail call byval lowering might overwrite argument registers so in case of - // tail call optimization the copies to registers are lowered later. - if (!isTailCall) - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { - Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), - InFlag); - InFlag = Chain.getValue(1); + RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of @@ -2341,12 +2349,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); - Chain = DAG.getCopyToReg(Chain, dl, X86::AL, - DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); - InFlag = Chain.getValue(1); + RegsToPass.push_back(std::make_pair(unsigned(X86::AL), + DAG.getConstant(NumXMMRegs, MVT::i8))); } - // For tail calls lower the arguments to the 'real' stack slot. if (isTailCall) { // Force all the incoming stack arguments to be loaded from the stack @@ -2360,8 +2366,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector<SDValue, 8> MemOpChains2; SDValue FIN; int FI = 0; - // Do not flag preceding copytoreg stuff together with the following stuff. - InFlag = SDValue(); if (getTargetMachine().Options.GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -2401,19 +2405,20 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains2[0], MemOpChains2.size()); - // Copy arguments to their registers. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - InFlag =SDValue(); - // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, FPDiff, dl); } + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + if (getTargetMachine().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls @@ -2515,14 +2520,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); - // Add an implicit use GOT pointer in EBX. - if (!isTailCall && Subtarget->isPICStyleGOT()) - Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); - - // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. - if (Is64Bit && isVarArg && !IsWin64) - Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); - // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); @@ -2744,7 +2741,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -2765,7 +2762,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (Unused) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; @@ -2779,12 +2776,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs1, *DAG.getContext()); + getTargetMachine(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs2, *DAG.getContext()); + getTargetMachine(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -2811,7 +2808,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (Subtarget->isTargetWin64()) { @@ -2912,6 +2909,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::UNPCKH: case X86ISD::VPERMILP: case X86ISD::VPERM2X128: + case X86ISD::VPERMI: return true; } } @@ -3052,10 +3050,12 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, RHS.getValueType()); return X86::COND_NS; - } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { + } + if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; - } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { + } + if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, RHS.getValueType()); return X86::COND_LE; @@ -3171,12 +3171,12 @@ static bool isUndefOrEqual(int Val, int CmpVal) { return false; } -/// isSequentialOrUndefInRange - Return true if every element in Mask, begining +/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (L, L+Pos]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, - int Pos, int Size, int Low) { - for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) + unsigned Pos, unsigned Size, int Low) { + for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; @@ -3195,8 +3195,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) { - if (VT != MVT::v8i16) +static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { + if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) return false; // Lower quadword copied in order or undef. @@ -3205,16 +3205,27 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) { // Upper quadword shuffled. for (unsigned i = 4; i != 8; ++i) - if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) + if (!isUndefOrInRange(Mask[i], 4, 8)) return false; + if (VT == MVT::v16i16) { + // Lower quadword copied in order or undef. + if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) + return false; + + // Upper quadword shuffled. + for (unsigned i = 12; i != 16; ++i) + if (!isUndefOrInRange(Mask[i], 12, 16)) + return false; + } + return true; } /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) { - if (VT != MVT::v8i16) +static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { + if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) return false; // Upper quadword copied in order. @@ -3223,9 +3234,20 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) { // Lower quadword shuffled. for (unsigned i = 0; i != 4; ++i) - if (Mask[i] >= 4) + if (!isUndefOrInRange(Mask[i], 0, 4)) + return false; + + if (VT == MVT::v16i16) { + // Upper quadword copied in order. + if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) return false; + // Lower quadword shuffled. + for (unsigned i = 8; i != 12; ++i) + if (!isUndefOrInRange(Mask[i], 8, 12)) + return false; + } + return true; } @@ -3419,11 +3441,11 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { if (NumElems != 2 && NumElems != 4) return false; - for (unsigned i = 0; i != NumElems/2; ++i) + for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i + NumElems)) return false; - for (unsigned i = NumElems/2; i != NumElems; ++i) + for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; @@ -3439,17 +3461,63 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { || VT.getSizeInBits() > 128) return false; - for (unsigned i = 0; i != NumElems/2; ++i) + for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; - for (unsigned i = 0; i != NumElems/2; ++i) - if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems)) + for (unsigned i = 0, e = NumElems/2; i != e; ++i) + if (!isUndefOrEqual(Mask[i + e], i + NumElems)) return false; return true; } +// +// Some special combinations that can be optimized. +// +static +SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG) { + EVT VT = SVOp->getValueType(0); + DebugLoc dl = SVOp->getDebugLoc(); + + if (VT != MVT::v8i32 && VT != MVT::v8f32) + return SDValue(); + + ArrayRef<int> Mask = SVOp->getMask(); + + // These are the special masks that may be optimized. + static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; + static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; + bool MatchEvenMask = true; + bool MatchOddMask = true; + for (int i=0; i<8; ++i) { + if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) + MatchEvenMask = false; + if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) + MatchOddMask = false; + } + static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1}; + static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1}; + + const int *CompactionMask; + if (MatchEvenMask) + CompactionMask = CompactionMaskEven; + else if (MatchOddMask) + CompactionMask = CompactionMaskOdd; + else + return SDValue(); + + SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); + + SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0), + UndefNode, CompactionMask); + SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1), + UndefNode, CompactionMask); + static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13}; + return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask); +} + /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, @@ -3881,9 +3949,8 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { for (unsigned i = 0; i != NumElts; ++i) { int Elt = N->getMaskElt(i); if (Elt < 0) continue; - Elt %= NumLaneElts; - unsigned ShAmt = i << Shift; - if (ShAmt >= 8) ShAmt -= 8; + Elt &= NumLaneElts - 1; + unsigned ShAmt = (i << Shift) % 8; Mask |= Elt << ShAmt; } @@ -3893,30 +3960,48 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + assert((VT == MVT::v8i16 || VT == MVT::v16i16) && + "Unsupported vector type for PSHUFHW"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned Mask = 0; - // 8 nodes, but we only care about the last 4. - for (unsigned i = 7; i >= 4; --i) { - int Val = N->getMaskElt(i); - if (Val >= 0) - Mask |= (Val - 4); - if (i != 4) - Mask <<= 2; + for (unsigned l = 0; l != NumElts; l += 8) { + // 8 nodes per lane, but we only care about the last 4. + for (unsigned i = 0; i < 4; ++i) { + int Elt = N->getMaskElt(l+i+4); + if (Elt < 0) continue; + Elt &= 0x3; // only 2-bits. + Mask |= Elt << (i * 2); + } } + return Mask; } /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + assert((VT == MVT::v8i16 || VT == MVT::v16i16) && + "Unsupported vector type for PSHUFHW"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned Mask = 0; - // 8 nodes, but we only care about the first 4. - for (int i = 3; i >= 0; --i) { - int Val = N->getMaskElt(i); - if (Val >= 0) - Mask |= Val; - if (i != 0) - Mask <<= 2; + for (unsigned l = 0; l != NumElts; l += 8) { + // 8 nodes per lane, but we only care about the first 4. + for (unsigned i = 0; i < 4; ++i) { + int Elt = N->getMaskElt(l+i); + if (Elt < 0) continue; + Elt &= 0x3; // only 2-bits + Mask |= Elt << (i * 2); + } } + return Mask; } @@ -4017,13 +4102,14 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SmallVector<int, 8> MaskVec; for (unsigned i = 0; i != NumElems; ++i) { - int idx = SVOp->getMaskElt(i); - if (idx < 0) - MaskVec.push_back(idx); - else if (idx < (int)NumElems) - MaskVec.push_back(idx + NumElems); - else - MaskVec.push_back(idx - NumElems); + int Idx = SVOp->getMaskElt(i); + if (Idx >= 0) { + if (Idx < (int)NumElems) + Idx += NumElems; + else + Idx -= NumElems; + } + MaskVec.push_back(Idx); } return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), SVOp->getOperand(0), &MaskVec[0]); @@ -4108,7 +4194,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; - for (unsigned i = NumElems/2; i != NumElems; ++i) + for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) if (!isUndefOrEqual(Mask[i], i+NumElems)) return false; return true; @@ -4160,11 +4246,12 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); + unsigned Size = VT.getSizeInBits(); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; - if (VT.getSizeInBits() == 128) { // SSE + if (Size == 128) { // SSE if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); @@ -4172,7 +4259,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } - } else if (VT.getSizeInBits() == 256) { // AVX + } else if (Size == 256) { // AVX if (Subtarget->hasAVX2()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; @@ -4184,7 +4271,9 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); } - } + } else + llvm_unreachable("Unexpected vector type"); + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } @@ -4195,25 +4284,22 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - assert((VT.is128BitVector() || VT.is256BitVector()) - && "Expected a 128-bit or 256-bit vector type"); + unsigned Size = VT.getSizeInBits(); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; - if (VT.getSizeInBits() == 256) { + if (Size == 256) { if (HasAVX2) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), - Vec, DAG.getConstant(0, MVT::i32), DAG, dl); - Vec = Insert128BitVector(InsV, Vec, - DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); } - } else { + } else if (Size == 128) { Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - } + } else + llvm_unreachable("Unexpected vector type"); return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } @@ -4256,9 +4342,8 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); - unsigned Half = NumElems/2; SmallVector<int, 8> Mask; - for (unsigned i = 0; i != Half; ++i) { + for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { Mask.push_back(i + Half); Mask.push_back(i + NumElems + Half); } @@ -4290,15 +4375,14 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { EVT VT = V.getValueType(); DebugLoc dl = V.getDebugLoc(); - assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) - && "Vector size not supported"); + unsigned Size = VT.getSizeInBits(); - if (VT.getSizeInBits() == 128) { + if (Size == 128) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), &SplatMask[0]); - } else { + } else if (Size == 256) { // To use VPERMILPS to splat scalars, the second half of indicies must // refer to the higher part, which is a duplication of the lower one, // because VPERMILPS can only handle in-lane permutations. @@ -4308,7 +4392,8 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), &SplatMask[0]); - } + } else + llvm_unreachable("Vector size not supported"); return DAG.getNode(ISD::BITCAST, dl, VT, V); } @@ -4329,9 +4414,8 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. if (Size == 256) { - unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0; - V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); - if (Idx > 0) + V1 = Extract128BitVector(V1, EltNo, DAG, dl); + if (EltNo >= NumElems/2) EltNo -= NumElems/2; } @@ -4347,10 +4431,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // into the low and high part. This is necessary because we want // to use VPERM* to shuffle the vectors if (Size == 256) { - SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, - DAG.getConstant(0, MVT::i32), DAG, dl); - V1 = Insert128BitVector(InsV, V1, - DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); + V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); } return getLegalSplat(DAG, V1, EltNo); @@ -4378,7 +4459,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the /// target specific opcode. Returns true if the Mask could be calculated. /// Sets IsUnary to true if only uses one source. -static bool getTargetShuffleMask(SDNode *N, EVT VT, +static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; @@ -4409,12 +4490,17 @@ static bool getTargetShuffleMask(SDNode *N, EVT VT, break; case X86ISD::PSHUFHW: ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::VPERMI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: @@ -4474,20 +4560,21 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { - unsigned NumElems = VT.getVectorNumElements(); + MVT ShufVT = V.getValueType().getSimpleVT(); + unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; SDValue ImmN; bool IsUnary; - if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt < 0) - return DAG.getUNDEF(VT.getVectorElementType()); + return DAG.getUNDEF(ShufVT.getVectorElementType()); SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) - : N->getOperand(1); + : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -4795,7 +4882,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); int EltNo = (Offset - StartOffset) >> 2; - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, @@ -4803,7 +4890,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, false, false, false, 0); SmallVector<int, 8> Mask; - for (int i = 0; i < NumElems; ++i) + for (unsigned i = 0; i != NumElems; ++i) Mask.push_back(EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); @@ -4867,8 +4954,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment()); - } else if (NumElems == 4 && LastLoadedElt == 1 && - DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { + } + if (NumElems == 4 && LastLoadedElt == 1 && + DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = @@ -4897,6 +4985,9 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for broadcast."); + SDValue Ld; bool ConstSplatVal; @@ -4931,8 +5022,17 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { return SDValue(); SDValue Sc = Op.getOperand(0); - if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) - return SDValue(); + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && + Sc.getOpcode() != ISD::BUILD_VECTOR) { + + if (!Subtarget->hasAVX2()) + return SDValue(); + + // Use the register form of the broadcast instruction available on AVX2. + if (VT.is256BitVector()) + Sc = Extract128BitVector(Sc, 0, DAG, dl); + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); + } Ld = Sc.getOperand(0); ConstSplatVal = (Ld.getOpcode() == ISD::Constant || @@ -4948,7 +5048,6 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { } bool Is256 = VT.getSizeInBits() == 256; - bool Is128 = VT.getSizeInBits() == 128; // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector @@ -4958,9 +5057,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { assert(!CVT.isVector() && "Must not broadcast a vector type"); unsigned ScalarSize = CVT.getSizeInBits(); - if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) || - (Is128 && (ScalarSize == 32))) { - + if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { const Constant *C = 0; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -4972,40 +5069,32 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { SDValue CP = DAG.getConstantPool(C, getPointerTy()); unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } - // The scalar source must be a normal load. - if (!ISD::isNormalLoad(Ld.getNode())) - return SDValue(); - - // Reject loads that have uses of the chain result - if (Ld->hasAnyUseOfValue(1)) - return SDValue(); - + bool IsLoad = ISD::isNormalLoad(Ld.getNode()); unsigned ScalarSize = Ld.getValueType().getSizeInBits(); - // VBroadcast to YMM - if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) + // Handle AVX2 in-register broadcasts. + if (!IsLoad && Subtarget->hasAVX2() && + (ScalarSize == 32 || (Is256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); - // VBroadcast to XMM - if (Is128 && (ScalarSize == 32)) + // The scalar source must be a normal load. + if (!IsLoad) + return SDValue(); + + if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match - // double since there is vbroadcastsd xmm + // double since there is no vbroadcastsd xmm if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { - // VBroadcast to YMM - if (Is256 && (ScalarSize == 8 || ScalarSize == 16)) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); - - // VBroadcast to XMM - if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) + if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5103,8 +5192,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Mask.push_back(Idx); for (unsigned i = 1; i != VecElts; ++i) Mask.push_back(i); - Item = DAG.getVectorShuffle(VecVT, dl, Item, - DAG.getUNDEF(Item.getValueType()), + Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), &Mask[0]); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); @@ -5137,8 +5225,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); if (VT.getSizeInBits() == 256) { SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); - Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32), - DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); } else { assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); @@ -5172,7 +5259,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector<int, 8> MaskVec; - for (unsigned i = 0; i < NumElems; i++) + for (unsigned i = 0; i != NumElems; ++i) MaskVec.push_back(i == Idx ? 0 : 1); return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); } @@ -5213,10 +5300,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { NumElems/2); // Recreate the wider vector with the lower and upper part. - SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. @@ -5383,10 +5467,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); - SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } SDValue @@ -5408,75 +5489,64 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { } // Try to lower a shuffle node into a simple blend instruction. -static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op, +static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = Op.getValueType(); - EVT InVT = V1.getValueType(); - int MaskSize = VT.getVectorNumElements(); - int InSize = InVT.getVectorNumElements(); + MVT VT = SVOp->getValueType(0).getSimpleVT(); + unsigned NumElems = VT.getVectorNumElements(); if (!Subtarget->hasSSE41()) return SDValue(); - if (MaskSize != InSize) - return SDValue(); - - int ISDNo = 0; + unsigned ISDNo = 0; MVT OpTy; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return SDValue(); case MVT::v8i16: - ISDNo = X86ISD::BLENDPW; - OpTy = MVT::v8i16; - break; + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v8i16; + break; case MVT::v4i32: case MVT::v4f32: - ISDNo = X86ISD::BLENDPS; - OpTy = MVT::v4f32; - break; + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v4f32; + break; case MVT::v2i64: case MVT::v2f64: - ISDNo = X86ISD::BLENDPD; - OpTy = MVT::v2f64; - break; + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v2f64; + break; case MVT::v8i32: case MVT::v8f32: - if (!Subtarget->hasAVX()) - return SDValue(); - ISDNo = X86ISD::BLENDPS; - OpTy = MVT::v8f32; - break; + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v8f32; + break; case MVT::v4i64: case MVT::v4f64: - if (!Subtarget->hasAVX()) - return SDValue(); - ISDNo = X86ISD::BLENDPD; - OpTy = MVT::v4f64; - break; - case MVT::v16i16: - if (!Subtarget->hasAVX2()) - return SDValue(); - ISDNo = X86ISD::BLENDPW; - OpTy = MVT::v16i16; - break; + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v4f64; + break; } assert(ISDNo && "Invalid Op Number"); unsigned MaskVals = 0; - for (int i = 0; i < MaskSize; ++i) { + for (unsigned i = 0; i != NumElems; ++i) { int EltIdx = SVOp->getMaskElt(i); - if (EltIdx == i || EltIdx == -1) + if (EltIdx == (int)i || EltIdx < 0) MaskVals |= (1<<i); - else if (EltIdx == (i + MaskSize)) + else if (EltIdx == (int)(i + NumElems)) continue; // Bit is set to zero; - else return SDValue(); + else + return SDValue(); } V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); @@ -5630,13 +5700,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, bool TwoInputs = V1Used && V2Used; for (unsigned i = 0; i != 8; ++i) { int EltIdx = MaskVals[i] * 2; - if (TwoInputs && (EltIdx >= 16)) { - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - continue; - } - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); + int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; + int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; + pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); } V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, @@ -5650,13 +5717,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, pshufbMask.clear(); for (unsigned i = 0; i != 8; ++i) { int EltIdx = MaskVals[i] * 2; - if (EltIdx < 16) { - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - continue; - } - pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); + int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; + int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; + pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); } V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, @@ -5732,10 +5796,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, int EltIdx = MaskVals[i]; if (EltIdx < 0) continue; - SDValue ExtOp = (EltIdx < 8) - ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, - DAG.getIntPtrConstant(EltIdx)) - : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, + SDValue ExtOp = (EltIdx < 8) ? + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, + DAG.getIntPtrConstant(EltIdx)) : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, DAG.getIntPtrConstant(EltIdx - 8)); NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, DAG.getIntPtrConstant(i)); @@ -5756,21 +5820,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef<int> MaskVals = SVOp->getMask(); + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. - // FIXME: kill V2Only once shuffles are canonizalized by getNode. - bool V1Only = true; - bool V2Only = true; - for (unsigned i = 0; i < 16; ++i) { - int EltIdx = MaskVals[i]; - if (EltIdx < 0) - continue; - if (EltIdx < 16) - V2Only = false; - else - V1Only = false; - } // If SSSE3, use 1 pshufb instruction per vector with elements in the result. if (TLI.getSubtarget()->hasSSSE3()) { @@ -5782,23 +5836,16 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // Otherwise, we have elements from both input vectors, and must zero out // elements that come from V2 in the first mask, and V1 in the second mask // so that we can OR them together. - bool TwoInputs = !(V1Only || V2Only); for (unsigned i = 0; i != 16; ++i) { int EltIdx = MaskVals[i]; - if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - continue; - } + if (EltIdx < 0 || EltIdx >= 16) + EltIdx = 0x80; pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } - // If all the elements are from V2, assign it to V1 and return after - // building the first pshufb. - if (V2Only) - V1 = V2; V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); - if (!TwoInputs) + if (V2IsUndef) return V1; // Calculate the shuffle mask for the second input, shuffle it, and @@ -5806,11 +5853,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, pshufbMask.clear(); for (unsigned i = 0; i != 16; ++i) { int EltIdx = MaskVals[i]; - if (EltIdx < 16) { - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - continue; - } - pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); + EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, @@ -5823,7 +5867,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // the 16 different words that comprise the two doublequadword input vectors. V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); - SDValue NewV = V2Only ? V2 : V1; + SDValue NewV = V1; for (int i = 0; i != 8; ++i) { int Elt0 = MaskVals[i*2]; int Elt1 = MaskVals[i*2+1]; @@ -5833,9 +5877,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, continue; // This word of the result is already in the correct place, skip it. - if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) - continue; - if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) + if ((Elt0 == i*2) && (Elt1 == i*2+1)) continue; SDValue Elt0Src = Elt0 < 16 ? V1 : V2; @@ -5897,41 +5939,37 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, DebugLoc dl) { - EVT VT = SVOp->getValueType(0); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned NumElems = VT.getVectorNumElements(); - unsigned NewWidth = (NumElems == 4) ? 2 : 4; - EVT NewVT; - switch (VT.getSimpleVT().SimpleTy) { + MVT NewVT; + unsigned Scale; + switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected!"); - case MVT::v4f32: NewVT = MVT::v2f64; break; - case MVT::v4i32: NewVT = MVT::v2i64; break; - case MVT::v8i16: NewVT = MVT::v4i32; break; - case MVT::v16i8: NewVT = MVT::v4i32; break; + case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; + case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; + case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; + case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; + case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; + case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; } - int Scale = NumElems / NewWidth; SmallVector<int, 8> MaskVec; - for (unsigned i = 0; i < NumElems; i += Scale) { + for (unsigned i = 0; i != NumElems; i += Scale) { int StartIdx = -1; - for (int j = 0; j < Scale; ++j) { + for (unsigned j = 0; j != Scale; ++j) { int EltIdx = SVOp->getMaskElt(i+j); if (EltIdx < 0) continue; - if (StartIdx == -1) - StartIdx = EltIdx - (EltIdx % Scale); - if (EltIdx != StartIdx + j) + if (StartIdx < 0) + StartIdx = (EltIdx / Scale); + if (EltIdx != (int)(StartIdx*Scale + j)) return SDValue(); } - if (StartIdx == -1) - MaskVec.push_back(-1); - else - MaskVec.push_back(StartIdx / Scale); + MaskVec.push_back(StartIdx); } - V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); + SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); } @@ -5974,6 +6012,11 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, /// which could not be matched by any known target speficic shuffle static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { + + SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); + if (NewOp.getNode()) + return NewOp; + EVT VT = SVOp->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); @@ -5982,14 +6025,15 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { DebugLoc dl = SVOp->getDebugLoc(); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); - SDValue Shufs[2]; + SDValue Output[2]; SmallVector<int, 16> Mask; for (unsigned l = 0; l < 2; ++l) { // Build a shuffle mask for the output, discovering on the fly which // input vectors to use as shuffle operands (recorded in InputUsed). // If building a suitable shuffle vector proves too hard, then bail - // out with useBuildVector set. + // out with UseBuildVector set. + bool UseBuildVector = false; int InputUsed[2] = { -1, -1 }; // Not yet discovered. unsigned LaneStart = l * NumLaneElems; for (unsigned i = 0; i != NumLaneElems; ++i) { @@ -6021,38 +6065,61 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } if (OpNo >= array_lengthof(InputUsed)) { - // More than two input vectors used! Give up. - return SDValue(); + // More than two input vectors used! Give up on trying to create a + // shuffle vector. Insert all elements into a BUILD_VECTOR instead. + UseBuildVector = true; + break; } // Add the mask index for the new shuffle vector. Mask.push_back(Idx + OpNo * NumLaneElems); } - if (InputUsed[0] < 0) { + if (UseBuildVector) { + SmallVector<SDValue, 16> SVOps; + for (unsigned i = 0; i != NumLaneElems; ++i) { + // The mask element. This indexes into the input. + int Idx = SVOp->getMaskElt(i+LaneStart); + if (Idx < 0) { + SVOps.push_back(DAG.getUNDEF(EltVT)); + continue; + } + + // The input vector this mask element indexes into. + int Input = Idx / NumElems; + + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NumElems; + + // Extract the vector element by hand. + SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + SVOp->getOperand(Input), + DAG.getIntPtrConstant(Idx))); + } + + // Construct the output using a BUILD_VECTOR. + Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], + SVOps.size()); + } else if (InputUsed[0] < 0) { // No input vectors were used! The result is undefined. - Shufs[l] = DAG.getUNDEF(NVT); + Output[l] = DAG.getUNDEF(NVT); } else { SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), - DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32), - DAG, dl); + (InputUsed[0] % 2) * NumLaneElems, + DAG, dl); // If only one input was used, use an undefined vector for the other. SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), - DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32), - DAG, dl); + (InputUsed[1] % 2) * NumLaneElems, DAG, dl); // At least one input vector was used. Create a new shuffle vector. - Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); + Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); } Mask.clear(); } // Concatenate the result back - SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0], - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32), - DAG, dl); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); } /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with @@ -6108,7 +6175,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); - } else if (NumLo == 3 || NumHi == 3) { + } + + if (NumLo == 3 || NumHi == 3) { // Otherwise, we must have three elements from one vector, call it X, and // one element from the other, call it Y. First, use a shufps to build an // intermediate vector with the one element from Y and the element from X @@ -6144,17 +6213,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { Mask1[2] = HiIndex & 1 ? 6 : 4; Mask1[3] = HiIndex & 1 ? 4 : 6; return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - } else { - Mask1[0] = HiIndex & 1 ? 2 : 0; - Mask1[1] = HiIndex & 1 ? 0 : 2; - Mask1[2] = PermMask[2]; - Mask1[3] = PermMask[3]; - if (Mask1[2] >= 0) - Mask1[2] += 4; - if (Mask1[3] >= 0) - Mask1[3] += 4; - return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); } + + Mask1[0] = HiIndex & 1 ? 2 : 0; + Mask1[1] = HiIndex & 1 ? 0 : 2; + Mask1[2] = PermMask[2]; + Mask1[3] = PermMask[3]; + if (Mask1[2] >= 0) + Mask1[2] += 4; + if (Mask1[3] >= 0) + Mask1[3] += 4; + return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); } // Break it into (shuffle shuffle_hi, shuffle_lo). @@ -6303,7 +6372,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); if (NumElems == 4) - // If we don't care about the second element, procede to use movss. + // If we don't care about the second element, proceed to use movss. if (SVOp->getMaskElt(1) != -1) return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); } @@ -6361,7 +6430,8 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! - if (VT == MVT::v8i16 || VT == MVT::v16i8) { + if (VT == MVT::v8i16 || VT == MVT::v16i8 || + VT == MVT::v16i16 || VT == MVT::v32i8) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); @@ -6565,11 +6635,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // new vector_shuffle with the corrected mask.p SmallVector<int, 8> NewMask(M.begin(), M.end()); NormalizeMask(NewMask, NumElems); - if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) { + if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) { + if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); - } } if (Commuted) { @@ -6606,12 +6675,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); } - if (isPSHUFHWMask(M, VT)) + if (isPSHUFHWMask(M, VT, HasAVX2)) return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, getShufflePSHUFHWImmediate(SVOp), DAG); - if (isPSHUFLWMask(M, VT)) + if (isPSHUFLWMask(M, VT, HasAVX2)) return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, getShufflePSHUFLWImmediate(SVOp), DAG); @@ -6648,7 +6717,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG); + SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; @@ -6715,7 +6784,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); - } else if (VT.getSizeInBits() == 16) { + } + + if (VT.getSizeInBits() == 16) { unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. if (Idx == 0) @@ -6730,7 +6801,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); - } else if (VT == MVT::f32) { + } + + if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the // result has a single use which is a store or a bitcast to i32. And in @@ -6750,7 +6823,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, Op.getOperand(0)), Op.getOperand(1)); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); - } else if (VT == MVT::i32 || VT == MVT::i64) { + } + + if (VT == MVT::i32 || VT == MVT::i64) { // ExtractPS/pextrq works with constant index. if (isa<ConstantSDNode>(Op.getOperand(1))) return Op; @@ -6777,12 +6852,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); // Get the 128-bit vector. - bool Upper = IdxVal >= NumElems/2; - Vec = Extract128BitVector(Vec, - DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl); + Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); + if (IdxVal >= NumElems/2) + IdxVal -= NumElems/2; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, - Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx); + DAG.getConstant(IdxVal, MVT::i32)); } assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); @@ -6812,7 +6887,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); - } else if (VT.getSizeInBits() == 32) { + } + + if (VT.getSizeInBits() == 32) { unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); if (Idx == 0) return Op; @@ -6824,7 +6901,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0)); - } else if (VT.getSizeInBits() == 64) { + } + + if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. @@ -6877,7 +6956,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); return DAG.getNode(Opc, dl, VT, N0, N1, N2); - } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { + } + + if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into these // bits. For example (insert (extract, 3), 2) could be matched by putting @@ -6890,8 +6971,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); - } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && - isa<ConstantSDNode>(N2)) { + } + + if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { // PINSR* works with constant index. return Op; } @@ -6917,16 +6999,15 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // Get the desired 128-bit vector half. unsigned NumElems = VT.getVectorNumElements(); unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); - bool Upper = IdxVal >= NumElems/2; - SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32); - SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl); + SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, - N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2); + bool Upper = IdxVal >= NumElems/2; + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, + DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); // Insert the changed part back to the 256-bit vector - return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); + return Insert128BitVector(N0, V, IdxVal, DAG, dl); } if (Subtarget->hasSSE41()) @@ -6964,19 +7045,16 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. - return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, - DAG.getConstant(0, MVT::i32), - DAG, dl); + return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } - if (Op.getValueType() == MVT::v1i64 && + if (OpVT == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); - assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && - "Expected an SSE type!"); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), + assert(OpVT.getSizeInBits() == 128 && "Expected an SSE type!"); + return DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); } @@ -6990,9 +7068,11 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getNode()->getOperand(0); SDValue Idx = Op.getNode()->getOperand(1); - if (Op.getNode()->getValueType(0).getSizeInBits() == 128 - && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { - return Extract128BitVector(Vec, Idx, DAG, dl); + if (Op.getNode()->getValueType(0).getSizeInBits() == 128 && + Vec.getNode()->getValueType(0).getSizeInBits() == 256 && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Extract128BitVector(Vec, IdxVal, DAG, dl); } } return SDValue(); @@ -7009,9 +7089,11 @@ X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue SubVec = Op.getNode()->getOperand(1); SDValue Idx = Op.getNode()->getOperand(2); - if (Op.getNode()->getValueType(0).getSizeInBits() == 256 - && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { - return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); + if (Op.getNode()->getValueType(0).getSizeInBits() == 256 && + SubVec.getNode()->getValueType(0).getSizeInBits() == 128 && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); } } return SDValue(); @@ -7220,7 +7302,7 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, - unsigned char OperandFlags) { + unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); DebugLoc dl = GA->getDebugLoc(); @@ -7228,12 +7310,16 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, GA->getValueType(0), GA->getOffset(), OperandFlags); + + X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR + : X86ISD::TLSADDR; + if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; - Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3); } else { SDValue Ops[] = { Chain, TGA }; - Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. @@ -7265,11 +7351,49 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, X86::RAX, X86II::MO_TLSGD); } -// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or -// "local exec" model. +static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + const EVT PtrVT, + bool is64Bit) { + DebugLoc dl = GA->getDebugLoc(); + + // Get the start address of the TLS block for this module. + X86MachineFunctionInfo* MFI = DAG.getMachineFunction() + .getInfo<X86MachineFunctionInfo>(); + MFI->incNumLocalDynamicTLSAccesses(); + + SDValue Base; + if (is64Bit) { + Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, + X86II::MO_TLSLD, /*LocalDynamic=*/true); + } else { + SDValue InFlag; + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag); + InFlag = Chain.getValue(1); + Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, + X86II::MO_TLSLDM, /*LocalDynamic=*/true); + } + + // Note: the CleanupLocalDynamicTLSPass will remove redundant computations + // of Base. + + // Build x@dtpoff. + unsigned char OperandFlags = X86II::MO_DTPOFF; + unsigned WrapperKind = X86ISD::Wrapper; + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), OperandFlags); + SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); + + // Add x@dtpoff with the base. + return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, - bool is64Bit) { + bool is64Bit, bool isPIC) { DebugLoc dl = GA->getDebugLoc(); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). @@ -7287,25 +7411,36 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, unsigned WrapperKind = X86ISD::Wrapper; if (model == TLSModel::LocalExec) { OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; - } else if (is64Bit) { - assert(model == TLSModel::InitialExec); - OperandFlags = X86II::MO_GOTTPOFF; - WrapperKind = X86ISD::WrapperRIP; + } else if (model == TLSModel::InitialExec) { + if (is64Bit) { + OperandFlags = X86II::MO_GOTTPOFF; + WrapperKind = X86ISD::WrapperRIP; + } else { + OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; + } } else { - assert(model == TLSModel::InitialExec); - OperandFlags = X86II::MO_INDNTPOFF; + llvm_unreachable("Unexpected model"); } - // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial - // exec) + // emit "addl x@ntpoff,%eax" (local exec) + // or "addl x@indntpoff,%eax" (initial exec) + // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); - if (model == TLSModel::InitialExec) + if (model == TLSModel::InitialExec) { + if (isPIC && !is64Bit) { + Offset = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), + Offset); + } + Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, + 0); + } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. @@ -7319,29 +7454,26 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = GA->getGlobal(); if (Subtarget->isTargetELF()) { - // TODO: implement the "local dynamic" model - // TODO: implement the "initial exec"model for pic executables - - // If GV is an alias then use the aliasee for determining - // thread-localness. - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - GV = GA->resolveAliasedGlobal(false); - TLSModel::Model model = getTargetMachine().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: - case TLSModel::LocalDynamic: // not implemented if (Subtarget->is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); - + case TLSModel::LocalDynamic: + return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), + Subtarget->is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, - Subtarget->is64Bit()); + Subtarget->is64Bit(), + getTargetMachine().getRelocationModel() == Reloc::PIC_); } - } else if (Subtarget->isTargetDarwin()) { + llvm_unreachable("Unknown TLS model."); + } + + if (Subtarget->isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? @@ -7384,7 +7516,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), Chain.getValue(1)); - } else if (Subtarget->isTargetWindows()) { + } + + if (Subtarget->isTargetWindows()) { // Just use the implicit TLS architecture // Need to generate someting similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -7430,7 +7564,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { false, false, false, 0); SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), - getPointerTy()); + getPointerTy()); IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); @@ -7694,12 +7828,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, // Handle final rounding. EVT DestVT = Op.getValueType(); - if (DestVT.bitsLT(MVT::f64)) { + if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, DAG.getIntPtrConstant(0)); - } else if (DestVT.bitsGT(MVT::f64)) { + if (DestVT.bitsGT(MVT::f64)) return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); - } // Handle final rounding. return Sub; @@ -7720,10 +7853,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, EVT DstVT = Op.getValueType(); if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); - else if (SrcVT == MVT::i32 && X86ScalarSSEf64) + if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); - else if (Subtarget->is64Bit() && - SrcVT == MVT::i64 && DstVT == MVT::f32) + if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. @@ -7900,9 +8032,9 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); - else - // The node is the result. - return FIST; + + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, @@ -7917,9 +8049,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); - else - // The node is the result. - return FIST; + + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFABS(SDValue Op, @@ -7969,12 +8101,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::XOR, dl, XORVT, - DAG.getNode(ISD::BITCAST, dl, XORVT, - Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); - } else { - return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); + DAG.getNode(ISD::BITCAST, dl, XORVT, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); } + + return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { @@ -8173,7 +8305,13 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, // Otherwise use a regular EFLAGS-setting instruction. switch (Op.getNode()->getOpcode()) { default: llvm_unreachable("unexpected operator!"); - case ISD::SUB: Opcode = X86ISD::SUB; break; + case ISD::SUB: + // If the only use of SUB is EFLAGS, use CMP instead. + if (Op.hasOneUse()) + Opcode = X86ISD::CMP; + else + Opcode = X86ISD::SUB; + break; case ISD::OR: Opcode = X86ISD::OR; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; @@ -8199,6 +8337,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, Op.getValueType())); + if (Opcode == X86ISD::CMP) { + SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0), + Op.getOperand(1)); + // We can't replace usage of SUB with CMP. + // The SUB node will be removed later because there is no use of it. + return SDValue(New.getNode(), 0); + } + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector<SDValue, 4> Ops; for (unsigned i = 0; i != NumOperands; ++i) @@ -8221,6 +8367,30 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } +/// Convert a comparison if required by the subtarget. +SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, + SelectionDAG &DAG) const { + // If the subtarget does not support the FUCOMI instruction, floating-point + // comparisons have to be converted. + if (Subtarget->hasCMov() || + Cmp.getOpcode() != X86ISD::CMP || + !Cmp.getOperand(0).getValueType().isFloatingPoint() || + !Cmp.getOperand(1).getValueType().isFloatingPoint()) + return Cmp; + + // The instruction selector will select an FUCOM instruction instead of + // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence + // build an SDNode sequence that transfers the result from FPSW into EFLAGS: + // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) + DebugLoc dl = Cmp.getDebugLoc(); + SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); + SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); + SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, + DAG.getConstant(8, MVT::i8)); + SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); +} + /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, @@ -8342,6 +8512,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SDValue(); SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), EFLAGS); } @@ -8354,21 +8525,19 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); DebugLoc dl = Op.getDebugLoc(); SDValue CC = Op.getOperand(2); - SDValue Idx0 = DAG.getConstant(0, MVT::i32); - SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); - SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); - SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back MVT EltVT = VT.getVectorElementType().getSimpleVT(); @@ -8438,7 +8607,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); - } else if (SetCCOpcode == ISD::SETONE) { + } + if (SetCCOpcode == ISD::SETONE) { SDValue ORD, NEQ; ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); @@ -8511,7 +8681,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); - if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) + if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || + Opc == X86ISD::SAHF) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || @@ -8557,6 +8728,46 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond = NewCond; } + // Handle the following cases related to max and min: + // (a > b) ? (a-b) : 0 + // (a >= b) ? (a-b) : 0 + // (b < a) ? (a-b) : 0 + // (b <= a) ? (a-b) : 0 + // Comparison is removed to use EFLAGS from SUB. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2)) + if (Cond.getOpcode() == X86ISD::SETCC && + Cond.getOperand(1).getOpcode() == X86ISD::CMP && + (Op1.getOpcode() == ISD::SUB || Op1.getOpcode() == X86ISD::SUB) && + C->getAPIntValue() == 0) { + SDValue Cmp = Cond.getOperand(1); + unsigned CC = cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + if ((DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(0)) && + DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(1)) && + (CC == X86::COND_G || CC == X86::COND_GE || + CC == X86::COND_A || CC == X86::COND_AE)) || + (DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(1)) && + DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(0)) && + (CC == X86::COND_L || CC == X86::COND_LE || + CC == X86::COND_B || CC == X86::COND_BE))) { + + if (Op1.getOpcode() == ISD::SUB) { + SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i32); + SDValue New = DAG.getNode(X86ISD::SUB, DL, VTs, + Op1.getOperand(0), Op1.getOperand(1)); + DAG.ReplaceAllUsesWith(Op1, New); + Op1 = New; + } + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); + unsigned NewCC = (CC == X86::COND_G || CC == X86::COND_GE || + CC == X86::COND_L || + CC == X86::COND_LE) ? X86::COND_GE : X86::COND_AE; + SDValue Ops[] = { Op2, Op1, DAG.getConstant(NewCC, MVT::i8), + SDValue(Op1.getNode(), 1) }; + return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); + } + } + // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y @@ -8573,8 +8784,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Y = isAllOnes(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); + // Apply further optimizations for special cases + // (select (x != 0), -1, 0) -> neg & sbb + // (select (x == 0), 0, -1) -> neg & sbb + if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) + if (YC->isNullValue() && + (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { + SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, + DAG.getConstant(0, CmpOp0.getValueType()), + CmpOp0); + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), + SDValue(Neg.getNode(), 1)); + return Res; + } + Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); SDValue Res = // Res = 0 or -1. DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), @@ -8681,6 +8909,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::CMP) { + Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && @@ -8919,6 +9148,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); @@ -8948,6 +9178,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); @@ -8981,6 +9212,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { CC = DAG.getConstant(X86::COND_NE, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DAG); } + Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); } @@ -9019,7 +9251,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, const Function *F = MF.getFunction(); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; I++) + I != E; ++I) if (I->hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); @@ -9202,12 +9434,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); if (isa<ConstantSDNode>(ShAmt)) { + // Constant may be a TargetConstant. Use a regular constant. + uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue(); switch (Opc) { default: llvm_unreachable("Unknown target vector shift node"); case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: - return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); + return DAG.getNode(Opc, dl, VT, SrcOp, + DAG.getConstant(ShiftAmt, MVT::i32)); } } @@ -9227,7 +9462,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, ShOps[2] = DAG.getUNDEF(MVT::i32); ShOps[3] = DAG.getUNDEF(MVT::i32); ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); - ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); + + // The return type has to be a 128-bit type with the same element + // type as the input type. + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); + + ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } @@ -9337,196 +9578,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(X86CC, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // XOP comparison intrinsics - case Intrinsic::x86_xop_vpcomltb: - case Intrinsic::x86_xop_vpcomltw: - case Intrinsic::x86_xop_vpcomltd: - case Intrinsic::x86_xop_vpcomltq: - case Intrinsic::x86_xop_vpcomltub: - case Intrinsic::x86_xop_vpcomltuw: - case Intrinsic::x86_xop_vpcomltud: - case Intrinsic::x86_xop_vpcomltuq: - case Intrinsic::x86_xop_vpcomleb: - case Intrinsic::x86_xop_vpcomlew: - case Intrinsic::x86_xop_vpcomled: - case Intrinsic::x86_xop_vpcomleq: - case Intrinsic::x86_xop_vpcomleub: - case Intrinsic::x86_xop_vpcomleuw: - case Intrinsic::x86_xop_vpcomleud: - case Intrinsic::x86_xop_vpcomleuq: - case Intrinsic::x86_xop_vpcomgtb: - case Intrinsic::x86_xop_vpcomgtw: - case Intrinsic::x86_xop_vpcomgtd: - case Intrinsic::x86_xop_vpcomgtq: - case Intrinsic::x86_xop_vpcomgtub: - case Intrinsic::x86_xop_vpcomgtuw: - case Intrinsic::x86_xop_vpcomgtud: - case Intrinsic::x86_xop_vpcomgtuq: - case Intrinsic::x86_xop_vpcomgeb: - case Intrinsic::x86_xop_vpcomgew: - case Intrinsic::x86_xop_vpcomged: - case Intrinsic::x86_xop_vpcomgeq: - case Intrinsic::x86_xop_vpcomgeub: - case Intrinsic::x86_xop_vpcomgeuw: - case Intrinsic::x86_xop_vpcomgeud: - case Intrinsic::x86_xop_vpcomgeuq: - case Intrinsic::x86_xop_vpcomeqb: - case Intrinsic::x86_xop_vpcomeqw: - case Intrinsic::x86_xop_vpcomeqd: - case Intrinsic::x86_xop_vpcomeqq: - case Intrinsic::x86_xop_vpcomequb: - case Intrinsic::x86_xop_vpcomequw: - case Intrinsic::x86_xop_vpcomequd: - case Intrinsic::x86_xop_vpcomequq: - case Intrinsic::x86_xop_vpcomneb: - case Intrinsic::x86_xop_vpcomnew: - case Intrinsic::x86_xop_vpcomned: - case Intrinsic::x86_xop_vpcomneq: - case Intrinsic::x86_xop_vpcomneub: - case Intrinsic::x86_xop_vpcomneuw: - case Intrinsic::x86_xop_vpcomneud: - case Intrinsic::x86_xop_vpcomneuq: - case Intrinsic::x86_xop_vpcomfalseb: - case Intrinsic::x86_xop_vpcomfalsew: - case Intrinsic::x86_xop_vpcomfalsed: - case Intrinsic::x86_xop_vpcomfalseq: - case Intrinsic::x86_xop_vpcomfalseub: - case Intrinsic::x86_xop_vpcomfalseuw: - case Intrinsic::x86_xop_vpcomfalseud: - case Intrinsic::x86_xop_vpcomfalseuq: - case Intrinsic::x86_xop_vpcomtrueb: - case Intrinsic::x86_xop_vpcomtruew: - case Intrinsic::x86_xop_vpcomtrued: - case Intrinsic::x86_xop_vpcomtrueq: - case Intrinsic::x86_xop_vpcomtrueub: - case Intrinsic::x86_xop_vpcomtrueuw: - case Intrinsic::x86_xop_vpcomtrueud: - case Intrinsic::x86_xop_vpcomtrueuq: { - unsigned CC = 0; - unsigned Opc = 0; - - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_xop_vpcomltb: - case Intrinsic::x86_xop_vpcomltw: - case Intrinsic::x86_xop_vpcomltd: - case Intrinsic::x86_xop_vpcomltq: - CC = 0; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomltub: - case Intrinsic::x86_xop_vpcomltuw: - case Intrinsic::x86_xop_vpcomltud: - case Intrinsic::x86_xop_vpcomltuq: - CC = 0; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomleb: - case Intrinsic::x86_xop_vpcomlew: - case Intrinsic::x86_xop_vpcomled: - case Intrinsic::x86_xop_vpcomleq: - CC = 1; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomleub: - case Intrinsic::x86_xop_vpcomleuw: - case Intrinsic::x86_xop_vpcomleud: - case Intrinsic::x86_xop_vpcomleuq: - CC = 1; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomgtb: - case Intrinsic::x86_xop_vpcomgtw: - case Intrinsic::x86_xop_vpcomgtd: - case Intrinsic::x86_xop_vpcomgtq: - CC = 2; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomgtub: - case Intrinsic::x86_xop_vpcomgtuw: - case Intrinsic::x86_xop_vpcomgtud: - case Intrinsic::x86_xop_vpcomgtuq: - CC = 2; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomgeb: - case Intrinsic::x86_xop_vpcomgew: - case Intrinsic::x86_xop_vpcomged: - case Intrinsic::x86_xop_vpcomgeq: - CC = 3; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomgeub: - case Intrinsic::x86_xop_vpcomgeuw: - case Intrinsic::x86_xop_vpcomgeud: - case Intrinsic::x86_xop_vpcomgeuq: - CC = 3; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomeqb: - case Intrinsic::x86_xop_vpcomeqw: - case Intrinsic::x86_xop_vpcomeqd: - case Intrinsic::x86_xop_vpcomeqq: - CC = 4; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomequb: - case Intrinsic::x86_xop_vpcomequw: - case Intrinsic::x86_xop_vpcomequd: - case Intrinsic::x86_xop_vpcomequq: - CC = 4; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomneb: - case Intrinsic::x86_xop_vpcomnew: - case Intrinsic::x86_xop_vpcomned: - case Intrinsic::x86_xop_vpcomneq: - CC = 5; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomneub: - case Intrinsic::x86_xop_vpcomneuw: - case Intrinsic::x86_xop_vpcomneud: - case Intrinsic::x86_xop_vpcomneuq: - CC = 5; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomfalseb: - case Intrinsic::x86_xop_vpcomfalsew: - case Intrinsic::x86_xop_vpcomfalsed: - case Intrinsic::x86_xop_vpcomfalseq: - CC = 6; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomfalseub: - case Intrinsic::x86_xop_vpcomfalseuw: - case Intrinsic::x86_xop_vpcomfalseud: - case Intrinsic::x86_xop_vpcomfalseuq: - CC = 6; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomtrueb: - case Intrinsic::x86_xop_vpcomtruew: - case Intrinsic::x86_xop_vpcomtrued: - case Intrinsic::x86_xop_vpcomtrueq: - CC = 7; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomtrueub: - case Intrinsic::x86_xop_vpcomtrueuw: - case Intrinsic::x86_xop_vpcomtrueud: - case Intrinsic::x86_xop_vpcomtrueuq: - CC = 7; - Opc = X86ISD::VPCOMU; - break; - } - - SDValue LHS = Op.getOperand(1); - SDValue RHS = Op.getOperand(2); - return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS, - DAG.getConstant(CC, MVT::i8)); - } - // Arithmetic intrinsics. case Intrinsic::x86_sse2_pmulu_dq: case Intrinsic::x86_avx2_pmulu_dq: @@ -9770,6 +9821,38 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const } } +SDValue +X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + + // RDRAND intrinsics. + case Intrinsic::x86_rdrand_16: + case Intrinsic::x86_rdrand_32: + case Intrinsic::x86_rdrand_64: { + // Emit the node with the right value type. + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); + SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0)); + + // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise + // return the value from Rand, which is always 0, casted to i32. + SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), + DAG.getConstant(1, Op->getValueType(1)), + DAG.getConstant(X86::COND_B, MVT::i32), + SDValue(Result.getNode(), 1) }; + SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, + DAG.getVTList(Op->getValueType(1), MVT::Glue), + Ops, 4); + + // Return { result, isValid, chain }. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, + SDValue(Result.getNode(), 2)); + } + } +} + SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -9817,7 +9900,6 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); @@ -9834,7 +9916,6 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); - MF.getRegInfo().addLiveOut(StoreAddrReg); return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, @@ -10153,20 +10234,18 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { assert(VT.getSizeInBits() == 256 && VT.isInteger() && "Unsupported value type for operation"); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); DebugLoc dl = Op.getDebugLoc(); - SDValue Idx0 = DAG.getConstant(0, MVT::i32); - SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); - SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); - SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); @@ -10311,6 +10390,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } + llvm_unreachable("Unknown shift opcode."); } if (Subtarget->hasAVX2() && VT == MVT::v32i8) { @@ -10354,6 +10434,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } + llvm_unreachable("Unknown shift opcode."); } } } @@ -10428,9 +10509,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); // Extract the two vectors - SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); - SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + SDValue V1 = Extract128BitVector(R, 0, DAG, dl); + SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); // Recreate the shift amount vectors SDValue Amt1, Amt2; @@ -10449,9 +10529,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { &Amt2Csts[0], NumElems/2); } else { // Variable shift amount - Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); - Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + Amt1 = Extract128BitVector(Amt, 0, DAG, dl); + Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); } // Issue new vector shifts for the smaller types @@ -10561,20 +10640,18 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, return SDValue(); if (!Subtarget->hasAVX2()) { // needs to be split - int NumElems = VT.getVectorNumElements(); - SDValue Idx0 = DAG.getConstant(0, MVT::i32); - SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + unsigned NumElems = VT.getVectorNumElements(); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); EVT ExtraEltVT = ExtraVT.getVectorElementType(); - int ExtraNumElems = ExtraVT.getVectorNumElements(); + unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, ExtraNumElems/2); SDValue Extra = DAG.getValueType(ExtraVT); @@ -10860,6 +10937,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: @@ -11119,10 +11197,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; + case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; + case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; @@ -11191,6 +11271,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; + case X86ISD::SAHF: return "X86ISD::SAHF"; + case X86ISD::RDRAND: return "X86ISD::RDRAND"; } } @@ -11259,6 +11341,15 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { return true; } +bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { + return Imm == (int32_t)Imm; +} + +bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { + // Can also use sub to handle negated immediates. + return Imm == (int32_t)Imm; +} + bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isInteger() || !VT2.isInteger()) return false; @@ -11301,8 +11392,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isMOVLMask(M, VT) || isSHUFPMask(M, VT, Subtarget->hasAVX()) || isPSHUFDMask(M, VT) || - isPSHUFHWMask(M, VT) || - isPSHUFLWMask(M, VT) || + isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) || + isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) || isPALIGNRMask(M, VT, Subtarget) || isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || @@ -11461,7 +11552,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, // result in out1, out2 // fallthrough -->nextMBB - const TargetRegisterClass *RC = X86::GR32RegisterClass; + const TargetRegisterClass *RC = &X86::GR32RegClass; const unsigned LoadOpc = X86::MOV32rm; const unsigned NotOpc = X86::NOT32r; const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); @@ -11663,7 +11754,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] int valArgIndx = lastAddrIndx + 1; - unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); @@ -11673,7 +11764,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, argOpers[valArgIndx]->isImm()) && "invalid operand"); - unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); if (argOpers[valArgIndx]->isReg()) MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); else @@ -11688,7 +11779,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MIB.addReg(t2); // Generate movc - unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); MIB.addReg(t2); MIB.addReg(t1); @@ -12307,8 +12398,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI) + .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) + .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) @@ -12518,7 +12610,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Load the old value of the high byte of the control word... unsigned OldCW = - F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); + F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); @@ -12606,25 +12698,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::AND32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::NOT32r, X86::EAX, - X86::GR32RegisterClass); + &X86::GR32RegClass); case X86::ATOMOR32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, X86::OR32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::NOT32r, X86::EAX, - X86::GR32RegisterClass); + &X86::GR32RegClass); case X86::ATOMXOR32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, X86::XOR32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::NOT32r, X86::EAX, - X86::GR32RegisterClass); + &X86::GR32RegClass); case X86::ATOMNAND32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, X86::AND32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::NOT32r, X86::EAX, - X86::GR32RegisterClass, true); + &X86::GR32RegClass, true); case X86::ATOMMIN32: return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); case X86::ATOMMAX32: @@ -12639,25 +12731,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::AND16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::NOT16r, X86::AX, - X86::GR16RegisterClass); + &X86::GR16RegClass); case X86::ATOMOR16: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, X86::OR16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::NOT16r, X86::AX, - X86::GR16RegisterClass); + &X86::GR16RegClass); case X86::ATOMXOR16: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, X86::XOR16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::NOT16r, X86::AX, - X86::GR16RegisterClass); + &X86::GR16RegClass); case X86::ATOMNAND16: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, X86::AND16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::NOT16r, X86::AX, - X86::GR16RegisterClass, true); + &X86::GR16RegClass, true); case X86::ATOMMIN16: return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); case X86::ATOMMAX16: @@ -12672,25 +12764,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::AND8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::NOT8r, X86::AL, - X86::GR8RegisterClass); + &X86::GR8RegClass); case X86::ATOMOR8: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, X86::OR8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::NOT8r, X86::AL, - X86::GR8RegisterClass); + &X86::GR8RegClass); case X86::ATOMXOR8: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, X86::XOR8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::NOT8r, X86::AL, - X86::GR8RegisterClass); + &X86::GR8RegClass); case X86::ATOMNAND8: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, X86::AND8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::NOT8r, X86::AL, - X86::GR8RegisterClass, true); + &X86::GR8RegClass, true); // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. // This group is for 64-bit host. case X86::ATOMAND64: @@ -12698,25 +12790,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::AND64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::NOT64r, X86::RAX, - X86::GR64RegisterClass); + &X86::GR64RegClass); case X86::ATOMOR64: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, X86::OR64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::NOT64r, X86::RAX, - X86::GR64RegisterClass); + &X86::GR64RegClass); case X86::ATOMXOR64: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, X86::XOR64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::NOT64r, X86::RAX, - X86::GR64RegisterClass); + &X86::GR64RegClass); case X86::ATOMNAND64: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, X86::AND64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::NOT64r, X86::RAX, - X86::GR64RegisterClass, true); + &X86::GR64RegClass, true); case X86::ATOMMIN64: return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); case X86::ATOMMAX64: @@ -12871,10 +12963,10 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, /// inserting the result into the low part of a new 256-bit vector static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j) + for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || SVOp->getMaskElt(j) >= 0) return false; @@ -12887,10 +12979,10 @@ static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { /// inserting the result into the high part of a new 256-bit vector static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j) + for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || SVOp->getMaskElt(j) >= 0) return false; @@ -12907,7 +12999,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && V2.getOpcode() == ISD::CONCAT_VECTORS) { @@ -12932,30 +13024,31 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, // To match the shuffle mask, the first half of the mask should // be exactly the first vector, and all the rest a splat with the // first element of the second one. - for (int i = 0; i < NumElems/2; ++i) + for (unsigned i = 0; i != NumElems/2; ++i) if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) return SDValue(); // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { - SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); - SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; - SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, - Ld->getMemoryVT(), - Ld->getPointerInfo(), - Ld->getAlignment(), - false/*isVolatile*/, true/*ReadMem*/, - false/*WriteMem*/); - return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + if (Ld->hasNUsesOfValue(1, 0)) { + SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); + SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, + Ld->getMemoryVT(), + Ld->getPointerInfo(), + Ld->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + } } // Emit a zeroed vector and insert the desired subvector on its // first half. SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), - DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); return DCI.CombineTo(N, InsV); } @@ -12965,18 +13058,15 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), - V, DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); return DCI.CombineTo(N, InsV); } // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), - V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); + SDValue V = Extract128BitVector(V1, 0, DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); return DCI.CombineTo(N, InsV); } @@ -13015,7 +13105,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } -/// PerformTruncateCombine - Converts truncate operation to +/// DCI, PerformTruncateCombine - Converts truncate operation to /// a sequence of vector shuffle operations. /// It is possible when we truncate 256-bit vector to 128-bit vector @@ -13024,7 +13114,8 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps()) return SDValue(); - if (!Subtarget->hasAVX()) return SDValue(); + if (!Subtarget->hasAVX()) + return SDValue(); EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); @@ -13033,55 +13124,102 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { + if (Subtarget->hasAVX2()) { + // AVX2: v4i64 -> v4i32 + + // VPERMD + static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op); + Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32), + ShufMask); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, + DAG.getIntPtrConstant(0)); + } + + // AVX: v4i64 -> v4i32 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, - DAG.getIntPtrConstant(2)); + DAG.getIntPtrConstant(2)); OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); // PSHUFD - int ShufMask1[] = {0, 2, 0, 0}; + static const int ShufMask1[] = {0, 2, 0, 0}; - OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), - ShufMask1); - OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), - ShufMask1); + OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), ShufMask1); + OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), ShufMask1); // MOVLHPS - int ShufMask2[] = {0, 1, 4, 5}; + static const int ShufMask2[] = {0, 1, 4, 5}; return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); } + if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { + if (Subtarget->hasAVX2()) { + // AVX2: v8i32 -> v8i16 + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op); + + // PSHUFB + SmallVector<SDValue,32> pshufbMask; + for (unsigned i = 0; i < 2; ++i) { + pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); + for (unsigned j = 0; j < 8; ++j) + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8, + &pshufbMask[0], 32); + Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV); + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op); + + static const int ShufMask[] = {0, 2, -1, -1}; + Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64), + &ShufMask[0]); + + Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(0)); + + return DAG.getNode(ISD::BITCAST, dl, VT, Op); + } + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4)); OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); // PSHUFB - int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, - -1, -1, -1, -1, -1, -1, -1, -1}; + static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1}; - OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, - DAG.getUNDEF(MVT::v16i8), + OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, DAG.getUNDEF(MVT::v16i8), ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, - DAG.getUNDEF(MVT::v16i8), + OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, DAG.getUNDEF(MVT::v16i8), ShufMask1); OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); // MOVLHPS - int ShufMask2[] = {0, 1, 4, 5}; + static const int ShufMask2[] = {0, 1, 4, 5}; SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); @@ -13128,7 +13266,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle)) + if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, + UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. @@ -13277,8 +13416,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - - DebugLoc DL = N->getDebugLoc(); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. @@ -13560,9 +13697,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // to simplify previous instructions. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && - !DCI.isBeforeLegalize() && - TLI.isOperationLegal(ISD::VSELECT, VT)) { + !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + + // Don't optimize vector selects that map to mask-registers. + if (BitWidth == 1) + return SDValue(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -14261,6 +14402,41 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Generate NEG and CMOV for integer abs. +static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // Since X86 does not have CMOV for 8-bit integer, we don't convert + // 8-bit integer abs to NEG and CMOV. + if (VT.isInteger() && VT.getSizeInBits() == 8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) + // and change it to SUB and CMOV. + if (VT.isInteger() && N->getOpcode() == ISD::XOR && + N0.getOpcode() == ISD::ADD && + N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && + N1.getOperand(0) == N0.getOperand(0)) + if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) + if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { + // Generate SUB & CMOV. + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), + DAG.getConstant(0, VT), N0.getOperand(0)); + + SDValue Ops[] = { N0.getOperand(0), Neg, + DAG.getConstant(X86::COND_GE, MVT::i8), + SDValue(Neg.getNode(), 1) }; + return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), + Ops, array_lengthof(Ops)); + } + return SDValue(); +} + // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -14268,6 +14444,16 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (Subtarget->hasCMov()) { + SDValue RV = performIntegerAbsCombine(N, DAG); + if (RV.getNode()) + return RV; + } + + // Try forming BMI if it is available. + if (!Subtarget->hasBMI()) + return SDValue(); + EVT VT = N->getValueType(0); if (VT != MVT::i32 && VT != MVT::i64) @@ -14293,7 +14479,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { LoadSDNode *Ld = cast<LoadSDNode>(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); @@ -14315,63 +14502,94 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, unsigned RegSz = RegVT.getSizeInBits(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); - // All sizes must be a power of two - if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue(); - // Attempt to load the original value using a single load op. - // Find a scalar type which is equal to the loaded word size. + // All sizes must be a power of two. + if (!isPowerOf2_32(RegSz * MemSz * NumElems)) + return SDValue(); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. MVT SclrLoadTy = MVT::i8; for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) { + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { SclrLoadTy = Tp; - break; } } - // Proceed if a load word is found. - if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue(); + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, RegSz/SclrLoadTy.getSizeInBits()); + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), RegSz/MemVT.getScalarType().getSizeInBits()); - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); - // Perform a single load. - SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), - Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->isInvariant(), - Ld->getAlignment()); + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); - // Insert the word loaded into a vector. - SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, - LoadUnitVecVT, ScalarLoad); + // We can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); + + SmallVector<SDValue, 8> Chains; + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + for (unsigned i = 0; i < NumLoads; ++i) { + // Perform a single load. + SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), + Ptr, Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], + Chains.size()); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, - ScalarInVector); + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); unsigned SizeRatio = RegSz/MemSz; // Redistribute the loaded elements into the different locations. SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i; + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i*SizeRatio] = i; SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(SlicedVec.getValueType()), - ShuffleVec.data()); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); // Bitcast to the requested type. Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); // Replace the original load with the new sequence // and return the new chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff); - return SDValue(ScalarLoad.getNode(), 1); + return DCI.CombineTo(N, Shuff, TF, true); } return SDValue(); @@ -14388,13 +14606,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we are saving a concatenation of two XMM registers, perform two stores. - // This is better in Sandy Bridge cause one 256-bit mem op is done via two - // 128-bit ones. If in the future the cost becomes only one memory access the - // first version would be better. - if (VT.getSizeInBits() == 256 && - StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && - StoredVal.getNumOperands() == 2) { - + // On Sandy Bridge, 256-bit memory operations are executed by two + // 128-bit ports. However, on Haswell it is better to issue a single 256-bit + // memory operation. + if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2() && + StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && + StoredVal.getNumOperands() == 2) { SDValue Value0 = StoredVal.getOperand(0); SDValue Value1 = StoredVal.getOperand(1); @@ -14439,14 +14656,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio; + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; - // Can't shuffle using an illegal type - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVec.getValueType()), - ShuffleVec.data()); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. @@ -14455,13 +14674,18 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz) + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && + (64 <= NumElems * ToSz)) + StoreType = MVT::f64; + // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); SmallVector<SDValue, 8> Chains; @@ -14470,7 +14694,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. - for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) { + for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StoreType, ShuffWide, DAG.getIntPtrConstant(i)); @@ -14819,18 +15043,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps()) return SDValue(); - if (!Subtarget->hasAVX()) + if (!Subtarget->hasAVX()) return SDValue(); - // Optimize vectors in AVX mode - // Sign extend v8i16 to v8i32 and - // v4i32 to v4i64 - // - // Divide input vector into two parts - // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} - // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 - // concat the vectors to original VT - EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); EVT OpVT = Op.getValueType(); @@ -14839,23 +15054,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + unsigned NumElems = OpVT.getVectorNumElements(); SmallVector<int,8> ShufMask1(NumElems, -1); - for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i; + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask1[i] = i; SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), - ShufMask1.data()); + &ShufMask1[0]); SmallVector<int,8> ShufMask2(NumElems, -1); - for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2; + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask2[i] = i + NumElems/2; SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), - ShufMask2.data()); + &ShufMask2[0]); - EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), VT.getVectorNumElements()/2); - OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); + OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); @@ -14864,6 +15093,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, } static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) @@ -14888,6 +15118,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, VT)); } + // Optimize vectors in AVX mode: // // v8i16 -> v8i32 @@ -14900,26 +15131,57 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // - if (Subtarget->hasAVX()) { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) + return SDValue(); - if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || + if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { - SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); - SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG); - SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG); + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0); - EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements()/2); + SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); + SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec); + SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec); - OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements()/2); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); - } + OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } + return SDValue(); +} +// Optimize x == -y --> x+y == 0 +// x != -y --> x+y != 0 +static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) + if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + LHS.getValueType(), RHS, LHS.getOperand(1)); + return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + addV, DAG.getConstant(0, addV.getValueType()), CC); + } + if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) + if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + RHS.getValueType(), LHS, RHS.getOperand(1)); + return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + addV, DAG.getConstant(0, addV.getValueType()), CC); + } return SDValue(); } @@ -14941,9 +15203,36 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) { + SDValue Op0 = N->getOperand(0); + EVT InVT = Op0->getValueType(0); + + // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32)) + if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); + // Notice that we use SINT_TO_FP because we know that the high bits + // are zero and SINT_TO_FP is better supported by the hardware. + return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { SDValue Op0 = N->getOperand(0); + EVT InVT = Op0->getValueType(0); + + // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) + if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); + return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + } + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (Op0.getOpcode() == ISD::LOAD) { @@ -14962,6 +15251,20 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT() + if (VT == MVT::v8i8 || VT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, I); + } + + return SDValue(); +} + // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -15096,9 +15399,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); - case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); + case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); + case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: @@ -15106,9 +15411,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); - case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); + case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: @@ -15653,55 +15960,55 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // in the normal allocation? case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget->is64Bit()) { - if (VT == MVT::i32 || VT == MVT::f32) - return std::make_pair(0U, X86::GR32RegisterClass); - else if (VT == MVT::i16) - return std::make_pair(0U, X86::GR16RegisterClass); - else if (VT == MVT::i8 || VT == MVT::i1) - return std::make_pair(0U, X86::GR8RegisterClass); - else if (VT == MVT::i64 || VT == MVT::f64) - return std::make_pair(0U, X86::GR64RegisterClass); - break; + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, &X86::GR32RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16RegClass); + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8RegClass); + if (VT == MVT::i64 || VT == MVT::f64) + return std::make_pair(0U, &X86::GR64RegClass); + break; } // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) - return std::make_pair(0U, X86::GR32_ABCDRegisterClass); - else if (VT == MVT::i16) - return std::make_pair(0U, X86::GR16_ABCDRegisterClass); - else if (VT == MVT::i8 || VT == MVT::i1) - return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); - else if (VT == MVT::i64) - return std::make_pair(0U, X86::GR64_ABCDRegisterClass); + return std::make_pair(0U, &X86::GR32_ABCDRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16_ABCDRegClass); + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); + if (VT == MVT::i64) + return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; case 'r': // GENERAL_REGS case 'l': // INDEX_REGS if (VT == MVT::i8 || VT == MVT::i1) - return std::make_pair(0U, X86::GR8RegisterClass); + return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i16) - return std::make_pair(0U, X86::GR16RegisterClass); + return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) - return std::make_pair(0U, X86::GR32RegisterClass); - return std::make_pair(0U, X86::GR64RegisterClass); + return std::make_pair(0U, &X86::GR32RegClass); + return std::make_pair(0U, &X86::GR64RegClass); case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) - return std::make_pair(0U, X86::GR8_NOREXRegisterClass); + return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) - return std::make_pair(0U, X86::GR16_NOREXRegisterClass); + return std::make_pair(0U, &X86::GR16_NOREXRegClass); if (VT == MVT::i32 || !Subtarget->is64Bit()) - return std::make_pair(0U, X86::GR32_NOREXRegisterClass); - return std::make_pair(0U, X86::GR64_NOREXRegisterClass); + return std::make_pair(0U, &X86::GR32_NOREXRegClass); + return std::make_pair(0U, &X86::GR64_NOREXRegClass); case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) - return std::make_pair(0U, X86::RFP32RegisterClass); + return std::make_pair(0U, &X86::RFP32RegClass); if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) - return std::make_pair(0U, X86::RFP64RegisterClass); - return std::make_pair(0U, X86::RFP80RegisterClass); + return std::make_pair(0U, &X86::RFP64RegClass); + return std::make_pair(0U, &X86::RFP80RegClass); case 'y': // MMX_REGS if MMX allowed. if (!Subtarget->hasMMX()) break; - return std::make_pair(0U, X86::VR64RegisterClass); + return std::make_pair(0U, &X86::VR64RegClass); case 'Y': // SSE_REGS if SSE2 allowed if (!Subtarget->hasSSE2()) break; // FALL THROUGH. @@ -15713,10 +16020,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // Scalar SSE types. case MVT::f32: case MVT::i32: - return std::make_pair(0U, X86::FR32RegisterClass); + return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: case MVT::i64: - return std::make_pair(0U, X86::FR64RegisterClass); + return std::make_pair(0U, &X86::FR64RegClass); // Vector types. case MVT::v16i8: case MVT::v8i16: @@ -15724,7 +16031,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: - return std::make_pair(0U, X86::VR128RegisterClass); + return std::make_pair(0U, &X86::VR128RegClass); // AVX types. case MVT::v32i8: case MVT::v16i16: @@ -15732,8 +16039,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: - return std::make_pair(0U, X86::VR256RegisterClass); - + return std::make_pair(0U, &X86::VR256RegClass); } break; } @@ -15756,28 +16062,28 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Constraint[6] == '}') { Res.first = X86::ST0+Constraint[4]-'0'; - Res.second = X86::RFP80RegisterClass; + Res.second = &X86::RFP80RegClass; return Res; } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) { Res.first = X86::ST0; - Res.second = X86::RFP80RegisterClass; + Res.second = &X86::RFP80RegClass; return Res; } // flags -> EFLAGS if (StringRef("{flags}").equals_lower(Constraint)) { Res.first = X86::EFLAGS; - Res.second = X86::CCRRegisterClass; + Res.second = &X86::CCRRegClass; return Res; } // 'A' means EAX + EDX. if (Constraint == "A") { Res.first = X86::EAX; - Res.second = X86::GR32_ADRegisterClass; + Res.second = &X86::GR32_ADRegClass; return Res; } return Res; @@ -15793,7 +16099,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we // really want an 8-bit or 32-bit register, map to the appropriate register // class and return the appropriate register. - if (Res.second == X86::GR16RegisterClass) { + if (Res.second == &X86::GR16RegClass) { if (VT == MVT::i8) { unsigned DestReg = 0; switch (Res.first) { @@ -15805,7 +16111,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } if (DestReg) { Res.first = DestReg; - Res.second = X86::GR8RegisterClass; + Res.second = &X86::GR8RegClass; } } else if (VT == MVT::i32) { unsigned DestReg = 0; @@ -15822,7 +16128,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } if (DestReg) { Res.first = DestReg; - Res.second = X86::GR32RegisterClass; + Res.second = &X86::GR32RegClass; } } else if (VT == MVT::i64) { unsigned DestReg = 0; @@ -15839,22 +16145,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } if (DestReg) { Res.first = DestReg; - Res.second = X86::GR64RegisterClass; + Res.second = &X86::GR64RegClass; } } - } else if (Res.second == X86::FR32RegisterClass || - Res.second == X86::FR64RegisterClass || - Res.second == X86::VR128RegisterClass) { + } else if (Res.second == &X86::FR32RegClass || + Res.second == &X86::FR64RegClass || + Res.second == &X86::VR128RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can // find, ignoring the required type. - if (VT == MVT::f32) - Res.second = X86::FR32RegisterClass; - else if (VT == MVT::f64) - Res.second = X86::FR64RegisterClass; - else if (X86::VR128RegisterClass->hasType(VT)) - Res.second = X86::VR128RegisterClass; + + if (VT == MVT::f32 || VT == MVT::i32) + Res.second = &X86::FR32RegClass; + else if (VT == MVT::f64 || VT == MVT::i64) + Res.second = &X86::FR64RegClass; + else if (X86::VR128RegClass.hasType(VT)) + Res.second = &X86::VR128RegClass; + else if (X86::VR256RegClass.hasType(VT)) + Res.second = &X86::VR256RegClass; } return Res; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 09116e8..78e4d75 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -207,6 +207,10 @@ namespace llvm { // TLSADDR - Thread Local Storage. TLSADDR, + // TLSBASEADDR - Thread Local Storage. A call to get the start address + // of the TLS block for the current module. + TLSBASEADDR, + // TLSCALL - Thread Local Storage. When calling to an OS provided // thunk at the address from an earlier relocation. TLSCALL, @@ -242,9 +246,6 @@ namespace llvm { // PCMP* - Vector integer comparisons. PCMPEQ, PCMPGT, - // VPCOM, VPCOMU - XOP Vector integer comparisons. - VPCOM, VPCOMU, - // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, @@ -315,6 +316,15 @@ namespace llvm { SFENCE, LFENCE, + // FNSTSW16r - Store FP status word into i16 register. + FNSTSW16r, + + // SAHF - Store contents of %ah into %eflags. + SAHF, + + // RDRAND - Get a random integer and indicate whether it is valid in CF. + RDRAND, + // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - // Atomic 64-bit binary operations. @@ -558,6 +568,18 @@ namespace llvm { /// by AM is legal for this target, for a load/store of the specified type. virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const; + /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. + virtual bool isLegalICmpImmediate(int64_t Imm) const; + + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + virtual bool isLegalAddImmediate(int64_t Imm) const; + /// isTruncateFree - Return true if it's free to truncate a value of /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in /// register EAX to i16 by referencing its sub-register AX. @@ -761,6 +783,7 @@ namespace llvm { SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; @@ -797,12 +820,7 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, - bool isVarArg, bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue @@ -822,9 +840,9 @@ namespace llvm { virtual bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - LLVMContext &Context) const; + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const; void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG, unsigned NewOp) const; @@ -909,6 +927,9 @@ namespace llvm { /// equivalent, for use with the given x86 condition code. SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SelectionDAG &DAG) const; + + /// Convert a comparison if required by the subtarget. + SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; }; namespace X86 { diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 0eee083..b6ba68f 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1143,7 +1143,9 @@ let Uses = [EFLAGS] in { 0, 0>; } +let isCompare = 1 in { defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; +} //===----------------------------------------------------------------------===// @@ -1154,7 +1156,7 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; def X86testpat : PatFrag<(ops node:$lhs, node:$rhs), (X86cmp (and_su node:$lhs, node:$rhs), 0)>; -let Defs = [EFLAGS] in { +let isCompare = 1, Defs = [EFLAGS] in { let isCommutable = 1 in { def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>; def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>; diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index fa1d676..aaef4a4 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -55,11 +55,11 @@ struct X86AddressMode { : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) { Base.Reg = 0; } - - + + void getFullAddress(SmallVectorImpl<MachineOperand> &MO) { assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8); - + if (BaseType == X86AddressMode::RegBase) MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, false, false, false, 0, false)); @@ -67,16 +67,16 @@ struct X86AddressMode { assert(BaseType == X86AddressMode::FrameIndexBase); MO.push_back(MachineOperand::CreateFI(Base.FrameIndex)); } - + MO.push_back(MachineOperand::CreateImm(Scale)); MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, false, false, false, 0, false)); - + if (GV) MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags)); else MO.push_back(MachineOperand::CreateImm(Disp)); - + MO.push_back(MachineOperand::CreateReg(0, false, false, false, false, false, 0, false)); } @@ -122,7 +122,7 @@ static inline const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM) { assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8); - + if (AM.BaseType == X86AddressMode::RegBase) MIB.addReg(AM.Base.Reg); else { @@ -135,7 +135,7 @@ addFullAddress(const MachineInstrBuilder &MIB, MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags); else MIB.addImm(AM.Disp); - + return MIB.addReg(0); } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 6f9e849..99c2b8f 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -375,11 +375,16 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], - Uses = [ESP] in + Uses = [ESP] in { def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLS_addr32", [(X86tlsaddr tls32addr:$sym)]>, Requires<[In32BitMode]>; +def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_base_addr32", + [(X86tlsbaseaddr tls32baseaddr:$sym)]>, + Requires<[In32BitMode]>; +} // All calls clobber the non-callee saved registers. RSP is marked as // a use to prevent stack-pointer assignments that appear immediately @@ -389,11 +394,16 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], - Uses = [RSP] in + Uses = [RSP] in { def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLS_addr64", [(X86tlsaddr tls64addr:$sym)]>, Requires<[In64BitMode]>; +def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_base_addr64", + [(X86tlsbaseaddr tls64baseaddr:$sym)]>, + Requires<[In64BitMode]>; +} // Darwin TLS Support // For i386, the address of the thunk is passed on the stack, on return the @@ -1008,8 +1018,8 @@ def : Pat<(X86call (i64 texternalsym:$dst)), (CALL64pcrel32 texternalsym:$dst)>; // tailcall stuff -def : Pat<(X86tcret GR32_TC:$dst, imm:$off), - (TCRETURNri GR32_TC:$dst, imm:$off)>, +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, Requires<[In32BitMode]>; // FIXME: This is disabled for 32-bit PIC mode because the global base @@ -1623,6 +1633,12 @@ def : Pat<(sub GR16:$src1, i16immSExt8:$src2), def : Pat<(sub GR32:$src1, i32immSExt8:$src2), (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; +// sub 0, reg +def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>; +def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; +def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; +def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; + // mul reg, reg def : Pat<(mul GR16:$src1, GR16:$src2), (IMUL16rr GR16:$src1, GR16:$src2)>; diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index bf11fde..b0c27c8 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -18,16 +18,16 @@ // Return instructions. let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, FPForm = SpecialFP in { - def RET : I <0xC3, RawFrm, (outs), (ins variable_ops), + def RET : I <0xC3, RawFrm, (outs), (ins), "ret", [(X86retflag 0)], IIC_RET>; - def RETW : I <0xC3, RawFrm, (outs), (ins variable_ops), + def RETW : I <0xC3, RawFrm, (outs), (ins), "ret{w}", [], IIC_RET>, OpSize; - def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), "ret\t$amt", [(X86retflag timm:$amt)], IIC_RET_IMM>; - def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), "ret{w}\t$amt", [], IIC_RET_IMM>, OpSize; def LRETL : I <0xCB, RawFrm, (outs), (ins), @@ -148,12 +148,12 @@ let isCall = 1 in // registers are added manually. let Uses = [ESP] in { def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i32imm_pcrel:$dst,variable_ops), + (outs), (ins i32imm_pcrel:$dst), "call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>; - def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops), + def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, Requires<[In32BitMode]>; - def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops), + def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>, Requires<[In32BitMode]>; @@ -174,7 +174,7 @@ let isCall = 1 in // callw for 16 bit code for the assembler. let isAsmParserOnly = 1 in def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, - (outs), (ins i16imm_pcrel:$dst, variable_ops), + (outs), (ins i16imm_pcrel:$dst), "callw\t$dst", []>, OpSize; } @@ -185,23 +185,23 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1 in let Uses = [ESP] in { def TCRETURNdi : PseudoI<(outs), - (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>; + (ins i32imm_pcrel:$dst, i32imm:$offset), []>; def TCRETURNri : PseudoI<(outs), - (ins GR32_TC:$dst, i32imm:$offset, variable_ops), []>; + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; let mayLoad = 1 in def TCRETURNmi : PseudoI<(outs), - (ins i32mem_TC:$dst, i32imm:$offset, variable_ops), []>; + (ins i32mem_TC:$dst, i32imm:$offset), []>; // FIXME: The should be pseudo instructions that are lowered when going to // mcinst. def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), - (ins i32imm_pcrel:$dst, variable_ops), + (ins i32imm_pcrel:$dst), "jmp\t$dst # TAILCALL", [], IIC_JMP_REL>; - def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops), + def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. let mayLoad = 1 in - def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops), + def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), "jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; } @@ -218,14 +218,14 @@ let isCall = 1, Uses = [RSP] in { // that the offset between an arbitrary immediate and the call will fit in // the 32-bit pcrel field that we have. def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i64i32imm_pcrel:$dst, variable_ops), + (outs), (ins i64i32imm_pcrel:$dst), "call{q}\t$dst", [], IIC_CALL_RI>, Requires<[In64BitMode]>; - def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops), + def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), "call{q}\t{*}$dst", [(X86call GR64:$dst)], IIC_CALL_RI>, Requires<[In64BitMode]>; - def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops), + def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], IIC_CALL_MEM>, Requires<[In64BitMode]>; @@ -240,7 +240,7 @@ let isCall = 1, isCodeGenOnly = 1 in let Defs = [RAX, R10, R11, RSP, EFLAGS], Uses = [RSP] in { def W64ALLOCA : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i64i32imm_pcrel:$dst, variable_ops), + (outs), (ins i64i32imm_pcrel:$dst), "call{q}\t$dst", [], IIC_CALL_RI>, Requires<[IsWin64]>; } @@ -250,21 +250,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, let Uses = [RSP], usesCustomInserter = 1 in { def TCRETURNdi64 : PseudoI<(outs), - (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops), + (ins i64i32imm_pcrel:$dst, i32imm:$offset), []>; def TCRETURNri64 : PseudoI<(outs), - (ins ptr_rc_tailcall:$dst, i32imm:$offset, variable_ops), []>; + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; let mayLoad = 1 in def TCRETURNmi64 : PseudoI<(outs), - (ins i64mem_TC:$dst, i32imm:$offset, variable_ops), []>; + (ins i64mem_TC:$dst, i32imm:$offset), []>; def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), - (ins i64i32imm_pcrel:$dst, variable_ops), + (ins i64i32imm_pcrel:$dst), "jmp\t$dst # TAILCALL", [], IIC_JMP_REL>; - def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops), + def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; let mayLoad = 1 in - def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops), + def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; } diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index d57937b..8802a2e 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -15,83 +15,161 @@ // FMA3 - Intel 3 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// +let Constraints = "$src1 = $dst" in { multiclass fma3p_rm<bits<8> opc, string OpcodeStr> { +let neverHasSideEffects = 1 in { def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; + let mayLoad = 1 in def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; +} // neverHasSideEffects = 1 +} + +// Intrinsic for 132 pattern +multiclass fma3p_rm_int<bits<8> opc, string OpcodeStr, + PatFrag MemFrag128, PatFrag MemFrag256, + Intrinsic Int128, Intrinsic Int256> { + def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src3, VR128:$src2))]>; + def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (Int128 VR128:$src1, (MemFrag128 addr:$src3), VR128:$src2))]>; + def rY_Int : FMA3<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src3, VR256:$src2))]>; + def mY_Int : FMA3<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, + (Int256 VR256:$src1, (MemFrag256 addr:$src3), VR256:$src2))]>; } +} // Constraints = "$src1 = $dst" multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpcodeStr, string PackTy> { - defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>; - defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>; - defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>; + string OpcodeStr, string PackTy, + PatFrag MemFrag128, PatFrag MemFrag256, + Intrinsic Int128, Intrinsic Int256> { + defm r132 : fma3p_rm_int <opc132, !strconcat(OpcodeStr, + !strconcat("132", PackTy)), MemFrag128, MemFrag256, + Int128, Int256>; + defm r132 : fma3p_rm <opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>; + defm r213 : fma3p_rm <opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>; + defm r231 : fma3p_rm <opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>; } // Fused Multiply-Add let ExeDomain = SSEPackedSingle in { - defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">; - defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">; - defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">; - defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">; + defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32, + memopv8f32, int_x86_fma_vfmadd_ps, int_x86_fma_vfmadd_ps_256>; + defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32, + memopv8f32, int_x86_fma_vfmsub_ps, int_x86_fma_vfmsub_ps_256>; + defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", + memopv4f32, memopv8f32, int_x86_fma_vfmaddsub_ps, + int_x86_fma_vfmaddsub_ps_256>; + defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", + memopv4f32, memopv8f32, int_x86_fma_vfmsubadd_ps, + int_x86_fma_vfmaddsub_ps_256>; } let ExeDomain = SSEPackedDouble in { - defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W; - defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W; - defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W; - defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W; + defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64, + memopv4f64, int_x86_fma_vfmadd_pd, int_x86_fma_vfmadd_pd_256>, VEX_W; + defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64, + memopv4f64, int_x86_fma_vfmsub_pd, int_x86_fma_vfmsub_pd_256>, VEX_W; + defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", memopv2f64, + memopv4f64, int_x86_fma_vfmaddsub_pd, int_x86_fma_vfmaddsub_pd_256>, VEX_W; + defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", memopv2f64, + memopv4f64, int_x86_fma_vfmsubadd_pd, int_x86_fma_vfmsubadd_pd_256>, VEX_W; } // Fused Negative Multiply-Add let ExeDomain = SSEPackedSingle in { - defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">; - defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">; + defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", memopv4f32, + memopv8f32, int_x86_fma_vfnmadd_ps, int_x86_fma_vfnmadd_ps_256>; + defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", memopv4f32, + memopv8f32, int_x86_fma_vfnmsub_ps, int_x86_fma_vfnmsub_ps_256>; } let ExeDomain = SSEPackedDouble in { - defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W; - defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W; + defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64, + memopv4f64, int_x86_fma_vfnmadd_pd, int_x86_fma_vfnmadd_pd_256>, VEX_W; + defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", memopv2f64, + memopv4f64, int_x86_fma_vfnmsub_pd, int_x86_fma_vfnmsub_pd_256>, VEX_W; } -multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> { - def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + +let Constraints = "$src1 = $dst" in { +multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, + RegisterClass RC> { +let neverHasSideEffects = 1 in { + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; - def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, x86memop:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; +} // neverHasSideEffects = 1 } +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop, + ComplexPattern mem_cpat, Intrinsic IntId> { + def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src3, VR128:$src2))]>; + def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (IntId VR128:$src1, mem_cpat:$src3, VR128:$src2))]>; +} +} // Constraints = "$src1 = $dst" + multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpcodeStr> { - defm SSr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132ss"), f32mem>; - defm SSr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213ss"), f32mem>; - defm SSr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231ss"), f32mem>; - defm SDr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132sd"), f64mem>, VEX_W; - defm SDr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213sd"), f64mem>, VEX_W; - defm SDr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231sd"), f64mem>, VEX_W; + string OpStr, Intrinsic IntF32, Intrinsic IntF64> { + defm SSr132 : fma3s_rm<opc132, !strconcat(OpStr, "132ss"), f32mem, FR32>; + defm SSr213 : fma3s_rm<opc213, !strconcat(OpStr, "213ss"), f32mem, FR32>; + defm SSr231 : fma3s_rm<opc231, !strconcat(OpStr, "231ss"), f32mem, FR32>; + defm SDr132 : fma3s_rm<opc132, !strconcat(OpStr, "132sd"), f64mem, FR64>, VEX_W; + defm SDr213 : fma3s_rm<opc213, !strconcat(OpStr, "213sd"), f64mem, FR64>, VEX_W; + defm SDr231 : fma3s_rm<opc231, !strconcat(OpStr, "231sd"), f64mem, FR64>, VEX_W; + defm SSr132 : fma3s_rm_int <opc132, !strconcat(OpStr, "132ss"), ssmem, + sse_load_f32, IntF32>; + defm SDr132 : fma3s_rm_int <opc132, !strconcat(OpStr, "132sd"), sdmem, + sse_load_f64, IntF64>; } -defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd">, VEX_LIG; -defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub">, VEX_LIG; +defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, + int_x86_fma_vfmadd_sd>, VEX_LIG; +defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss, + int_x86_fma_vfmsub_sd>, VEX_LIG; + +defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss, + int_x86_fma_vfnmadd_sd>, VEX_LIG; +defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, + int_x86_fma_vfnmsub_sd>, VEX_LIG; -defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd">, VEX_LIG; -defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub">, VEX_LIG; //===----------------------------------------------------------------------===// // FMA4 - AMD 4 operand Fused Multiply-Add instructions @@ -178,43 +256,47 @@ let isCodeGenOnly = 1 in { } // isCodeGenOnly = 1 } +let Predicates = [HasFMA4] in { + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma4_vfmadd_ss>; + int_x86_fma_vfmadd_ss>; defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma4_vfmadd_sd>; -defm VFMADDPS4 : fma4p<0x68, "vfmaddps", int_x86_fma4_vfmadd_ps, - int_x86_fma4_vfmadd_ps_256, memopv4f32, memopv8f32>; -defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", int_x86_fma4_vfmadd_pd, - int_x86_fma4_vfmadd_pd_256, memopv2f64, memopv4f64>; + int_x86_fma_vfmadd_sd>; +defm VFMADDPS4 : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps, + int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>; +defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd, + int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>; defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma4_vfmsub_ss>; + int_x86_fma_vfmsub_ss>; defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma4_vfmsub_sd>; -defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", int_x86_fma4_vfmsub_ps, - int_x86_fma4_vfmsub_ps_256, memopv4f32, memopv8f32>; -defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", int_x86_fma4_vfmsub_pd, - int_x86_fma4_vfmsub_pd_256, memopv2f64, memopv4f64>; + int_x86_fma_vfmsub_sd>; +defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps, + int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>; +defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd, + int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>; defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma4_vfnmadd_ss>; + int_x86_fma_vfnmadd_ss>; defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma4_vfnmadd_sd>; -defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", int_x86_fma4_vfnmadd_ps, - int_x86_fma4_vfnmadd_ps_256, memopv4f32, memopv8f32>; -defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", int_x86_fma4_vfnmadd_pd, - int_x86_fma4_vfnmadd_pd_256, memopv2f64, memopv4f64>; + int_x86_fma_vfnmadd_sd>; +defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps, + int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>; +defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd, + int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>; defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma4_vfnmsub_ss>; + int_x86_fma_vfnmsub_ss>; defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma4_vfnmsub_sd>; -defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", int_x86_fma4_vfnmsub_ps, - int_x86_fma4_vfnmsub_ps_256, memopv4f32, memopv8f32>; -defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", int_x86_fma4_vfnmsub_pd, - int_x86_fma4_vfnmsub_pd_256, memopv2f64, memopv4f64>; -defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma4_vfmaddsub_ps, - int_x86_fma4_vfmaddsub_ps_256, memopv4f32, memopv8f32>; -defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma4_vfmaddsub_pd, - int_x86_fma4_vfmaddsub_pd_256, memopv2f64, memopv4f64>; -defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma4_vfmsubadd_ps, - int_x86_fma4_vfmsubadd_ps_256, memopv4f32, memopv8f32>; -defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma4_vfmsubadd_pd, - int_x86_fma4_vfmsubadd_pd_256, memopv2f64, memopv4f64>; + int_x86_fma_vfnmsub_sd>; +defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps, + int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>; +defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd, + int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>; +defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps, + int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>; +defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd, + int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>; +defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps, + int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>; +defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd, + int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>; +} // HasFMA4 + diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index a13887e..568726e 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -27,6 +27,7 @@ def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, SDTCisVT<2, OtherVT>]>; def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; +def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; @@ -41,6 +42,7 @@ def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, SDNPMemOperand]>; +def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, @@ -203,6 +205,7 @@ def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), } } +let Defs = [FPSW] in { defm ADD : FPBinary_rr<fadd>; defm SUB : FPBinary_rr<fsub>; defm MUL : FPBinary_rr<fmul>; @@ -213,6 +216,7 @@ defm SUBR: FPBinary<fsub ,MRM5m, "subr">; defm MUL : FPBinary<fmul, MRM1m, "mul">; defm DIV : FPBinary<fdiv, MRM6m, "div">; defm DIVR: FPBinary<fdiv, MRM7m, "divr">; +} class FPST0rInst<bits<8> o, string asm> : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8; @@ -257,6 +261,7 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, def _F : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9; } +let Defs = [FPSW] in { defm CHS : FPUnary<fneg, 0xE0, "fchs">; defm ABS : FPUnary<fabs, 0xE1, "fabs">; defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">; @@ -269,6 +274,7 @@ def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; } def TST_F : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9; +} // Defs = [FPSW] // Versions of FP instructions that take a single memory operand. Added for the // disassembler; remove as they are included with patterns elsewhere. @@ -316,6 +322,7 @@ multiclass FPCMov<PatLeaf cc> { Requires<[HasCMov]>; } +let Defs = [FPSW] in { let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { defm CMOVB : FPCMov<X86_COND_B>; defm CMOVBE : FPCMov<X86_COND_BE>; @@ -416,24 +423,40 @@ def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; } let mayLoad = 1 in { -def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">; -def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">; -def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">; -def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">; -def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">; -def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">; +def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src", + IIC_FLD>; +def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src", + IIC_FLD>; +def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src", + IIC_FLD80>; +def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src", + IIC_FILD>; +def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src", + IIC_FILD>; +def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src", + IIC_FILD>; } let mayStore = 1 in { -def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">; -def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">; -def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">; -def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">; -def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">; -def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">; -def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">; -def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">; -def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">; -def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">; +def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst", + IIC_FST>; +def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst", + IIC_FST>; +def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst", + IIC_FST>; +def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst", + IIC_FST>; +def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst", + IIC_FST80>; +def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst", + IIC_FIST>; +def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst", + IIC_FIST>; +def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst", + IIC_FIST>; +def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst", + IIC_FIST>; +def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst", + IIC_FIST>; } // FISTTP requires SSE3 even though it's a FPStack op. @@ -459,17 +482,23 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, } // Predicates = [HasSSE3] let mayStore = 1 in { -def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">; -def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">; +def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst", + IIC_FST>; +def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst", + IIC_FST>; def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), - "fisttp{ll}\t$dst">; + "fisttp{ll}\t$dst", IIC_FST>; } // FP Stack manipulation instructions. -def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9; -def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD; -def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD; -def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9; +def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op", + IIC_FLD>, D9; +def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op", + IIC_FST>, DD; +def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op", + IIC_FST>, DD; +def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op", + IIC_FXCH>, D9; // Floating point constant loads. let isReMaterializable = 1 in { @@ -487,20 +516,21 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, [(set RFP80:$dst, fpimm1)]>; } -def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9; -def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9; +def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9; +def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9; // Floating point compares. -let Defs = [EFLAGS] in { def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, - []>; // FPSW = cmp ST(0) with ST(i) + [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>; def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, - []>; // FPSW = cmp ST(0) with ST(i) + [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>; def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, - []>; // FPSW = cmp ST(0) with ST(i) - + [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>; +} // Defs = [FPSW] + // CC = ST(0) cmp ST(i) +let Defs = [EFLAGS, FPSW] in { def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>; def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, @@ -509,85 +539,94 @@ def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>; } -let Defs = [EFLAGS], Uses = [ST0] in { +let Defs = [FPSW], Uses = [ST0] in { def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i) (outs), (ins RST:$reg), - "fucom\t$reg">, DD; + "fucom\t$reg", IIC_FUCOM>, DD; def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop (outs), (ins RST:$reg), - "fucomp\t$reg">, DD; + "fucomp\t$reg", IIC_FUCOM>, DD; def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop (outs), (ins), - "fucompp">, DA; + "fucompp", IIC_FUCOM>, DA; +} +let Defs = [EFLAGS, FPSW], Uses = [ST0] in { def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i) (outs), (ins RST:$reg), - "fucomi\t$reg">, DB; + "fucomi\t$reg", IIC_FUCOMI>, DB; def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop (outs), (ins RST:$reg), - "fucompi\t$reg">, DF; + "fucompi\t$reg", IIC_FUCOMI>, DF; } +let Defs = [EFLAGS, FPSW] in { def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), - "fcomi\t$reg">, DB; + "fcomi\t$reg", IIC_FCOMI>, DB; def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), - "fcompi\t$reg">, DF; + "fcompi\t$reg", IIC_FCOMI>, DF; +} // Floating point flag ops. -let Defs = [AX] in -def FNSTSW8r : I<0xE0, RawFrm, // AX = fp flags - (outs), (ins), "fnstsw %ax", []>, DF; +let Defs = [AX], Uses = [FPSW] in +def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags + (outs), (ins), "fnstsw %ax", + [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF; def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world (outs), (ins i16mem:$dst), "fnstcw\t$dst", - [(X86fp_cwd_get16 addr:$dst)]>; + [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>; let mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] - (outs), (ins i16mem:$dst), "fldcw\t$dst", []>; + (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>; // FPU control instructions -def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", []>, DB; +let Defs = [FPSW] in +def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB; def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg), - "ffree\t$reg">, DD; + "ffree\t$reg", IIC_FFREE>, DD; // Clear exceptions -def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", []>, DB; +let Defs = [FPSW] in +def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB; // Operandless floating-point instructions for the disassembler. -def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>; - -def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", []>, D9; -def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", []>, D9; -def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", []>, D9; -def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", []>, D9; -def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", []>, D9; -def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", []>, D9; -def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", []>, D9; -def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", []>, D9; -def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", []>, D9; -def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", []>, D9; -def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", []>, D9; -def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", []>, D9; -def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", []>, D9; -def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", []>, D9; -def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", []>, D9; -def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", []>, D9; -def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", []>, D9; -def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", []>, D9; -def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", []>, D9; -def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", []>, D9; -def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", []>, DE; +def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>; + +def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9; +def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", [], IIC_FXAM>, D9; +def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", [], IIC_FLDL>, D9; +def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", [], IIC_FLDL>, D9; +def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", [], IIC_FLDL>, D9; +def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", [], IIC_FLDL>, D9; +def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", [], IIC_FLDL>, D9; +def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", [], IIC_F2XM1>, D9; +def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", [], IIC_FYL2X>, D9; +def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", [], IIC_FPTAN>, D9; +def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", [], IIC_FPATAN>, D9; +def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", [], IIC_FXTRACT>, D9; +def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", [], IIC_FPREM1>, D9; +def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", [], IIC_FPSTP>, D9; +def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", [], IIC_FPSTP>, D9; +def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", [], IIC_FPREM>, D9; +def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>, D9; +def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", [], IIC_FSINCOS>, D9; +def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", [], IIC_FRNDINT>, D9; +def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", [], IIC_FSCALE>, D9; +def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", [], IIC_FCOMPP>, DE; def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), - "fxsave\t$dst", []>, TB; + "fxsave\t$dst", [], IIC_FXSAVE>, TB; def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), - "fxsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; + "fxsaveq\t$dst", [], IIC_FXSAVE>, TB, REX_W, + Requires<[In64BitMode]>; def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor\t$src", []>, TB; + "fxrstor\t$src", [], IIC_FXRSTOR>, TB; def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstorq\t$src", []>, TB, REX_W, Requires<[In64BitMode]>; + "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W, + Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // Non-Instruction Patterns diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index b387090..a115ab4 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -255,8 +255,9 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, // FPStack Instruction Templates: // FPI - Floating Point Instruction template. -class FPI<bits<8> o, Format F, dag outs, dag ins, string asm> - : I<o, F, outs, ins, asm, []> {} +class FPI<bits<8> o, Format F, dag outs, dag ins, string asm, + InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, [], itin> {} // FpI_ - Floating Point Pseudo Instruction template. Not Predicated. class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, @@ -365,6 +366,7 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, // // SDI - SSE2 instructions with XD prefix. // SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. +// SSDI - SSE2 instructions with XS prefix. // SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix. // PDI - SSE2 instructions with TB and OpSize prefixes. // PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. @@ -377,8 +379,11 @@ class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; +class SSDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>; class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> @@ -503,29 +508,29 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, - Requires<[HasSSE2, HasAES]>; + Requires<[HasAES]>; class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, - Requires<[HasSSE2, HasAES]>; + Requires<[HasAES]>; -// CLMUL Instruction Templates -class CLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, +// PCLMUL Instruction Templates +class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, - OpSize, Requires<[HasSSE2, HasCLMUL]>; + OpSize, Requires<[HasPCLMUL]>; -class AVXCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, +class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, - OpSize, VEX_4V, Requires<[HasAVX, HasCLMUL]>; + OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>; // FMA3 Instruction Templates class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> : I<o, F, outs, ins, asm, pattern, itin>, T8, - OpSize, VEX_4V, Requires<[HasFMA3]>; + OpSize, VEX_4V, Requires<[HasFMA]>; // FMA4 Instruction Templates class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 35801e4..ec030dd 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -71,9 +71,14 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS", SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; + +def X86vzmovly : SDNode<"X86ISD::VZEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisOpSmallerThanOp<1, 0> ]>>; + def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>; - + def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; @@ -102,13 +107,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; -def X86vpcom : SDNode<"X86ISD::VPCOM", - SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>; -def X86vpcomu : SDNode<"X86ISD::VPCOMU", - SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>; - def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>>; @@ -304,7 +302,7 @@ def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), }]>; def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ + (st node:$val, node:$ptr), [{ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) return ST->isNonTemporal() && !ST->isTruncatingStore() && ST->getAddressingMode() == ISD::UNINDEXED && @@ -313,7 +311,7 @@ def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), }]>; def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ + (st node:$val, node:$ptr), [{ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) return ST->isNonTemporal() && ST->getAlignment() < 16; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index b12c1db..69493bc 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/LLVMContext.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -54,38 +55,39 @@ ReMatPICStubLoad("remat-pic-stub-load", enum { // Select which memory operand is being unfolded. - // (stored in bits 0 - 7) + // (stored in bits 0 - 3) TB_INDEX_0 = 0, TB_INDEX_1 = 1, TB_INDEX_2 = 2, - TB_INDEX_MASK = 0xff, - - // Minimum alignment required for load/store. - // Used for RegOp->MemOp conversion. - // (stored in bits 8 - 15) - TB_ALIGN_SHIFT = 8, - TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, - TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, - TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, - TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT, + TB_INDEX_3 = 3, + TB_INDEX_MASK = 0xf, // Do not insert the reverse map (MemOp -> RegOp) into the table. // This may be needed because there is a many -> one mapping. - TB_NO_REVERSE = 1 << 16, + TB_NO_REVERSE = 1 << 4, // Do not insert the forward map (RegOp -> MemOp) into the table. // This is needed for Native Client, which prohibits branch // instructions from using a memory operand. - TB_NO_FORWARD = 1 << 17, + TB_NO_FORWARD = 1 << 5, + + TB_FOLDED_LOAD = 1 << 6, + TB_FOLDED_STORE = 1 << 7, - TB_FOLDED_LOAD = 1 << 18, - TB_FOLDED_STORE = 1 << 19 + // Minimum alignment required for load/store. + // Used for RegOp->MemOp conversion. + // (stored in bits 8 - 15) + TB_ALIGN_SHIFT = 8, + TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, + TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, + TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, + TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT }; struct X86OpTblEntry { uint16_t RegOp; uint16_t MemOp; - uint32_t Flags; + uint16_t Flags; }; X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) @@ -408,14 +410,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 }, { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 }, - { X86::Int_CVTDQ2PDrr, X86::Int_CVTDQ2PDrm, TB_ALIGN_16 }, - { X86::Int_CVTDQ2PSrr, X86::Int_CVTDQ2PSrm, TB_ALIGN_16 }, - { X86::Int_CVTPD2DQrr, X86::Int_CVTPD2DQrm, TB_ALIGN_16 }, - { X86::Int_CVTPD2PSrr, X86::Int_CVTPD2PSrm, TB_ALIGN_16 }, - { X86::Int_CVTPS2DQrr, X86::Int_CVTPS2DQrm, TB_ALIGN_16 }, - { X86::Int_CVTPS2PDrr, X86::Int_CVTPS2PDrm, 0 }, { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 }, { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, + { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, + { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 }, { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, @@ -492,14 +490,20 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) // AVX 128-bit versions of foldable instructions { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, - { X86::Int_VCVTDQ2PDrr, X86::Int_VCVTDQ2PDrm, TB_ALIGN_16 }, - { X86::Int_VCVTDQ2PSrr, X86::Int_VCVTDQ2PSrm, TB_ALIGN_16 }, - { X86::Int_VCVTPD2DQrr, X86::Int_VCVTPD2DQrm, TB_ALIGN_16 }, - { X86::Int_VCVTPD2PSrr, X86::Int_VCVTPD2PSrm, TB_ALIGN_16 }, - { X86::Int_VCVTPS2DQrr, X86::Int_VCVTPS2DQrm, TB_ALIGN_16 }, - { X86::Int_VCVTPS2PDrr, X86::Int_VCVTPS2PDrm, 0 }, { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 }, { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 }, + { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, + { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 }, + { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, + { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 }, + { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, + { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 }, + { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, + { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 }, + { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, + { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, + { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, + { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, { X86::FsVMOVAPDrr, X86::VMOVSDrm, TB_NO_REVERSE }, { X86::FsVMOVAPSrr, X86::VMOVSSrm, TB_NO_REVERSE }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, @@ -535,6 +539,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VSQRTPSr_Int, X86::VSQRTPSm_Int, TB_ALIGN_16 }, { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, + { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, + // AVX 256-bit foldable instructions { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, @@ -543,6 +549,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, { X86::VPERMILPDYri, X86::VPERMILPDYmi, TB_ALIGN_32 }, { X86::VPERMILPSYri, X86::VPERMILPSYmi, TB_ALIGN_32 }, + // AVX2 foldable instructions { X86::VPABSBrr256, X86::VPABSBrm256, TB_ALIGN_32 }, { X86::VPABSDrr256, X86::VPABSDrm256, TB_ALIGN_32 }, @@ -558,6 +565,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VSQRTPDYr_Int, X86::VSQRTPDYm_Int, TB_ALIGN_32 }, { X86::VSQRTPSYr, X86::VSQRTPSYm, TB_ALIGN_32 }, { X86::VSQRTPSYr_Int, X86::VSQRTPSYm_Int, TB_ALIGN_32 }, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { @@ -808,17 +817,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, - { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, - { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm, 0 }, - { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, - { X86::Int_VCVTTSD2SIrr, X86::Int_VCVTTSD2SIrm, 0 }, - { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, - { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm, 0 }, - { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, - { X86::Int_VCVTTSS2SIrr, X86::Int_VCVTTSS2SIrm, 0 }, - { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, - { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, - { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, TB_ALIGN_16 }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, TB_ALIGN_16 }, { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, TB_ALIGN_16 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, @@ -1122,6 +1121,158 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) // Index 2, folded load Flags | TB_INDEX_2 | TB_FOLDED_LOAD); } + + static const X86OpTblEntry OpTbl3[] = { + // FMA foldable instructions + { X86::VFMADDSSr231r, X86::VFMADDSSr231m, 0 }, + { X86::VFMADDSDr231r, X86::VFMADDSDr231m, 0 }, + { X86::VFMADDSSr132r, X86::VFMADDSSr132m, 0 }, + { X86::VFMADDSDr132r, X86::VFMADDSDr132m, 0 }, + { X86::VFMADDSSr213r, X86::VFMADDSSr213m, 0 }, + { X86::VFMADDSDr213r, X86::VFMADDSDr213m, 0 }, + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, 0 }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, 0 }, + + { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_16 }, + { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_16 }, + { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_16 }, + { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_16 }, + { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_16 }, + { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_16 }, + { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_32 }, + { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_32 }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_32 }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_32 }, + { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_32 }, + { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_32 }, + { X86::VFMADDPSr132r_Int, X86::VFMADDPSr132m_Int, TB_ALIGN_16 }, + { X86::VFMADDPDr132r_Int, X86::VFMADDPDr132m_Int, TB_ALIGN_16 }, + { X86::VFMADDPSr132rY_Int, X86::VFMADDPSr132mY_Int, TB_ALIGN_32 }, + { X86::VFMADDPDr132rY_Int, X86::VFMADDPDr132mY_Int, TB_ALIGN_32 }, + + { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, 0 }, + { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, 0 }, + { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, 0 }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, 0 }, + { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, 0 }, + { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, 0 }, + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, 0 }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, 0 }, + + { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_16 }, + { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_16 }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_16 }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_16 }, + { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_16 }, + { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_16 }, + { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_32 }, + { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_32 }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_32 }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_32 }, + { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_32 }, + { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_32 }, + { X86::VFNMADDPSr132r_Int, X86::VFNMADDPSr132m_Int, TB_ALIGN_16 }, + { X86::VFNMADDPDr132r_Int, X86::VFNMADDPDr132m_Int, TB_ALIGN_16 }, + { X86::VFNMADDPSr132rY_Int, X86::VFNMADDPSr132mY_Int, TB_ALIGN_32 }, + { X86::VFNMADDPDr132rY_Int, X86::VFNMADDPDr132mY_Int, TB_ALIGN_32 }, + + { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, 0 }, + { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, 0 }, + { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, 0 }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, 0 }, + { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, 0 }, + { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, 0 }, + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, 0 }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, 0 }, + + { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_16 }, + { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_16 }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_16 }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_16 }, + { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_16 }, + { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_16 }, + { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_32 }, + { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_32 }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_32 }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_32 }, + { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_32 }, + { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_32 }, + { X86::VFMSUBPSr132r_Int, X86::VFMSUBPSr132m_Int, TB_ALIGN_16 }, + { X86::VFMSUBPDr132r_Int, X86::VFMSUBPDr132m_Int, TB_ALIGN_16 }, + { X86::VFMSUBPSr132rY_Int, X86::VFMSUBPSr132mY_Int, TB_ALIGN_32 }, + { X86::VFMSUBPDr132rY_Int, X86::VFMSUBPDr132mY_Int, TB_ALIGN_32 }, + + { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, 0 }, + { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, 0 }, + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, 0 }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, 0 }, + { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, 0 }, + { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, 0 }, + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, 0 }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, 0 }, + + { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_16 }, + { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_16 }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_16 }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_16 }, + { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_16 }, + { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_16 }, + { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_32 }, + { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_32 }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_32 }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_32 }, + { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_32 }, + { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_32 }, + { X86::VFNMSUBPSr132r_Int, X86::VFNMSUBPSr132m_Int, TB_ALIGN_16 }, + { X86::VFNMSUBPDr132r_Int, X86::VFNMSUBPDr132m_Int, TB_ALIGN_16 }, + { X86::VFNMSUBPSr132rY_Int, X86::VFNMSUBPSr132mY_Int, TB_ALIGN_32 }, + { X86::VFNMSUBPDr132rY_Int, X86::VFNMSUBPDr132mY_Int, TB_ALIGN_32 }, + + { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_16 }, + { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_16 }, + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_16 }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_16 }, + { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_16 }, + { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_16 }, + { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_32 }, + { X86::VFMADDSUBPSr132r_Int, X86::VFMADDSUBPSr132m_Int, TB_ALIGN_16 }, + { X86::VFMADDSUBPDr132r_Int, X86::VFMADDSUBPDr132m_Int, TB_ALIGN_16 }, + { X86::VFMADDSUBPSr132rY_Int, X86::VFMADDSUBPSr132mY_Int, TB_ALIGN_32 }, + { X86::VFMADDSUBPDr132rY_Int, X86::VFMADDSUBPDr132mY_Int, TB_ALIGN_32 }, + + { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_16 }, + { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_16 }, + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_16 }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_16 }, + { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_16 }, + { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_16 }, + { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 }, + { X86::VFMSUBADDPSr132r_Int, X86::VFMSUBADDPSr132m_Int, TB_ALIGN_16 }, + { X86::VFMSUBADDPDr132r_Int, X86::VFMSUBADDPDr132m_Int, TB_ALIGN_16 }, + { X86::VFMSUBADDPSr132rY_Int, X86::VFMSUBADDPSr132mY_Int, TB_ALIGN_32 }, + { X86::VFMSUBADDPDr132rY_Int, X86::VFMSUBADDPDr132mY_Int, TB_ALIGN_32 }, + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { + unsigned RegOp = OpTbl3[i].RegOp; + unsigned MemOp = OpTbl3[i].MemOp; + unsigned Flags = OpTbl3[i].Flags; + AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, + RegOp, MemOp, + // Index 3, folded load + Flags | TB_INDEX_3 | TB_FOLDED_LOAD); + } + } void @@ -1782,12 +1933,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + const TargetRegisterClass *RC = MIOpc == X86::INC64r ? + (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : + (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; // LEA can't handle RSP. if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, - MIOpc == X86::INC64r ? X86::GR64_NOSPRegisterClass : - X86::GR32_NOSPRegisterClass)) + !MF.getRegInfo().constrainRegClass(Src, RC)) return 0; NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) @@ -1812,11 +1964,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + const TargetRegisterClass *RC = MIOpc == X86::DEC64r ? + (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : + (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; // LEA can't handle RSP. if (TargetRegisterInfo::isVirtualRegister(Src) && - !MF.getRegInfo().constrainRegClass(Src, - MIOpc == X86::DEC64r ? X86::GR64_NOSPRegisterClass : - X86::GR32_NOSPRegisterClass)) + !MF.getRegInfo().constrainRegClass(Src, RC)) return 0; NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) @@ -1844,10 +1997,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, const TargetRegisterClass *RC; if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) { Opc = X86::LEA64r; - RC = X86::GR64_NOSPRegisterClass; + RC = &X86::GR64_NOSPRegClass; } else { Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - RC = X86::GR32_NOSPRegisterClass; + RC = &X86::GR32_NOSPRegClass; } @@ -1863,6 +2016,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, .addReg(Dest, RegState::Define | getDeadRegState(isDead)), Src, isKill, Src2, isKill2); + + // Preserve undefness of the operands. + bool isUndef = MI->getOperand(1).isUndef(); + bool isUndef2 = MI->getOperand(2).isUndef(); + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + if (LV && isKill2) LV->replaceKillInstruction(Src2, MI, NewMI); break; @@ -2079,7 +2239,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { } } -static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) { +static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { switch (BrOpc) { default: return X86::COND_INVALID; case X86::JE_4: return X86::COND_E; @@ -2101,6 +2261,84 @@ static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) { } } +/// getCondFromSETOpc - return condition code of a SET opcode. +static X86::CondCode getCondFromSETOpc(unsigned Opc) { + switch (Opc) { + default: return X86::COND_INVALID; + case X86::SETAr: case X86::SETAm: return X86::COND_A; + case X86::SETAEr: case X86::SETAEm: return X86::COND_AE; + case X86::SETBr: case X86::SETBm: return X86::COND_B; + case X86::SETBEr: case X86::SETBEm: return X86::COND_BE; + case X86::SETEr: case X86::SETEm: return X86::COND_E; + case X86::SETGr: case X86::SETGm: return X86::COND_G; + case X86::SETGEr: case X86::SETGEm: return X86::COND_GE; + case X86::SETLr: case X86::SETLm: return X86::COND_L; + case X86::SETLEr: case X86::SETLEm: return X86::COND_LE; + case X86::SETNEr: case X86::SETNEm: return X86::COND_NE; + case X86::SETNOr: case X86::SETNOm: return X86::COND_NO; + case X86::SETNPr: case X86::SETNPm: return X86::COND_NP; + case X86::SETNSr: case X86::SETNSm: return X86::COND_NS; + case X86::SETOr: case X86::SETOm: return X86::COND_O; + case X86::SETPr: case X86::SETPm: return X86::COND_P; + case X86::SETSr: case X86::SETSm: return X86::COND_S; + } +} + +/// getCondFromCmovOpc - return condition code of a CMov opcode. +static X86::CondCode getCondFromCMovOpc(unsigned Opc) { + switch (Opc) { + default: return X86::COND_INVALID; + case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm: + case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr: + return X86::COND_A; + case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm: + case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr: + return X86::COND_AE; + case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm: + case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr: + return X86::COND_B; + case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm: + case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr: + return X86::COND_BE; + case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm: + case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr: + return X86::COND_E; + case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm: + case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr: + return X86::COND_G; + case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm: + case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr: + return X86::COND_GE; + case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm: + case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr: + return X86::COND_L; + case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm: + case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr: + return X86::COND_LE; + case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm: + case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr: + return X86::COND_NE; + case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm: + case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr: + return X86::COND_NO; + case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm: + case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr: + return X86::COND_NP; + case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm: + case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr: + return X86::COND_NS; + case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm: + case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr: + return X86::COND_O; + case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm: + case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr: + return X86::COND_P; + case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm: + case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr: + return X86::COND_S; + } +} + unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { switch (CC) { default: llvm_unreachable("Illegal condition code!"); @@ -2147,6 +2385,101 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { } } +/// getSwappedCondition - assume the flags are set by MI(a,b), return +/// the condition code if we modify the instructions such that flags are +/// set by MI(b,a). +static X86::CondCode getSwappedCondition(X86::CondCode CC) { + switch (CC) { + default: return X86::COND_INVALID; + case X86::COND_E: return X86::COND_E; + case X86::COND_NE: return X86::COND_NE; + case X86::COND_L: return X86::COND_G; + case X86::COND_LE: return X86::COND_GE; + case X86::COND_G: return X86::COND_L; + case X86::COND_GE: return X86::COND_LE; + case X86::COND_B: return X86::COND_A; + case X86::COND_BE: return X86::COND_AE; + case X86::COND_A: return X86::COND_B; + case X86::COND_AE: return X86::COND_BE; + } +} + +/// getSETFromCond - Return a set opcode for the given condition and +/// whether it has memory operand. +static unsigned getSETFromCond(X86::CondCode CC, + bool HasMemoryOperand) { + static const unsigned Opc[16][2] = { + { X86::SETAr, X86::SETAm }, + { X86::SETAEr, X86::SETAEm }, + { X86::SETBr, X86::SETBm }, + { X86::SETBEr, X86::SETBEm }, + { X86::SETEr, X86::SETEm }, + { X86::SETGr, X86::SETGm }, + { X86::SETGEr, X86::SETGEm }, + { X86::SETLr, X86::SETLm }, + { X86::SETLEr, X86::SETLEm }, + { X86::SETNEr, X86::SETNEm }, + { X86::SETNOr, X86::SETNOm }, + { X86::SETNPr, X86::SETNPm }, + { X86::SETNSr, X86::SETNSm }, + { X86::SETOr, X86::SETOm }, + { X86::SETPr, X86::SETPm }, + { X86::SETSr, X86::SETSm } + }; + + assert(CC < 16 && "Can only handle standard cond codes"); + return Opc[CC][HasMemoryOperand ? 1 : 0]; +} + +/// getCMovFromCond - Return a cmov opcode for the given condition, +/// register size in bytes, and operand type. +static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes, + bool HasMemoryOperand) { + static const unsigned Opc[32][3] = { + { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, + { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, + { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr }, + { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr }, + { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr }, + { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr }, + { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr }, + { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr }, + { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr }, + { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr }, + { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr }, + { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr }, + { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr }, + { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr }, + { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr }, + { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr }, + { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm }, + { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm }, + { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm }, + { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm }, + { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm }, + { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm }, + { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm }, + { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm }, + { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm }, + { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm }, + { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm }, + { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm }, + { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm }, + { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm }, + { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm }, + { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm } + }; + + assert(CC < 16 && "Can only handle standard cond codes"); + unsigned Idx = HasMemoryOperand ? 16+CC : CC; + switch(RegBytes) { + default: llvm_unreachable("Illegal register size!"); + case 2: return Opc[Idx][0]; + case 4: return Opc[Idx][1]; + case 8: return Opc[Idx][2]; + } +} + bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { if (!MI->isTerminator()) return false; @@ -2213,7 +2546,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, } // Handle conditional branches. - X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode()); + X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode()); if (BranchCode == X86::COND_INVALID) return true; // Can't handle indirect branch. @@ -2311,7 +2644,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { if (I->isDebugValue()) continue; if (I->getOpcode() != X86::JMP_4 && - GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) break; // Remove the branch. I->eraseFromParent(); @@ -2371,6 +2704,56 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, return Count; } +bool X86InstrInfo:: +canInsertSelect(const MachineBasicBlock &MBB, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, int &TrueCycles, int &FalseCycles) const { + // Not all subtargets have cmov instructions. + if (!TM.getSubtarget<X86Subtarget>().hasCMov()) + return false; + if (Cond.size() != 1) + return false; + // We cannot do the composite conditions, at least not in SSA form. + if ((X86::CondCode)Cond[0].getImm() > X86::COND_S) + return false; + + // Check register classes. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + if (!RC) + return false; + + // We have cmov instructions for 16, 32, and 64 bit general purpose registers. + if (X86::GR16RegClass.hasSubClassEq(RC) || + X86::GR32RegClass.hasSubClassEq(RC) || + X86::GR64RegClass.hasSubClassEq(RC)) { + // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy + // Bridge. Probably Ivy Bridge as well. + CondCycles = 2; + TrueCycles = 2; + FalseCycles = 2; + return true; + } + + // Can't do vectors. + return false; +} + +void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DstReg, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + assert(Cond.size() == 1 && "Invalid Cond array"); + unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), + MRI.getRegClass(DstReg)->getSize(), + false/*HasMemoryOperand*/); + BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); +} + /// isHReg - Test if the given register is a physical h register. static bool isHReg(unsigned Reg) { return X86::GR8_ABCD_HRegClass.contains(Reg); @@ -2637,6 +3020,305 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, NewMIs.push_back(MIB); } +bool X86InstrInfo:: +analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, + int &CmpMask, int &CmpValue) const { + switch (MI->getOpcode()) { + default: break; + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP8ri: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = MI->getOperand(1).getImm(); + return true; + case X86::CMP64rr: + case X86::CMP32rr: + case X86::CMP16rr: + case X86::CMP8rr: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = MI->getOperand(1).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + SrcReg = MI->getOperand(0).getReg(); + if (MI->getOperand(1).getReg() != SrcReg) return false; + // Compare against zero. + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = 0; + return true; + } + return false; +} + +/// isRedundantFlagInstr - check whether the first instruction, whose only +/// purpose is to update flags, can be made redundant. +/// CMPrr can be made redundant by SUBrr if the operands are the same. +/// This function can be extended later on. +/// SrcReg, SrcRegs: register operands for FlagI. +/// ImmValue: immediate for FlagI if it takes an immediate. +inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, + unsigned SrcReg2, int ImmValue, + MachineInstr *OI) { + if (((FlagI->getOpcode() == X86::CMP64rr && + OI->getOpcode() == X86::SUB64rr) || + (FlagI->getOpcode() == X86::CMP32rr && + OI->getOpcode() == X86::SUB32rr)|| + (FlagI->getOpcode() == X86::CMP16rr && + OI->getOpcode() == X86::SUB16rr)|| + (FlagI->getOpcode() == X86::CMP8rr && + OI->getOpcode() == X86::SUB8rr)) && + ((OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getReg() == SrcReg2) || + (OI->getOperand(1).getReg() == SrcReg2 && + OI->getOperand(2).getReg() == SrcReg))) + return true; + + if (((FlagI->getOpcode() == X86::CMP64ri32 && + OI->getOpcode() == X86::SUB64ri32) || + (FlagI->getOpcode() == X86::CMP64ri8 && + OI->getOpcode() == X86::SUB64ri8) || + (FlagI->getOpcode() == X86::CMP32ri && + OI->getOpcode() == X86::SUB32ri) || + (FlagI->getOpcode() == X86::CMP32ri8 && + OI->getOpcode() == X86::SUB32ri8) || + (FlagI->getOpcode() == X86::CMP16ri && + OI->getOpcode() == X86::SUB16ri) || + (FlagI->getOpcode() == X86::CMP16ri8 && + OI->getOpcode() == X86::SUB16ri8) || + (FlagI->getOpcode() == X86::CMP8ri && + OI->getOpcode() == X86::SUB8ri)) && + OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getImm() == ImmValue) + return true; + return false; +} + +/// isDefConvertible - check whether the definition can be converted +/// to remove a comparison against zero. +inline static bool isDefConvertible(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return false; + case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: + case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: + case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr: + case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm: + case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: + case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri: + case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8: + case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr: + case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm: + case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm: + case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri: + case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8: + case X86::AND8ri: case X86::AND64rr: case X86::AND32rr: + case X86::AND16rr: case X86::AND8rr: case X86::AND64rm: + case X86::AND32rm: case X86::AND16rm: case X86::AND8rm: + case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri: + case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8: + case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr: + case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm: + case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm: + case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri: + case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8: + case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: + case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: + case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: + return true; + } +} + +/// optimizeCompareInstr - Check if there exists an earlier instruction that +/// operates on the same source operands and sets flags in the same way as +/// Compare; remove Compare if possible. +bool X86InstrInfo:: +optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, + int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const { + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) return false; + + // CmpInstr is the first instruction of the BB. + MachineBasicBlock::iterator I = CmpInstr, Def = MI; + + // If we are comparing against zero, check whether we can use MI to update + // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize. + bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0); + if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() || + !isDefConvertible(MI))) + return false; + + // We are searching for an earlier instruction that can make CmpInstr + // redundant and that instruction will be saved in Sub. + MachineInstr *Sub = NULL; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + // We iterate backward, starting from the instruction before CmpInstr and + // stop when reaching the definition of a source register or done with the BB. + // RI points to the instruction before CmpInstr. + // If the definition is in this basic block, RE points to the definition; + // otherwise, RE is the rend of the basic block. + MachineBasicBlock::reverse_iterator + RI = MachineBasicBlock::reverse_iterator(I), + RE = CmpInstr->getParent() == MI->getParent() ? + MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ : + CmpInstr->getParent()->rend(); + MachineInstr *Movr0Inst = 0; + for (; RI != RE; ++RI) { + MachineInstr *Instr = &*RI; + // Check whether CmpInstr can be made redundant by the current instruction. + if (!IsCmpZero && + isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) { + Sub = Instr; + break; + } + + if (Instr->modifiesRegister(X86::EFLAGS, TRI) || + Instr->readsRegister(X86::EFLAGS, TRI)) { + // This instruction modifies or uses EFLAGS. + + // MOV32r0 etc. are implemented with xor which clobbers condition code. + // They are safe to move up, if the definition to EFLAGS is dead and + // earlier instructions do not read or write EFLAGS. + if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 || + Instr->getOpcode() == X86::MOV16r0 || + Instr->getOpcode() == X86::MOV32r0 || + Instr->getOpcode() == X86::MOV64r0) && + Instr->registerDefIsDead(X86::EFLAGS, TRI)) { + Movr0Inst = Instr; + continue; + } + + // We can't remove CmpInstr. + return false; + } + } + + // Return false if no candidates exist. + if (!IsCmpZero && !Sub) + return false; + + bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg); + + // Scan forward from the instruction after CmpInstr for uses of EFLAGS. + // It is safe to remove CmpInstr if EFLAGS is redefined or killed. + // If we are done with the basic block, we need to check whether EFLAGS is + // live-out. + bool IsSafe = false; + SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate; + MachineBasicBlock::iterator E = CmpInstr->getParent()->end(); + for (++I; I != E; ++I) { + const MachineInstr &Instr = *I; + if (Instr.modifiesRegister(X86::EFLAGS, TRI)) { + // It is safe to remove CmpInstr if EFLAGS is updated again. + IsSafe = true; + break; + } + if (!Instr.readsRegister(X86::EFLAGS, TRI)) + continue; + + // EFLAGS is used by this instruction. + X86::CondCode OldCC; + bool OpcIsSET = false; + if (IsCmpZero || IsSwapped) { + // We decode the condition code from opcode. + if (Instr.isBranch()) + OldCC = getCondFromBranchOpc(Instr.getOpcode()); + else { + OldCC = getCondFromSETOpc(Instr.getOpcode()); + if (OldCC != X86::COND_INVALID) + OpcIsSET = true; + else + OldCC = getCondFromCMovOpc(Instr.getOpcode()); + } + if (OldCC == X86::COND_INVALID) return false; + } + if (IsCmpZero) { + switch (OldCC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + case X86::COND_O: case X86::COND_NO: + // CF and OF are used, we can't perform this optimization. + return false; + } + } else if (IsSwapped) { + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs + // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. + // We swap the condition code and synthesize the new opcode. + X86::CondCode NewCC = getSwappedCondition(OldCC); + if (NewCC == X86::COND_INVALID) return false; + + // Synthesize the new opcode. + bool HasMemoryOperand = Instr.hasOneMemOperand(); + unsigned NewOpc; + if (Instr.isBranch()) + NewOpc = GetCondBranchFromCond(NewCC); + else if(OpcIsSET) + NewOpc = getSETFromCond(NewCC, HasMemoryOperand); + else { + unsigned DstReg = Instr.getOperand(0).getReg(); + NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(), + HasMemoryOperand); + } + + // Push the MachineInstr to OpsToUpdate. + // If it is safe to remove CmpInstr, the condition code of these + // instructions will be modified. + OpsToUpdate.push_back(std::make_pair(&*I, NewOpc)); + } + if (Instr.killsRegister(X86::EFLAGS, TRI)) { + IsSafe = true; + break; + } + } + + // If EFLAGS is not killed nor re-defined, we should check whether it is + // live-out. If it is live-out, do not optimize. + if ((IsCmpZero || IsSwapped) && !IsSafe) { + MachineBasicBlock *MBB = CmpInstr->getParent(); + for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), + SE = MBB->succ_end(); SI != SE; ++SI) + if ((*SI)->isLiveIn(X86::EFLAGS)) + return false; + } + + // The instruction to be updated is either Sub or MI. + Sub = IsCmpZero ? MI : Sub; + // Move Movr0Inst to the place right before Sub. + if (Movr0Inst) { + Sub->getParent()->remove(Movr0Inst); + Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst); + } + + // Make sure Sub instruction defines EFLAGS. + assert(Sub->getNumOperands() >= 2 && + Sub->getOperand(Sub->getNumOperands()-1).isReg() && + Sub->getOperand(Sub->getNumOperands()-1).getReg() == X86::EFLAGS && + "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND"); + Sub->getOperand(Sub->getNumOperands()-1).setIsDef(true); + CmpInstr->eraseFromParent(); + + // Modify the condition code of instructions in OpsToUpdate. + for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++) + OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second)); + return true; +} + /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr /// instruction with two undef reads of the register being defined. This is /// used for mapping: @@ -2809,7 +3491,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return NULL; bool NarrowToMOV32rm = false; if (Size) { - unsigned RCSize = getRegClass(MI->getDesc(), i, &RI)->getSize(); + unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize(); if (Size < RCSize) { // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -3202,7 +3884,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, UnfoldStore &= FoldedStore; const MCInstrDesc &MCID = get(Opc); - const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) @@ -3297,7 +3979,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, // Emit the store instruction. if (UnfoldStore) { - const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI); + const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs = MF.extractStoreMemRefs(MI->memoperands_begin(), @@ -3323,7 +4005,8 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; bool FoldedStore = I->second.second & TB_FOLDED_STORE; const MCInstrDesc &MCID = get(Opc); - const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + MachineFunction &MF = DAG.getMachineFunction(); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); unsigned NumDefs = MCID.NumDefs; std::vector<SDValue> AddrOps; std::vector<SDValue> BeforeOps; @@ -3344,7 +4027,6 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, // Emit the load instruction. SDNode *Load = 0; - MachineFunction &MF = DAG.getMachineFunction(); if (FoldedLoad) { EVT VT = *RC->vt_begin(); std::pair<MachineInstr::mmo_iterator, @@ -3371,7 +4053,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, std::vector<EVT> VTs; const TargetRegisterClass *DstRC = 0; if (MCID.getNumDefs() > 0) { - DstRC = getRegClass(MCID, 0, &RI); + DstRC = getRegClass(MCID, 0, &RI, MF); VTs.push_back(*DstRC->vt_begin()); } for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { @@ -3625,7 +4307,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { // Create the register. The code to initialize it is inserted // later, by the CGBR pass (below). MachineRegisterInfo &RegInfo = MF->getRegInfo(); - GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass); + GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); X86FI->setGlobalBaseReg(GlobalBaseReg); return GlobalBaseReg; } @@ -3835,7 +4517,7 @@ namespace { unsigned PC; if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) - PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass); + PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); else PC = GlobalBaseReg; @@ -3869,3 +4551,117 @@ namespace { char CGBR::ID = 0; FunctionPass* llvm::createGlobalBaseRegPass() { return new CGBR(); } + +namespace { + struct LDTLSCleanup : public MachineFunctionPass { + static char ID; + LDTLSCleanup() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF) { + X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>(); + if (MFI->getNumLocalDynamicTLSAccesses() < 2) { + // No point folding accesses if there isn't at least two. + return false; + } + + MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + return VisitNode(DT->getRootNode(), 0); + } + + // Visit the dominator subtree rooted at Node in pre-order. + // If TLSBaseAddrReg is non-null, then use that to replace any + // TLS_base_addr instructions. Otherwise, create the register + // when the first such instruction is seen, and then use it + // as we encounter more instructions. + bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { + MachineBasicBlock *BB = Node->getBlock(); + bool Changed = false; + + // Traverse the current block. + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; + ++I) { + switch (I->getOpcode()) { + case X86::TLS_base_addr32: + case X86::TLS_base_addr64: + if (TLSBaseAddrReg) + I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg); + else + I = SetRegister(I, &TLSBaseAddrReg); + Changed = true; + break; + default: + break; + } + } + + // Visit the children of this block in the dominator tree. + for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end(); + I != E; ++I) { + Changed |= VisitNode(*I, TLSBaseAddrReg); + } + + return Changed; + } + + // Replace the TLS_base_addr instruction I with a copy from + // TLSBaseAddrReg, returning the new instruction. + MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I, + unsigned TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF->getTarget()); + const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); + const X86InstrInfo *TII = TM->getInstrInfo(); + + // Insert a Copy from TLSBaseAddrReg to RAX/EAX. + MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + is64Bit ? X86::RAX : X86::EAX) + .addReg(TLSBaseAddrReg); + + // Erase the TLS_base_addr instruction. + I->eraseFromParent(); + + return Copy; + } + + // Create a virtal register in *TLSBaseAddrReg, and populate it by + // inserting a copy instruction after I. Returns the new instruction. + MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF->getTarget()); + const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); + const X86InstrInfo *TII = TM->getInstrInfo(); + + // Create a virtual register for the TLS base address. + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit + ? &X86::GR64RegClass + : &X86::GR32RegClass); + + // Insert a copy from RAX/EAX to TLSBaseAddrReg. + MachineInstr *Next = I->getNextNode(); + MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + *TLSBaseAddrReg) + .addReg(is64Bit ? X86::RAX : X86::EAX); + + return Copy; + } + + virtual const char *getPassName() const { + return "Local Dynamic TLS Access Clean-up"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +char LDTLSCleanup::ID = 0; +FunctionPass* +llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); } diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index b23d756..ec9b2e6 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -128,8 +128,8 @@ class X86InstrInfo : public X86GenInstrInfo { X86TargetMachine &TM; const X86RegisterInfo RI; - /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, - /// RegOp2MemOpTable2 - Load / store folding opcode maps. + /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, + /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps. /// typedef DenseMap<unsigned, std::pair<unsigned, unsigned> > RegOp2MemOpTableType; @@ -137,6 +137,7 @@ class X86InstrInfo : public X86GenInstrInfo { RegOp2MemOpTableType RegOp2MemOpTable0; RegOp2MemOpTableType RegOp2MemOpTable1; RegOp2MemOpTableType RegOp2MemOpTable2; + RegOp2MemOpTableType RegOp2MemOpTable3; /// MemOp2RegOpTable - Load / store unfolding opcode map. /// @@ -144,9 +145,9 @@ class X86InstrInfo : public X86GenInstrInfo { std::pair<unsigned, unsigned> > MemOp2RegOpTableType; MemOp2RegOpTableType MemOp2RegOpTable; - void AddTableEntry(RegOp2MemOpTableType &R2MTable, - MemOp2RegOpTableType &M2RTable, - unsigned RegOp, unsigned MemOp, unsigned Flags); + static void AddTableEntry(RegOp2MemOpTableType &R2MTable, + MemOp2RegOpTableType &M2RTable, + unsigned RegOp, unsigned MemOp, unsigned Flags); public: explicit X86InstrInfo(X86TargetMachine &tm); @@ -218,6 +219,14 @@ public: MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const; + virtual bool canInsertSelect(const MachineBasicBlock&, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned, unsigned, int&, int&, int&) const; + virtual void insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DstReg, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg) const; virtual void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, @@ -363,6 +372,21 @@ public: const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const; + /// analyzeCompare - For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2 if having two register operands, and the value it + /// compares against in CmpValue. Return true if the comparison instruction + /// can be analyzed. + virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, + int &CmpMask, int &CmpValue) const; + + /// optimizeCompareInstr - Check if there exists an earlier instruction that + /// operates on the same source operands and sets flags in the same way as + /// Compare; remove Compare if possible. + virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const; + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 6a25312..d293156 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -63,6 +63,10 @@ def SDTX86SetCC_C : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; +def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>; + +def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; + def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, SDTCisVT<2, i8>]>; def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; @@ -95,6 +99,8 @@ def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; @@ -131,6 +137,11 @@ def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; +def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>; + +def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; @@ -199,6 +210,9 @@ def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, [SDNPHasChain]>; @@ -278,6 +292,20 @@ def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; let PredicateMethod = "isMem256"; } +// Gather mem operands +def X86MemVX32Operand : AsmOperandClass { + let Name = "MemVX32"; let PredicateMethod = "isMemVX32"; +} +def X86MemVY32Operand : AsmOperandClass { + let Name = "MemVY32"; let PredicateMethod = "isMemVY32"; +} +def X86MemVX64Operand : AsmOperandClass { + let Name = "MemVX64"; let PredicateMethod = "isMemVX64"; +} +def X86MemVY64Operand : AsmOperandClass { + let Name = "MemVY64"; let PredicateMethod = "isMemVY64"; +} + def X86AbsMemAsmOperand : AsmOperandClass { let Name = "AbsMem"; let SuperClasses = [X86MemAsmOperand]; @@ -316,6 +344,20 @@ def f128mem : X86MemOperand<"printf128mem"> { let ParserMatchClass = X86Mem128AsmOperand; } def f256mem : X86MemOperand<"printf256mem">{ let ParserMatchClass = X86Mem256AsmOperand; } + +// Gather mem operands +def vx32mem : X86MemOperand<"printi32mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm); + let ParserMatchClass = X86MemVX32Operand; } +def vy32mem : X86MemOperand<"printi32mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); + let ParserMatchClass = X86MemVY32Operand; } +def vx64mem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm); + let ParserMatchClass = X86MemVX64Operand; } +def vy64mem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); + let ParserMatchClass = X86MemVY64Operand; } } // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of @@ -328,7 +370,7 @@ def i8mem_NOREX : Operand<i64> { } // GPRs available for tailcall. -// It represents GR64_TC or GR64_TCW64. +// It represents GR32_TC, GR64_TC or GR64_TCW64. def ptr_rc_tailcall : PointerLikeRegClass<2>; // Special i32mem for addresses of load folding tail calls. These are not @@ -336,7 +378,8 @@ def ptr_rc_tailcall : PointerLikeRegClass<2>; // after callee-saved register are popped. def i32mem_TC : Operand<i32> { let PrintMethod = "printi32mem"; - let MIOperandInfo = (ops GR32_TC, i8imm, GR32_TC, i32imm, i8imm); + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, + i32imm, i8imm); let ParserMatchClass = X86Mem32AsmOperand; let OperandType = "OPERAND_MEMORY"; } @@ -487,6 +530,9 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr", def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", [tglobaltlsaddr], []>; +def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", + [tglobaltlsaddr], []>; + def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; @@ -494,6 +540,9 @@ def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr", def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", [tglobaltlsaddr], []>; +def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", + [tglobaltlsaddr], []>; + //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. def HasCMov : Predicate<"Subtarget->hasCMov()">; @@ -514,8 +563,8 @@ def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; -def HasCLMUL : Predicate<"Subtarget->hasCLMUL()">; -def HasFMA3 : Predicate<"Subtarget->hasFMA3()">; +def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; +def HasFMA : Predicate<"Subtarget->hasFMA()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; def HasXOP : Predicate<"Subtarget->hasXOP()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; @@ -680,25 +729,27 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ // Nop let neverHasSideEffects = 1 in { - def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>; + def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>; def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero), - "nop{w}\t$zero", []>, TB, OpSize; + "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize; def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero), - "nop{l}\t$zero", []>, TB; + "nop{l}\t$zero", [], IIC_NOP>, TB; } // Constructing a stack frame. def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), - "enter\t$len, $lvl", []>; + "enter\t$len, $lvl", [], IIC_ENTER>; let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in def LEAVE : I<0xC9, RawFrm, - (outs), (ins), "leave", []>, Requires<[In32BitMode]>; + (outs), (ins), "leave", [], IIC_LEAVE>, + Requires<[In32BitMode]>; let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in def LEAVE64 : I<0xC9, RawFrm, - (outs), (ins), "leave", []>, Requires<[In64BitMode]>; + (outs), (ins), "leave", [], IIC_LEAVE>, + Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // Miscellaneous Instructions. @@ -706,41 +757,49 @@ def LEAVE64 : I<0xC9, RawFrm, let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in { let mayLoad = 1 in { -def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>, - OpSize; -def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>; -def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>, - OpSize; -def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", []>, - OpSize; -def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>; -def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", []>; - -def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize; -def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>, +def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], + IIC_POP_REG16>, OpSize; +def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], + IIC_POP_REG>; +def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], + IIC_POP_REG>, OpSize; +def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [], + IIC_POP_MEM>, OpSize; +def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], + IIC_POP_REG>; +def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [], + IIC_POP_MEM>; + +def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize; +def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, Requires<[In32BitMode]>; } let mayStore = 1 in { -def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>, - OpSize; -def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>; -def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>, - OpSize; -def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[]>, +def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[], + IIC_PUSH_REG>, OpSize; +def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[], + IIC_PUSH_REG>; +def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[], + IIC_PUSH_REG>, OpSize; +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], + IIC_PUSH_MEM>, OpSize; -def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>; -def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[]>; +def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[], + IIC_PUSH_REG>; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], + IIC_PUSH_MEM>; def PUSHi8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm), - "push{l}\t$imm", []>; + "push{l}\t$imm", [], IIC_PUSH_IMM>; def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), - "push{w}\t$imm", []>, OpSize; + "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize; def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), - "push{l}\t$imm", []>; + "push{l}\t$imm", [], IIC_PUSH_IMM>; -def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize; -def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>, +def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>, + OpSize; +def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>, Requires<[In32BitMode]>; } @@ -749,44 +808,48 @@ def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>, let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in { let mayLoad = 1 in { def POP64r : I<0x58, AddRegFrm, - (outs GR64:$reg), (ins), "pop{q}\t$reg", []>; -def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>; -def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", []>; + (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>; +def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], + IIC_POP_REG>; +def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [], + IIC_POP_MEM>; } let mayStore = 1 in { def PUSH64r : I<0x50, AddRegFrm, - (outs), (ins GR64:$reg), "push{q}\t$reg", []>; -def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>; -def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>; + (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>; +def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [], + IIC_PUSH_REG>; +def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], + IIC_PUSH_MEM>; } } let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in { def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), - "push{q}\t$imm", []>; + "push{q}\t$imm", [], IIC_PUSH_IMM>; def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), - "push{q}\t$imm", []>; + "push{q}\t$imm", [], IIC_PUSH_IMM>; def PUSH64i32 : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm), - "push{q}\t$imm", []>; + "push{q}\t$imm", [], IIC_PUSH_IMM>; } let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in -def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>, +def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>, Requires<[In64BitMode]>; let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in -def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>, +def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>, Requires<[In64BitMode]>; let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], mayLoad=1, neverHasSideEffects=1 in { -def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l}", []>, +def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>, Requires<[In32BitMode]>; } let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], mayStore=1, neverHasSideEffects=1 in { -def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l}", []>, +def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>, Requires<[In32BitMode]>; } @@ -794,84 +857,92 @@ let Constraints = "$src = $dst" in { // GR32 = bswap GR32 def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "bswap{l}\t$dst", - [(set GR32:$dst, (bswap GR32:$src))]>, TB; + [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, TB; def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), "bswap{q}\t$dst", - [(set GR64:$dst, (bswap GR64:$src))]>, TB; + [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB; } // Constraints = "$src = $dst" // Bit scan instructions. let Defs = [EFLAGS] in { def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsf{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB, OpSize; + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))], + IIC_BSF>, TB, OpSize; def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsf{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB, - OpSize; + [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))], + IIC_BSF>, TB, OpSize; def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsf{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB; + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB; def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsf{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB; + [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))], + IIC_BSF>, TB; def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsf{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB; + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))], + IIC_BSF>, TB; def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsf{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB; + [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))], + IIC_BSF>, TB; def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsr{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB, OpSize; + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>, + TB, OpSize; def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsr{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB, + [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))], + IIC_BSR>, TB, OpSize; def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsr{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB; + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB; def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsr{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB; + [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))], + IIC_BSR>, TB; def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsr{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB; + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB; def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsr{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB; + [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))], + IIC_BSR>, TB; } // Defs = [EFLAGS] // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in { -def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", []>; -def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", []>, OpSize; -def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", []>; -def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", []>; +def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>; +def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", [], IIC_MOVS>, OpSize; +def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", [], IIC_MOVS>; +def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", [], IIC_MOVS>; } // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in -def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", []>; +def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", [], IIC_STOS>; let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in -def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", []>, OpSize; +def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", [], IIC_STOS>, OpSize; let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in -def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", []>; +def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", [], IIC_STOS>; let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in -def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", []>; +def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", [], IIC_STOS>; -def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", []>; -def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", []>, OpSize; -def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", []>; -def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", []>; +def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", [], IIC_SCAS>; +def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", [], IIC_SCAS>, OpSize; +def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", [], IIC_SCAS>; +def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", [], IIC_SCAS>; -def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", []>; -def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", []>, OpSize; -def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", []>; -def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>; +def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>; +def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize; +def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>; +def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>; //===----------------------------------------------------------------------===// @@ -880,64 +951,64 @@ def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>; let neverHasSideEffects = 1 in { def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), - "mov{b}\t{$src, $dst|$dst, $src}", []>; + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), - "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize; def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(set GR8:$dst, imm:$src)]>; + [(set GR8:$dst, imm:$src)], IIC_MOV>; def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, imm:$src)]>, OpSize; + [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize; def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, imm:$src)]>; + [(set GR32:$dst, imm:$src)], IIC_MOV>; def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), "movabs{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, imm:$src)]>; + [(set GR64:$dst, imm:$src)], IIC_MOV>; def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, i64immSExt32:$src)]>; + [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>; } def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store (i8 imm:$src), addr:$dst)]>; + [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store (i16 imm:$src), addr:$dst)]>, OpSize; + [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store (i32 imm:$src), addr:$dst)]>; + [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>; def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(store i64immSExt32:$src, addr:$dst)]>; + [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; /// moffs8, moffs16 and moffs32 versions of moves. The immediate is a /// 32-bit offset from the PC. These are only valid in x86-32 mode. def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src), - "mov{b}\t{$src, %al|AL, $src}", []>, + "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src), - "mov{w}\t{$src, %ax|AL, $src}", []>, OpSize, + "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize, Requires<[In32BitMode]>; def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src), - "mov{l}\t{$src, %eax|EAX, $src}", []>, + "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins), - "mov{b}\t{%al, $dst|$dst, AL}", []>, + "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins), - "mov{w}\t{%ax, $dst|$dst, AL}", []>, OpSize, + "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize, Requires<[In32BitMode]>; def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins), - "mov{l}\t{%eax, $dst|$dst, EAX}", []>, + "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; // FIXME: These definitions are utterly broken @@ -958,42 +1029,42 @@ def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins), let isCodeGenOnly = 1 in { def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), - "mov{b}\t{$src, $dst|$dst, $src}", []>; + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), - "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize; def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; } let canFoldAsLoad = 1, isReMaterializable = 1 in { def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(set GR8:$dst, (loadi8 addr:$src))]>; + [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>; def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize; + [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize; def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (loadi32 addr:$src))]>; + [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>; def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (load addr:$src))]>; + [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>; } def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store GR8:$src, addr:$dst)]>; + [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>; def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store GR16:$src, addr:$dst)]>, OpSize; + [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize; def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store GR32:$src, addr:$dst)]>; + [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>; def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(store GR64:$src, addr:$dst)]>; + [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>; // Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so // that they can be used for copying and storing h registers, which can't be @@ -1002,24 +1073,28 @@ let isCodeGenOnly = 1 in { let neverHasSideEffects = 1 in def MOV8rr_NOREX : I<0x88, MRMDestReg, (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), - "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>; + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>; let mayStore = 1 in def MOV8mr_NOREX : I<0x88, MRMDestMem, (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), - "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>; + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], + IIC_MOV_MEM>; let mayLoad = 1, neverHasSideEffects = 1, canFoldAsLoad = 1, isReMaterializable = 1 in def MOV8rm_NOREX : I<0x8A, MRMSrcMem, (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), - "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>; + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], + IIC_MOV_MEM>; } // Condition code ops, incl. set if equal/not equal/... -let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in -def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", []>; // flags = AH +let Defs = [EFLAGS], Uses = [AH] in +def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", + [(set EFLAGS, (X86sahf AH))], IIC_AHF>; let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in -def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>; // AH = flags +def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], + IIC_AHF>; // AH = flags //===----------------------------------------------------------------------===// @@ -1028,13 +1103,14 @@ def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>; // AH = flags let Defs = [EFLAGS] in { def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, OpSize, TB; + [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>, + OpSize, TB; def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>, TB; + [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, TB; def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB; + [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB; // Unlike with the register+register form, the memory+register form of the // bt instruction does not ignore the high bits of the index. From ISel's @@ -1045,31 +1121,33 @@ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", // [(X86bt (loadi16 addr:$src1), GR16:$src2), // (implicit EFLAGS)] - [] + [], IIC_BT_MR >, OpSize, TB, Requires<[FastBTMem]>; def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", // [(X86bt (loadi32 addr:$src1), GR32:$src2), // (implicit EFLAGS)] - [] + [], IIC_BT_MR >, TB, Requires<[FastBTMem]>; def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", // [(X86bt (loadi64 addr:$src1), GR64:$src2), // (implicit EFLAGS)] - [] + [], IIC_BT_MR >, TB; def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>, - OpSize, TB; + [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))], + IIC_BT_RI>, OpSize, TB; def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, TB; + [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))], + IIC_BT_RI>, TB; def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB; + [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))], + IIC_BT_RI>, TB; // Note that these instructions don't need FastBTMem because that // only applies when the other operand is in a register. When it's @@ -1077,91 +1155,103 @@ def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2)) - ]>, OpSize, TB; + ], IIC_BT_MI>, OpSize, TB; def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2)) - ]>, TB; + ], IIC_BT_MI>, TB; def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi64 addr:$src1), - i64immSExt8:$src2))]>, TB; + i64immSExt8:$src2))], IIC_BT_MI>, TB; def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), - "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize, TB; def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), - "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), - "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), - "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize, TB; def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), - "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), - "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2), - "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize, TB; def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2), - "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2), - "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), - "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize, TB; def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), - "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), - "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), - "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize, TB; def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), - "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), - "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize, TB; def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), - "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), - "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2), - "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize, TB; def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2), - "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2), - "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), - "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize, TB; def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), - "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), - "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), - "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize, TB; def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), - "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), - "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), - "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize, TB; def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), - "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), - "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2), - "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize, TB; def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2), - "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2), - "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), - "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize, TB; def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), - "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB; + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), - "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; } // Defs = [EFLAGS] @@ -1175,89 +1265,106 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), let Constraints = "$val = $dst" in { def XCHG8rm : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr), "xchg{b}\t{$val, $ptr|$ptr, $val}", - [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>; + [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))], + IIC_XCHG_MEM>; def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr), "xchg{w}\t{$val, $ptr|$ptr, $val}", - [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>, + [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))], + IIC_XCHG_MEM>, OpSize; def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr), "xchg{l}\t{$val, $ptr|$ptr, $val}", - [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>; + [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))], + IIC_XCHG_MEM>; def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr), "xchg{q}\t{$val, $ptr|$ptr, $val}", - [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>; + [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))], + IIC_XCHG_MEM>; def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src), - "xchg{b}\t{$val, $src|$src, $val}", []>; + "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src), - "xchg{w}\t{$val, $src|$src, $val}", []>, OpSize; + "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize; def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src), - "xchg{l}\t{$val, $src|$src, $val}", []>; + "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), - "xchg{q}\t{$val, $src|$src, $val}", []>; + "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; } def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), - "xchg{w}\t{$src, %ax|AX, $src}", []>, OpSize; + "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize; def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), - "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In32BitMode]>; + "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + Requires<[In32BitMode]>; // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding. // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP. def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src), - "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In64BitMode]>; + "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + Requires<[In64BitMode]>; def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src), - "xchg{q}\t{$src, %rax|RAX, $src}", []>; + "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>; def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), - "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB; + "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), - "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB, + OpSize; def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), - "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB; + "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), - "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB; + "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; let mayLoad = 1, mayStore = 1 in { def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), - "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB; + "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), - "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB, + OpSize; def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), - "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB; + "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), - "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB; + "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; } def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), - "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB; + "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG8>, TB; def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), - "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "cmpxchg{w}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB, OpSize; def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), - "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB; + "cmpxchg{l}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB; def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), - "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB; + "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB; let mayLoad = 1, mayStore = 1 in { def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), - "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB; + "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM8>, TB; def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), - "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "cmpxchg{w}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB, OpSize; def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), - "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB; + "cmpxchg{l}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB; def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), - "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB; + "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB; } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), - "cmpxchg8b\t$dst", []>, TB; + "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB; let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), - "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>; + "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>, + TB, Requires<[HasCmpxchg16b]>; @@ -1281,69 +1388,75 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; // String manipulation instructions -def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", []>; -def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", []>, OpSize; -def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", []>; -def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", []>; +def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>; +def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize; +def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>; +def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>; -def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", []>; -def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", []>, OpSize; -def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", []>; +def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>; +def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize; +def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>; // Flag instructions -def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>; -def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>; -def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>; -def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>; -def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>; -def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>; -def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>; +def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>; +def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>; +def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>; +def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>; +def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>; +def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>; +def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>; -def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB; +def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; // Table lookup instructions -def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>; +def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>; // ASCII Adjust After Addition // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS -def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>, Requires<[In32BitMode]>; +def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>, + Requires<[In32BitMode]>; // ASCII Adjust AX Before Division // sets AL, AH and EFLAGS and uses AL and AH def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), - "aad\t$src", []>, Requires<[In32BitMode]>; + "aad\t$src", [], IIC_AAD>, Requires<[In32BitMode]>; // ASCII Adjust AX After Multiply // sets AL, AH and EFLAGS and uses AL def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), - "aam\t$src", []>, Requires<[In32BitMode]>; + "aam\t$src", [], IIC_AAM>, Requires<[In32BitMode]>; // ASCII Adjust AL After Subtraction - sets // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS -def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>, Requires<[In32BitMode]>; +def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>, + Requires<[In32BitMode]>; // Decimal Adjust AL after Addition // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS -def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>, Requires<[In32BitMode]>; +def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>, + Requires<[In32BitMode]>; // Decimal Adjust AL after Subtraction // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS -def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>, Requires<[In32BitMode]>; +def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>, + Requires<[In32BitMode]>; // Check Array Index Against Bounds def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), - "bound\t{$src, $dst|$dst, $src}", []>, OpSize, + "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize, Requires<[In32BitMode]>; def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "bound\t{$src, $dst|$dst, $src}", []>, + "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, Requires<[In32BitMode]>; // Adjust RPL Field of Segment Selector def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$src), (ins GR16:$dst), - "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>; + "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>, + Requires<[In32BitMode]>; def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst), - "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>; + "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>, + Requires<[In32BitMode]>; //===----------------------------------------------------------------------===// // MOVBE Instructions @@ -1351,22 +1464,28 @@ def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst), let Predicates = [HasMOVBE] in { def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "movbe{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, OpSize, T8; + [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>, + OpSize, T8; def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "movbe{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, T8; + [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>, + T8; def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "movbe{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, T8; + [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>, + T8; def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "movbe{w}\t{$src, $dst|$dst, $src}", - [(store (bswap GR16:$src), addr:$dst)]>, OpSize, T8; + [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>, + OpSize, T8; def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movbe{l}\t{$src, $dst|$dst, $src}", - [(store (bswap GR32:$src), addr:$dst)]>, T8; + [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>, + T8; def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movbe{q}\t{$src, $dst|$dst, $src}", - [(store (bswap GR64:$src), addr:$dst)]>, T8; + [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>, + T8; } //===----------------------------------------------------------------------===// @@ -1374,11 +1493,14 @@ let Predicates = [HasMOVBE] in { // let Predicates = [HasRDRAND], Defs = [EFLAGS] in { def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins), - "rdrand{w}\t$dst", []>, OpSize, TB; + "rdrand{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize, TB; def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins), - "rdrand{l}\t$dst", []>, TB; + "rdrand{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86rdrand))]>, TB; def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins), - "rdrand{q}\t$dst", []>, TB; + "rdrand{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB; } //===----------------------------------------------------------------------===// @@ -1774,9 +1896,9 @@ def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; // We accept "fnstsw %eax" even though it only writes %ax. -def : InstAlias<"fnstsw %eax", (FNSTSW8r)>; -def : InstAlias<"fnstsw %al" , (FNSTSW8r)>; -def : InstAlias<"fnstsw" , (FNSTSW8r)>; +def : InstAlias<"fnstsw %eax", (FNSTSW16r)>; +def : InstAlias<"fnstsw %al" , (FNSTSW16r)>; +def : InstAlias<"fnstsw" , (FNSTSW16r)>; // lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but // this is compatible with what GAS does. diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 63f96b6..e4edd36 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -20,71 +20,130 @@ // MMX Multiclasses //===----------------------------------------------------------------------===// +def MMX_INTALU_ITINS : OpndItins< + IIC_MMX_ALU_RR, IIC_MMX_ALU_RM +>; + +def MMX_INTALUQ_ITINS : OpndItins< + IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM +>; + +def MMX_PHADDSUBW : OpndItins< + IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM +>; + +def MMX_PHADDSUBD : OpndItins< + IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM +>; + +def MMX_PMUL_ITINS : OpndItins< + IIC_MMX_PMUL, IIC_MMX_PMUL +>; + +def MMX_PSADBW_ITINS : OpndItins< + IIC_MMX_PSADBW, IIC_MMX_PSADBW +>; + +def MMX_MISC_FUNC_ITINS : OpndItins< + IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG +>; + +def MMX_SHIFT_ITINS : ShiftOpndItins< + IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI +>; + +def MMX_UNPCK_H_ITINS : OpndItins< + IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM +>; + +def MMX_UNPCK_L_ITINS : OpndItins< + IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L +>; + +def MMX_PCK_ITINS : OpndItins< + IIC_MMX_PCK_RR, IIC_MMX_PCK_RM +>; + +def MMX_PSHUF_ITINS : OpndItins< + IIC_MMX_PSHUF, IIC_MMX_PSHUF +>; + +def MMX_CVT_PD_ITINS : OpndItins< + IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM +>; + +def MMX_CVT_PS_ITINS : OpndItins< + IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM +>; + let Constraints = "$src1 = $dst" in { // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, - bit Commutable = 0> { + OpndItins itins, bit Commutable = 0> { def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> { + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr> { let isCommutable = Commutable; } def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2))))]>; + (bitconvert (load_mmx addr:$src2))))], + itins.rm>; } multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, string OpcodeStr, Intrinsic IntId, - Intrinsic IntId2> { + Intrinsic IntId2, ShiftOpndItins itins> { def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>; + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>; def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2))))]>; + (bitconvert (load_mmx addr:$src2))))], + itins.rm>; def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), (ins VR64:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>; + [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>; } } /// Unary MMX instructions requiring SSSE3. multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, - Intrinsic IntId64> { + Intrinsic IntId64, OpndItins itins> { def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR64:$dst, (IntId64 VR64:$src))]>; + [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>; def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR64:$dst, - (IntId64 (bitconvert (memopmmx addr:$src))))]>; + (IntId64 (bitconvert (memopmmx addr:$src))))], + itins.rm>; } /// Binary MMX instructions requiring SSSE3. let ImmT = NoImm, Constraints = "$src1 = $dst" in { multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, - Intrinsic IntId64> { + Intrinsic IntId64, OpndItins itins> { let isCommutable = 0 in def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>; + [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>; def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId64 VR64:$src1, - (bitconvert (memopmmx addr:$src2))))]>; + (bitconvert (memopmmx addr:$src2))))], itins.rm>; } } @@ -103,13 +162,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, - string asm, Domain d> { + string asm, OpndItins itins, Domain d> { def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, [(set DstRC:$dst, (Int SrcRC:$src))], - IIC_DEFAULT, d>; + itins.rr, d>; def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, [(set DstRC:$dst, (Int (ld_frag addr:$src)))], - IIC_DEFAULT, d>; + itins.rm, d>; } multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, @@ -139,22 +198,24 @@ def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, - (x86mmx (scalar_to_vector GR32:$src)))]>; + (x86mmx (scalar_to_vector GR32:$src)))], + IIC_MMX_MOV_MM_RM>; let canFoldAsLoad = 1 in def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, - (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>; + [(set VR64:$dst, + (x86mmx (scalar_to_vector (loadi32 addr:$src))))], + IIC_MMX_MOV_MM_RM>; let mayStore = 1 in def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), - "movd\t{$src, $dst|$dst, $src}", []>; + "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>; def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src), - "movd\t{$src, $dst|$dst, $src}", []>; + "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_REG_MM>; let neverHasSideEffects = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), "movd\t{$src, $dst|$dst, $src}", - []>; + [], IIC_MMX_MOV_MM_RM>; // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as @@ -163,197 +224,276 @@ def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR64:$dst, - (bitconvert VR64:$src))]>; + (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>; def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, - (bitconvert GR64:$src))]>; + (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>; let neverHasSideEffects = 1 in def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), - "movq\t{$src, $dst|$dst, $src}", []>; + "movq\t{$src, $dst|$dst, $src}", [], + IIC_MMX_MOVQ_RR>; let canFoldAsLoad = 1 in def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, (load_mmx addr:$src))]>; + [(set VR64:$dst, (load_mmx addr:$src))], + IIC_MMX_MOVQ_RM>; def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (x86mmx VR64:$src), addr:$dst)]>; + [(store (x86mmx VR64:$src), addr:$dst)], + IIC_MMX_MOVQ_RM>; def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (bitconvert (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))))))]>; + (iPTR 0))))))], + IIC_MMX_MOVQ_RR>; def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector - (i64 (bitconvert (x86mmx VR64:$src))))))]>; + (i64 (bitconvert (x86mmx VR64:$src))))))], + IIC_MMX_MOVQ_RR>; let neverHasSideEffects = 1 in def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), - (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", []>; + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [], + IIC_MMX_MOVQ_RR>; def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), - (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", []>; + (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [], + IIC_MMX_MOVQ_RR>; def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movntq\t{$src, $dst|$dst, $src}", - [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>; + [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)], + IIC_MMX_MOVQ_RM>; let AddedComplexity = 15 in // movd to MMX register zero-extends def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, - (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))]>; + (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))], + IIC_MMX_MOV_MM_RM>; let AddedComplexity = 20 in def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (X86vzmovl (x86mmx - (scalar_to_vector (loadi32 addr:$src))))))]>; + (scalar_to_vector (loadi32 addr:$src))))))], + IIC_MMX_MOV_MM_RM>; // Arithmetic Instructions -defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b>; -defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w>; -defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d>; +defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b, + MMX_INTALU_ITINS>; +defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w, + MMX_INTALU_ITINS>; +defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d, + MMX_INTALU_ITINS>; // -- Addition -defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, 1>; -defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, 1>; -defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, 1>; -defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, 1>; -defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>; -defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>; - -defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>; -defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>; - -defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w>; -defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d>; -defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw>; +defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, + MMX_INTALUQ_ITINS, 1>; +defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w, + MMX_PHADDSUBW>; +defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d, + MMX_PHADDSUBD>; +defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw, + MMX_PHADDSUBW>; // -- Subtraction -defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b>; -defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w>; -defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d>; -defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q>; - -defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>; -defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>; - -defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>; -defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>; - -defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w>; -defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d>; -defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw>; +defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b, + MMX_INTALU_ITINS>; +defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, + MMX_INTALU_ITINS, 1>; +defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, + MMX_INTALU_ITINS, 1>; +defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, + MMX_INTALUQ_ITINS, 1>; + +defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w, + MMX_PHADDSUBW>; +defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d, + MMX_PHADDSUBD>; +defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw, + MMX_PHADDSUBW>; // -- Multiplication -defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, 1>; - -defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, 1>; -defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>; -defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>; +defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, + MMX_PMUL_ITINS, 1>; + +defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, + MMX_PMUL_ITINS, 1>; +defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, + MMX_PMUL_ITINS, 1>; +defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, + MMX_PMUL_ITINS, 1>; let isCommutable = 1 in defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", - int_x86_ssse3_pmul_hr_sw>; + int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>; // -- Miscellanea -defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>; +defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, + MMX_PMUL_ITINS, 1>; defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", - int_x86_ssse3_pmadd_ub_sw>; -defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>; -defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>; - -defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>; -defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>; - -defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>; -defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>; - -defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>; - -defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b>; -defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w>; -defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d>; + int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>; +defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, + MMX_PSADBW_ITINS, 1>; + +defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b, + MMX_MISC_FUNC_ITINS>; +defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w, + MMX_MISC_FUNC_ITINS>; +defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d, + MMX_MISC_FUNC_ITINS>; let Constraints = "$src1 = $dst" in defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>; // Logical Instructions -defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>; -defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, 1>; -defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>; -defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>; +defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, + MMX_INTALU_ITINS, 1>; +defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, + MMX_INTALU_ITINS, 1>; +defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, + MMX_INTALU_ITINS, 1>; +defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, + MMX_INTALU_ITINS>; // Shift Instructions defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", - int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>; + int_x86_mmx_psrl_w, int_x86_mmx_psrli_w, + MMX_SHIFT_ITINS>; defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", - int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>; + int_x86_mmx_psrl_d, int_x86_mmx_psrli_d, + MMX_SHIFT_ITINS>; defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", - int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>; + int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, + MMX_SHIFT_ITINS>; defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", - int_x86_mmx_psll_w, int_x86_mmx_pslli_w>; + int_x86_mmx_psll_w, int_x86_mmx_pslli_w, + MMX_SHIFT_ITINS>; defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", - int_x86_mmx_psll_d, int_x86_mmx_pslli_d>; + int_x86_mmx_psll_d, int_x86_mmx_pslli_d, + MMX_SHIFT_ITINS>; defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", - int_x86_mmx_psll_q, int_x86_mmx_pslli_q>; + int_x86_mmx_psll_q, int_x86_mmx_pslli_q, + MMX_SHIFT_ITINS>; defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", - int_x86_mmx_psra_w, int_x86_mmx_psrai_w>; + int_x86_mmx_psra_w, int_x86_mmx_psrai_w, + MMX_SHIFT_ITINS>; defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", - int_x86_mmx_psra_d, int_x86_mmx_psrai_d>; + int_x86_mmx_psra_d, int_x86_mmx_psrai_d, + MMX_SHIFT_ITINS>; // Comparison Instructions -defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>; -defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>; -defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>; - -defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>; -defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>; -defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>; +defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, + MMX_INTALU_ITINS>; +defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w, + MMX_INTALU_ITINS>; +defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d, + MMX_INTALU_ITINS>; + +defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b, + MMX_INTALU_ITINS>; +defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w, + MMX_INTALU_ITINS>; +defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d, + MMX_INTALU_ITINS>; // -- Unpack Instructions defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", - int_x86_mmx_punpckhbw>; + int_x86_mmx_punpckhbw, + MMX_UNPCK_H_ITINS>; defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", - int_x86_mmx_punpckhwd>; + int_x86_mmx_punpckhwd, + MMX_UNPCK_H_ITINS>; defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", - int_x86_mmx_punpckhdq>; + int_x86_mmx_punpckhdq, + MMX_UNPCK_H_ITINS>; defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", - int_x86_mmx_punpcklbw>; + int_x86_mmx_punpcklbw, + MMX_UNPCK_L_ITINS>; defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", - int_x86_mmx_punpcklwd>; + int_x86_mmx_punpcklwd, + MMX_UNPCK_L_ITINS>; defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", - int_x86_mmx_punpckldq>; + int_x86_mmx_punpckldq, + MMX_UNPCK_L_ITINS>; // -- Pack Instructions -defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>; -defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>; -defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>; +defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb, + MMX_PCK_ITINS>; +defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw, + MMX_PCK_ITINS>; +defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb, + MMX_PCK_ITINS>; // -- Shuffle Instructions -defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b>; +defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b, + MMX_PSHUF_ITINS>; def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, - (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>; + (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))], + IIC_MMX_PSHUF>; def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w (load_mmx addr:$src1), - imm:$src2))]>; - + imm:$src2))], + IIC_MMX_PSHUF>; @@ -361,24 +501,24 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, // -- Conversion Instructions defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}", - SSEPackedSingle>, TB; + MMX_CVT_PS_ITINS, SSEPackedSingle>, TB; defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi, f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}", - SSEPackedDouble>, TB, OpSize; + MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize; defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi, f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}", - SSEPackedSingle>, TB; + MMX_CVT_PS_ITINS, SSEPackedSingle>, TB; defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi, f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}", - SSEPackedDouble>, TB, OpSize; + MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize; defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd, i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}", - SSEPackedDouble>, TB, OpSize; + MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize; let Constraints = "$src1 = $dst" in { defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128, int_x86_sse_cvtpi2ps, i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}", - SSEPackedSingle>, TB; + SSEPackedSingle>, TB; } // Extract / Insert @@ -386,14 +526,16 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1, - (iPTR imm:$src2)))]>; + (iPTR imm:$src2)))], + IIC_MMX_PEXTR>; let Constraints = "$src1 = $dst" in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, GR32:$src2, i32i8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, - GR32:$src2, (iPTR imm:$src3)))]>; + GR32:$src2, (iPTR imm:$src3)))], + IIC_MMX_PINSRW>; def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), @@ -401,7 +543,8 @@ let Constraints = "$src1 = $dst" in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, (i32 (anyext (loadi16 addr:$src2))), - (iPTR imm:$src3)))]>; + (iPTR imm:$src3)))], + IIC_MMX_PINSRW>; } // Mask creation @@ -439,11 +582,13 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), let Uses = [EDI] in def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", - [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>; + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], + IIC_MMX_MASKMOV>; let Uses = [RDI] in def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", - [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>; + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)], + IIC_MMX_MASKMOV>; // 64-bit bit convert. def : Pat<(x86mmx (bitconvert (i64 GR64:$src))), diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 65e3c1e..c2d169a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1418,10 +1418,10 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d, OpndItins itins> { - def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, [(set DstRC:$dst, (OpNode SrcRC:$src))], itins.rr, d>; - def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], itins.rm, d>; } @@ -1622,7 +1622,7 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, "cvttsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W; -let Pattern = []<dag> in { +let Pattern = []<dag>, neverHasSideEffects = 1 in { defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load, "cvtss2si{l}\t{$src, $dst|$dst, $src}", SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; @@ -1630,14 +1630,16 @@ defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load, "cvtss2si\t{$src, $dst|$dst, $src}", SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load, - "cvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle, SSE_CVT_PS>, TB, VEX; + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, SSE_CVT_PS>, TB, VEX, + Requires<[HasAVX]>; defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load, - "cvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle, SSE_CVT_PS>, TB, VEX; + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, SSE_CVT_PS>, TB, VEX, + Requires<[HasAVX]>; } -let Pattern = []<dag> in { +let Pattern = []<dag>, neverHasSideEffects = 1 in { defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/, "cvtss2si{l}\t{$src, $dst|$dst, $src}", SSE_CVT_SS2SI_32>, XS; @@ -1646,8 +1648,8 @@ defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/, SSE_CVT_SS2SI_64>, XS, REX_W; defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/, "cvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle, SSE_CVT_PS>, - TB; /* PD SSE3 form is avaiable */ + SSEPackedSingle, SSE_CVT_PS>, TB, + Requires<[HasSSE2]>; } let Predicates = [HasAVX] in { @@ -1788,57 +1790,6 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, Requires<[HasSSE2]>; } -// Convert doubleword to packed single/double fp -// SSE2 instructions without OpSize prefix -def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtdq2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))], - IIC_SSE_CVT_PS_RR>, - TB, VEX, Requires<[HasAVX]>; -def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vcvtdq2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2ps - (bitconvert (memopv2i64 addr:$src))))], - IIC_SSE_CVT_PS_RM>, - TB, VEX, Requires<[HasAVX]>; -def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtdq2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))], - IIC_SSE_CVT_PS_RR>, - TB, Requires<[HasSSE2]>; -def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "cvtdq2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2ps - (bitconvert (memopv2i64 addr:$src))))], - IIC_SSE_CVT_PS_RM>, - TB, Requires<[HasSSE2]>; - -// FIXME: why the non-intrinsic version is described as SSE3? -// SSE2 instructions with XS prefix -def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtdq2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], - IIC_SSE_CVT_PD_RR>, - XS, VEX, Requires<[HasAVX]>; -def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - "vcvtdq2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2pd - (bitconvert (memopv2i64 addr:$src))))], - IIC_SSE_CVT_PD_RM>, - XS, VEX, Requires<[HasAVX]>; -def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtdq2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], - IIC_SSE_CVT_PD_RR>, - XS, Requires<[HasSSE2]>; -def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - "cvtdq2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2pd - (bitconvert (memopv2i64 addr:$src))))], - IIC_SSE_CVT_PD_RM>, - XS, Requires<[HasSSE2]>; - - // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [], @@ -1859,51 +1810,63 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PS_RM>; -def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], - IIC_SSE_CVT_PS_RR>, - VEX; -def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2dq - (memop addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX; -def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], - IIC_SSE_CVT_PS_RR>; -def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2dq - (memop addr:$src)))], - IIC_SSE_CVT_PS_RM>; - -// SSE2 packed instructions with XD prefix -def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], - IIC_SSE_CVT_PD_RR>, - XD, VEX, Requires<[HasAVX]>; -def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "vcvtpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2dq - (memop addr:$src)))], - IIC_SSE_CVT_PD_RM>, - XD, VEX, Requires<[HasAVX]>; -def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], - IIC_SSE_CVT_PD_RR>, - XD, Requires<[HasSSE2]>; -def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2dq - (memop addr:$src)))], - IIC_SSE_CVT_PD_RM>, - XD, Requires<[HasSSE2]>; +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_cvtps2dq VR128:$src), + (VCVTPS2DQrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)), + (VCVTPS2DQrm addr:$src)>; +} + +let Predicates = [HasSSE2] in { + def : Pat<(int_x86_sse2_cvtps2dq VR128:$src), + (CVTPS2DQrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)), + (CVTPS2DQrm addr:$src)>; +} + +// Convert Packed Double FP to Packed DW Integers +let Predicates = [HasAVX] in { +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; +// XMM only +def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTPD2DQrr VR128:$dst, VR128:$src)>; +def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; + +// YMM only +def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; +def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", + (VCVTPD2DQYrr VR128:$dst, VR256:$src)>; +} + +def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>; +def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>; + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_cvtpd2dq VR128:$src), + (VCVTPD2DQrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)), + (VCVTPD2DQXrm addr:$src)>; +} + +let Predicates = [HasSSE2] in { + def : Pat<(int_x86_sse2_cvtpd2dq VR128:$src), + (CVTPD2DQrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)), + (CVTPD2DQrm addr:$src)>; +} // Convert with truncation packed single/double fp to doubleword // SSE2 packed instructions with XS prefix @@ -1915,7 +1878,7 @@ def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq - (memop addr:$src)))], + (memopv4f32 addr:$src)))], IIC_SSE_CVT_PS_RM>, VEX; def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", @@ -1936,14 +1899,19 @@ def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttps2dq (memop addr:$src)))], + (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], IIC_SSE_CVT_PS_RM>; let Predicates = [HasAVX] in { def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), - (Int_VCVTDQ2PSrr VR128:$src)>; + (VCVTDQ2PSrr VR128:$src)>; def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), - (Int_VCVTDQ2PSrm addr:$src)>; + (VCVTDQ2PSrm addr:$src)>; + + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), + (VCVTDQ2PSrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), + (VCVTDQ2PSrm addr:$src)>; def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), (VCVTTPS2DQrr VR128:$src)>; @@ -1963,9 +1931,14 @@ let Predicates = [HasAVX] in { let Predicates = [HasSSE2] in { def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), - (Int_CVTDQ2PSrr VR128:$src)>; + (CVTDQ2PSrr VR128:$src)>; def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), - (Int_CVTDQ2PSrm addr:$src)>; + (CVTDQ2PSrm addr:$src)>; + + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), + (CVTDQ2PSrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), + (CVTDQ2PSrm addr:$src)>; def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), (CVTTPS2DQrr VR128:$src)>; @@ -1978,12 +1951,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], IIC_SSE_CVT_PD_RR>, VEX; -let isCodeGenOnly = 1 in -def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvttpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (memop addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX; + def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], @@ -1991,31 +1959,38 @@ def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (memop addr:$src)))], + (memopv2f64 addr:$src)))], IIC_SSE_CVT_PD_RM>; // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. -def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvttpd2dq\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RR>, VEX; // XMM only -def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvttpd2dqx\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RR>, VEX; +def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQrr VR128:$dst, VR128:$src)>; def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvttpd2dqx\t{$src, $dst|$dst, $src}", [], + "cvttpd2dqx\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq + (memopv2f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX; // YMM only def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvttpd2dqy\t{$src, $dst|$dst, $src}", [], + "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RR>, VEX; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), - "cvttpd2dqy\t{$src, $dst|$dst, $src}", [], + "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L; +def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>; + +let Predicates = [HasAVX] in { + def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), + (VCVTTPD2DQYrr VR256:$src)>; + def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))), + (VCVTTPD2DQYrm addr:$src)>; +} // Predicates = [HasAVX] // Convert packed single to packed double let Predicates = [HasAVX] in { @@ -2033,35 +2008,71 @@ def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RM>, TB, VEX; } + +let Predicates = [HasSSE2] in { def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RR>, TB; def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RM>, TB; +} -def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], - IIC_SSE_CVT_PD_RR>, - TB, VEX, Requires<[HasAVX]>; -def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2pd - (load addr:$src)))], - IIC_SSE_CVT_PD_RM>, - TB, VEX, Requires<[HasAVX]>; -def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], - IIC_SSE_CVT_PD_RR>, - TB, Requires<[HasSSE2]>; -def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "cvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2pd - (load addr:$src)))], - IIC_SSE_CVT_PD_RM>, - TB, Requires<[HasSSE2]>; +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_cvtps2pd VR128:$src), + (VCVTPS2PDrr VR128:$src)>; +} + +let Predicates = [HasSSE2] in { + def : Pat<(int_x86_sse2_cvtps2pd VR128:$src), + (CVTPS2PDrr VR128:$src)>; +} + +// Convert Packed DW Integers to Packed Double FP +let Predicates = [HasAVX] in { +def VCVTDQ2PDrm : SSDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDrr : SSDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDYrm : SSDI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDYrr : SSDI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +} + +def CVTDQ2PDrm : SSDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>; +def CVTDQ2PDrr : SSDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>; + +// 128 bit register conversion intrinsics +let Predicates = [HasAVX] in +def : Pat<(int_x86_sse2_cvtdq2pd VR128:$src), + (VCVTDQ2PDrr VR128:$src)>; + +let Predicates = [HasSSE2] in +def : Pat<(int_x86_sse2_cvtdq2pd VR128:$src), + (CVTDQ2PDrr VR128:$src)>; + +// AVX 256-bit register conversion intrinsics +let Predicates = [HasAVX] in { + def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src), + (VCVTDQ2PDYrr VR128:$src)>; + def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))), + (VCVTDQ2PDYrm addr:$src)>; + + def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src), + (VCVTPD2DQYrr VR256:$src)>; + def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)), + (VCVTPD2DQYrm addr:$src)>; + + def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), + (VCVTDQ2PDYrr VR128:$src)>; + def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + (VCVTDQ2PDYrm addr:$src)>; +} // Predicates = [HasAVX] // Convert packed double to packed single // The assembler can recognize rr 256-bit instructions by seeing a ymm @@ -2070,25 +2081,24 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RR>, VEX; -def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RR>, VEX; // XMM only -def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2psx\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RR>, VEX; +def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", + (VCVTPD2PSrr VR128:$dst, VR128:$src)>; def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2psx\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RM>, VEX; // YMM only def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvtpd2psy\t{$src, $dst|$dst, $src}", [], + "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RR>, VEX; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), - "cvtpd2psy\t{$src, $dst|$dst, $src}", [], + "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L; +def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", + (VCVTPD2PSYrr VR128:$dst, VR256:$src)>; + def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RR>; @@ -2097,64 +2107,60 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), IIC_SSE_CVT_PD_RM>; -def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], - IIC_SSE_CVT_PD_RR>; -def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2ps - (memop addr:$src)))], - IIC_SSE_CVT_PD_RM>; -def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], - IIC_SSE_CVT_PD_RR>; -def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2ps - (memop addr:$src)))], - IIC_SSE_CVT_PD_RM>; +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_cvtpd2ps VR128:$src), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)), + (VCVTPD2PSXrm addr:$src)>; +} + +let Predicates = [HasSSE2] in { + def : Pat<(int_x86_sse2_cvtpd2ps VR128:$src), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)), + (CVTPD2PSrm addr:$src)>; +} // AVX 256-bit register conversion intrinsics // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below // whenever possible to avoid declaring two versions of each one. -def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), - (VCVTDQ2PSYrr VR256:$src)>; -def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))), - (VCVTDQ2PSYrm addr:$src)>; - -def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src), - (VCVTPD2PSYrr VR256:$src)>; -def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)), - (VCVTPD2PSYrm addr:$src)>; - -def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src), - (VCVTPS2DQYrr VR256:$src)>; -def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)), - (VCVTPS2DQYrm addr:$src)>; - -def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src), - (VCVTPS2PDYrr VR128:$src)>; -def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)), - (VCVTPS2PDYrm addr:$src)>; - -def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src), - (VCVTTPD2DQYrr VR256:$src)>; -def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)), - (VCVTTPD2DQYrm addr:$src)>; - -// Match fround and fextend for 128/256-bit conversions -def : Pat<(v4f32 (fround (v4f64 VR256:$src))), - (VCVTPD2PSYrr VR256:$src)>; -def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), - (VCVTPD2PSYrm addr:$src)>; - -def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), - (VCVTPS2PDYrr VR128:$src)>; -def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))), - (VCVTPS2PDYrm addr:$src)>; +let Predicates = [HasAVX] in { + def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), + (VCVTDQ2PSYrr VR256:$src)>; + def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))), + (VCVTDQ2PSYrm addr:$src)>; + + def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src), + (VCVTPD2PSYrr VR256:$src)>; + def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)), + (VCVTPD2PSYrm addr:$src)>; + + def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src), + (VCVTPS2DQYrr VR256:$src)>; + def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)), + (VCVTPS2DQYrm addr:$src)>; + + def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src), + (VCVTPS2PDYrr VR128:$src)>; + def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)), + (VCVTPS2PDYrm addr:$src)>; + + def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src), + (VCVTTPD2DQYrr VR256:$src)>; + def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)), + (VCVTTPD2DQYrm addr:$src)>; + + // Match fround and fextend for 128/256-bit conversions + def : Pat<(v4f32 (fround (v4f64 VR256:$src))), + (VCVTPD2PSYrr VR256:$src)>; + def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), + (VCVTPD2PSYrm addr:$src)>; + + def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), + (VCVTPS2PDYrr VR128:$src)>; + def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))), + (VCVTPS2PDYrm addr:$src)>; +} //===----------------------------------------------------------------------===// // SSE 1 & 2 - Compare Instructions @@ -3336,13 +3342,6 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions IIC_SSE_MOVNT>, VEX; } -def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src), - (VMOVNTDQYmr addr:$dst, VR256:$src)>; -def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src), - (VMOVNTPDYmr addr:$dst, VR256:$src)>; -def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src), - (VMOVNTPSYmr addr:$dst, VR256:$src)>; - let AddedComplexity = 400 in { // Prefer non-temporal versions def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -4610,7 +4609,7 @@ def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), // Bitcast FR64 <-> GR64 // let Predicates = [HasAVX] in -def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), +def VMOV64toSDrm : SSDI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, VEX; @@ -4623,7 +4622,7 @@ def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), [(store (i64 (bitconvert FR64:$src)), addr:$dst)], IIC_SSE_MOVDQ>, VEX; -def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), +def MOV64toSDrm : SSDI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], IIC_SSE_MOVDQ>; @@ -4897,80 +4896,6 @@ def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS; //===---------------------------------------------------------------------===// -// SSE3 - Conversion Instructions -//===---------------------------------------------------------------------===// - -// Convert Packed Double FP to Packed DW Integers -let Predicates = [HasAVX] in { -// The assembler can recognize rr 256-bit instructions by seeing a ymm -// register, but the same isn't true when using memory operands instead. -// Provide other assembly rr and rm forms to address this explicitly. -def VCVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; -def VCVTPD2DQXrYr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; - -// XMM only -def VCVTPD2DQXrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; -def VCVTPD2DQXrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; - -// YMM only -def VCVTPD2DQYrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; -def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), - "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; -} - -def CVTPD2DQrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtpd2dq\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RM>; -def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2dq\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RR>; - -def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), - (VCVTTPD2DQYrr VR256:$src)>; -def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))), - (VCVTTPD2DQYrm addr:$src)>; - -// Convert Packed DW Integers to Packed Double FP -let Predicates = [HasAVX] in { -def VCVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; -def VCVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; -def VCVTDQ2PDYrm : S3SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), - "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; -def VCVTDQ2PDYrr : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; -} - -def CVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtdq2pd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RR>; -def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtdq2pd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_CVT_PD_RM>; - -// AVX 256-bit register conversion intrinsics -def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src), - (VCVTDQ2PDYrr VR128:$src)>; -def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))), - (VCVTDQ2PDYrm addr:$src)>; - -def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src), - (VCVTPD2DQYrr VR256:$src)>; -def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)), - (VCVTPD2DQYrm addr:$src)>; - -def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), - (VCVTDQ2PDYrr VR128:$src)>; -def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), - (VCVTDQ2PDYrm addr:$src)>; - -//===---------------------------------------------------------------------===// // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, @@ -5730,14 +5655,26 @@ let Predicates = [HasSSE41] in { (PMOVZXDQrm addr:$src)>; } +let Predicates = [HasAVX2] in { + let AddedComplexity = 15 in { + def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))), + (VPMOVZXDQYrr VR128:$src)>; + def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))), + (VPMOVZXWDYrr VR128:$src)>; + } + + def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; + def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; +} + let Predicates = [HasAVX] in { -def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; -def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; + def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; } let Predicates = [HasSSE41] in { -def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; -def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; + def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; } @@ -6608,15 +6545,15 @@ let Predicates = [HasAVX] in { let isCommutable = 0 in { let ExeDomain = SSEPackedSingle in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, memopv4f32, i128mem, 0>, VEX_4V; + VR128, memopv4f32, f128mem, 0>, VEX_4V; defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, memopv8f32, i256mem, 0>, VEX_4V; + int_x86_avx_blend_ps_256, VR256, memopv8f32, f256mem, 0>, VEX_4V; } let ExeDomain = SSEPackedDouble in { defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, memopv2f64, i128mem, 0>, VEX_4V; + VR128, memopv2f64, f128mem, 0>, VEX_4V; defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256, VR256, memopv4f64, i256mem, 0>, VEX_4V; + int_x86_avx_blend_pd_256, VR256, memopv4f64, f256mem, 0>, VEX_4V; } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem, 0>, VEX_4V; @@ -6625,10 +6562,10 @@ let Predicates = [HasAVX] in { } let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, - VR128, memopv4f32, i128mem, 0>, VEX_4V; + VR128, memopv4f32, f128mem, 0>, VEX_4V; let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, - VR128, memopv2f64, i128mem, 0>, VEX_4V; + VR128, memopv2f64, f128mem, 0>, VEX_4V; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, VR256, memopv8f32, i256mem, 0>, VEX_4V; @@ -6647,10 +6584,10 @@ let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, - VR128, memopv4f32, i128mem>; + VR128, memopv4f32, f128mem>; let ExeDomain = SSEPackedDouble in defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, - VR128, memopv2f64, i128mem>; + VR128, memopv2f64, f128mem>; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem>; defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, @@ -6658,10 +6595,10 @@ let Constraints = "$src1 = $dst" in { } let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, - VR128, memopv4f32, i128mem>; + VR128, memopv4f32, f128mem>; let ExeDomain = SSEPackedDouble in defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, - VR128, memopv2f64, i128mem>; + VR128, memopv2f64, f128mem>; } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators @@ -6687,15 +6624,15 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { -defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem, +defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, memopv2f64, int_x86_sse41_blendvpd>; -defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem, +defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, memopv4f64, int_x86_avx_blendv_pd_256>; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { -defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem, +defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, memopv4f32, int_x86_sse41_blendvps>; -defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem, +defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, memopv8f32, int_x86_avx_blendv_ps_256>; } // ExeDomain = SSEPackedSingle defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, @@ -6766,7 +6703,7 @@ let Predicates = [HasAVX2] in { /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - Intrinsic IntId> { + X86MemOperand x86memop, Intrinsic IntId> { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, @@ -6775,7 +6712,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { OpSize; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), + (ins VR128:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, @@ -6785,14 +6722,28 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { } let ExeDomain = SSEPackedDouble in -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, int_x86_sse41_blendvpd>; let ExeDomain = SSEPackedSingle in -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, int_x86_sse41_blendvps>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, int_x86_sse41_pblendvb>; +// Aliases with the implicit xmm0 argument +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", + (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; + let Predicates = [HasSSE41] in { def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), (v16i8 VR128:$src2))), @@ -7204,52 +7155,50 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), OpSize; //===----------------------------------------------------------------------===// -// CLMUL Instructions +// PCLMUL Instructions //===----------------------------------------------------------------------===// -// Carry-less Multiplication instructions -let neverHasSideEffects = 1 in { // AVX carry-less Multiplication instructions -def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), +def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>; + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>; -let mayLoad = 1 in -def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), +def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>; + [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, + (memopv2i64 addr:$src2), imm:$src3))]>; +// Carry-less Multiplication instructions let Constraints = "$src1 = $dst" in { -def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), +def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>; + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>; -let mayLoad = 1 in -def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), +def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", - []>; + [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, + (memopv2i64 addr:$src2), imm:$src3))]>; } // Constraints = "$src1 = $dst" -} // neverHasSideEffects = 1 multiclass pclmul_alias<string asm, int immop> { - def : InstAlias<!strconcat("pclmul", asm, - "dq {$src, $dst|$dst, $src}"), + def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>; - def : InstAlias<!strconcat("pclmul", asm, - "dq {$src, $dst|$dst, $src}"), + def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>; - def : InstAlias<!strconcat("vpclmul", asm, + def : InstAlias<!strconcat("vpclmul", asm, "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>; - def : InstAlias<!strconcat("vpclmul", asm, + def : InstAlias<!strconcat("vpclmul", asm, "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>; } @@ -7259,6 +7208,45 @@ defm : pclmul_alias<"lqhq", 0x10>; defm : pclmul_alias<"lqlq", 0x00>; //===----------------------------------------------------------------------===// +// SSE4A Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE4A] in { + +let Constraints = "$src = $dst" in { +def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst), + (ins VR128:$src, i8imm:$len, i8imm:$idx), + "extrq\t{$idx, $len, $src|$src, $len, $idx}", + [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len, + imm:$idx))]>, TB, OpSize; +def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$mask), + "extrq\t{$mask, $src|$src, $mask}", + [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, + VR128:$mask))]>, TB, OpSize; + +def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx), + "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", + [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src, + VR128:$src2, imm:$len, imm:$idx))]>, XD; +def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$mask), + "insertq\t{$mask, $src|$src, $mask}", + [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, + VR128:$mask))]>, XD; +} + +def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), + "movntss\t{$src, $dst|$dst, $src}", + [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS; + +def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movntsd\t{$src, $dst|$dst, $src}", + [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD; +} + +//===----------------------------------------------------------------------===// // AVX Instructions //===----------------------------------------------------------------------===// @@ -7286,7 +7274,7 @@ let ExeDomain = SSEPackedSingle in { int_x86_avx_vbroadcast_ss_256>; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, +def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, int_x86_avx_vbroadcast_sd_256>; def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, int_x86_avx_vbroadcastf128_pd_256>; @@ -7298,8 +7286,8 @@ let ExeDomain = SSEPackedSingle in { int_x86_avx2_vbroadcast_ss_ps_256>; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256>; +def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, + int_x86_avx2_vbroadcast_sd_pd_256>; let Predicates = [HasAVX2] in def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, @@ -7595,7 +7583,6 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, // Half precision conversion instructions //===----------------------------------------------------------------------===// multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { -let Predicates = [HasAVX, HasF16C] in { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", [(set RC:$dst, (Int VR128:$src))]>, @@ -7604,27 +7591,26 @@ let Predicates = [HasAVX, HasF16C] in { def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; } -} multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { -let Predicates = [HasAVX, HasF16C] in { def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), (ins RC:$src1, i32i8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, TA, OpSize, VEX; - let neverHasSideEffects = 1, mayLoad = 1 in - def mr : Ii8<0x1D, MRMDestMem, (outs x86memop:$dst), - (ins RC:$src1, i32i8imm:$src2), + let neverHasSideEffects = 1, mayStore = 1 in + def mr : Ii8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, RC:$src1, i32i8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, TA, OpSize, VEX; } -} -defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; -defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>; -defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; -defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>; +let Predicates = [HasAVX, HasF16C] in { + defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>; + defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; + defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>; +} //===----------------------------------------------------------------------===// // AVX2 Instructions @@ -7711,6 +7697,55 @@ let Predicates = [HasAVX2] in { (VPBROADCASTQrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), (VPBROADCASTQYrm addr:$src)>; + + def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), + (VPBROADCASTBrr VR128:$src)>; + def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), + (VPBROADCASTBYrr VR128:$src)>; + def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), + (VPBROADCASTWrr VR128:$src)>; + def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), + (VPBROADCASTWYrr VR128:$src)>; + def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), + (VPBROADCASTDrr VR128:$src)>; + def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), + (VPBROADCASTDYrr VR128:$src)>; + def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), + (VPBROADCASTQrr VR128:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), + (VPBROADCASTQYrr VR128:$src)>; + def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), + (VBROADCASTSSrr VR128:$src)>; + def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), + (VBROADCASTSSYrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), + (VPBROADCASTQrr VR128:$src)>; + def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), + (VBROADCASTSDYrr VR128:$src)>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSrr + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSYrr + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VBROADCASTSDYrr + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>; + + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VBROADCASTSSrr + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VBROADCASTSSYrr + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VBROADCASTSDYrr + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd))>; + } } // AVX1 broadcast patterns @@ -7718,16 +7753,62 @@ let Predicates = [HasAVX] in { def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VBROADCASTSDrm addr:$src)>; + (VBROADCASTSDYrm addr:$src)>; def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSYrm addr:$src)>; def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDrm addr:$src)>; - + (VBROADCASTSDYrm addr:$src)>; def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), (VBROADCASTSSrm addr:$src)>; def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), (VBROADCASTSSrm addr:$src)>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + // 128bit broadcasts: + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0)>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0), + sub_xmm), + (VPSHUFDri + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), + 0), 1)>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + (VPSHUFDri + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), + 0x44), + sub_xmm), + (VPSHUFDri + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd), + 0x44), 1)>; + + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VPSHUFDri + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss), 0)>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + (VPSHUFDri + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss), 0), + sub_xmm), + (VPSHUFDri + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss), + 0), 1)>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), + (VPSHUFDri + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd), + 0x44), + sub_xmm), + (VPSHUFDri + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd), + 0x44), 1)>; + } } //===----------------------------------------------------------------------===// @@ -7820,8 +7901,8 @@ let neverHasSideEffects = 1 in { def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, - VEX_4V; + []>, VEX_4V; +let mayLoad = 1 in def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i128mem:$src2, i8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -7954,3 +8035,30 @@ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; + +//===----------------------------------------------------------------------===// +// VGATHER - GATHER Operations +multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, + X86MemOperand memop128, X86MemOperand memop256> { + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb), + (ins VR128:$src1, memop128:$src2, VR128:$mask), + !strconcat(OpcodeStr, + "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), + []>, VEX_4VOp3; + def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb), + (ins RC256:$src1, memop256:$src2, RC256:$mask), + !strconcat(OpcodeStr, + "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), + []>, VEX_4VOp3, VEX_L; +} + +let Constraints = "$src1 = $dst, $mask = $mask_wb" in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>; +} diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index bddba6c..ea716bf 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -14,7 +14,8 @@ //===----------------------------------------------------------------------===// let Defs = [RAX, RDX] in - def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB; + def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>, + TB; let Defs = [RAX, RCX, RDX] in def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB; @@ -26,14 +27,17 @@ let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in { def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB; } -def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>; -def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB; +def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>; +def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB; // Interrupt and SysCall Instructions. let Uses = [EFLAGS] in def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", - [(int_x86_int (i8 3))]>; + [(int_x86_int (i8 3))], IIC_INT3>; + +def : Pat<(debugtrap), + (INT3)>; // The long form of "int $3" turns into int3 as a size optimization. // FIXME: This doesn't work because InstAlias can't match immediate constants. @@ -41,23 +45,25 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", - [(int_x86_int imm:$trap)]>; + [(int_x86_int imm:$trap)], IIC_INT>; -def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; -def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB; -def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", []>, TB, +def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB; +def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB; +def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB, Requires<[In64BitMode]>; -def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB; - -def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB; +def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [], + IIC_SYS_ENTER_EXIT>, TB; + +def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [], + IIC_SYS_ENTER_EXIT>, TB; def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB, Requires<[In64BitMode]>; -def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize; -def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>; -def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>, +def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize; +def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>; +def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, Requires<[In64BitMode]>; @@ -66,73 +72,73 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>, // let Defs = [AL], Uses = [DX] in def IN8rr : I<0xEC, RawFrm, (outs), (ins), - "in{b}\t{%dx, %al|AL, DX}", []>; + "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>; let Defs = [AX], Uses = [DX] in def IN16rr : I<0xED, RawFrm, (outs), (ins), - "in{w}\t{%dx, %ax|AX, DX}", []>, OpSize; + "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize; let Defs = [EAX], Uses = [DX] in def IN32rr : I<0xED, RawFrm, (outs), (ins), - "in{l}\t{%dx, %eax|EAX, DX}", []>; + "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>; let Defs = [AL] in def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), - "in{b}\t{$port, %al|AL, $port}", []>; + "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>; let Defs = [AX] in def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), - "in{w}\t{$port, %ax|AX, $port}", []>, OpSize; + "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize; let Defs = [EAX] in def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), - "in{l}\t{$port, %eax|EAX, $port}", []>; + "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>; let Uses = [DX, AL] in def OUT8rr : I<0xEE, RawFrm, (outs), (ins), - "out{b}\t{%al, %dx|DX, AL}", []>; + "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>; let Uses = [DX, AX] in def OUT16rr : I<0xEF, RawFrm, (outs), (ins), - "out{w}\t{%ax, %dx|DX, AX}", []>, OpSize; + "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize; let Uses = [DX, EAX] in def OUT32rr : I<0xEF, RawFrm, (outs), (ins), - "out{l}\t{%eax, %dx|DX, EAX}", []>; + "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>; let Uses = [AL] in def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), - "out{b}\t{%al, $port|$port, AL}", []>; + "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>; let Uses = [AX] in def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), - "out{w}\t{%ax, $port|$port, AX}", []>, OpSize; + "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize; let Uses = [EAX] in def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), - "out{l}\t{%eax, $port|$port, EAX}", []>; + "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>; -def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", []>; -def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", []>, OpSize; -def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", []>; +def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>; +def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize; +def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>; //===----------------------------------------------------------------------===// // Moves to and from debug registers def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>, TB; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB; def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>, TB; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB; def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>, TB; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB; def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>, TB; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB; //===----------------------------------------------------------------------===// // Moves to and from control registers def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>, TB; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB; def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>, TB; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB; def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>, TB; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB; def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>, TB; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB; //===----------------------------------------------------------------------===// // Segment override instruction prefixes @@ -150,254 +156,265 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>; // def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src), - "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize; def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>; def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>; def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src), - "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize; def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>; def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>; def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src), - "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize; def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>; def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>; def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src), - "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize; def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src), - "mov{l}\t{$src, $dst|$dst, $src}", []>; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), - "mov{q}\t{$src, $dst|$dst, $src}", []>; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; //===----------------------------------------------------------------------===// // Segmentation support instructions. -def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB; +def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB; def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), - "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize; def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), - "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize; // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo. def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), - "lar{l}\t{$src, $dst|$dst, $src}", []>, TB; + "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB; def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "lar{l}\t{$src, $dst|$dst, $src}", []>, TB; + "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB; // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo. def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), - "lar{q}\t{$src, $dst|$dst, $src}", []>, TB; + "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB; def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), - "lar{q}\t{$src, $dst|$dst, $src}", []>, TB; + "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB; def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), - "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize; def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), - "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize; def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB; + "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB; + "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB; def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB; + "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB; + "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB; -def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB; +def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", + [], IIC_INVLPG>, TB; def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins), - "str{w}\t$dst", []>, TB, OpSize; + "str{w}\t$dst", [], IIC_STR>, TB, OpSize; def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), - "str{l}\t$dst", []>, TB; + "str{l}\t$dst", [], IIC_STR>, TB; def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), - "str{q}\t$dst", []>, TB; + "str{q}\t$dst", [], IIC_STR>, TB; def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), - "str{w}\t$dst", []>, TB; + "str{w}\t$dst", [], IIC_STR>, TB; def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), - "ltr{w}\t$src", []>, TB; + "ltr{w}\t$src", [], IIC_LTR>, TB; def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), - "ltr{w}\t$src", []>, TB; + "ltr{w}\t$src", [], IIC_LTR>, TB; def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), - "push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize; + "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + OpSize; def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), - "push{l}\t{%cs|CS}", []>, Requires<[In32BitMode]>; + "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>; def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), - "push{w}\t{%ss|SS}", []>, Requires<[In32BitMode]>, OpSize; + "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + OpSize; def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), - "push{l}\t{%ss|SS}", []>, Requires<[In32BitMode]>; + "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), - "push{w}\t{%ds|DS}", []>, Requires<[In32BitMode]>, OpSize; + "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + OpSize; def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), - "push{l}\t{%ds|DS}", []>, Requires<[In32BitMode]>; + "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHES16 : I<0x06, RawFrm, (outs), (ins), - "push{w}\t{%es|ES}", []>, Requires<[In32BitMode]>, OpSize; + "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + OpSize; def PUSHES32 : I<0x06, RawFrm, (outs), (ins), - "push{l}\t{%es|ES}", []>, Requires<[In32BitMode]>; + "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), - "push{w}\t{%fs|FS}", []>, OpSize, TB; + "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB; def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), - "push{l}\t{%fs|FS}", []>, TB, Requires<[In32BitMode]>; + "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), - "push{w}\t{%gs|GS}", []>, OpSize, TB; + "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB; def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), - "push{l}\t{%gs|GS}", []>, TB, Requires<[In32BitMode]>; + "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), - "push{q}\t{%fs|FS}", []>, TB; + "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB; def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), - "push{q}\t{%gs|GS}", []>, TB; + "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB; // No "pop cs" instruction. def POPSS16 : I<0x17, RawFrm, (outs), (ins), - "pop{w}\t{%ss|SS}", []>, OpSize, Requires<[In32BitMode]>; + "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>, + OpSize, Requires<[In32BitMode]>; def POPSS32 : I<0x17, RawFrm, (outs), (ins), - "pop{l}\t{%ss|SS}", []> , Requires<[In32BitMode]>; + "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>, + Requires<[In32BitMode]>; def POPDS16 : I<0x1F, RawFrm, (outs), (ins), - "pop{w}\t{%ds|DS}", []>, OpSize, Requires<[In32BitMode]>; + "pop{w}\t{%ds|DS}", [], IIC_POP_SR>, + OpSize, Requires<[In32BitMode]>; def POPDS32 : I<0x1F, RawFrm, (outs), (ins), - "pop{l}\t{%ds|DS}", []> , Requires<[In32BitMode]>; + "pop{l}\t{%ds|DS}", [], IIC_POP_SR>, + Requires<[In32BitMode]>; def POPES16 : I<0x07, RawFrm, (outs), (ins), - "pop{w}\t{%es|ES}", []>, OpSize, Requires<[In32BitMode]>; + "pop{w}\t{%es|ES}", [], IIC_POP_SR>, + OpSize, Requires<[In32BitMode]>; def POPES32 : I<0x07, RawFrm, (outs), (ins), - "pop{l}\t{%es|ES}", []> , Requires<[In32BitMode]>; + "pop{l}\t{%es|ES}", [], IIC_POP_SR>, + Requires<[In32BitMode]>; def POPFS16 : I<0xa1, RawFrm, (outs), (ins), - "pop{w}\t{%fs|FS}", []>, OpSize, TB; + "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB; def POPFS32 : I<0xa1, RawFrm, (outs), (ins), - "pop{l}\t{%fs|FS}", []>, TB , Requires<[In32BitMode]>; + "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; def POPFS64 : I<0xa1, RawFrm, (outs), (ins), - "pop{q}\t{%fs|FS}", []>, TB; + "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB; def POPGS16 : I<0xa9, RawFrm, (outs), (ins), - "pop{w}\t{%gs|GS}", []>, OpSize, TB; + "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB; def POPGS32 : I<0xa9, RawFrm, (outs), (ins), - "pop{l}\t{%gs|GS}", []>, TB , Requires<[In32BitMode]>; + "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; def POPGS64 : I<0xa9, RawFrm, (outs), (ins), - "pop{q}\t{%gs|GS}", []>, TB; + "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB; def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), - "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize; + "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize; def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), - "lds{l}\t{$src, $dst|$dst, $src}", []>; + "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>; def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), - "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize; def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), - "lss{l}\t{$src, $dst|$dst, $src}", []>, TB; + "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), - "lss{q}\t{$src, $dst|$dst, $src}", []>, TB; + "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), - "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize; + "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize; def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), - "les{l}\t{$src, $dst|$dst, $src}", []>; + "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>; def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), - "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize; def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), - "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB; + "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), - "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB; + "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), - "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize; def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), - "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB; + "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), - "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB; + "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), - "verr\t$seg", []>, TB; + "verr\t$seg", [], IIC_VERR>, TB; def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), - "verr\t$seg", []>, TB; + "verr\t$seg", [], IIC_VERR>, TB; def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), - "verw\t$seg", []>, TB; + "verw\t$seg", [], IIC_VERW_MEM>, TB; def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), - "verw\t$seg", []>, TB; + "verw\t$seg", [], IIC_VERW_REG>, TB; //===----------------------------------------------------------------------===// // Descriptor-table support instructions def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), - "sgdtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>; + "sgdtw\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>; def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), - "sgdt\t$dst", []>, TB; + "sgdt\t$dst", [], IIC_SGDT>, TB; def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins), - "sidtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>; + "sidtw\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>; def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins), "sidt\t$dst", []>, TB; def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins), - "sldt{w}\t$dst", []>, TB, OpSize; + "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize; def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins), - "sldt{w}\t$dst", []>, TB; + "sldt{w}\t$dst", [], IIC_SLDT>, TB; def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), - "sldt{l}\t$dst", []>, TB; + "sldt{l}\t$dst", [], IIC_SLDT>, TB; // LLDT is not interpreted specially in 64-bit mode because there is no sign // extension. def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins), - "sldt{q}\t$dst", []>, TB; + "sldt{q}\t$dst", [], IIC_SLDT>, TB; def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins), - "sldt{q}\t$dst", []>, TB; + "sldt{q}\t$dst", [], IIC_SLDT>, TB; def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src), - "lgdtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>; + "lgdtw\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>; def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src), - "lgdt\t$src", []>, TB; + "lgdt\t$src", [], IIC_LGDT>, TB; def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src), - "lidtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>; + "lidtw\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>; def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src), - "lidt\t$src", []>, TB; + "lidt\t$src", [], IIC_LIDT>, TB; def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src), - "lldt{w}\t$src", []>, TB; + "lldt{w}\t$src", [], IIC_LLDT_REG>, TB; def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), - "lldt{w}\t$src", []>, TB; + "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB; //===----------------------------------------------------------------------===// // Specialized register support -def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB; -def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB; -def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB; +def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; +def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; +def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB; def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), - "smsw{w}\t$dst", []>, OpSize, TB; + "smsw{w}\t$dst", [], IIC_SMSW>, OpSize, TB; def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), - "smsw{l}\t$dst", []>, TB; + "smsw{l}\t$dst", [], IIC_SMSW>, TB; // no m form encodable; use SMSW16m def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), - "smsw{q}\t$dst", []>, TB; + "smsw{q}\t$dst", [], IIC_SMSW>, TB; // For memory operands, there is only a 16-bit form def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins), - "smsw{w}\t$dst", []>, TB; + "smsw{w}\t$dst", [], IIC_SMSW>, TB; def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src), - "lmsw{w}\t$src", []>, TB; + "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB; def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), - "lmsw{w}\t$src", []>, TB; + "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB; -def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB; +def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB; //===----------------------------------------------------------------------===// // Cache instructions -def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB; -def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB; +def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB; +def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; //===----------------------------------------------------------------------===// // XSAVE instructions diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td index 6a8f0c8..6d3548f 100644 --- a/lib/Target/X86/X86InstrVMX.td +++ b/lib/Target/X86/X86InstrVMX.td @@ -17,17 +17,17 @@ // 66 0F 38 80 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), - "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, Requires<[In32BitMode]>; def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), - "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, Requires<[In64BitMode]>; // 66 0F 38 81 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), - "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, Requires<[In32BitMode]>; def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), - "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8, Requires<[In64BitMode]>; // 0F 01 C1 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB; diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 65bbcb5..8ec2c68 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -15,7 +15,7 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (Int VR128:$src))]>, VEX; - def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; } @@ -36,27 +36,19 @@ let isAsmParserOnly = 1 in { defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>; defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>; defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>; - defm VFRCZPS : xop2op<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>; - defm VFRCZPD : xop2op<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>; } // Scalar load 2 addr operand instructions -let Constraints = "$src1 = $dst" in { multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, Operand memop, ComplexPattern mem_cpat> { - def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, - VR128:$src2), - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX; - def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, - memop:$src2), - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, - (bitconvert mem_cpat:$src2)))]>, VEX; + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, VEX; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX; } -} // Constraints = "$src1 = $dst" - let isAsmParserOnly = 1 in { defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, ssmem, sse_load_f32>; @@ -64,12 +56,26 @@ let isAsmParserOnly = 1 in { sdmem, sse_load_f64>; } +multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, + PatFrag memop> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, VEX; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>; +} multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L; + [(set VR256:$dst, (Int VR256:$src))]>, VEX; def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; @@ -88,13 +94,13 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2), + (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>, VEX_4V, VEX_W; def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src1, VR128:$src2), + (ins i128mem:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>, @@ -116,25 +122,23 @@ let isAsmParserOnly = 1 in { defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; } -multiclass xop3opimm<bits<8> opc, string OpcodeStr> { - let neverHasSideEffects = 1 in { - def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX; - let mayLoad = 1 in - def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src1, i8imm:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX; - } +multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, VEX; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX; } let isAsmParserOnly = 1 in { - defm VPROTW : xop3opimm<0xC1, "vprotw">; - defm VPROTQ : xop3opimm<0xC3, "vprotq">; - defm VPROTD : xop3opimm<0xC2, "vprotd">; - defm VPROTB : xop3opimm<0xC0, "vprotb">; + defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; + defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; } // Instruction where second source can be memory, but third must be register @@ -146,7 +150,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM; def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2, VR128:$src3), + (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, @@ -170,32 +174,31 @@ let isAsmParserOnly = 1 in { } // Instruction where second source can be memory, third must be imm8 -multiclass xop4opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType VT> { +multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (VT (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, VEX_4V; + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>, + VEX_4V; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (VT (OpNode VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), - imm:$src3)))]>, VEX_4V; + (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + imm:$src3))]>, VEX_4V; } let isAsmParserOnly = 1 in { - defm VPCOMB : xop4opimm<0xCC, "vpcomb", X86vpcom, v16i8>; - defm VPCOMW : xop4opimm<0xCD, "vpcomw", X86vpcom, v8i16>; - defm VPCOMD : xop4opimm<0xCE, "vpcomd", X86vpcom, v4i32>; - defm VPCOMQ : xop4opimm<0xCF, "vpcomq", X86vpcom, v2i64>; - defm VPCOMUB : xop4opimm<0xEC, "vpcomub", X86vpcomu, v16i8>; - defm VPCOMUW : xop4opimm<0xED, "vpcomuw", X86vpcomu, v8i16>; - defm VPCOMUD : xop4opimm<0xEE, "vpcomud", X86vpcomu, v4i32>; - defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", X86vpcomu, v2i64>; + defm VPCOMB : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>; + defm VPCOMW : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>; + defm VPCOMD : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>; + defm VPCOMQ : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>; + defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>; + defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>; + defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>; + defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>; } // Instruction where either second or third source can be memory @@ -207,7 +210,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM; def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, f128mem:$src3), + (ins VR128:$src1, VR128:$src2, i128mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, @@ -215,7 +218,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { (bitconvert (memopv2i64 addr:$src3))))]>, VEX_4V, VEX_I8IMM, VEX_W, MemOp4; def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2, VR128:$src3), + (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, @@ -237,7 +240,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_4V, VEX_I8IMM; def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, f256mem:$src3), + (ins VR256:$src1, VR256:$src2, i256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index b578e8d..df7507c 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -156,10 +156,14 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, break; case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break; case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break; + case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break; + case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break; case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break; case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break; case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break; + case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break; case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break; + case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break; case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break; case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break; case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break; @@ -550,17 +554,38 @@ ReSimplify: static void LowerTlsAddr(MCStreamer &OutStreamer, X86MCInstLower &MCInstLowering, const MachineInstr &MI) { - bool is64Bits = MI.getOpcode() == X86::TLS_addr64; + + bool is64Bits = MI.getOpcode() == X86::TLS_addr64 || + MI.getOpcode() == X86::TLS_base_addr64; + + bool needsPadding = MI.getOpcode() == X86::TLS_addr64; + MCContext &context = OutStreamer.getContext(); - if (is64Bits) { + if (needsPadding) { MCInst prefix; prefix.setOpcode(X86::DATA16_PREFIX); OutStreamer.EmitInstruction(prefix); } + + MCSymbolRefExpr::VariantKind SRVK; + switch (MI.getOpcode()) { + case X86::TLS_addr32: + case X86::TLS_addr64: + SRVK = MCSymbolRefExpr::VK_TLSGD; + break; + case X86::TLS_base_addr32: + SRVK = MCSymbolRefExpr::VK_TLSLDM; + break; + case X86::TLS_base_addr64: + SRVK = MCSymbolRefExpr::VK_TLSLD; + break; + default: + llvm_unreachable("unexpected opcode"); + } + MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)); - const MCSymbolRefExpr *symRef = - MCSymbolRefExpr::Create(sym, MCSymbolRefExpr::VK_TLSGD, context); + const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context); MCInst LEA; if (is64Bits) { @@ -571,6 +596,14 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, LEA.addOperand(MCOperand::CreateReg(0)); // index LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp LEA.addOperand(MCOperand::CreateReg(0)); // seg + } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) { + LEA.setOpcode(X86::LEA32r); + LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base + LEA.addOperand(MCOperand::CreateImm(1)); // scale + LEA.addOperand(MCOperand::CreateReg(0)); // index + LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp + LEA.addOperand(MCOperand::CreateReg(0)); // seg } else { LEA.setOpcode(X86::LEA32r); LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest @@ -582,7 +615,7 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, } OutStreamer.EmitInstruction(LEA); - if (is64Bits) { + if (needsPadding) { MCInst prefix; prefix.setOpcode(X86::DATA16_PREFIX); OutStreamer.EmitInstruction(prefix); @@ -609,8 +642,6 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, } void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { - OutStreamer.EmitCodeRegion(); - X86MCInstLower MCInstLowering(Mang, *MF, *this); switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: @@ -646,6 +677,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::TLS_addr32: case X86::TLS_addr64: + case X86::TLS_base_addr32: + case X86::TLS_base_addr64: return LowerTlsAddr(OutStreamer, MCInstLowering, *MI); case X86::MOVPC32r: { @@ -715,4 +748,3 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInstLowering.Lower(MI, TmpInst); OutStreamer.EmitInstruction(TmpInst); } - diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index c747109..f83a525 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -66,6 +66,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// ArgumentStackSize - The number of bytes on stack consumed by the arguments /// being passed on the stack. unsigned ArgumentStackSize; + /// NumLocalDynamics - Number of local-dynamic TLS accesses. + unsigned NumLocalDynamics; public: X86MachineFunctionInfo() : ForceFramePointer(false), @@ -79,7 +81,8 @@ public: RegSaveFrameIndex(0), VarArgsGPOffset(0), VarArgsFPOffset(0), - ArgumentStackSize(0) {} + ArgumentStackSize(0), + NumLocalDynamics(0) {} explicit X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false), @@ -93,8 +96,9 @@ public: RegSaveFrameIndex(0), VarArgsGPOffset(0), VarArgsFPOffset(0), - ArgumentStackSize(0) {} - + ArgumentStackSize(0), + NumLocalDynamics(0) {} + bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } @@ -130,6 +134,10 @@ public: unsigned getArgumentStackSize() const { return ArgumentStackSize; } void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } + + unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } + }; } // End llvm namespace diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index b56025f..acf53f8 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -50,6 +50,10 @@ ForceStackAlign("force-align-stack", " needed for the function."), cl::init(false), cl::Hidden); +cl::opt<bool> +EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), + cl::desc("Enable use of a base pointer for complex stack frames")); + X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii) : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit() @@ -68,10 +72,12 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, SlotSize = 8; StackPtr = X86::RSP; FramePtr = X86::RBP; + BasePtr = X86::RBX; } else { SlotSize = 4; StackPtr = X86::ESP; FramePtr = X86::EBP; + BasePtr = X86::EBX; } } @@ -90,6 +96,12 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const { return -1; } +bool +X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + // Only enable when post-RA scheduling is enabled and this is needed. + return TM.getSubtargetImpl()->postRAScheduler(); +} + int X86RegisterInfo::getSEHRegNum(unsigned i) const { int reg = X86_MC::getX86RegNum(i); @@ -146,7 +158,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ // The GR8_NOREX class is always used in a way that won't be constrained to a // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the // full GR8 class. - if (RC == X86::GR8_NOREXRegisterClass) + if (RC == &X86::GR8_NOREXRegClass) return RC; const TargetRegisterClass *Super = RC; @@ -175,7 +187,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ } const TargetRegisterClass * -X86RegisterInfo::getPointerRegClass(unsigned Kind) const { +X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { switch (Kind) { default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); case 0: // Normal GPRs. @@ -238,7 +251,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } if (ghcCall) - return CSR_Ghc_SaveList; + return CSR_NoRegs_SaveList; if (Is64Bit) { if (IsWin64) return CSR_Win64_SaveList; @@ -254,7 +267,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { if (CC == CallingConv::GHC) - return CSR_Ghc_RegMask; + return CSR_NoRegs_RegMask; if (!Is64Bit) return CSR_32_RegMask; if (IsWin64) @@ -268,21 +281,33 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Set the stack-pointer register and its aliases as reserved. Reserved.set(X86::RSP); - Reserved.set(X86::ESP); - Reserved.set(X86::SP); - Reserved.set(X86::SPL); + for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I) + Reserved.set(*I); // Set the instruction pointer register and its aliases as reserved. Reserved.set(X86::RIP); - Reserved.set(X86::EIP); - Reserved.set(X86::IP); + for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I) + Reserved.set(*I); // Set the frame-pointer register and its aliases as reserved if needed. if (TFI->hasFP(MF)) { Reserved.set(X86::RBP); - Reserved.set(X86::EBP); - Reserved.set(X86::BP); - Reserved.set(X86::BPL); + for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I) + Reserved.set(*I); + } + + // Set the base-pointer register and its aliases as reserved if needed. + if (hasBasePointer(MF)) { + CallingConv::ID CC = MF.getFunction()->getCallingConv(); + const uint32_t* RegMask = getCallPreservedMask(CC); + if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) + report_fatal_error( + "Stack realignment in presence of dynamic allocas is not supported with" + "this calling convention."); + + Reserved.set(getBaseRegister()); + for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I) + Reserved.set(*I); } // Mark the segment registers as reserved. @@ -293,6 +318,16 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(X86::FS); Reserved.set(X86::GS); + // Mark the floating point stack registers as reserved. + Reserved.set(X86::ST0); + Reserved.set(X86::ST1); + Reserved.set(X86::ST2); + Reserved.set(X86::ST3); + Reserved.set(X86::ST4); + Reserved.set(X86::ST5); + Reserved.set(X86::ST6); + Reserved.set(X86::ST7); + // Reserve the registers that only exist in 64-bit mode. if (!Is64Bit) { // These 8-bit registers are part of the x86-64 extension even though their @@ -308,14 +343,13 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { X86::R8, X86::R9, X86::R10, X86::R11, X86::R12, X86::R13, X86::R14, X86::R15 }; - for (const uint16_t *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI) - Reserved.set(Reg); + for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI) + Reserved.set(*AI); // XMM8, XMM9, ... assert(X86::XMM15 == X86::XMM8+7); - for (const uint16_t *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI; - ++AI) - Reserved.set(Reg); + for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); } } @@ -326,10 +360,36 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Stack Frame Processing methods //===----------------------------------------------------------------------===// +bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!EnableBasePointer) + return false; + + // When we need stack realignment and there are dynamic allocas, we can't + // reference off of the stack pointer, so we reserve a base pointer. + if (needsStackRealignment(MF) && MFI->hasVarSizedObjects()) + return true; + + return false; +} + bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - return (MF.getTarget().Options.RealignStack && - !MFI->hasVarSizedObjects()); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + if (!MF.getTarget().Options.RealignStack) + return false; + + // Stack realignment requires a frame pointer. If we already started + // register allocation with frame pointer elimination, it is too late now. + if (!MRI->canReserveReg(FramePtr)) + return false; + + // If a base pointer is necessary. Check that it isn't too late to reserve + // it. + if (MFI->hasVarSizedObjects()) + return MRI->canReserveReg(BasePtr); + return true; } bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { @@ -339,13 +399,6 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->hasFnAttr(Attribute::StackAlignment)); - // FIXME: Currently we don't support stack realignment for functions with - // variable-sized allocas. - // FIXME: It's more complicated than this... - if (0 && requiresRealignment && MFI->hasVarSizedObjects()) - report_fatal_error( - "Stack realignment in presence of dynamic allocas is not supported"); - // If we've requested that we force align the stack do so now. if (ForceStackAlign) return canRealignStack(MF); @@ -485,7 +538,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned Opc = MI.getOpcode(); bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm; - if (needsStackRealignment(MF)) + if (hasBasePointer(MF)) + BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); + else if (needsStackRealignment(MF)) BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr); else if (AfterFPPop) BasePtr = StackPtr; diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index bee0393..1bc32cb 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -50,6 +50,11 @@ private: /// unsigned FramePtr; + /// BasePtr - X86 physical register used as a base ptr in complex stack + /// frames. I.e., when we need a 3rd base, not just SP and FP, due to + /// variable size stack objects. + unsigned BasePtr; + public: X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii); @@ -65,7 +70,8 @@ public: int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const; /// Code Generation virtual methods... - /// + /// + virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const; /// getMatchingSuperRegClass - Return a subclass of the specified register /// class A so that each register in it has a sub-register of the @@ -82,7 +88,8 @@ public: /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. - const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const; /// getCrossCopyRegClass - Returns a legal register class to copy a register /// in the specified class to or from. Returns NULL if it is possible to copy @@ -104,6 +111,8 @@ public: /// register scavenger to determine what registers are free. BitVector getReservedRegs(const MachineFunction &MF) const; + bool hasBasePointer(const MachineFunction &MF) const; + bool canRealignStack(const MachineFunction &MF) const; bool needsStackRealignment(const MachineFunction &MF) const; @@ -121,6 +130,7 @@ public: // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const; unsigned getStackRegister() const { return StackPtr; } + unsigned getBaseRegister() const { return BasePtr; } // FIXME: Move to FrameInfok unsigned getSlotSize() const { return SlotSize; } diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 5263a49..ae2d4d0 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -223,6 +223,9 @@ let Namespace = "X86" in { def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>; def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>; + // Floating-point status word + def FPSW : Register<"fpsw">; + // Status flags register def EFLAGS : Register<"flags">; @@ -296,26 +299,18 @@ def GR8 : RegisterClass<"X86", [i8], 8, def GR16 : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, SI, DI, BX, BP, SP, - R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)> { - let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)]; -} + R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>; def GR32 : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, - R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)> { - let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; -} + R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>; // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since // RIP isn't really a register and it can't be used anywhere except in an // address, but it doesn't cause trouble. def GR64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, - RBX, R14, R15, R12, R13, RBP, RSP, RIP)> { - let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), - (GR16 sub_16bit), - (GR32 sub_32bit)]; -} + RBX, R14, R15, R12, R13, RBP, RSP, RIP)>; // Segment registers for use by MOV instructions (and others) that have a // segment register as one operand. Always contain a 16-bit segment @@ -336,30 +331,12 @@ def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; // operations. def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>; def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>; -def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)> { - let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)]; -} -def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)> { - let SubRegClasses = [(GR8_ABCD_L sub_8bit), - (GR8_ABCD_H sub_8bit_hi), - (GR16_ABCD sub_16bit)]; -} -def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)> { - let SubRegClasses = [(GR8_ABCD_L sub_8bit), - (GR8_ABCD_H sub_8bit_hi), - (GR16_ABCD sub_16bit), - (GR32_ABCD sub_32bit)]; -} -def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)> { - let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; -} +def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>; +def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>; +def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>; +def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, - R8, R9, R11, RIP)> { - let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), - (GR16 sub_16bit), - (GR32_TC sub_32bit)]; -} - + R8, R9, R11, RIP)>; def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, R8, R9, R11)>; @@ -373,64 +350,36 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8, } // GR16_NOREX - GR16 registers which do not require a REX prefix. def GR16_NOREX : RegisterClass<"X86", [i16], 16, - (add AX, CX, DX, SI, DI, BX, BP, SP)> { - let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)]; -} + (add AX, CX, DX, SI, DI, BX, BP, SP)>; // GR32_NOREX - GR32 registers which do not require a REX prefix. def GR32_NOREX : RegisterClass<"X86", [i32], 32, - (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)> { - let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), - (GR16_NOREX sub_16bit)]; -} + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>; // GR64_NOREX - GR64 registers which do not require a REX prefix. def GR64_NOREX : RegisterClass<"X86", [i64], 64, - (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)> { - let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), - (GR16_NOREX sub_16bit), - (GR32_NOREX sub_32bit)]; -} + (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>; // GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit // mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs // to clear upper 32-bits of RAX so is not a NOP. -def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)> { - let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; -} +def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>; // GR32_NOSP - GR32 registers except ESP. -def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> { - let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; -} +def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>; // GR64_NOSP - GR64 registers except RSP (and RIP). -def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)> { - let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), - (GR16 sub_16bit), - (GR32_NOSP sub_32bit)]; -} +def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>; // GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except // ESP. def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, - (and GR32_NOREX, GR32_NOSP)> { - let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), - (GR16_NOREX sub_16bit)]; -} + (and GR32_NOREX, GR32_NOSP)>; // GR64_NOREX_NOSP - GR64_NOREX registers except RSP. def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, - (and GR64_NOREX, GR64_NOSP)> { - let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), - (GR16_NOREX sub_16bit), - (GR32_NOREX_NOSP sub_32bit)]; -} + (and GR64_NOREX, GR64_NOSP)>; // A class to support the 'A' assembler constraint: EAX then EDX. -def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)> { - let SubRegClasses = [(GR8_ABCD_L sub_8bit), - (GR8_ABCD_H sub_8bit_hi), - (GR16_ABCD sub_16bit)]; -} +def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>; // Scalar SSE2 floating point registers. def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; @@ -458,17 +407,16 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { // Generic vector registers: VR64 and VR128. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, (add FR32)> { - let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)]; -} - + 128, (add FR32)>; def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - 256, (sequence "YMM%u", 0, 15)> { - let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)]; -} + 256, (sequence "YMM%u", 0, 15)>; // Status flags registers. def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } +def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 17f4efd..c14407f 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Instruction Itinerary classes used for X86 +// Instruction Itinerary classes used for X86 def IIC_DEFAULT : InstrItinClass; def IIC_ALU_MEM : InstrItinClass; def IIC_ALU_NONMEM : InstrItinClass; @@ -253,6 +253,42 @@ def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass; def IIC_SSE_CVT_SD2SI_RM : InstrItinClass; def IIC_SSE_CVT_SD2SI_RR : InstrItinClass; +// MMX +def IIC_MMX_MOV_MM_RM : InstrItinClass; +def IIC_MMX_MOV_REG_MM : InstrItinClass; +def IIC_MMX_MOVQ_RM : InstrItinClass; +def IIC_MMX_MOVQ_RR : InstrItinClass; + +def IIC_MMX_ALU_RM : InstrItinClass; +def IIC_MMX_ALU_RR : InstrItinClass; +def IIC_MMX_ALUQ_RM : InstrItinClass; +def IIC_MMX_ALUQ_RR : InstrItinClass; +def IIC_MMX_PHADDSUBW_RM : InstrItinClass; +def IIC_MMX_PHADDSUBW_RR : InstrItinClass; +def IIC_MMX_PHADDSUBD_RM : InstrItinClass; +def IIC_MMX_PHADDSUBD_RR : InstrItinClass; +def IIC_MMX_PMUL : InstrItinClass; +def IIC_MMX_MISC_FUNC_MEM : InstrItinClass; +def IIC_MMX_MISC_FUNC_REG : InstrItinClass; +def IIC_MMX_PSADBW : InstrItinClass; +def IIC_MMX_SHIFT_RI : InstrItinClass; +def IIC_MMX_SHIFT_RM : InstrItinClass; +def IIC_MMX_SHIFT_RR : InstrItinClass; +def IIC_MMX_UNPCK_H_RM : InstrItinClass; +def IIC_MMX_UNPCK_H_RR : InstrItinClass; +def IIC_MMX_UNPCK_L : InstrItinClass; +def IIC_MMX_PCK_RM : InstrItinClass; +def IIC_MMX_PCK_RR : InstrItinClass; +def IIC_MMX_PSHUF : InstrItinClass; +def IIC_MMX_PEXTR : InstrItinClass; +def IIC_MMX_PINSRW : InstrItinClass; +def IIC_MMX_MASKMOV : InstrItinClass; + +def IIC_MMX_CVT_PD_RR : InstrItinClass; +def IIC_MMX_CVT_PD_RM : InstrItinClass; +def IIC_MMX_CVT_PS_RR : InstrItinClass; +def IIC_MMX_CVT_PS_RM : InstrItinClass; + def IIC_CMPX_LOCK : InstrItinClass; def IIC_CMPX_LOCK_8 : InstrItinClass; def IIC_CMPX_LOCK_8B : InstrItinClass; @@ -261,13 +297,185 @@ def IIC_CMPX_LOCK_16B : InstrItinClass; def IIC_XADD_LOCK_MEM : InstrItinClass; def IIC_XADD_LOCK_MEM8 : InstrItinClass; +def IIC_FILD : InstrItinClass; +def IIC_FLD : InstrItinClass; +def IIC_FLD80 : InstrItinClass; +def IIC_FST : InstrItinClass; +def IIC_FST80 : InstrItinClass; +def IIC_FIST : InstrItinClass; +def IIC_FLDZ : InstrItinClass; +def IIC_FUCOM : InstrItinClass; +def IIC_FUCOMI : InstrItinClass; +def IIC_FCOMI : InstrItinClass; +def IIC_FNSTSW : InstrItinClass; +def IIC_FNSTCW : InstrItinClass; +def IIC_FLDCW : InstrItinClass; +def IIC_FNINIT : InstrItinClass; +def IIC_FFREE : InstrItinClass; +def IIC_FNCLEX : InstrItinClass; +def IIC_WAIT : InstrItinClass; +def IIC_FXAM : InstrItinClass; +def IIC_FNOP : InstrItinClass; +def IIC_FLDL : InstrItinClass; +def IIC_F2XM1 : InstrItinClass; +def IIC_FYL2X : InstrItinClass; +def IIC_FPTAN : InstrItinClass; +def IIC_FPATAN : InstrItinClass; +def IIC_FXTRACT : InstrItinClass; +def IIC_FPREM1 : InstrItinClass; +def IIC_FPSTP : InstrItinClass; +def IIC_FPREM : InstrItinClass; +def IIC_FYL2XP1 : InstrItinClass; +def IIC_FSINCOS : InstrItinClass; +def IIC_FRNDINT : InstrItinClass; +def IIC_FSCALE : InstrItinClass; +def IIC_FCOMPP : InstrItinClass; +def IIC_FXSAVE : InstrItinClass; +def IIC_FXRSTOR : InstrItinClass; + +def IIC_FXCH : InstrItinClass; + +// System instructions +def IIC_CPUID : InstrItinClass; +def IIC_INT : InstrItinClass; +def IIC_INT3 : InstrItinClass; +def IIC_INVD : InstrItinClass; +def IIC_INVLPG : InstrItinClass; +def IIC_IRET : InstrItinClass; +def IIC_HLT : InstrItinClass; +def IIC_LXS : InstrItinClass; +def IIC_LTR : InstrItinClass; +def IIC_RDTSC : InstrItinClass; +def IIC_RSM : InstrItinClass; +def IIC_SIDT : InstrItinClass; +def IIC_SGDT : InstrItinClass; +def IIC_SLDT : InstrItinClass; +def IIC_STR : InstrItinClass; +def IIC_SWAPGS : InstrItinClass; +def IIC_SYSCALL : InstrItinClass; +def IIC_SYS_ENTER_EXIT : InstrItinClass; +def IIC_IN_RR : InstrItinClass; +def IIC_IN_RI : InstrItinClass; +def IIC_OUT_RR : InstrItinClass; +def IIC_OUT_IR : InstrItinClass; +def IIC_INS : InstrItinClass; +def IIC_MOV_REG_DR : InstrItinClass; +def IIC_MOV_DR_REG : InstrItinClass; +def IIC_MOV_REG_CR : InstrItinClass; +def IIC_MOV_CR_REG : InstrItinClass; +def IIC_MOV_REG_SR : InstrItinClass; +def IIC_MOV_MEM_SR : InstrItinClass; +def IIC_MOV_SR_REG : InstrItinClass; +def IIC_MOV_SR_MEM : InstrItinClass; +def IIC_LAR_RM : InstrItinClass; +def IIC_LAR_RR : InstrItinClass; +def IIC_LSL_RM : InstrItinClass; +def IIC_LSL_RR : InstrItinClass; +def IIC_LGDT : InstrItinClass; +def IIC_LIDT : InstrItinClass; +def IIC_LLDT_REG : InstrItinClass; +def IIC_LLDT_MEM : InstrItinClass; +def IIC_PUSH_CS : InstrItinClass; +def IIC_PUSH_SR : InstrItinClass; +def IIC_POP_SR : InstrItinClass; +def IIC_POP_SR_SS : InstrItinClass; +def IIC_VERR : InstrItinClass; +def IIC_VERW_REG : InstrItinClass; +def IIC_VERW_MEM : InstrItinClass; +def IIC_WRMSR : InstrItinClass; +def IIC_RDMSR : InstrItinClass; +def IIC_RDPMC : InstrItinClass; +def IIC_SMSW : InstrItinClass; +def IIC_LMSW_REG : InstrItinClass; +def IIC_LMSW_MEM : InstrItinClass; +def IIC_ENTER : InstrItinClass; +def IIC_LEAVE : InstrItinClass; +def IIC_POP_MEM : InstrItinClass; +def IIC_POP_REG16 : InstrItinClass; +def IIC_POP_REG : InstrItinClass; +def IIC_POP_F : InstrItinClass; +def IIC_POP_FD : InstrItinClass; +def IIC_POP_A : InstrItinClass; +def IIC_PUSH_IMM : InstrItinClass; +def IIC_PUSH_MEM : InstrItinClass; +def IIC_PUSH_REG : InstrItinClass; +def IIC_PUSH_F : InstrItinClass; +def IIC_PUSH_A : InstrItinClass; +def IIC_BSWAP : InstrItinClass; +def IIC_BSF : InstrItinClass; +def IIC_BSR : InstrItinClass; +def IIC_MOVS : InstrItinClass; +def IIC_STOS : InstrItinClass; +def IIC_SCAS : InstrItinClass; +def IIC_CMPS : InstrItinClass; +def IIC_MOV : InstrItinClass; +def IIC_MOV_MEM : InstrItinClass; +def IIC_AHF : InstrItinClass; +def IIC_BT_MI : InstrItinClass; +def IIC_BT_MR : InstrItinClass; +def IIC_BT_RI : InstrItinClass; +def IIC_BT_RR : InstrItinClass; +def IIC_BTX_MI : InstrItinClass; +def IIC_BTX_MR : InstrItinClass; +def IIC_BTX_RI : InstrItinClass; +def IIC_BTX_RR : InstrItinClass; +def IIC_XCHG_REG : InstrItinClass; +def IIC_XCHG_MEM : InstrItinClass; +def IIC_XADD_REG : InstrItinClass; +def IIC_XADD_MEM : InstrItinClass; +def IIC_CMPXCHG_MEM : InstrItinClass; +def IIC_CMPXCHG_REG : InstrItinClass; +def IIC_CMPXCHG_MEM8 : InstrItinClass; +def IIC_CMPXCHG_REG8 : InstrItinClass; +def IIC_CMPXCHG_8B : InstrItinClass; +def IIC_CMPXCHG_16B : InstrItinClass; +def IIC_LODS : InstrItinClass; +def IIC_OUTS : InstrItinClass; +def IIC_CLC : InstrItinClass; +def IIC_CLD : InstrItinClass; +def IIC_CLI : InstrItinClass; +def IIC_CMC : InstrItinClass; +def IIC_CLTS : InstrItinClass; +def IIC_STC : InstrItinClass; +def IIC_STI : InstrItinClass; +def IIC_STD : InstrItinClass; +def IIC_XLAT : InstrItinClass; +def IIC_AAA : InstrItinClass; +def IIC_AAD : InstrItinClass; +def IIC_AAM : InstrItinClass; +def IIC_AAS : InstrItinClass; +def IIC_DAA : InstrItinClass; +def IIC_DAS : InstrItinClass; +def IIC_BOUND : InstrItinClass; +def IIC_ARPL_REG : InstrItinClass; +def IIC_ARPL_MEM : InstrItinClass; +def IIC_MOVBE : InstrItinClass; + +def IIC_NOP : InstrItinClass; //===----------------------------------------------------------------------===// // Processor instruction itineraries. -def GenericItineraries : ProcessorItineraries<[], [], []>; +// IssueWidth is analagous to the number of decode units. Core and its +// descendents, including Nehalem and SandyBridge have 4 decoders. +// Resources beyond the decoder operate on micro-ops and are bufferred +// so adjacent micro-ops don't directly compete. +// +// MinLatency=0 indicates that RAW dependencies can be decoded in the +// same cycle. +// +// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef +// indicates high latency opcodes. Alternatively, InstrItinData +// entries may be included here to define specific operand +// latencies. Since these latencies are not used for pipeline hazards, +// they do not need to be exact. +// +// The GenericModel contains no instruciton itineraries. +def GenericModel : SchedMachineModel { + let IssueWidth = 4; + let MinLatency = 0; + let LoadLatency = 4; + let HighLatency = 10; +} include "X86ScheduleAtom.td" - - - diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 77d4e56..8710261 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -106,7 +106,7 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >, InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >, // set - InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >, InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >, // jcc InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >, @@ -294,12 +294,237 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >, InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >, + // MMX MOVs + InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >, + // other MMX + InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PSADBW, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >, + // conversions + // from/to PD + InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >, + // from/to PI + InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>]>, + InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >, InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >, InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >, InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >, InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >, - InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] > + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >, + InstrItinData<IIC_FLD, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >, + + InstrItinData<IIC_FST, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >, + + InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_FCOMI, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_FLDCW, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >, + InstrItinData<IIC_FFREE, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >, + InstrItinData<IIC_WAIT, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FXAM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_FNOP, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FLDL, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_F2XM1, [InstrStage<99, [Port0, Port1]>] >, + InstrItinData<IIC_FYL2X, [InstrStage<146, [Port0, Port1]>] >, + InstrItinData<IIC_FPTAN, [InstrStage<168, [Port0, Port1]>] >, + InstrItinData<IIC_FPATAN, [InstrStage<183, [Port0, Port1]>] >, + InstrItinData<IIC_FXTRACT, [InstrStage<25, [Port0, Port1]>] >, + InstrItinData<IIC_FPREM1, [InstrStage<71, [Port0, Port1]>] >, + InstrItinData<IIC_FPSTP, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FPREM, [InstrStage<55, [Port0, Port1]>] >, + InstrItinData<IIC_FYL2XP1, [InstrStage<147, [Port0, Port1]>] >, + InstrItinData<IIC_FSINCOS, [InstrStage<174, [Port0, Port1]>] >, + InstrItinData<IIC_FRNDINT, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_FSCALE, [InstrStage<77, [Port0, Port1]>] >, + InstrItinData<IIC_FCOMPP, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >, + InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >, + InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >, + + // System instructions + InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >, + InstrItinData<IIC_INT, [InstrStage<127, [Port0, Port1]>] >, + InstrItinData<IIC_INT3, [InstrStage<130, [Port0, Port1]>] >, + InstrItinData<IIC_INVD, [InstrStage<1003, [Port0, Port1]>] >, + InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >, + InstrItinData<IIC_IRET, [InstrStage<109, [Port0, Port1]>] >, + InstrItinData<IIC_HLT, [InstrStage<121, [Port0, Port1]>] >, + InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >, + InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >, + InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >, + InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SLDT, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_STR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >, + InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >, + InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >, + + InstrItinData<IIC_IN_RR, [InstrStage<94, [Port0, Port1]>] >, + InstrItinData<IIC_IN_RI, [InstrStage<92, [Port0, Port1]>] >, + InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >, + InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >, + InstrItinData<IIC_INS, [InstrStage<59, [Port0, Port1]>] >, + + InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >, + // worst case for mov REG_CRx + InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >, + + InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >, + // LAR + InstrItinData<IIC_LAR_RM, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_LAR_RR, [InstrStage<54, [Port0, Port1]>] >, + // LSL + InstrItinData<IIC_LSL_RM, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_LSL_RR, [InstrStage<49, [Port0, Port1]>] >, + + InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >, + InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >, + InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >, + InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >, + // push control register, segment registers + InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >, + // pop control register, segment registers + InstrItinData<IIC_POP_SR, [InstrStage<29, [Port0, Port1]>] >, + InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >, + // VERR, VERW + InstrItinData<IIC_VERR, [InstrStage<41, [Port0, Port1]>] >, + InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >, + InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >, + // WRMSR, RDMSR + InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >, + InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >, + InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >, + // SMSW, LMSW + InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >, + InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >, + + InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >, + InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >, + + InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >, + + InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >, + InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >, + InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >, + InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >, + InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >, + InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >, + InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >, + InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >, + InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >, + InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >, + InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >, + InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >, + InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >, + InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] > ]>; +// Atom machine model. +def AtomModel : SchedMachineModel { + let IssueWidth = 2; // Allows 2 instructions per scheduling group. + let MinLatency = 1; // InstrStage cycles overrides MinLatency. + // OperandCycles may be used for expected latency. + let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. + let HighLatency = 30;// Expected, may be overriden by OperandCycles. + + let Itineraries = AtomItineraries; +} diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 9a04e35..7c6788f 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -62,13 +62,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); - std::pair<SDValue,SDValue> CallResult = - TLI.LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), + TargetLowering:: + CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed=*/false, DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); + std::pair<SDValue,SDValue> CallResult = + TLI.LowerCallTo(CLI); return CallResult.second; } diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index ed1a409..e6e9c56 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -196,33 +196,32 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);} if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);} if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);} - // FIXME: AVX codegen support is not ready. - //if ((ECX >> 28) & 1) { X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); } + if ((ECX >> 28) & 1) { X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); } bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0; bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0; - if (IsIntel && ((ECX >> 1) & 0x1)) { - HasCLMUL = true; - ToggleFeature(X86::FeatureCLMUL); + if ((ECX >> 1) & 0x1) { + HasPCLMUL = true; + ToggleFeature(X86::FeaturePCLMUL); } - if (IsIntel && ((ECX >> 12) & 0x1)) { - HasFMA3 = true; - ToggleFeature(X86::FeatureFMA3); + if ((ECX >> 12) & 0x1) { + HasFMA = true; + ToggleFeature(X86::FeatureFMA); } if (IsIntel && ((ECX >> 22) & 0x1)) { HasMOVBE = true; ToggleFeature(X86::FeatureMOVBE); } - if (IsIntel && ((ECX >> 23) & 0x1)) { + if ((ECX >> 23) & 0x1) { HasPOPCNT = true; ToggleFeature(X86::FeaturePOPCNT); } - if (IsIntel && ((ECX >> 25) & 0x1)) { + if ((ECX >> 25) & 0x1) { HasAES = true; ToggleFeature(X86::FeatureAES); } - if (IsIntel && ((ECX >> 29) & 0x1)) { + if ((ECX >> 29) & 0x1) { HasF16C = true; ToggleFeature(X86::FeatureF16C); } @@ -254,8 +253,12 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { } // Set processor type. Currently only Atom is detected. - if (Family == 6 && Model == 28) { + if (Family == 6 && + (Model == 28 || Model == 38 || Model == 39 + || Model == 53 || Model == 54)) { X86ProcFamily = IntelAtom; + + UseLeaForSP = true; ToggleFeature(X86::FeatureLeaForSP); } @@ -289,9 +292,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { } } - if (IsIntel && MaxLevel >= 7) { + if (MaxLevel >= 7) { if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) { - if (EBX & 0x1) { + if (IsIntel && (EBX & 0x1)) { HasFSGSBase = true; ToggleFeature(X86::FeatureFSGSBase); } @@ -299,12 +302,11 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasBMI = true; ToggleFeature(X86::FeatureBMI); } - // FIXME: AVX2 codegen support is not ready. - //if ((EBX >> 5) & 0x1) { - // X86SSELevel = AVX2; - // ToggleFeature(X86::FeatureAVX2); - //} - if ((EBX >> 8) & 0x1) { + if (IsIntel && ((EBX >> 5) & 0x1)) { + X86SSELevel = AVX2; + ToggleFeature(X86::FeatureAVX2); + } + if (IsIntel && ((EBX >> 8) & 0x1)) { HasBMI2 = true; ToggleFeature(X86::FeatureBMI2); } @@ -325,8 +327,8 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, , HasPOPCNT(false) , HasSSE4A(false) , HasAES(false) - , HasCLMUL(false) - , HasFMA3(false) + , HasPCLMUL(false) + , HasFMA(false) , HasFMA4(false) , HasXOP(false) , HasMOVBE(false) @@ -424,9 +426,7 @@ bool X86Subtarget::enablePostRAScheduler( CodeGenOpt::Level OptLevel, TargetSubtargetInfo::AntiDepBreakMode& Mode, RegClassVector& CriticalPathRCs) const { - //TODO: change back to ANTIDEP_CRITICAL when the - // X86 subtarget properly sets up post RA liveness. - Mode = TargetSubtargetInfo::ANTIDEP_NONE; + Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; CriticalPathRCs.clear(); return PostRAScheduler && OptLevel >= CodeGenOpt::Default; } diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 7fd832b..1af585f 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -85,11 +85,11 @@ protected: /// HasAES - Target has AES instructions bool HasAES; - /// HasCLMUL - Target has carry-less multiplication - bool HasCLMUL; + /// HasPCLMUL - Target has carry-less multiplication + bool HasPCLMUL; - /// HasFMA3 - Target has 3-operand fused multiply-add - bool HasFMA3; + /// HasFMA - Target has 3-operand fused multiply-add + bool HasFMA; /// HasFMA4 - Target has 4-operand fused multiply-add bool HasFMA4; @@ -203,8 +203,8 @@ public: bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasPOPCNT() const { return HasPOPCNT; } bool hasAES() const { return HasAES; } - bool hasCLMUL() const { return HasCLMUL; } - bool hasFMA3() const { return HasFMA3; } + bool hasPCLMUL() const { return HasPCLMUL; } + bool hasFMA() const { return HasFMA; } bool hasFMA4() const { return HasFMA4; } bool hasXOP() const { return HasXOP; } bool hasMOVBE() const { return HasMOVBE; } @@ -307,6 +307,8 @@ public: TargetSubtargetInfo::AntiDepBreakMode& Mode, RegClassVector& CriticalPathRCs) const; + bool postRAScheduler() const { return PostRAScheduler; } + /// getInstrItins = Return the instruction itineraries based on the /// subtarget selection. const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index f4b7a62..b7ba568 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -140,39 +140,48 @@ public: } // namespace TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { - return new X86PassConfig(this, PM); + X86PassConfig *PC = new X86PassConfig(this, PM); + + if (Subtarget.hasCMov()) + PC->enablePass(&EarlyIfConverterID); + + return PC; } bool X86PassConfig::addInstSelector() { // Install an instruction selector. - PM.add(createX86ISelDag(getX86TargetMachine(), getOptLevel())); + addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); + + // For ELF, cleanup any local-dynamic TLS accesses. + if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None) + addPass(createCleanupLocalDynamicTLSPass()); // For 32-bit, prepend instructions to set the "global base reg" for PIC. if (!getX86Subtarget().is64Bit()) - PM.add(createGlobalBaseRegPass()); + addPass(createGlobalBaseRegPass()); return false; } bool X86PassConfig::addPreRegAlloc() { - PM.add(createX86MaxStackAlignmentHeuristicPass()); + addPass(createX86MaxStackAlignmentHeuristicPass()); return false; // -print-machineinstr shouldn't print after this. } bool X86PassConfig::addPostRegAlloc() { - PM.add(createX86FloatingPointStackifierPass()); + addPass(createX86FloatingPointStackifierPass()); return true; // -print-machineinstr should print after this. } bool X86PassConfig::addPreEmitPass() { bool ShouldPrint = false; if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) { - PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass)); + addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); ShouldPrint = true; } if (getX86Subtarget().hasAVX() && UseVZeroUpper) { - PM.add(createX86IssueVZeroUpperPass()); + addPass(createX86IssueVZeroUpperPass()); ShouldPrint = true; } diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 718f35e..92aee0d 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -9,16 +9,19 @@ #include "X86TargetObjectFile.h" #include "X86TargetMachine.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Target/Mangler.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/ELF.h" using namespace llvm; using namespace dwarf; -const MCExpr *X8664_MachoTargetObjectFile:: +const MCExpr *X86_64MachoTargetObjectFile:: getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, MachineModuleInfo *MMI, unsigned Encoding, MCStreamer &Streamer) const { @@ -37,8 +40,14 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer); } -MCSymbol *X8664_MachoTargetObjectFile:: +MCSymbol *X86_64MachoTargetObjectFile:: getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, MachineModuleInfo *MMI) const { return Mang->getSymbol(GV); } + +void +X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index a02a368..2d320c5 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -16,9 +16,9 @@ namespace llvm { - /// X8664_MachoTargetObjectFile - This TLOF implementation is used for Darwin + /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin /// x86-64. - class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO { + class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO { public: virtual const MCExpr * getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, @@ -32,6 +32,12 @@ namespace llvm { MachineModuleInfo *MMI) const; }; + /// X86LinuxTargetObjectFile - This implementation is used for linux x86 + /// and x86-64. + class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF { + virtual void Initialize(MCContext &Ctx, const TargetMachine &TM); + }; + } // end namespace llvm #endif diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 2fd78a7..e4f567f 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -145,7 +145,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { // to insert any VZEROUPPER instructions. This is constant-time, so it is // cheap in the common case of no ymm use. bool YMMUsed = false; - const TargetRegisterClass *RC = X86::VR256RegisterClass; + const TargetRegisterClass *RC = &X86::VR256RegClass; for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; i++) { if (MRI.isPhysRegUsed(*i)) { @@ -205,7 +205,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, } - // The entry MBB for the function may set the inital state to dirty if + // The entry MBB for the function may set the initial state to dirty if // the function receives any YMM incoming arguments if (MBB == MF.begin()) { EntryState = ST_CLEAN; diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt index 0d59572..ca94f03 100644 --- a/lib/Target/XCore/CMakeLists.txt +++ b/lib/Target/XCore/CMakeLists.txt @@ -22,5 +22,7 @@ add_llvm_target(XCoreCodeGen XCoreSelectionDAGInfo.cpp ) +add_dependencies(LLVMXCoreCodeGen intrinsics_gen) + add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp index 8906b24..c76866f 100644 --- a/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -18,9 +18,9 @@ #include "XCoreSubtarget.h" #include "XCoreTargetMachine.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Module.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -260,7 +260,17 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum, bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,const char *ExtraCode, raw_ostream &O) { - printOperand(MI, OpNo, O); + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + } + +printOperand(MI, OpNo, O); return false; } diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp index 50fda58..3dbc3b9 100644 --- a/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/lib/Target/XCore/XCoreFrameLowering.cpp @@ -78,8 +78,7 @@ static void storeToStack(MachineBasicBlock &MBB, //===----------------------------------------------------------------------===// XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0), - STI(sti) { + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) { // Do nothing } @@ -341,7 +340,7 @@ XCoreFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR); - const TargetRegisterClass *RC = XCore::GRRegsRegisterClass; + const TargetRegisterClass *RC = &XCore::GRRegsRegClass; XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>(); if (LRUsed) { MF.getRegInfo().setPhysRegUnused(XCore::LR); diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h index 4c51aa5..afa2773 100644 --- a/lib/Target/XCore/XCoreFrameLowering.h +++ b/lib/Target/XCore/XCoreFrameLowering.h @@ -22,7 +22,6 @@ namespace llvm { class XCoreSubtarget; class XCoreFrameLowering: public TargetFrameLowering { - const XCoreSubtarget &STI; public: XCoreFrameLowering(const XCoreSubtarget &STI); diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index fdf2b78..8643ffc 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -66,7 +66,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM) Subtarget(*XTM.getSubtargetImpl()) { // Set up the register classes. - addRegisterClass(MVT::i32, XCore::GRRegsRegisterClass); + addRegisterClass(MVT::i32, &XCore::GRRegsRegClass); // Compute derived properties from the register classes computeRegisterProperties(); @@ -485,12 +485,12 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { Entry.Node = BasePtr; Args.push_back(Entry); - std::pair<SDValue, SDValue> CallResult = - LowerCallTo(Chain, IntPtrTy, false, false, + TargetLowering::CallLoweringInfo CLI(Chain, IntPtrTy, false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed=*/true, DAG.getExternalSymbol("__misaligned_load", getPointerTy()), Args, DAG, DL); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); SDValue Ops[] = { CallResult.first, CallResult.second }; @@ -547,12 +547,13 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const Entry.Node = Value; Args.push_back(Entry); - std::pair<SDValue, SDValue> CallResult = - LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), false, false, + TargetLowering::CallLoweringInfo CLI(Chain, + Type::getVoidTy(*DAG.getContext()), false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed=*/true, DAG.getExternalSymbol("__misaligned_store", getPointerTy()), Args, DAG, dl); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.second; } @@ -873,14 +874,19 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { /// XCore call implementation SDValue -XCoreTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + // XCore target does not yet support tail call optimization. isTailCall = false; @@ -913,7 +919,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); // The ABI dictates there should be one stack slot available to the callee // on function entry (for saving lr). @@ -1036,7 +1042,7 @@ XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_XCore); @@ -1096,7 +1102,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_XCore); @@ -1121,8 +1127,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain, llvm_unreachable(0); } case MVT::i32: - unsigned VReg = RegInfo.createVirtualRegister( - XCore::GRRegsRegisterClass); + unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); } @@ -1172,8 +1177,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain, offset -= StackSlotSize; SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); // Move argument from phys reg -> virt reg - unsigned VReg = RegInfo.createVirtualRegister( - XCore::GRRegsRegisterClass); + unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass); RegInfo.addLiveIn(ArgRegs[i], VReg); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); // Move argument from virt reg -> stack @@ -1201,7 +1205,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain, bool XCoreTargetLowering:: CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, - bool isVarArg, + bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; @@ -1222,7 +1226,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain, // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); // Analyze return values. CCInfo.AnalyzeReturn(Outs, RetCC_XCore); @@ -1606,12 +1610,12 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM, std::pair<unsigned, const TargetRegisterClass*> XCoreTargetLowering:: getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const { + EVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default : break; case 'r': - return std::make_pair(0U, XCore::GRRegsRegisterClass); + return std::make_pair(0U, &XCore::GRRegsRegClass); } } // Use the default implementation in TargetLowering to convert the register diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h index 0b63ecd..2874f00 100644 --- a/lib/Target/XCore/XCoreISelLowering.h +++ b/lib/Target/XCore/XCoreISelLowering.h @@ -151,7 +151,7 @@ namespace llvm { // Inline asm support std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const; + EVT VT) const; // Expand specifics SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const; @@ -174,12 +174,7 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, - bool isVarArg, bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue @@ -191,7 +186,7 @@ namespace llvm { virtual bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, - bool isVarArg, + bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &ArgsFlags, LLVMContext &Context) const; }; diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td index b25a08d..ae646a2 100644 --- a/lib/Target/XCore/XCoreInstrInfo.td +++ b/lib/Target/XCore/XCoreInstrInfo.td @@ -741,14 +741,12 @@ let isCall=1, // All calls clobber the link register and the non-callee-saved registers: Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in { def BL_u10 : _FU10< - (outs), - (ins calltarget:$target, variable_ops), + (outs), (ins calltarget:$target), "bl $target", [(XCoreBranchLink immU10:$target)]>; def BL_lu10 : _FLU10< - (outs), - (ins calltarget:$target, variable_ops), + (outs), (ins calltarget:$target), "bl $target", [(XCoreBranchLink immU20:$target)]>; } @@ -796,7 +794,7 @@ def MKMSK_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$size), def MKMSK_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$size), "mkmsk $dst, $size", - [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), 0xffffffff))]>; + [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>; def GETR_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$type), "getr $dst, $type", @@ -950,10 +948,10 @@ def ENDIN_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src), // dgetreg def MSYNC_1r : _F1R<(outs), (ins GRRegs:$i), "msync res[$i]", - [(int_xcore_msync GRRegs:$i)]>; + [(int_xcore_msync GRRegs:$i)]>; def MJOIN_1r : _F1R<(outs), (ins GRRegs:$i), "mjoin res[$i]", - [(int_xcore_mjoin GRRegs:$i)]>; + [(int_xcore_mjoin GRRegs:$i)]>; let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in def BAU_1r : _F1R<(outs), (ins GRRegs:$addr), @@ -988,7 +986,7 @@ def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src), let isCall=1, // All calls clobber the link register and the non-callee-saved registers: Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in { -def BLA_1r : _F1R<(outs), (ins GRRegs:$addr, variable_ops), +def BLA_1r : _F1R<(outs), (ins GRRegs:$addr), "bla $addr", [(XCoreBranchLink GRRegs:$addr)]>; } @@ -1038,7 +1036,7 @@ def GETET_0R : _F0R<(outs), (ins), def SSYNC_0r : _F0R<(outs), (ins), "ssync", - [(int_xcore_ssync)]>; + [(int_xcore_ssync)]>; let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1, hasSideEffects = 1 in diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index f3b4b4c..cdd0a08 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -92,6 +92,11 @@ XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const { } bool +XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + return requiresRegisterScavenging(MF); +} + +bool XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { return false; } @@ -205,8 +210,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned Reg = MI.getOperand(0).getReg(); bool isKill = MI.getOpcode() == XCore::STWFI && MI.getOperand(0).isKill(); - assert(XCore::GRRegsRegisterClass->contains(Reg) && - "Unexpected register operand"); + assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand"); MachineBasicBlock &MBB = *MI.getParent(); @@ -217,7 +221,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (!RS) report_fatal_error("eliminateFrameIndex Frame size too big: " + Twine(Offset)); - unsigned ScratchReg = RS->scavengeRegister(XCore::GRRegsRegisterClass, II, + unsigned ScratchReg = RS->scavengeRegister(&XCore::GRRegsRegClass, II, SPAdj); loadConstant(MBB, II, ScratchReg, Offset, dl); switch (MI.getOpcode()) { diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h index 7391cfd..c4dcb6b 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.h +++ b/lib/Target/XCore/XCoreRegisterInfo.h @@ -50,6 +50,8 @@ public: bool requiresRegisterScavenging(const MachineFunction &MF) const; + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const; + bool useFPForScavengingIndex(const MachineFunction &MF) const; void eliminateCallFramePseudoInstr(MachineFunction &MF, diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index f65297e..11ec86b 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -55,7 +55,7 @@ TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) { } bool XCorePassConfig::addInstSelector() { - PM.add(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel())); + addPass(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel())); return false; } diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index e160f63..b94dd69 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -245,10 +245,7 @@ static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix, const ArgPromotion::IndicesVector &Longer) { if (Prefix.size() > Longer.size()) return false; - for (unsigned i = 0, e = Prefix.size(); i != e; ++i) - if (Prefix[i] != Longer[i]) - return false; - return true; + return std::equal(Prefix.begin(), Prefix.end(), Longer.begin()); } @@ -616,8 +613,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // Recompute the parameter attributes list based on the new arguments for // the function. - NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end())); + NF->setAttributes(AttrListPtr::get(AttributesVec)); AttributesVec.clear(); F->getParent()->getFunctionList().insert(F, NF); @@ -734,13 +730,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), Args, "", Call); cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end())); + cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec)); } else { New = CallInst::Create(NF, Args, "", Call); cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end())); + cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec)); if (cast<CallInst>(Call)->isTailCall()) cast<CallInst>(New)->setTailCall(); } diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index 58b3551..3f6b1de 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -20,3 +20,5 @@ add_llvm_library(LLVMipo StripDeadPrototypes.cpp StripSymbols.cpp ) + +add_dependencies(LLVMipo intrinsics_gen) diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 95aef27..fd23a93 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -238,7 +238,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { AttributesVec.push_back(PAL.getSlot(i)); if (Attributes FnAttrs = PAL.getFnAttributes()) AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); - PAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()); + PAL = AttrListPtr::get(AttributesVec); } Instruction *New; @@ -753,8 +753,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); // Reconstruct the AttributesList based on the vector we constructed. - AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end()); + AttrListPtr NewPAL = AttrListPtr::get(AttributesVec); // Create the new function type based on the recomputed parameters. FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg()); @@ -816,8 +815,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); // Reconstruct the AttributesList based on the vector we constructed. - AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end()); + AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec); Instruction *New; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp index d9911bf..4c7f0ed 100644 --- a/lib/Transforms/IPO/ExtractGV.cpp +++ b/lib/Transforms/IPO/ExtractGV.cpp @@ -53,12 +53,12 @@ namespace { I != E; ++I) { if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) { I->setInitializer(0); - } else { - if (I->hasAvailableExternallyLinkage()) - continue; - if (I->getName() == "llvm.global_ctors") - continue; - } + } else { + if (I->hasAvailableExternallyLinkage()) + continue; + if (I->getName() == "llvm.global_ctors") + continue; + } if (I->hasLocalLinkage()) I->setVisibility(GlobalValue::HiddenVisibility); @@ -69,10 +69,10 @@ namespace { for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) { I->deleteBody(); - } else { - if (I->hasAvailableExternallyLinkage()) - continue; - } + } else { + if (I->hasAvailableExternallyLinkage()) + continue; + } if (I->hasLocalLinkage()) I->setVisibility(GlobalValue::HiddenVisibility); diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index 2b427aa..18c1c7b 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -65,7 +65,7 @@ bool GlobalDCE::runOnModule(Module &M) { for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); // Functions with external linkage are needed if they have a body - if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() && + if (!I->isDiscardableIfUnused() && !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) GlobalIsNeeded(I); } @@ -75,7 +75,7 @@ bool GlobalDCE::runOnModule(Module &M) { Changed |= RemoveUnusedGlobalValue(*I); // Externally visible & appending globals are needed, if they have an // initializer. - if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() && + if (!I->isDiscardableIfUnused() && !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) GlobalIsNeeded(I); } @@ -84,7 +84,7 @@ bool GlobalDCE::runOnModule(Module &M) { I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); // Externally visible aliases are needed. - if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage()) + if (!I->isDiscardableIfUnused()) GlobalIsNeeded(I); } diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 1522aa4..60ce958 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -254,6 +254,8 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, GS.StoredType = GlobalStatus::isStored; } } + } else if (isa<BitCastInst>(I)) { + if (AnalyzeGlobal(I, GS, PHIUsers)) return true; } else if (isa<GetElementPtrInst>(I)) { if (AnalyzeGlobal(I, GS, PHIUsers)) return true; } else if (isa<SelectInst>(I)) { @@ -294,6 +296,165 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, return false; } +/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker +/// as a root? If so, we might not really want to eliminate the stores to it. +static bool isLeakCheckerRoot(GlobalVariable *GV) { + // A global variable is a root if it is a pointer, or could plausibly contain + // a pointer. There are two challenges; one is that we could have a struct + // the has an inner member which is a pointer. We recurse through the type to + // detect these (up to a point). The other is that we may actually be a union + // of a pointer and another type, and so our LLVM type is an integer which + // gets converted into a pointer, or our type is an [i8 x #] with a pointer + // potentially contained here. + + if (GV->hasPrivateLinkage()) + return false; + + SmallVector<Type *, 4> Types; + Types.push_back(cast<PointerType>(GV->getType())->getElementType()); + + unsigned Limit = 20; + do { + Type *Ty = Types.pop_back_val(); + switch (Ty->getTypeID()) { + default: break; + case Type::PointerTyID: return true; + case Type::ArrayTyID: + case Type::VectorTyID: { + SequentialType *STy = cast<SequentialType>(Ty); + Types.push_back(STy->getElementType()); + break; + } + case Type::StructTyID: { + StructType *STy = cast<StructType>(Ty); + if (STy->isOpaque()) return true; + for (StructType::element_iterator I = STy->element_begin(), + E = STy->element_end(); I != E; ++I) { + Type *InnerTy = *I; + if (isa<PointerType>(InnerTy)) return true; + if (isa<CompositeType>(InnerTy)) + Types.push_back(InnerTy); + } + break; + } + } + if (--Limit == 0) return true; + } while (!Types.empty()); + return false; +} + +/// Given a value that is stored to a global but never read, determine whether +/// it's safe to remove the store and the chain of computation that feeds the +/// store. +static bool IsSafeComputationToRemove(Value *V) { + do { + if (isa<Constant>(V)) + return true; + if (!V->hasOneUse()) + return false; + if (isa<LoadInst>(V) || isa<Argument>(V) || isa<GlobalValue>(V)) + return false; + if (isAllocationFn(V)) + return true; + + Instruction *I = cast<Instruction>(V); + if (I->mayHaveSideEffects()) + return false; + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + if (!GEP->hasAllConstantIndices()) + return false; + } else if (I->getNumOperands() != 1) { + return false; + } + + V = I->getOperand(0); + } while (1); +} + +/// CleanupPointerRootUsers - This GV is a pointer root. Loop over all users +/// of the global and clean up any that obviously don't assign the global a +/// value that isn't dynamically allocated. +/// +static bool CleanupPointerRootUsers(GlobalVariable *GV) { + // A brief explanation of leak checkers. The goal is to find bugs where + // pointers are forgotten, causing an accumulating growth in memory + // usage over time. The common strategy for leak checkers is to whitelist the + // memory pointed to by globals at exit. This is popular because it also + // solves another problem where the main thread of a C++ program may shut down + // before other threads that are still expecting to use those globals. To + // handle that case, we expect the program may create a singleton and never + // destroy it. + + bool Changed = false; + + // If Dead[n].first is the only use of a malloc result, we can delete its + // chain of computation and the store to the global in Dead[n].second. + SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead; + + // Constants can't be pointers to dynamically allocated memory. + for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E;) { + User *U = *UI++; + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + Value *V = SI->getValueOperand(); + if (isa<Constant>(V)) { + Changed = true; + SI->eraseFromParent(); + } else if (Instruction *I = dyn_cast<Instruction>(V)) { + if (I->hasOneUse()) + Dead.push_back(std::make_pair(I, SI)); + } + } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) { + if (isa<Constant>(MSI->getValue())) { + Changed = true; + MSI->eraseFromParent(); + } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) { + if (I->hasOneUse()) + Dead.push_back(std::make_pair(I, MSI)); + } + } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) { + GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource()); + if (MemSrc && MemSrc->isConstant()) { + Changed = true; + MTI->eraseFromParent(); + } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) { + if (I->hasOneUse()) + Dead.push_back(std::make_pair(I, MTI)); + } + } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { + if (CE->use_empty()) { + CE->destroyConstant(); + Changed = true; + } + } else if (Constant *C = dyn_cast<Constant>(U)) { + if (SafeToDestroyConstant(C)) { + C->destroyConstant(); + // This could have invalidated UI, start over from scratch. + Dead.clear(); + CleanupPointerRootUsers(GV); + return true; + } + } + } + + for (int i = 0, e = Dead.size(); i != e; ++i) { + if (IsSafeComputationToRemove(Dead[i].first)) { + Dead[i].second->eraseFromParent(); + Instruction *I = Dead[i].first; + do { + Instruction *J = dyn_cast<Instruction>(I->getOperand(0)); + if (!J) + break; + I->eraseFromParent(); + I = J; + } while (!isAllocationFn(I)); + I->eraseFromParent(); + } + } + + return Changed; +} + /// CleanupConstantGlobalUsers - We just marked GV constant. Loop over all /// users of the global, cleaning up the obvious ones. This is largely just a /// quick scan over the use list to clean up the easy and obvious cruft. This @@ -517,7 +678,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) { GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false, GlobalVariable::InternalLinkage, In, GV->getName()+"."+Twine(i), - GV->isThreadLocal(), + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); Globals.insert(GV, NGV); NewGlobals.push_back(NGV); @@ -550,7 +711,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) { GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false, GlobalVariable::InternalLinkage, In, GV->getName()+"."+Twine(i), - GV->isThreadLocal(), + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); Globals.insert(GV, NGV); NewGlobals.push_back(NGV); @@ -810,13 +971,18 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, // If we nuked all of the loads, then none of the stores are needed either, // nor is the global. if (AllNonStoreUsesGone) { - DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); - CleanupConstantGlobalUsers(GV, 0, TD, TLI); + if (isLeakCheckerRoot(GV)) { + Changed |= CleanupPointerRootUsers(GV); + } else { + Changed = true; + CleanupConstantGlobalUsers(GV, 0, TD, TLI); + } if (GV->use_empty()) { + DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); + Changed = true; GV->eraseFromParent(); ++NumDeleted; } - Changed = true; } return Changed; } @@ -866,7 +1032,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, UndefValue::get(GlobalType), GV->getName()+".body", GV, - GV->isThreadLocal()); + GV->getThreadLocalMode()); // If there are bitcast users of the malloc (which is typical, usually we have // a malloc + bitcast) then replace them with uses of the new global. Update @@ -899,7 +1065,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, new GlobalVariable(Type::getInt1Ty(GV->getContext()), false, GlobalValue::InternalLinkage, ConstantInt::getFalse(GV->getContext()), - GV->getName()+".init", GV->isThreadLocal()); + GV->getName()+".init", GV->getThreadLocalMode()); bool InitBoolUsed = false; // Loop over all uses of GV, processing them in turn. @@ -1321,7 +1487,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, PFieldTy, false, GlobalValue::InternalLinkage, Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo), GV, - GV->isThreadLocal()); + GV->getThreadLocalMode()); FieldGlobals.push_back(NGV); unsigned TypeSize = TD->getTypeAllocSize(FieldTy); @@ -1567,8 +1733,10 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI); CI->replaceAllUsesWith(Cast); CI->eraseFromParent(); - CI = dyn_cast<BitCastInst>(Malloc) ? - extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc); + if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc)) + CI = cast<CallInst>(BCI->getOperand(0)); + else + CI = cast<CallInst>(Malloc); } GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true), TD); @@ -1645,7 +1813,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { GlobalValue::InternalLinkage, ConstantInt::getFalse(GV->getContext()), GV->getName()+".b", - GV->isThreadLocal()); + GV->getThreadLocalMode()); GV->getParent()->getGlobalList().insert(GV, NewGV); Constant *InitVal = GV->getInitializer(); @@ -1716,7 +1884,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { /// possible. If we make a change, return true. bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, Module::global_iterator &GVI) { - if (!GV->hasLocalLinkage()) + if (!GV->isDiscardableIfUnused()) return false; // Do more involved optimizations if the global is internal. @@ -1729,6 +1897,9 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, return true; } + if (!GV->hasLocalLinkage()) + return false; + SmallPtrSet<const PHINode*, 16> PHIUsers; GlobalStatus GS; @@ -1787,10 +1958,15 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, if (!GS.isLoaded) { DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV); - // Delete any stores we can find to the global. We may not be able to - // make it completely dead though. - bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), - TD, TLI); + bool Changed; + if (isLeakCheckerRoot(GV)) { + // Delete any constant stores to the global. + Changed = CleanupPointerRootUsers(GV); + } else { + // Delete any stores we can find to the global. We may not be able to + // make it completely dead though. + Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI); + } // If the global is dead now, delete it. if (GV->use_empty()) { @@ -1838,7 +2014,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, if (GV->use_empty()) { DEBUG(dbgs() << " *** Substituting initializer allowed us to " - << "simplify all users and delete global!\n"); + << "simplify all users and delete global!\n"); GV->eraseFromParent(); ++NumDeleted; } else { @@ -1870,6 +2046,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, /// function, changing them to FastCC. static void ChangeCalleesToFastCall(Function *F) { for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ + if (isa<BlockAddress>(*UI)) + continue; CallSite User(cast<Instruction>(*UI)); User.setCallingConv(CallingConv::Fast); } @@ -1890,6 +2068,8 @@ static AttrListPtr StripNest(const AttrListPtr &Attrs) { static void RemoveNestAttribute(Function *F) { F->setAttributes(StripNest(F->getAttributes())); for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ + if (isa<BlockAddress>(*UI)) + continue; CallSite User(cast<Instruction>(*UI)); User.setAttributes(StripNest(User.getAttributes())); } @@ -2045,7 +2225,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, // Create the new global and insert it next to the existing list. GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(), CA, "", - GCL->isThreadLocal()); + GCL->getThreadLocalMode()); GCL->getParent()->getGlobalList().insert(GCL, NGV); NGV->takeName(GCL); @@ -2701,7 +2881,7 @@ static bool EvaluateStaticConstructor(Function *F, const TargetData *TD, << " stores.\n"); for (DenseMap<Constant*, Constant*>::const_iterator I = Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end(); - I != E; ++I) + I != E; ++I) CommitValueTo(I->second, I->first); for (SmallPtrSet<GlobalVariable*, 8>::const_iterator I = Eval.getInvariants().begin(), E = Eval.getInvariants().end(); diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index dc9cbfb..712888a 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -36,7 +36,7 @@ STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined"); STATISTIC(NumDeleted, "Number of functions deleted because all callers found"); STATISTIC(NumMergedAllocas, "Number of allocas merged together"); -// This weirdly named statistic tracks the number of times that, when attemting +// This weirdly named statistic tracks the number of times that, when attempting // to inline a function A into B, we analyze the callers of B in order to see // if those would be more profitable and blocked inline steps. STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed"); @@ -201,19 +201,22 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, } unsigned Inliner::getInlineThreshold(CallSite CS) const { - int thres = InlineThreshold; + int thres = InlineThreshold; // -inline-threshold or else selected by + // overall opt level - // Listen to optsize when -inline-limit is not given. + // If -inline-threshold is not given, listen to the optsize attribute when it + // would decrease the threshold. Function *Caller = CS.getCaller(); - if (Caller && !Caller->isDeclaration() && - Caller->hasFnAttr(Attribute::OptimizeForSize) && - InlineLimit.getNumOccurrences() == 0) + bool OptSize = Caller && !Caller->isDeclaration() && + Caller->hasFnAttr(Attribute::OptimizeForSize); + if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres) thres = OptSizeThreshold; - // Listen to inlinehint when it would increase the threshold. + // Listen to the inlinehint attribute when it would increase the threshold. Function *Callee = CS.getCalledFunction(); - if (HintThreshold > thres && Callee && !Callee->isDeclaration() && - Callee->hasFnAttr(Attribute::InlineHint)) + bool InlineHint = Callee && !Callee->isDeclaration() && + Callee->hasFnAttr(Attribute::InlineHint); + if (InlineHint && HintThreshold > thres) thres = HintThreshold; return thres; diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp index 4f96afe..97d7cdc 100644 --- a/lib/Transforms/IPO/LoopExtractor.cpp +++ b/lib/Transforms/IPO/LoopExtractor.cpp @@ -24,7 +24,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" #include "llvm/ADT/Statistic.h" #include <fstream> #include <set> @@ -132,7 +132,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { if (ShouldExtractLoop) { if (NumLoops == 0) return Changed; --NumLoops; - if (ExtractLoop(DT, L) != 0) { + CodeExtractor Extractor(DT, *L); + if (Extractor.extractCodeRegion() != 0) { Changed = true; // After extraction, the loop is replaced by a function call, so // we shouldn't try to run any more loop passes on it. @@ -296,7 +297,7 @@ bool BlockExtractorPass::runOnModule(Module &M) { if (const InvokeInst *II = dyn_cast<InvokeInst>(BlocksToExtract[i]->getTerminator())) BlocksToExtractVec.push_back(II->getUnwindDest()); - ExtractBasicBlock(BlocksToExtractVec); + CodeExtractor(BlocksToExtractVec).extractCodeRegion(); } return !BlocksToExtract.empty(); diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index 0b01c38..9f70f66 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -45,22 +45,22 @@ #define DEBUG_TYPE "mergefunc" #include "llvm/Transforms/IPO.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/FoldingSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/Constants.h" +#include "llvm/IRBuilder.h" #include "llvm/InlineAsm.h" #include "llvm/Instructions.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Operator.h" #include "llvm/Pass.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" @@ -389,7 +389,7 @@ bool FunctionComparator::enumerate(const Value *V1, const Value *V2) { if (!C2) return false; // TODO: constant expressions with GEP or references to F1 or F2. if (C1->isNullValue() && C2->isNullValue() && - isEquivalentType(C1->getType(), C2->getType())) + isEquivalentType(C1->getType(), C2->getType())) return true; // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1 // then they must have equal bit patterns. diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index d9d1d10..9c9910b 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -19,7 +19,7 @@ #include "llvm/Pass.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CFG.h" using namespace llvm; @@ -122,7 +122,8 @@ Function* PartialInliner::unswitchFunction(Function* F) { DT.runOnFunction(*duplicateFunction); // Extract the body of the if. - Function* extractedFunction = ExtractCodeRegion(DT, toExtract); + Function* extractedFunction + = CodeExtractor(toExtract, &DT).extractCodeRegion(); InlineFunctionInfo IFI; diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp index b5caa9a..d8e8cf7 100644 --- a/lib/Transforms/IPO/StripSymbols.cpp +++ b/lib/Transforms/IPO/StripSymbols.cpp @@ -22,11 +22,11 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Instructions.h" #include "llvm/Module.h" #include "llvm/Pass.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/ValueSymbolTable.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt index d070ccc..72cfe2c 100644 --- a/lib/Transforms/InstCombine/CMakeLists.txt +++ b/lib/Transforms/InstCombine/CMakeLists.txt @@ -13,3 +13,5 @@ add_llvm_library(LLVMInstCombine InstCombineSimplifyDemanded.cpp InstCombineVectorOps.cpp ) + +add_dependencies(LLVMInstCombine intrinsics_gen) diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h index 199df51..0d5ef90 100644 --- a/lib/Transforms/InstCombine/InstCombine.h +++ b/lib/Transforms/InstCombine/InstCombine.h @@ -11,11 +11,11 @@ #define INSTCOMBINE_INSTCOMBINE_H #include "InstCombineWorklist.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/Operator.h" #include "llvm/Pass.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/InstVisitor.h" #include "llvm/Support/TargetFolder.h" @@ -187,7 +187,7 @@ public: Instruction *visitPHINode(PHINode &PN); Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP); Instruction *visitAllocaInst(AllocaInst &AI); - Instruction *visitMalloc(Instruction &FI); + Instruction *visitAllocSite(Instruction &FI); Instruction *visitFree(CallInst &FI); Instruction *visitLoadInst(LoadInst &LI); Instruction *visitStoreInst(StoreInst &SI); diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 05e702f..99b62f8 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -170,10 +170,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // -A + B --> B - A // -A + -B --> -(A + B) if (Value *LHSV = dyn_castNegVal(LHS)) { - if (Value *RHSV = dyn_castNegVal(RHS)) { - Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum"); - return BinaryOperator::CreateNeg(NewAdd); - } + if (!isa<Constant>(RHS)) + if (Value *RHSV = dyn_castNegVal(RHS)) { + Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum"); + return BinaryOperator::CreateNeg(NewAdd); + } return BinaryOperator::CreateSub(RHS, LHSV); } @@ -329,6 +330,20 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { } } + // Check for (x & y) + (x ^ y) + { + Value *A = 0, *B = 0; + if (match(RHS, m_Xor(m_Value(A), m_Value(B))) && + (match(LHS, m_And(m_Specific(A), m_Specific(B))) || + match(LHS, m_And(m_Specific(B), m_Specific(A))))) + return BinaryOperator::CreateOr(A, B); + + if (match(LHS, m_Xor(m_Value(A), m_Value(B))) && + (match(RHS, m_And(m_Specific(A), m_Specific(B))) || + match(RHS, m_And(m_Specific(B), m_Specific(A))))) + return BinaryOperator::CreateOr(A, B); + } + return Changed ? &I : 0; } @@ -406,66 +421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { } -/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the -/// code necessary to compute the offset from the base pointer (without adding -/// in the base pointer). Return the result as a signed integer of intptr size. -Value *InstCombiner::EmitGEPOffset(User *GEP) { - TargetData &TD = *getTargetData(); - gep_type_iterator GTI = gep_type_begin(GEP); - Type *IntPtrTy = TD.getIntPtrType(GEP->getContext()); - Value *Result = Constant::getNullValue(IntPtrTy); - - // If the GEP is inbounds, we know that none of the addressing operations will - // overflow in an unsigned sense. - bool isInBounds = cast<GEPOperator>(GEP)->isInBounds(); - - // Build a mask for high order bits. - unsigned IntPtrWidth = TD.getPointerSizeInBits(); - uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth); - - for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e; - ++i, ++GTI) { - Value *Op = *i; - uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask; - if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) { - if (OpC->isZero()) continue; - - // Handle a struct index, which adds its field offset to the pointer. - if (StructType *STy = dyn_cast<StructType>(*GTI)) { - Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); - - if (Size) - Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size), - GEP->getName()+".offs"); - continue; - } - - Constant *Scale = ConstantInt::get(IntPtrTy, Size); - Constant *OC = - ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/); - Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/); - // Emit an add instruction. - Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs"); - continue; - } - // Convert to correct type. - if (Op->getType() != IntPtrTy) - Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c"); - if (Size != 1) { - // We'll let instcombine(mul) convert this to a shl if possible. - Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size), - GEP->getName()+".idx", isInBounds /*NUW*/); - } - - // Emit an add instruction. - Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs"); - } - return Result; -} - - - - /// Optimize pointer differences into the same array into a size. Consider: /// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. @@ -589,11 +544,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; - // C - zext(bool) -> bool ? C - 1 : C - if (ZExtInst *ZI = dyn_cast<ZExtInst>(Op1)) - if (ZI->getSrcTy()->isIntegerTy(1)) - return SelectInst::Create(ZI->getOperand(0), SubOne(C), C); - // C-(X+C2) --> (C-C2)-X ConstantInt *C2; if (match(Op1, m_Add(m_Value(X), m_ConstantInt(C2)))) diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 0dbe11d..7d0af0d 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -986,19 +986,23 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { bool Op1Ordered; unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered); unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered); + // uno && ord -> false + if (Op0Pred == 0 && Op1Pred == 0 && Op0Ordered != Op1Ordered) + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); if (Op1Pred == 0) { std::swap(LHS, RHS); std::swap(Op0Pred, Op1Pred); std::swap(Op0Ordered, Op1Ordered); } if (Op0Pred == 0) { - // uno && ueq -> uno && (uno || eq) -> ueq + // uno && ueq -> uno && (uno || eq) -> uno // ord && olt -> ord && (ord && lt) -> olt - if (Op0Ordered == Op1Ordered) + if (!Op0Ordered && (Op0Ordered == Op1Ordered)) + return LHS; + if (Op0Ordered && (Op0Ordered == Op1Ordered)) return RHS; // uno && oeq -> uno && (ord && eq) -> false - // uno && ord -> false if (!Op0Ordered) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); // ord && ueq -> ord && (uno || eq) -> oeq @@ -1932,10 +1936,15 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { // A | ( A ^ B) -> A | B // A | (~A ^ B) -> A | ~B + // (A & B) | (A ^ B) if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) { if (Op0 == A || Op0 == B) return BinaryOperator::CreateOr(A, B); + if (match(Op0, m_And(m_Specific(A), m_Specific(B))) || + match(Op0, m_And(m_Specific(B), m_Specific(A)))) + return BinaryOperator::CreateOr(A, B); + if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) { Value *Not = Builder->CreateNot(B, B->getName()+".not"); return BinaryOperator::CreateOr(Not, Op0); @@ -2212,7 +2221,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (Op0I && Op1I && Op0I->isShift() && Op0I->getOpcode() == Op1I->getOpcode() && Op0I->getOperand(1) == Op1I->getOperand(1) && - (Op1I->hasOneUse() || Op1I->hasOneUse())) { + (Op0I->hasOneUse() || Op1I->hasOneUse())) { Value *NewOp = Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0), Op0I->getName()); diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 77e4727..c1d9d01 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -172,8 +172,6 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (isFreeCall(&CI)) return visitFree(CI); - if (isMalloc(&CI)) - return visitMalloc(CI); // If the caller function is nounwind, mark the call as nounwind, even if the // callee isn't. @@ -246,78 +244,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::objectsize: { - // We need target data for just about everything so depend on it. - if (!TD) break; - - Type *ReturnTy = CI.getType(); - uint64_t DontKnow = II->getArgOperand(1) == Builder->getTrue() ? 0 : -1ULL; - - // Get to the real allocated thing and offset as fast as possible. - Value *Op1 = II->getArgOperand(0)->stripPointerCasts(); - - uint64_t Offset = 0; - uint64_t Size = -1ULL; - - // Try to look through constant GEPs. - if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1)) { - if (!GEP->hasAllConstantIndices()) break; - - // Get the current byte offset into the thing. Use the original - // operand in case we're looking through a bitcast. - SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end()); - if (!GEP->getPointerOperandType()->isPointerTy()) - return 0; - Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops); - - Op1 = GEP->getPointerOperand()->stripPointerCasts(); - - // Make sure we're not a constant offset from an external - // global. - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1)) - if (!GV->hasDefinitiveInitializer()) break; - } - - // If we've stripped down to a single global variable that we - // can know the size of then just return that. - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1)) { - if (GV->hasDefinitiveInitializer()) { - Constant *C = GV->getInitializer(); - Size = TD->getTypeAllocSize(C->getType()); - } else { - // Can't determine size of the GV. - Constant *RetVal = ConstantInt::get(ReturnTy, DontKnow); - return ReplaceInstUsesWith(CI, RetVal); - } - } else if (AllocaInst *AI = dyn_cast<AllocaInst>(Op1)) { - // Get alloca size. - if (AI->getAllocatedType()->isSized()) { - Size = TD->getTypeAllocSize(AI->getAllocatedType()); - if (AI->isArrayAllocation()) { - const ConstantInt *C = dyn_cast<ConstantInt>(AI->getArraySize()); - if (!C) break; - Size *= C->getZExtValue(); - } - } - } else if (CallInst *MI = extractMallocCall(Op1)) { - // Get allocation size. - Type* MallocType = getMallocAllocatedType(MI); - if (MallocType && MallocType->isSized()) - if (Value *NElems = getMallocArraySize(MI, TD, true)) - if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) - Size = NElements->getZExtValue() * TD->getTypeAllocSize(MallocType); - } - - // Do not return "I don't know" here. Later optimization passes could - // make it possible to evaluate objectsize to a constant. - if (Size == -1ULL) - break; - - if (Size < Offset) { - // Out of bound reference? Negative index normalized to large - // index? Just return "I don't know". - return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, DontKnow)); - } - return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, Size-Offset)); + uint64_t Size; + if (getObjectSize(II->getArgOperand(0), Size, TD)) + return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size)); + return 0; } case Intrinsic::bswap: // bswap(bswap(x)) -> x @@ -694,6 +624,57 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + + // Handle mul by zero first: + if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { + return ReplaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); + } + + // Check for constant LHS & RHS - in this case we just simplify. + bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu); + VectorType *NewVT = cast<VectorType>(II->getType()); + unsigned NewWidth = NewVT->getElementType()->getIntegerBitWidth(); + if (ConstantDataVector *CV0 = dyn_cast<ConstantDataVector>(Arg0)) { + if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) { + VectorType* VT = cast<VectorType>(CV0->getType()); + SmallVector<Constant*, 4> NewElems; + for (unsigned i = 0; i < VT->getNumElements(); ++i) { + APInt CV0E = + (cast<ConstantInt>(CV0->getAggregateElement(i)))->getValue(); + CV0E = Zext ? CV0E.zext(NewWidth) : CV0E.sext(NewWidth); + APInt CV1E = + (cast<ConstantInt>(CV1->getAggregateElement(i)))->getValue(); + CV1E = Zext ? CV1E.zext(NewWidth) : CV1E.sext(NewWidth); + NewElems.push_back( + ConstantInt::get(NewVT->getElementType(), CV0E * CV1E)); + } + return ReplaceInstUsesWith(CI, ConstantVector::get(NewElems)); + } + + // Couldn't simplify - cannonicalize constant to the RHS. + std::swap(Arg0, Arg1); + } + + // Handle mul by one: + if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) { + if (ConstantInt *Splat = + dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) { + if (Splat->isOne()) { + if (Zext) + return CastInst::CreateZExtOrBitCast(Arg0, II->getType()); + // else + return CastInst::CreateSExtOrBitCast(Arg0, II->getType()); + } + } + } + + break; + } + case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. @@ -711,7 +692,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { TerminatorInst *TI = II->getParent()->getTerminator(); bool CannotRemove = false; for (++BI; &*BI != TI; ++BI) { - if (isa<AllocaInst>(BI) || isMalloc(BI)) { + if (isa<AllocaInst>(BI)) { CannotRemove = true; break; } @@ -898,6 +879,9 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) { // visitCallSite - Improvements for call and invoke instructions. // Instruction *InstCombiner::visitCallSite(CallSite CS) { + if (isAllocLikeFn(CS.getInstruction())) + return visitAllocSite(*CS.getInstruction()); + bool Changed = false; // If the callee is a pointer to a function, attempt to move any casts to the @@ -933,24 +917,24 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { } if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { - // This instruction is not reachable, just remove it. We insert a store to - // undef so that we know that this code is not reachable, despite the fact - // that we can't modify the CFG here. - new StoreInst(ConstantInt::getTrue(Callee->getContext()), - UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), - CS.getInstruction()); - // If CS does not return void then replaceAllUsesWith undef. // This allows ValueHandlers and custom metadata to adjust itself. if (!CS.getInstruction()->getType()->isVoidTy()) ReplaceInstUsesWith(*CS.getInstruction(), UndefValue::get(CS.getInstruction()->getType())); - if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { - // Don't break the CFG, insert a dummy cond branch. - BranchInst::Create(II->getNormalDest(), II->getUnwindDest(), - ConstantInt::getTrue(Callee->getContext()), II); + if (isa<InvokeInst>(CS.getInstruction())) { + // Can't remove an invoke because we cannot change the CFG. + return 0; } + + // This instruction is not reachable, just remove it. We insert a store to + // undef so that we know that this code is not reachable, despite the fact + // that we can't modify the CFG here. + new StoreInst(ConstantInt::getTrue(Callee->getContext()), + UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), + CS.getInstruction()); + return EraseInstFromFunction(*CS.getInstruction()); } @@ -1194,8 +1178,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (NewRetTy->isVoidTy()) Caller->setName(""); // Void type should not have a name. - const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec.begin(), - attrVec.end()); + const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec); Instruction *NC; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { @@ -1367,8 +1350,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, NestF->getType() == PointerType::getUnqual(NewFTy) ? NestF : ConstantExpr::getBitCast(NestF, PointerType::getUnqual(NewFTy)); - const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs.begin(), - NewAttrs.end()); + const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs); Instruction *NewCaller; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 39279f4..555b442 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -34,7 +34,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) { // Cannot look past anything that might overflow. OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val); - if (OBI && !OBI->hasNoUnsignedWrap()) { + if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) { Scale = 1; Offset = 0; return Val; @@ -648,10 +648,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) { if (!I) return false; // If the input is a truncate from the destination type, we can trivially - // eliminate it, even if it has multiple uses. - // FIXME: This is currently disabled until codegen can handle this without - // pessimizing code, PR5997. - if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) + // eliminate it. + if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) return true; // We can't extend or shrink something that has multiple uses: doing so would @@ -992,11 +990,8 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { Instruction *I = dyn_cast<Instruction>(V); if (!I) return false; - // If this is a truncate from the dest type, we can trivially eliminate it, - // even if it has multiple uses. - // FIXME: This is currently disabled until codegen can handle this without - // pessimizing code, PR5997. - if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) + // If this is a truncate from the dest type, we can trivially eliminate it. + if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) return true; // We can't extend or shrink something that has multiple uses: doing so would @@ -1341,10 +1336,9 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { // non-type-safe code. if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) && GEP->hasAllConstantIndices()) { - // We are guaranteed to get a constant from EmitGEPOffset. - ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(GEP)); - int64_t Offset = OffsetV->getSExtValue(); - + SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end()); + int64_t Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops); + // Get the base pointer input of the bitcast, and the type it points to. Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0); Type *GEPIdxTy = diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index ab2987f..7076d88 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1035,7 +1035,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) { // Pull in the high bits from known-ones set. APInt NewRHS = RHS->getValue().zext(SrcBits); - NewRHS |= KnownOne; + NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits); return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), ConstantInt::get(ICI.getContext(), NewRHS)); } @@ -2580,10 +2580,25 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } } + // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B) + // and (B & (1<<X)-1) == (zext A) --> A == (trunc B) + ConstantInt *Cst1; + if ((Op0->hasOneUse() && + match(Op0, m_ZExt(m_Value(A))) && + match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) || + (Op1->hasOneUse() && + match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) && + match(Op1, m_ZExt(m_Value(A))))) { + APInt Pow2 = Cst1->getValue() + 1; + if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) && + Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth()) + return new ICmpInst(I.getPredicate(), A, + Builder->CreateTrunc(B, A->getType())); + } + // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to // "icmp (and X, mask), cst" uint64_t ShAmt = 0; - ConstantInt *Cst1; if (Op0->hasOneUse() && match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) && diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index b2f2e24..c485844 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -22,72 +22,6 @@ using namespace llvm; STATISTIC(NumDeadStore, "Number of dead stores eliminated"); -// Try to kill dead allocas by walking through its uses until we see some use -// that could escape. This is a conservative analysis which tries to handle -// GEPs, bitcasts, stores, and no-op intrinsics. These tend to be the things -// left after inlining and SROA finish chewing on an alloca. -static Instruction *removeDeadAlloca(InstCombiner &IC, AllocaInst &AI) { - SmallVector<Instruction *, 4> Worklist, DeadStores; - Worklist.push_back(&AI); - do { - Instruction *PI = Worklist.pop_back_val(); - for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); - UI != UE; ++UI) { - Instruction *I = cast<Instruction>(*UI); - switch (I->getOpcode()) { - default: - // Give up the moment we see something we can't handle. - return 0; - - case Instruction::GetElementPtr: - case Instruction::BitCast: - Worklist.push_back(I); - continue; - - case Instruction::Call: - // We can handle a limited subset of calls to no-op intrinsics. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::dbg_declare: - case Intrinsic::dbg_value: - case Intrinsic::invariant_start: - case Intrinsic::invariant_end: - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - continue; - default: - return 0; - } - } - // Reject everything else. - return 0; - - case Instruction::Store: { - // Stores into the alloca are only live if the alloca is live. - StoreInst *SI = cast<StoreInst>(I); - // We can eliminate atomic stores, but not volatile. - if (SI->isVolatile()) - return 0; - // The store is only trivially safe if the poniter is the destination - // as opposed to the value. We're conservative here and don't check for - // the case where we store the address of a dead alloca into a dead - // alloca. - if (SI->getPointerOperand() != PI) - return 0; - DeadStores.push_back(I); - continue; - } - } - } - } while (!Worklist.empty()); - - // The alloca is dead. Kill off all the stores to it, and then replace it - // with undef. - while (!DeadStores.empty()) - IC.EraseInstFromFunction(*DeadStores.pop_back_val()); - return IC.ReplaceInstUsesWith(AI, UndefValue::get(AI.getType())); -} - Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Ensure that the alloca array size argument has type intptr_t, so that // any casting is exposed early. @@ -106,7 +40,6 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); - assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!"); AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName()); New->setAlignment(AI.getAlignment()); @@ -135,22 +68,54 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { } } - if (TD && isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized()) { - // If alloca'ing a zero byte object, replace the alloca with a null pointer. - // Note that we only do this for alloca's, because malloc should allocate - // and return a unique pointer, even for a zero byte allocation. - if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) - return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); - + if (TD && AI.getAllocatedType()->isSized()) { // If the alignment is 0 (unspecified), assign it the preferred alignment. if (AI.getAlignment() == 0) AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType())); + + // Move all alloca's of zero byte objects to the entry block and merge them + // together. Note that we only do this for alloca's, because malloc should + // allocate and return a unique pointer, even for a zero byte allocation. + if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) { + // For a zero sized alloca there is no point in doing an array allocation. + // This is helpful if the array size is a complicated expression not used + // elsewhere. + if (AI.isArrayAllocation()) { + AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1)); + return &AI; + } + + // Get the first instruction in the entry block. + BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock(); + Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg(); + if (FirstInst != &AI) { + // If the entry block doesn't start with a zero-size alloca then move + // this one to the start of the entry block. There is no problem with + // dominance as the array size was forced to a constant earlier already. + AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst); + if (!EntryAI || !EntryAI->getAllocatedType()->isSized() || + TD->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { + AI.moveBefore(FirstInst); + return &AI; + } + + // Replace this zero-sized alloca with the one at the start of the entry + // block after ensuring that the address will be aligned enough for both + // types. + unsigned MaxAlign = + std::max(TD->getPrefTypeAlignment(EntryAI->getAllocatedType()), + TD->getPrefTypeAlignment(AI.getAllocatedType())); + EntryAI->setAlignment(MaxAlign); + if (AI.getType() != EntryAI->getType()) + return new BitCastInst(EntryAI, AI.getType()); + return ReplaceInstUsesWith(AI, EntryAI); + } + } } - // Try to aggressively remove allocas which are only used for GEPs, lifetime - // markers, and stores. This happens when SROA iteratively promotes stores - // out of the alloca, and we need to cleanup after it. - return removeDeadAlloca(*this, AI); + // At last, use the generic allocation site handler to aggressively remove + // unused allocas. + return visitAllocSite(AI); } diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 5168e2a..35a0bbb 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -464,9 +464,12 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { // X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2) { const APInt *CI; Value *N; - if (match(Op1, m_Shl(m_Power2(CI), m_Value(N)))) { + if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) || + match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) { if (*CI != 1) N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2())); + if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1)) + N = Builder->CreateZExt(N, Z->getDestTy()); if (I.isExact()) return BinaryOperator::CreateExactLShr(Op0, N); return BinaryOperator::CreateLShr(Op0, N); diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index e727b2c..eb9945b 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -129,6 +129,12 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, if (TI->isCast()) { if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType()) return 0; + // The select condition may be a vector. We may only change the operand + // type if the vector width remains the same (and matches the condition). + Type *CondTy = SI.getCondition()->getType(); + if (CondTy->isVectorTy() && CondTy->getVectorNumElements() != + FI->getOperand(0)->getType()->getVectorNumElements()) + return 0; } else { return 0; // unknown unary op. } @@ -498,7 +504,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, // NOTE: if we wanted to, this is where to detect integer MIN/MAX - if (isa<Constant>(CmpRHS)) { + if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) { if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) { // Transform (X == C) ? X : Y -> (X == C) ? C : Y SI.setOperand(1, CmpRHS); diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index b31049e..4bb2403 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -151,7 +151,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't // profitable unless we know the and'd out bits are already zero. - if (CI->getZExtValue() > NumBits) { + if (CI->getValue().ult(TypeWidth) && CI->getZExtValue() > NumBits) { unsigned LowBits = CI->getZExtValue() - NumBits; if (MaskedValueIsZero(I->getOperand(0), APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits)) @@ -529,6 +529,19 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, ShiftOp = 0; if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) { + + // This is a constant shift of a constant shift. Be careful about hiding + // shl instructions behind bit masks. They are used to represent multiplies + // by a constant, and it is important that simple arithmetic expressions + // are still recognizable by scalar evolution. + // + // The transforms applied to shl are very similar to the transforms applied + // to mul by constant. We can be more aggressive about optimizing right + // shifts. + // + // Combinations of right and left shifts will still be optimized in + // DAGCombine where scalar evolution no longer applies. + ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1)); uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits); uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits); @@ -554,13 +567,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, } if (ShiftAmt1 == ShiftAmt2) { - // If we have ((X >>? C) << C), turn this into X & (-1 << C). - if (I.getOpcode() == Instruction::Shl && - ShiftOp->getOpcode() != Instruction::Shl) { - APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1)); - return BinaryOperator::CreateAnd(X, - ConstantInt::get(I.getContext(),Mask)); - } // If we have ((X << C) >>u C), turn this into X & (-1 >>u C). if (I.getOpcode() == Instruction::LShr && ShiftOp->getOpcode() == Instruction::Shl) { @@ -570,28 +576,23 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, } } else if (ShiftAmt1 < ShiftAmt2) { uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1; - - // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2) + + // (X >>?,exact C1) << C2 --> X << (C2-C1) + // The inexact version is deferred to DAGCombine so we don't hide shl + // behind a bit mask. if (I.getOpcode() == Instruction::Shl && - ShiftOp->getOpcode() != Instruction::Shl) { + ShiftOp->getOpcode() != Instruction::Shl && + ShiftOp->isExact()) { assert(ShiftOp->getOpcode() == Instruction::LShr || ShiftOp->getOpcode() == Instruction::AShr); ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff); - if (ShiftOp->isExact()) { - // (X >>?,exact C1) << C2 --> X << (C2-C1) - BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl, - X, ShiftDiffCst); - NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); - NewShl->setHasNoSignedWrap(I.hasNoSignedWrap()); - return NewShl; - } - Value *Shift = Builder->CreateShl(X, ShiftDiffCst); - - APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2)); - return BinaryOperator::CreateAnd(Shift, - ConstantInt::get(I.getContext(),Mask)); + BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl, + X, ShiftDiffCst); + NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + NewShl->setHasNoSignedWrap(I.hasNoSignedWrap()); + return NewShl; } - + // (X << C1) >>u C2 --> X >>u (C2-C1) & (-1 >> C2) if (I.getOpcode() == Instruction::LShr && ShiftOp->getOpcode() == Instruction::Shl) { @@ -627,24 +628,19 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, assert(ShiftAmt2 < ShiftAmt1); uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2; - // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2) + // (X >>?exact C1) << C2 --> X >>?exact (C1-C2) + // The inexact version is deferred to DAGCombine so we don't hide shl + // behind a bit mask. if (I.getOpcode() == Instruction::Shl && - ShiftOp->getOpcode() != Instruction::Shl) { + ShiftOp->getOpcode() != Instruction::Shl && + ShiftOp->isExact()) { ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff); - if (ShiftOp->isExact()) { - // (X >>?exact C1) << C2 --> X >>?exact (C1-C2) - BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(), - X, ShiftDiffCst); - NewShr->setIsExact(true); - return NewShr; - } - Value *Shift = Builder->CreateBinOp(ShiftOp->getOpcode(), - X, ShiftDiffCst); - APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2)); - return BinaryOperator::CreateAnd(Shift, - ConstantInt::get(I.getContext(),Mask)); + BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(), + X, ShiftDiffCst); + NewShr->setIsExact(true); + return NewShr; } - + // (X << C1) >>u C2 --> X << (C1-C2) & (-1 >> C2) if (I.getOpcode() == Instruction::LShr && ShiftOp->getOpcode() == Instruction::Shl) { diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 066b2ec..68ecd51 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -87,30 +87,34 @@ void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const { } +Value *InstCombiner::EmitGEPOffset(User *GEP) { + return llvm::EmitGEPOffset(Builder, *getTargetData(), GEP); +} + /// ShouldChangeType - Return true if it is desirable to convert a computation /// from 'From' to 'To'. We don't want to convert from a legal to an illegal /// type for example, or from a smaller to a larger illegal type. bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { assert(From->isIntegerTy() && To->isIntegerTy()); - + // If we don't have TD, we don't know if the source/dest are legal. if (!TD) return false; - + unsigned FromWidth = From->getPrimitiveSizeInBits(); unsigned ToWidth = To->getPrimitiveSizeInBits(); bool FromLegal = TD->isLegalInteger(FromWidth); bool ToLegal = TD->isLegalInteger(ToWidth); - + // If this is a legal integer from type, and the result would be an illegal // type, don't do the transformation. if (FromLegal && !ToLegal) return false; - + // Otherwise, if both are illegal, do not increase the size of the result. We // do allow things like i160 -> i64, but not i64 -> i160. if (!FromLegal && !ToLegal && ToWidth > FromWidth) return false; - + return true; } @@ -127,7 +131,7 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { // We reason about Add and Sub Only. Instruction::BinaryOps Opcode = I.getOpcode(); - if (Opcode != Instruction::Add && + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) { return false; } @@ -203,7 +207,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { // Conservatively clear the optional flags, since they may not be // preserved by the reassociation. if (MaintainNoSignedWrap(I, B, C) && - (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) { + (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) { // Note: this is only valid because SimplifyBinOp doesn't look at // the operands to Op0. I.clearSubclassOptionalData(); @@ -211,7 +215,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { } else { I.clearSubclassOptionalData(); } - + Changed = true; ++NumReassoc; continue; @@ -540,7 +544,7 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, Value *Op0 = SO, *Op1 = ConstOperand; if (!ConstIsRHS) std::swap(Op0, Op1); - + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) return IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1, SO->getName()+".op"); @@ -579,7 +583,7 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements()) return 0; } - + Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this); Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, this); @@ -599,7 +603,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { unsigned NumPHIValues = PN->getNumIncomingValues(); if (NumPHIValues == 0) return 0; - + // We normally only transform phis with a single use. However, if a PHI has // multiple uses and they are all the same operation, we can fold *all* of the // uses into the PHI. @@ -613,7 +617,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { } // Otherwise, we can replace *all* users with the new PHI we form. } - + // Check to see if all of the operands of the PHI are simple constants // (constantint/constantfp/undef). If there is one non-constant value, // remember the BB it is in. If there is more than one or if *it* is a PHI, @@ -627,7 +631,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { if (isa<PHINode>(InVal)) return 0; // Itself a phi. if (NonConstBB) return 0; // More than one non-const value. - + NonConstBB = PN->getIncomingBlock(i); // If the InVal is an invoke at the end of the pred block, then we can't @@ -635,14 +639,14 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { if (InvokeInst *II = dyn_cast<InvokeInst>(InVal)) if (II->getParent() == NonConstBB) return 0; - + // If the incoming non-constant value is in I's block, we will remove one // instruction, but insert another equivalent one, leading to infinite // instcombine. if (NonConstBB == I.getParent()) return 0; } - + // If there is exactly one non-constant value, we can insert a copy of the // operation in that block. However, if this is a critical edge, we would be // inserting the computation one some other paths (e.g. inside a loop). Only @@ -656,12 +660,12 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues()); InsertNewInstBefore(NewPN, *PN); NewPN->takeName(PN); - + // If we are going to have to insert a new computation, do so right before the // predecessors terminator. if (NonConstBB) Builder->SetInsertPoint(NonConstBB->getTerminator()); - + // Next, add all of the operands to the PHI. if (SelectInst *SI = dyn_cast<SelectInst>(&I)) { // We only currently try to fold the condition of a select when it is a phi, @@ -706,20 +710,20 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { PN->getIncomingValue(i), C, "phitmp"); NewPN->addIncoming(InV, PN->getIncomingBlock(i)); } - } else { + } else { CastInst *CI = cast<CastInst>(&I); Type *RetTy = CI->getType(); for (unsigned i = 0; i != NumPHIValues; ++i) { Value *InV; if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy); - else + else InV = Builder->CreateCast(CI->getOpcode(), PN->getIncomingValue(i), I.getType(), "phitmp"); NewPN->addIncoming(InV, PN->getIncomingBlock(i)); } } - + for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); UI != E; ) { Instruction *User = cast<Instruction>(*UI++); @@ -734,11 +738,11 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { /// or not there is a sequence of GEP indices into the type that will land us at /// the specified offset. If so, fill them into NewIndices and return the /// resultant element type, otherwise return null. -Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, +Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, SmallVectorImpl<Value*> &NewIndices) { if (!TD) return 0; if (!Ty->isSized()) return 0; - + // Start with the index over the outer type. Note that the type size // might be zero (even if the offset isn't zero) if the indexed type // is something like [0 x {int, int}] @@ -747,7 +751,7 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, if (int64_t TySize = TD->getTypeAllocSize(Ty)) { FirstIdx = Offset/TySize; Offset -= FirstIdx*TySize; - + // Handle hosts where % returns negative instead of values [0..TySize). if (Offset < 0) { --FirstIdx; @@ -756,24 +760,24 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, } assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset"); } - + NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx)); - + // Index into the types. If we fail, set OrigBase to null. while (Offset) { // Indexing into tail padding between struct/array elements. if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty)) return 0; - + if (StructType *STy = dyn_cast<StructType>(Ty)) { const StructLayout *SL = TD->getStructLayout(STy); assert(Offset < (int64_t)SL->getSizeInBytes() && "Offset must stay within the indexed type"); - + unsigned Elt = SL->getElementContainingOffset(Offset); NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()), Elt)); - + Offset -= SL->getElementOffset(Elt); Ty = STy->getElementType(Elt); } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) { @@ -787,7 +791,7 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, return 0; } } - + return Ty; } @@ -948,7 +952,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Res->setIsInBounds(GEP.isInBounds()); return Res; } - + if (ArrayType *XATy = dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){ // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ? @@ -981,16 +985,16 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // V and GEP are both pointer types --> BitCast return new BitCastInst(NewGEP, GEP.getType()); } - + // Transform things like: // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp // (where tmp = 8*tmp2) into: // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast - + if (TD && SrcElTy->isArrayTy() && ResElTy->isIntegerTy(8)) { uint64_t ArrayEltSize = TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()); - + // Check to see if "tmp" is a scale by a multiple of ArrayEltSize. We // allow either a mul, shift, or constant here. Value *NewIdx = 0; @@ -1015,7 +1019,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { NewIdx = Inst->getOperand(0); } } - + // If the index will be to exactly the right offset with the scale taken // out, perform the transformation. Note, we don't know whether Scale is // signed or not. We'll use unsigned version of division/modulo @@ -1054,10 +1058,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { !isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices() && StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) { - // Determine how much the GEP moves the pointer. We are guaranteed to get - // a constant back from EmitGEPOffset. - ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(&GEP)); - int64_t Offset = OffsetV->getSExtValue(); + // Determine how much the GEP moves the pointer. + SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end()); + int64_t Offset = TD->getIndexedOffset(GEP.getPointerOperandType(), Ops); // If this GEP instruction doesn't move the pointer, just replace the GEP // with a bitcast of the real input to the dest type. @@ -1065,7 +1068,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // If the bitcast is of an allocation, and the allocation will be // converted to match the type of the cast, don't touch this. if (isa<AllocaInst>(BCI->getOperand(0)) || - isMalloc(BCI->getOperand(0))) { + isAllocationFn(BCI->getOperand(0))) { // See if the bitcast simplifies, if so, don't nuke this GEP yet. if (Instruction *I = visitBitCast(*BCI)) { if (I != BCI) { @@ -1078,7 +1081,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } return new BitCastInst(BCI->getOperand(0), GEP.getType()); } - + // Otherwise, if the offset is non-zero, we need to find out if there is a // field at Offset in 'A's type. If so, we can pull the cast through the // GEP. @@ -1089,68 +1092,103 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Value *NGEP = GEP.isInBounds() ? Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) : Builder->CreateGEP(BCI->getOperand(0), NewIndices); - + if (NGEP->getType() == GEP.getType()) return ReplaceInstUsesWith(GEP, NGEP); NGEP->takeName(&GEP); return new BitCastInst(NGEP, GEP.getType()); } } - } - + } + return 0; } -static bool IsOnlyNullComparedAndFreed(Value *V, SmallVectorImpl<WeakVH> &Users, - int Depth = 0) { - if (Depth == 8) - return false; +static bool +isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) { + SmallVector<Instruction*, 4> Worklist; + Worklist.push_back(AI); - for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); - UI != UE; ++UI) { - User *U = *UI; - if (isFreeCall(U)) { - Users.push_back(U); - continue; - } - if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) { - if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1))) { - Users.push_back(ICI); + do { + Instruction *PI = Worklist.pop_back_val(); + for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); UI != UE; + ++UI) { + Instruction *I = cast<Instruction>(*UI); + switch (I->getOpcode()) { + default: + // Give up the moment we see something we can't handle. + return false; + + case Instruction::BitCast: + case Instruction::GetElementPtr: + Users.push_back(I); + Worklist.push_back(I); continue; - } - } - if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { - if (IsOnlyNullComparedAndFreed(BCI, Users, Depth+1)) { - Users.push_back(BCI); + + case Instruction::ICmp: { + ICmpInst *ICI = cast<ICmpInst>(I); + // We can fold eq/ne comparisons with null to false/true, respectively. + if (!ICI->isEquality() || !isa<ConstantPointerNull>(ICI->getOperand(1))) + return false; + Users.push_back(I); continue; } - } - if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { - if (IsOnlyNullComparedAndFreed(GEPI, Users, Depth+1)) { - Users.push_back(GEPI); + + case Instruction::Call: + // Ignore no-op and store intrinsics. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: + return false; + + case Intrinsic::memmove: + case Intrinsic::memcpy: + case Intrinsic::memset: { + MemIntrinsic *MI = cast<MemIntrinsic>(II); + if (MI->isVolatile() || MI->getRawDest() != PI) + return false; + } + // fall through + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + Users.push_back(I); + continue; + } + } + + if (isFreeCall(I)) { + Users.push_back(I); + continue; + } + return false; + + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(I); + if (SI->isVolatile() || SI->getPointerOperand() != PI) + return false; + Users.push_back(I); continue; } - } - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end) { - Users.push_back(II); - continue; } + llvm_unreachable("missing a return?"); } - return false; - } + } while (!Worklist.empty()); return true; } -Instruction *InstCombiner::visitMalloc(Instruction &MI) { +Instruction *InstCombiner::visitAllocSite(Instruction &MI) { // If we have a malloc call which is only used in any amount of comparisons // to null and free calls, delete the calls and replace the comparisons with // true or false as appropriate. SmallVector<WeakVH, 64> Users; - if (IsOnlyNullComparedAndFreed(&MI, Users)) { + if (isAllocSiteRemovable(&MI, Users)) { for (unsigned i = 0, e = Users.size(); i != e; ++i) { Instruction *I = cast_or_null<Instruction>(&*Users[i]); if (!I) continue; @@ -1161,9 +1199,23 @@ Instruction *InstCombiner::visitMalloc(Instruction &MI) { C->isFalseWhenEqual())); } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { ReplaceInstUsesWith(*I, UndefValue::get(I->getType())); + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::objectsize) { + ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1)); + uint64_t DontKnow = CI->isZero() ? -1ULL : 0; + ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow)); + } } EraseInstFromFunction(*I); } + + if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) { + // Replace invoke with a NOP intrinsic to maintain the original CFG + Module *M = II->getParent()->getParent()->getParent(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); + InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), + ArrayRef<Value *>(), "", II->getParent()); + } return EraseInstFromFunction(MI); } return 0; @@ -1181,7 +1233,7 @@ Instruction *InstCombiner::visitFree(CallInst &FI) { UndefValue::get(Type::getInt1PtrTy(FI.getContext()))); return EraseInstFromFunction(FI); } - + // If we have 'free null' delete the instruction. This can happen in stl code // when lots of inlining happens. if (isa<ConstantPointerNull>(Op)) @@ -1207,14 +1259,14 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { // Cannonicalize fcmp_one -> fcmp_oeq FCmpInst::Predicate FPred; Value *Y; - if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), + if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), TrueDest, FalseDest)) && BI.getCondition()->hasOneUse()) if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE || FPred == FCmpInst::FCMP_OGE) { FCmpInst *Cond = cast<FCmpInst>(BI.getCondition()); Cond->setPredicate(FCmpInst::getInversePredicate(FPred)); - + // Swap Destinations and condition. BI.swapSuccessors(); Worklist.Add(Cond); @@ -1280,7 +1332,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { } return 0; // Can't handle other constants } - + if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) { // We're extracting from an insertvalue instruction, compare the indices const unsigned *exti, *exte, *insi, *inse; @@ -1329,7 +1381,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { // %E = extractvalue { i32, { i32 } } %I, 1, 0 // with // %E extractvalue { i32 } { i32 42 }, 0 - return ExtractValueInst::Create(IV->getInsertedValueOperand(), + return ExtractValueInst::Create(IV->getInsertedValueOperand(), makeArrayRef(exti, exte)); } if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) { @@ -1349,7 +1401,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { EraseInstFromFunction(*II); return BinaryOperator::CreateAdd(LHS, RHS); } - + // If the normal result of the add is dead, and the RHS is a constant, // we can transform this into a range comparison. // overflow = uadd a, -4 --> overflow = icmp ugt a, 3 @@ -1798,7 +1850,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { /// many instructions are dead or constant). Additionally, if we find a branch /// whose condition is a known constant, we only visit the reachable successors. /// -static bool AddReachableCodeToWorklist(BasicBlock *BB, +static bool AddReachableCodeToWorklist(BasicBlock *BB, SmallPtrSet<BasicBlock*, 64> &Visited, InstCombiner &IC, const TargetData *TD, @@ -1812,13 +1864,13 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, do { BB = Worklist.pop_back_val(); - + // We have now visited this block! If we've already been here, ignore it. if (!Visited.insert(BB)) continue; for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { Instruction *Inst = BBI++; - + // DCE instruction if trivially dead. if (isInstructionTriviallyDead(Inst)) { ++NumDeadInst; @@ -1826,7 +1878,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, Inst->eraseFromParent(); continue; } - + // ConstantProp instruction if trivially constant. if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0))) if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) { @@ -1837,7 +1889,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, Inst->eraseFromParent(); continue; } - + if (TD) { // See if we can constant fold its operands. for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); @@ -1881,17 +1933,17 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, Worklist.push_back(ReachableBB); continue; } - + // Otherwise it is the default destination. Worklist.push_back(SI->getDefaultDest()); continue; } } - + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) Worklist.push_back(TI->getSuccessor(i)); } while (!Worklist.empty()); - + // Once we've found all of the instructions to add to instcombine's worklist, // add them in reverse order. This way instcombine will visit from the top // of the function down. This jives well with the way that it adds all uses @@ -1899,13 +1951,13 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, // some N^2 behavior in pathological cases. IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], InstrsForInstCombineWorklist.size()); - + return MadeIRChange; } bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { MadeIRChange = false; - + DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); @@ -1976,13 +2028,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { BasicBlock *BB = I->getParent(); Instruction *UserInst = cast<Instruction>(I->use_back()); BasicBlock *UserParent; - + // Get the block the use occurs in. if (PHINode *PN = dyn_cast<PHINode>(UserInst)) UserParent = PN->getIncomingBlock(I->use_begin().getUse()); else UserParent = UserInst->getParent(); - + if (UserParent != BB) { bool UserIsSuccessor = false; // See if the user is one of our successors. @@ -2004,7 +2056,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // Now that we have an instruction, try combining it to simplify it. Builder->SetInsertPoint(I->getParent(), I); Builder->SetCurrentDebugLocation(I->getDebugLoc()); - + #ifndef NDEBUG std::string OrigI; #endif @@ -2069,14 +2121,14 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { bool InstCombiner::runOnFunction(Function &F) { TD = getAnalysisIfAvailable<TargetData>(); TLI = &getAnalysis<TargetLibraryInfo>(); - + /// Builder - This is an IRBuilder that automatically inserts new /// instructions into the worklist when they are created. - IRBuilder<true, TargetFolder, InstCombineIRInserter> + IRBuilder<true, TargetFolder, InstCombineIRInserter> TheBuilder(F.getContext(), TargetFolder(TD), InstCombineIRInserter(Worklist)); Builder = &TheBuilder; - + bool EverMadeChange = false; // Lower dbg.declare intrinsics otherwise their value may be clobbered @@ -2087,7 +2139,7 @@ bool InstCombiner::runOnFunction(Function &F) { unsigned Iteration = 0; while (DoOneIteration(F, Iteration++)) EverMadeChange = true; - + Builder = 0; return EverMadeChange; } diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 7fb33f7..3368026 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -16,6 +16,13 @@ #define DEBUG_TYPE "asan" #include "FunctionBlackList.h" +#include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/InlineAsm.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Type.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallSet.h" @@ -23,14 +30,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" -#include "llvm/Function.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/system_error.h" #include "llvm/Target/TargetData.h" @@ -38,7 +40,6 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Type.h" #include <string> #include <algorithm> @@ -72,6 +73,9 @@ static const int kAsanStackMidRedzoneMagic = 0xf2; static const int kAsanStackRightRedzoneMagic = 0xf3; static const int kAsanStackPartialRedzoneMagic = 0xf4; +// Accesses sizes are powers of two: 1, 2, 4, 8, 16. +static const size_t kNumberOfAccessSizes = 5; + // Command-line flags. // This flag may need to be replaced with -f[no-]asan-reads. @@ -79,6 +83,20 @@ static cl::opt<bool> ClInstrumentReads("asan-instrument-reads", cl::desc("instrument read instructions"), cl::Hidden, cl::init(true)); static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes", cl::desc("instrument write instructions"), cl::Hidden, cl::init(true)); +static cl::opt<bool> ClInstrumentAtomics("asan-instrument-atomics", + cl::desc("instrument atomic instructions (rmw, cmpxchg)"), + cl::Hidden, cl::init(true)); +static cl::opt<bool> ClMergeCallbacks("asan-merge-callbacks", + cl::desc("merge __asan_report_ callbacks to create fewer BBs"), + cl::Hidden, cl::init(false)); +// This flag limits the number of instructions to be instrumented +// in any given BB. Normally, this should be set to unlimited (INT_MAX), +// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary +// set it to 10000. +static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb", + cl::init(10000), + cl::desc("maximal number of instructions to instrument in any given BB"), + cl::Hidden); // This flag may need to be replaced with -f[no]asan-stack. static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"), cl::Hidden, cl::init(true)); @@ -127,18 +145,42 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"), namespace { +/// When the crash callbacks are merged, they receive some amount of arguments +/// that are merged in a PHI node. This struct represents arguments from one +/// call site. +struct CrashArg { + Value *Arg1; + Value *Arg2; +}; + +/// An object of this type is created while instrumenting every function. +struct AsanFunctionContext { + AsanFunctionContext(Function &Function) : F(Function), CrashBlock() { } + + Function &F; + // These are initially zero. If we require at least one call to + // __asan_report_{read,write}{1,2,4,8,16}, an appropriate BB is created. + BasicBlock *CrashBlock[2][kNumberOfAccessSizes]; + typedef SmallVector<CrashArg, 8> CrashArgsVec; + CrashArgsVec CrashArgs[2][kNumberOfAccessSizes]; +}; + /// AddressSanitizer: instrument the code in module to find memory bugs. struct AddressSanitizer : public ModulePass { AddressSanitizer(); virtual const char *getPassName() const; - void instrumentMop(Instruction *I); - void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB, + void instrumentMop(AsanFunctionContext &AFC, Instruction *I); + void instrumentAddress(AsanFunctionContext &AFC, + Instruction *OrigIns, IRBuilder<> &IRB, Value *Addr, uint32_t TypeSize, bool IsWrite); - Instruction *generateCrashCode(IRBuilder<> &IRB, Value *Addr, - bool IsWrite, uint32_t TypeSize); - bool instrumentMemIntrinsic(MemIntrinsic *MI); - void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr, - Value *Size, + Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, + Value *ShadowValue, uint32_t TypeSize); + Instruction *generateCrashCode(BasicBlock *BB, Value *Addr, Value *PC, + bool IsWrite, size_t AccessSizeIndex); + bool instrumentMemIntrinsic(AsanFunctionContext &AFC, MemIntrinsic *MI); + void instrumentMemIntrinsicParam(AsanFunctionContext &AFC, + Instruction *OrigIns, Value *Addr, + Value *Size, Instruction *InsertBefore, bool IsWrite); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); bool handleFunction(Module &M, Function &F); @@ -146,7 +188,6 @@ struct AddressSanitizer : public ModulePass { bool poisonStackInFunction(Module &M, Function &F); virtual bool runOnModule(Module &M); bool insertGlobalRedzones(Module &M); - BranchInst *splitBlockAndInsertIfThen(Instruction *SplitBefore, Value *Cmp); static char ID; // Pass identification, replacement for typeid private: @@ -165,11 +206,11 @@ struct AddressSanitizer : public ModulePass { return getAlignedSize(SizeInBytes); } + Function *checkInterfaceFunction(Constant *FuncOrBitcast); void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase, bool DoPoison); bool LooksLikeCodeInBug11395(Instruction *I); - Module *CurrentModule; LLVMContext *C; TargetData *TD; uint64_t MappingOffset; @@ -182,7 +223,11 @@ struct AddressSanitizer : public ModulePass { Function *AsanInitFunction; Instruction *CtorInsertBefore; OwningPtr<FunctionBlackList> BL; + // This array is indexed by AccessIsWrite and log2(AccessSize). + Function *AsanErrorCallback[2][kNumberOfAccessSizes]; + InlineAsm *EmptyAsm; }; + } // namespace char AddressSanitizer::ID = 0; @@ -198,6 +243,12 @@ const char *AddressSanitizer::getPassName() const { return "AddressSanitizer"; } +static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { + size_t Res = CountTrailingZeros_32(TypeSize / 8); + assert(Res < kNumberOfAccessSizes); + return Res; +} + // Create a constant for Str so that we can pass it to the run-time lib. static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) { Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); @@ -208,29 +259,32 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) { // Split the basic block and insert an if-then code. // Before: // Head -// SplitBefore +// Cmp // Tail // After: // Head // if (Cmp) -// NewBasicBlock -// SplitBefore +// ThenBlock // Tail // -// Returns the NewBasicBlock's terminator. -BranchInst *AddressSanitizer::splitBlockAndInsertIfThen( - Instruction *SplitBefore, Value *Cmp) { +// If ThenBlock is zero, a new block is created and its terminator is returned. +// Otherwize 0 is returned. +static BranchInst *splitBlockAndInsertIfThen(Value *Cmp, + BasicBlock *ThenBlock = 0) { + Instruction *SplitBefore = cast<Instruction>(Cmp)->getNextNode(); BasicBlock *Head = SplitBefore->getParent(); BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); TerminatorInst *HeadOldTerm = Head->getTerminator(); - BasicBlock *NewBasicBlock = - BasicBlock::Create(*C, "", Head->getParent()); - BranchInst *HeadNewTerm = BranchInst::Create(/*ifTrue*/NewBasicBlock, - /*ifFalse*/Tail, - Cmp); + BranchInst *CheckTerm = 0; + if (!ThenBlock) { + LLVMContext &C = Head->getParent()->getParent()->getContext(); + ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); + CheckTerm = BranchInst::Create(Tail, ThenBlock); + } + BranchInst *HeadNewTerm = + BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp); ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); - BranchInst *CheckTerm = BranchInst::Create(Tail, NewBasicBlock); return CheckTerm; } @@ -244,12 +298,13 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { MappingOffset)); } -void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns, +void AddressSanitizer::instrumentMemIntrinsicParam( + AsanFunctionContext &AFC, Instruction *OrigIns, Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) { // Check the first byte. { IRBuilder<> IRB(InsertBefore); - instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite); + instrumentAddress(AFC, OrigIns, IRB, Addr, 8, IsWrite); } // Check the last byte. { @@ -259,15 +314,16 @@ void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns, SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false); Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne); - instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite); + instrumentAddress(AFC, OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite); } } // Instrument memset/memmove/memcpy -bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { +bool AddressSanitizer::instrumentMemIntrinsic(AsanFunctionContext &AFC, + MemIntrinsic *MI) { Value *Dst = MI->getDest(); MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI); - Value *Src = MemTran ? MemTran->getSource() : NULL; + Value *Src = MemTran ? MemTran->getSource() : 0; Value *Length = MI->getLength(); Constant *ConstLength = dyn_cast<Constant>(Length); @@ -279,26 +335,46 @@ bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { IRBuilder<> IRB(InsertBefore); Value *Cmp = IRB.CreateICmpNE(Length, - Constant::getNullValue(Length->getType())); - InsertBefore = splitBlockAndInsertIfThen(InsertBefore, Cmp); + Constant::getNullValue(Length->getType())); + InsertBefore = splitBlockAndInsertIfThen(Cmp); } - instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true); + instrumentMemIntrinsicParam(AFC, MI, Dst, Length, InsertBefore, true); if (Src) - instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false); + instrumentMemIntrinsicParam(AFC, MI, Src, Length, InsertBefore, false); return true; } -static Value *getLDSTOperand(Instruction *I) { +// If I is an interesting memory access, return the PointerOperand +// and set IsWrite. Otherwise return NULL. +static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) { if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (!ClInstrumentReads) return NULL; + *IsWrite = false; return LI->getPointerOperand(); } - return cast<StoreInst>(*I).getPointerOperand(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + if (!ClInstrumentWrites) return NULL; + *IsWrite = true; + return SI->getPointerOperand(); + } + if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { + if (!ClInstrumentAtomics) return NULL; + *IsWrite = true; + return RMW->getPointerOperand(); + } + if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { + if (!ClInstrumentAtomics) return NULL; + *IsWrite = true; + return XCHG->getPointerOperand(); + } + return NULL; } -void AddressSanitizer::instrumentMop(Instruction *I) { - int IsWrite = isa<StoreInst>(*I); - Value *Addr = getLDSTOperand(I); +void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) { + bool IsWrite; + Value *Addr = isInterestingMemoryAccess(I, &IsWrite); + assert(Addr); if (ClOpt && ClOptGlobals && isa<GlobalVariable>(Addr)) { // We are accessing a global scalar variable. Nothing to catch here. return; @@ -316,22 +392,57 @@ void AddressSanitizer::instrumentMop(Instruction *I) { } IRBuilder<> IRB(I); - instrumentAddress(I, IRB, Addr, TypeSize, IsWrite); + instrumentAddress(AFC, I, IRB, Addr, TypeSize, IsWrite); +} + +// Validate the result of Module::getOrInsertFunction called for an interface +// function of AddressSanitizer. If the instrumented module defines a function +// with the same name, their prototypes must match, otherwise +// getOrInsertFunction returns a bitcast. +Function *AddressSanitizer::checkInterfaceFunction(Constant *FuncOrBitcast) { + if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast); + FuncOrBitcast->dump(); + report_fatal_error("trying to redefine an AddressSanitizer " + "interface function"); } Instruction *AddressSanitizer::generateCrashCode( - IRBuilder<> &IRB, Value *Addr, bool IsWrite, uint32_t TypeSize) { - // IsWrite and TypeSize are encoded in the function name. - std::string FunctionName = std::string(kAsanReportErrorTemplate) + - (IsWrite ? "store" : "load") + itostr(TypeSize / 8); - Value *ReportWarningFunc = CurrentModule->getOrInsertFunction( - FunctionName, IRB.getVoidTy(), IntptrTy, NULL); - CallInst *Call = IRB.CreateCall(ReportWarningFunc, Addr); - Call->setDoesNotReturn(); + BasicBlock *BB, Value *Addr, Value *PC, + bool IsWrite, size_t AccessSizeIndex) { + IRBuilder<> IRB(BB->getFirstNonPHI()); + CallInst *Call; + if (PC) + Call = IRB.CreateCall2(AsanErrorCallback[IsWrite][AccessSizeIndex], + Addr, PC); + else + Call = IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr); + // We don't do Call->setDoesNotReturn() because the BB already has + // UnreachableInst at the end. + // This EmptyAsm is required to avoid callback merge. + IRB.CreateCall(EmptyAsm); return Call; } -void AddressSanitizer::instrumentAddress(Instruction *OrigIns, +Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, + Value *ShadowValue, + uint32_t TypeSize) { + size_t Granularity = 1 << MappingScale; + // Addr & (Granularity - 1) + Value *LastAccessedByte = IRB.CreateAnd( + AddrLong, ConstantInt::get(IntptrTy, Granularity - 1)); + // (Addr & (Granularity - 1)) + size - 1 + if (TypeSize / 8 > 1) + LastAccessedByte = IRB.CreateAdd( + LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)); + // (uint8_t) ((Addr & (Granularity-1)) + size - 1) + LastAccessedByte = IRB.CreateIntCast( + LastAccessedByte, IRB.getInt8Ty(), false); + // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue + return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue); +} + +void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC, + Instruction *OrigIns, IRBuilder<> &IRB, Value *Addr, uint32_t TypeSize, bool IsWrite) { Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); @@ -346,31 +457,47 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal); - Instruction *CheckTerm = splitBlockAndInsertIfThen( - cast<Instruction>(Cmp)->getNextNode(), Cmp); - IRBuilder<> IRB2(CheckTerm); + BasicBlock *CrashBlock = 0; + if (ClMergeCallbacks) { + size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize); + BasicBlock **Cached = &AFC.CrashBlock[IsWrite][AccessSizeIndex]; + if (!*Cached) { + std::string BBName("crash_bb-"); + BBName += (IsWrite ? "w-" : "r-") + itostr(1 << AccessSizeIndex); + BasicBlock *BB = BasicBlock::Create(*C, BBName, &AFC.F); + new UnreachableInst(*C, BB); + *Cached = BB; + } + CrashBlock = *Cached; + // We need to pass the PC as the second parameter to __asan_report_*. + // There are few problems: + // - Some architectures (e.g. x86_32) don't have a cheap way to get the PC. + // - LLVM doesn't have the appropriate intrinsic. + // For now, put a random number into the PC, just to allow experiments. + Value *PC = ConstantInt::get(IntptrTy, rand()); + CrashArg Arg = {AddrLong, PC}; + AFC.CrashArgs[IsWrite][AccessSizeIndex].push_back(Arg); + } else { + CrashBlock = BasicBlock::Create(*C, "crash_bb", &AFC.F); + new UnreachableInst(*C, CrashBlock); + size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize); + Instruction *Crash = + generateCrashCode(CrashBlock, AddrLong, 0, IsWrite, AccessSizeIndex); + Crash->setDebugLoc(OrigIns->getDebugLoc()); + } size_t Granularity = 1 << MappingScale; if (TypeSize < 8 * Granularity) { - // Addr & (Granularity - 1) - Value *Lower3Bits = IRB2.CreateAnd( - AddrLong, ConstantInt::get(IntptrTy, Granularity - 1)); - // (Addr & (Granularity - 1)) + size - 1 - Value *LastAccessedByte = IRB2.CreateAdd( - Lower3Bits, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)); - // (uint8_t) ((Addr & (Granularity-1)) + size - 1) - LastAccessedByte = IRB2.CreateIntCast( - LastAccessedByte, IRB.getInt8Ty(), false); - // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue - Value *Cmp2 = IRB2.CreateICmpSGE(LastAccessedByte, ShadowValue); - - CheckTerm = splitBlockAndInsertIfThen(CheckTerm, Cmp2); - } - - IRBuilder<> IRB1(CheckTerm); - Instruction *Crash = generateCrashCode(IRB1, AddrLong, IsWrite, TypeSize); - Crash->setDebugLoc(OrigIns->getDebugLoc()); - ReplaceInstWithInst(CheckTerm, new UnreachableInst(*C)); + BranchInst *CheckTerm = splitBlockAndInsertIfThen(Cmp); + assert(CheckTerm->isUnconditional()); + BasicBlock *NextBB = CheckTerm->getSuccessor(0); + IRB.SetInsertPoint(CheckTerm); + Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize); + BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); + ReplaceInstWithInst(CheckTerm, NewTerm); + } else { + splitBlockAndInsertIfThen(Cmp, CrashBlock); + } } // This function replaces all global variables with new variables that have @@ -475,7 +602,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { // Create a new global variable with enough space for a redzone. GlobalVariable *NewGlobal = new GlobalVariable( M, NewTy, G->isConstant(), G->getLinkage(), - NewInitializer, "", G, G->isThreadLocal()); + NewInitializer, "", G, G->getThreadLocalMode()); NewGlobal->copyAttributesFrom(G); NewGlobal->setAlignment(RedzoneSize); @@ -503,7 +630,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage, ConstantArray::get(ArrayOfGlobalStructTy, Initializers), ""); - Function *AsanRegisterGlobals = cast<Function>(M.getOrInsertFunction( + Function *AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction( kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); AsanRegisterGlobals->setLinkage(Function::ExternalLinkage); @@ -518,8 +645,10 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { GlobalValue::InternalLinkage, kAsanModuleDtorName, &M); BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction); IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB)); - Function *AsanUnregisterGlobals = cast<Function>(M.getOrInsertFunction( - kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); + Function *AsanUnregisterGlobals = + checkInterfaceFunction(M.getOrInsertFunction( + kAsanUnregisterGlobalsName, + IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage); IRB_Dtor.CreateCall2(AsanUnregisterGlobals, @@ -539,7 +668,6 @@ bool AddressSanitizer::runOnModule(Module &M) { return false; BL.reset(new FunctionBlackList(ClBlackListFile)); - CurrentModule = &M; C = &(M.getContext()); LongSize = TD->getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); @@ -553,11 +681,33 @@ bool AddressSanitizer::runOnModule(Module &M) { // call __asan_init in the module ctor. IRBuilder<> IRB(CtorInsertBefore); - AsanInitFunction = cast<Function>( + AsanInitFunction = checkInterfaceFunction( M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL)); AsanInitFunction->setLinkage(Function::ExternalLinkage); IRB.CreateCall(AsanInitFunction); + // Create __asan_report* callbacks. + for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { + for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; + AccessSizeIndex++) { + // IsWrite and TypeSize are encoded in the function name. + std::string FunctionName = std::string(kAsanReportErrorTemplate) + + (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex); + // If we are merging crash callbacks, they have two parameters. + if (ClMergeCallbacks) + AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>( + M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy, + IntptrTy, NULL)); + else + AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>( + M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy, NULL)); + } + } + // We insert an empty inline asm after __asan_report* to avoid callback merge. + EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), + StringRef(""), StringRef(""), + /*hasSideEffects=*/true); + llvm::Triple targetTriple(M.getTargetTriple()); bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::ANDROIDEABI; @@ -645,17 +795,17 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) { SmallSet<Value*, 16> TempsToInstrument; SmallVector<Instruction*, 16> ToInstrument; SmallVector<Instruction*, 8> NoReturnCalls; + bool IsWrite; // Fill the set of memory operations to instrument. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { TempsToInstrument.clear(); + int NumInsnsPerBB = 0; for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { if (LooksLikeCodeInBug11395(BI)) return false; - if ((isa<LoadInst>(BI) && ClInstrumentReads) || - (isa<StoreInst>(BI) && ClInstrumentWrites)) { - Value *Addr = getLDSTOperand(BI); + if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite)) { if (ClOpt && ClOptSameTemp) { if (!TempsToInstrument.insert(Addr)) continue; // We've seen this temp in the current BB. @@ -673,23 +823,55 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) { continue; } ToInstrument.push_back(BI); + NumInsnsPerBB++; + if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) + break; } } + AsanFunctionContext AFC(F); + // Instrument. int NumInstrumented = 0; for (size_t i = 0, n = ToInstrument.size(); i != n; i++) { Instruction *Inst = ToInstrument[i]; if (ClDebugMin < 0 || ClDebugMax < 0 || (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) { - if (isa<StoreInst>(Inst) || isa<LoadInst>(Inst)) - instrumentMop(Inst); + if (isInterestingMemoryAccess(Inst, &IsWrite)) + instrumentMop(AFC, Inst); else - instrumentMemIntrinsic(cast<MemIntrinsic>(Inst)); + instrumentMemIntrinsic(AFC, cast<MemIntrinsic>(Inst)); } NumInstrumented++; } + // Create PHI nodes and crash callbacks if we are merging crash callbacks. + if (NumInstrumented) { + for (size_t IsWrite = 0; IsWrite <= 1; IsWrite++) { + for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; + AccessSizeIndex++) { + BasicBlock *BB = AFC.CrashBlock[IsWrite][AccessSizeIndex]; + if (!BB) continue; + assert(ClMergeCallbacks); + AsanFunctionContext::CrashArgsVec &Args = + AFC.CrashArgs[IsWrite][AccessSizeIndex]; + IRBuilder<> IRB(BB->getFirstNonPHI()); + size_t n = Args.size(); + PHINode *PN1 = IRB.CreatePHI(IntptrTy, n); + PHINode *PN2 = IRB.CreatePHI(IntptrTy, n); + // We need to match crash parameters and the predecessors. + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + PI != PE; ++PI) { + n--; + PN1->addIncoming(Args[n].Arg1, *PI); + PN2->addIncoming(Args[n].Arg2, *PI); + } + assert(n == 0); + generateCrashCode(BB, PN1, PN2, IsWrite, AccessSizeIndex); + } + } + } + DEBUG(dbgs() << F); bool ChangedStack = poisonStackInFunction(M, F); diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp new file mode 100644 index 0000000..09e0f14 --- /dev/null +++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -0,0 +1,209 @@ +//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass that instruments the code to perform run-time +// bounds checking on loads, stores, and other memory intrinsics. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "bounds-checking" +#include "llvm/IRBuilder.h" +#include "llvm/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/TargetFolder.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Instrumentation.h" +using namespace llvm; + +static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap", + cl::desc("Use one trap block per function")); + +STATISTIC(ChecksAdded, "Bounds checks added"); +STATISTIC(ChecksSkipped, "Bounds checks skipped"); +STATISTIC(ChecksUnable, "Bounds checks unable to add"); + +typedef IRBuilder<true, TargetFolder> BuilderTy; + +namespace { + struct BoundsChecking : public FunctionPass { + static char ID; + + BoundsChecking(unsigned _Penalty = 5) : FunctionPass(ID), Penalty(_Penalty){ + initializeBoundsCheckingPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetData>(); + } + + private: + const TargetData *TD; + ObjectSizeOffsetEvaluator *ObjSizeEval; + BuilderTy *Builder; + Instruction *Inst; + BasicBlock *TrapBB; + unsigned Penalty; + + BasicBlock *getTrapBB(); + void emitBranchToTrap(Value *Cmp = 0); + bool computeAllocSize(Value *Ptr, APInt &Offset, Value* &OffsetValue, + APInt &Size, Value* &SizeValue); + bool instrument(Value *Ptr, Value *Val); + }; +} + +char BoundsChecking::ID = 0; +INITIALIZE_PASS(BoundsChecking, "bounds-checking", "Run-time bounds checking", + false, false) + + +/// getTrapBB - create a basic block that traps. All overflowing conditions +/// branch to this block. There's only one trap block per function. +BasicBlock *BoundsChecking::getTrapBB() { + if (TrapBB && SingleTrapBB) + return TrapBB; + + Function *Fn = Inst->getParent()->getParent(); + BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint(); + TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn); + Builder->SetInsertPoint(TrapBB); + + llvm::Value *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap); + CallInst *TrapCall = Builder->CreateCall(F); + TrapCall->setDoesNotReturn(); + TrapCall->setDoesNotThrow(); + TrapCall->setDebugLoc(Inst->getDebugLoc()); + Builder->CreateUnreachable(); + + Builder->SetInsertPoint(PrevInsertPoint); + return TrapBB; +} + + +/// emitBranchToTrap - emit a branch instruction to a trap block. +/// If Cmp is non-null, perform a jump only if its value evaluates to true. +void BoundsChecking::emitBranchToTrap(Value *Cmp) { + // check if the comparison is always false + ConstantInt *C = dyn_cast_or_null<ConstantInt>(Cmp); + if (C) { + ++ChecksSkipped; + if (!C->getZExtValue()) + return; + else + Cmp = 0; // unconditional branch + } + + Instruction *Inst = Builder->GetInsertPoint(); + BasicBlock *OldBB = Inst->getParent(); + BasicBlock *Cont = OldBB->splitBasicBlock(Inst); + OldBB->getTerminator()->eraseFromParent(); + + if (Cmp) + BranchInst::Create(getTrapBB(), Cont, Cmp, OldBB); + else + BranchInst::Create(getTrapBB(), OldBB); +} + + +/// instrument - adds run-time bounds checks to memory accessing instructions. +/// Ptr is the pointer that will be read/written, and InstVal is either the +/// result from the load or the value being stored. It is used to determine the +/// size of memory block that is touched. +/// Returns true if any change was made to the IR, false otherwise. +bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { + uint64_t NeededSize = TD->getTypeStoreSize(InstVal->getType()); + DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize) + << " bytes\n"); + + SizeOffsetEvalType SizeOffset = ObjSizeEval->compute(Ptr); + + if (!ObjSizeEval->bothKnown(SizeOffset)) { + ++ChecksUnable; + return false; + } + + Value *Size = SizeOffset.first; + Value *Offset = SizeOffset.second; + ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size); + + IntegerType *IntTy = TD->getIntPtrType(Inst->getContext()); + Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize); + + // three checks are required to ensure safety: + // . Offset >= 0 (since the offset is given from the base ptr) + // . Size >= Offset (unsigned) + // . Size - Offset >= NeededSize (unsigned) + // + // optimization: if Size >= 0 (signed), skip 1st check + // FIXME: add NSW/NUW here? -- we dont care if the subtraction overflows + Value *ObjSize = Builder->CreateSub(Size, Offset); + Value *Cmp2 = Builder->CreateICmpULT(Size, Offset); + Value *Cmp3 = Builder->CreateICmpULT(ObjSize, NeededSizeVal); + Value *Or = Builder->CreateOr(Cmp2, Cmp3); + if (!SizeCI || SizeCI->getValue().slt(0)) { + Value *Cmp1 = Builder->CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0)); + Or = Builder->CreateOr(Cmp1, Or); + } + emitBranchToTrap(Or); + + ++ChecksAdded; + return true; +} + +bool BoundsChecking::runOnFunction(Function &F) { + TD = &getAnalysis<TargetData>(); + + TrapBB = 0; + BuilderTy TheBuilder(F.getContext(), TargetFolder(TD)); + Builder = &TheBuilder; + ObjectSizeOffsetEvaluator TheObjSizeEval(TD, F.getContext()); + ObjSizeEval = &TheObjSizeEval; + + // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory + // touching instructions + std::vector<Instruction*> WorkList; + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &*i; + if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<AtomicCmpXchgInst>(I) || + isa<AtomicRMWInst>(I)) + WorkList.push_back(I); + } + + bool MadeChange = false; + for (std::vector<Instruction*>::iterator i = WorkList.begin(), + e = WorkList.end(); i != e; ++i) { + Inst = *i; + + Builder->SetInsertPoint(Inst); + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + MadeChange |= instrument(LI->getPointerOperand(), LI); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + MadeChange |= instrument(SI->getPointerOperand(), SI->getValueOperand()); + } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { + MadeChange |= instrument(AI->getPointerOperand(),AI->getCompareOperand()); + } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) { + MadeChange |= instrument(AI->getPointerOperand(), AI->getValOperand()); + } else { + llvm_unreachable("unknown Instruction type"); + } + } + return MadeChange; +} + +FunctionPass *llvm::createBoundsCheckingPass(unsigned Penalty) { + return new BoundsChecking(Penalty); +} diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index e4c8cf1..00de882 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMInstrumentation AddressSanitizer.cpp + BoundsChecking.cpp EdgeProfiling.cpp FunctionBlackList.cpp GCOVProfiling.cpp @@ -9,3 +10,5 @@ add_llvm_library(LLVMInstrumentation ProfilingUtils.cpp ThreadSanitizer.cpp ) + +add_dependencies(LLVMInstrumentation intrinsics_gen) diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 96e5d5b..264a6a6 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -18,22 +18,23 @@ #include "ProfilingUtils.h" #include "llvm/Transforms/Instrumentation.h" -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/DebugInfo.h" +#include "llvm/IRBuilder.h" +#include "llvm/Instructions.h" #include "llvm/Module.h" #include "llvm/Pass.h" -#include "llvm/Instructions.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/DebugLoc.h" -#include "llvm/Support/InstIterator.h" -#include "llvm/Support/IRBuilder.h" -#include "llvm/Support/PathV2.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/UniqueVector.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/PathV2.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include <string> #include <utility> using namespace llvm; @@ -57,7 +58,6 @@ namespace { virtual const char *getPassName() const { return "GCOV Profiler"; } - private: bool runOnModule(Module &M); @@ -90,6 +90,7 @@ namespace { // list. void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &); + void insertIndirectCounterIncrement(); std::string mangleName(DICompileUnit CU, std::string NewStem); @@ -421,6 +422,7 @@ bool GCOVProfiler::emitProfileArcs() { if (!CU_Nodes) return false; bool Result = false; + bool InsertIndCounterIncrCode = false; for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { DICompileUnit CU(CU_Nodes->getOperand(i)); DIArray SPs = CU.getSubprograms(); @@ -446,7 +448,7 @@ bool GCOVProfiler::emitProfileArcs() { new GlobalVariable(*M, CounterTy, false, GlobalValue::InternalLinkage, Constant::getNullValue(CounterTy), - "__llvm_gcov_ctr", 0, false, 0); + "__llvm_gcov_ctr"); CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP)); UniqueVector<BasicBlock *> ComplexEdgePreds; @@ -507,15 +509,21 @@ bool GCOVProfiler::emitProfileArcs() { Value *CounterPtrArray = Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0, i * ComplexEdgePreds.size()); + + // Build code to increment the counter. + InsertIndCounterIncrCode = true; Builder.CreateCall2(getIncrementIndirectCounterFunc(), EdgeState, CounterPtrArray); - // clear the predecessor number - Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState); } } } + insertCounterWriteout(CountersBySP); } + + if (InsertIndCounterIncrCode) + insertIndirectCounterIncrement(); + return Result; } @@ -574,13 +582,14 @@ Constant *GCOVProfiler::getStartFileFunc() { } Constant *GCOVProfiler::getIncrementIndirectCounterFunc() { + Type *Int32Ty = Type::getInt32Ty(*Ctx); + Type *Int64Ty = Type::getInt64Ty(*Ctx); Type *Args[] = { - Type::getInt32PtrTy(*Ctx), // uint32_t *predecessor - Type::getInt64PtrTy(*Ctx)->getPointerTo(), // uint64_t **state_table_row + Int32Ty->getPointerTo(), // uint32_t *predecessor + Int64Ty->getPointerTo()->getPointerTo() // uint64_t **counters }; - FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), - Args, false); - return M->getOrInsertFunction("llvm_gcda_increment_indirect_counter", FTy); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); + return M->getOrInsertFunction("__llvm_gcov_indirect_counter_increment", FTy); } Constant *GCOVProfiler::getEmitFunctionFunc() { @@ -588,8 +597,7 @@ Constant *GCOVProfiler::getEmitFunctionFunc() { Type::getInt32Ty(*Ctx), // uint32_t ident Type::getInt8PtrTy(*Ctx), // const char *function_name }; - FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), - Args, false); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); return M->getOrInsertFunction("llvm_gcda_emit_function", FTy); } @@ -665,5 +673,75 @@ void GCOVProfiler::insertCounterWriteout( } Builder.CreateRetVoid(); - InsertProfilingShutdownCall(WriteoutF, M); + // Create a small bit of code that registers the "__llvm_gcov_writeout" + // function to be executed at exit. + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + Function *F = Function::Create(FTy, GlobalValue::InternalLinkage, + "__llvm_gcov_init", M); + F->setUnnamedAddr(true); + F->setLinkage(GlobalValue::InternalLinkage); + F->addFnAttr(Attribute::NoInline); + + BB = BasicBlock::Create(*Ctx, "entry", F); + Builder.SetInsertPoint(BB); + + FTy = FunctionType::get(Type::getInt32Ty(*Ctx), + PointerType::get(FTy, 0), false); + Constant *AtExitFn = M->getOrInsertFunction("atexit", FTy); + Builder.CreateCall(AtExitFn, WriteoutF); + Builder.CreateRetVoid(); + + appendToGlobalCtors(*M, F, 0); +} + +void GCOVProfiler::insertIndirectCounterIncrement() { + Function *Fn = + cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc()); + Fn->setUnnamedAddr(true); + Fn->setLinkage(GlobalValue::InternalLinkage); + Fn->addFnAttr(Attribute::NoInline); + + Type *Int32Ty = Type::getInt32Ty(*Ctx); + Type *Int64Ty = Type::getInt64Ty(*Ctx); + Constant *NegOne = ConstantInt::get(Int32Ty, 0xffffffff); + + // Create basic blocks for function. + BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", Fn); + IRBuilder<> Builder(BB); + + BasicBlock *PredNotNegOne = BasicBlock::Create(*Ctx, "", Fn); + BasicBlock *CounterEnd = BasicBlock::Create(*Ctx, "", Fn); + BasicBlock *Exit = BasicBlock::Create(*Ctx, "exit", Fn); + + // uint32_t pred = *predecessor; + // if (pred == 0xffffffff) return; + Argument *Arg = Fn->arg_begin(); + Arg->setName("predecessor"); + Value *Pred = Builder.CreateLoad(Arg, "pred"); + Value *Cond = Builder.CreateICmpEQ(Pred, NegOne); + BranchInst::Create(Exit, PredNotNegOne, Cond, BB); + + Builder.SetInsertPoint(PredNotNegOne); + + // uint64_t *counter = counters[pred]; + // if (!counter) return; + Value *ZExtPred = Builder.CreateZExt(Pred, Int64Ty); + Arg = llvm::next(Fn->arg_begin()); + Arg->setName("counters"); + Value *GEP = Builder.CreateGEP(Arg, ZExtPred); + Value *Counter = Builder.CreateLoad(GEP, "counter"); + Cond = Builder.CreateICmpEQ(Counter, + Constant::getNullValue(Int64Ty->getPointerTo())); + Builder.CreateCondBr(Cond, Exit, CounterEnd); + + // ++*counter; + Builder.SetInsertPoint(CounterEnd); + Value *Add = Builder.CreateAdd(Builder.CreateLoad(Counter), + ConstantInt::get(Int64Ty, 1)); + Builder.CreateStore(Add, Counter); + Builder.CreateBr(Exit); + + // Fill in the exit block. + Builder.SetInsertPoint(Exit); + Builder.CreateRetVoid(); } diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index c7266e2..1e0b4a3 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -20,11 +20,12 @@ using namespace llvm; /// initializeInstrumentation - Initialize all passes in the TransformUtils /// library. void llvm::initializeInstrumentation(PassRegistry &Registry) { + initializeAddressSanitizerPass(Registry); + initializeBoundsCheckingPass(Registry); initializeEdgeProfilerPass(Registry); + initializeGCOVProfilerPass(Registry); initializeOptimalEdgeProfilerPass(Registry); initializePathProfilerPass(Registry); - initializeGCOVProfilerPass(Registry); - initializeAddressSanitizerPass(Registry); initializeThreadSanitizerPass(Registry); } diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp index b214796..cc27146 100644 --- a/lib/Transforms/Instrumentation/PathProfiling.cpp +++ b/lib/Transforms/Instrumentation/PathProfiling.cpp @@ -55,11 +55,11 @@ #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Pass.h" +#include "llvm/TypeBuilder.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/TypeBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Instrumentation.h" diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 8bb337e..dc0fa71 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -22,73 +22,73 @@ #define DEBUG_TYPE "tsan" #include "FunctionBlackList.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Intrinsics.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" #include "llvm/Metadata.h" #include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Type.h" using namespace llvm; static cl::opt<std::string> ClBlackListFile("tsan-blacklist", cl::desc("Blacklist file"), cl::Hidden); -static cl::opt<bool> ClPrintStats("tsan-print-stats", - cl::desc("Print ThreadSanitizer instrumentation stats"), cl::Hidden); +STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); +STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); +STATISTIC(NumOmittedReadsBeforeWrite, + "Number of reads ignored due to following writes"); +STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size"); +STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes"); +STATISTIC(NumOmittedReadsFromConstantGlobals, + "Number of reads from constant globals"); +STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads"); namespace { -// Stats counters for ThreadSanitizer instrumentation. -struct ThreadSanitizerStats { - size_t NumInstrumentedReads; - size_t NumInstrumentedWrites; - size_t NumOmittedReadsBeforeWrite; - size_t NumAccessesWithBadSize; - size_t NumInstrumentedVtableWrites; - size_t NumOmittedReadsFromConstantGlobals; - size_t NumOmittedReadsFromVtable; -}; - /// ThreadSanitizer: instrument the code in module to find races. struct ThreadSanitizer : public FunctionPass { ThreadSanitizer(); + const char *getPassName() const; bool runOnFunction(Function &F); bool doInitialization(Module &M); - bool doFinalization(Module &M); - bool instrumentLoadOrStore(Instruction *I); static char ID; // Pass identification, replacement for typeid. private: - void choseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local, - SmallVectorImpl<Instruction*> &All); + bool instrumentLoadOrStore(Instruction *I); + bool instrumentAtomic(Instruction *I); + void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local, + SmallVectorImpl<Instruction*> &All); bool addrPointsToConstantData(Value *Addr); + int getMemoryAccessFuncIndex(Value *Addr); TargetData *TD; OwningPtr<FunctionBlackList> BL; + IntegerType *OrdTy; // Callbacks to run-time library are computed in doInitialization. - Value *TsanFuncEntry; - Value *TsanFuncExit; + Function *TsanFuncEntry; + Function *TsanFuncExit; // Accesses sizes are powers of two: 1, 2, 4, 8, 16. static const size_t kNumberOfAccessSizes = 5; - Value *TsanRead[kNumberOfAccessSizes]; - Value *TsanWrite[kNumberOfAccessSizes]; - Value *TsanVptrUpdate; - - // Stats are modified w/o synchronization. - ThreadSanitizerStats stats; + Function *TsanRead[kNumberOfAccessSizes]; + Function *TsanWrite[kNumberOfAccessSizes]; + Function *TsanAtomicLoad[kNumberOfAccessSizes]; + Function *TsanAtomicStore[kNumberOfAccessSizes]; + Function *TsanVptrUpdate; }; } // namespace @@ -97,6 +97,10 @@ INITIALIZE_PASS(ThreadSanitizer, "tsan", "ThreadSanitizer: detects data races.", false, false) +const char *ThreadSanitizer::getPassName() const { + return "ThreadSanitizer"; +} + ThreadSanitizer::ThreadSanitizer() : FunctionPass(ID), TD(NULL) { @@ -106,12 +110,18 @@ FunctionPass *llvm::createThreadSanitizerPass() { return new ThreadSanitizer(); } +static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { + if (Function *F = dyn_cast<Function>(FuncOrBitcast)) + return F; + FuncOrBitcast->dump(); + report_fatal_error("ThreadSanitizer interface function redefined"); +} + bool ThreadSanitizer::doInitialization(Module &M) { TD = getAnalysisIfAvailable<TargetData>(); if (!TD) return false; BL.reset(new FunctionBlackList(ClBlackListFile)); - memset(&stats, 0, sizeof(stats)); // Always insert a call to __tsan_init into the module's CTORs. IRBuilder<> IRB(M.getContext()); @@ -120,38 +130,38 @@ bool ThreadSanitizer::doInitialization(Module &M) { appendToGlobalCtors(M, cast<Function>(TsanInit), 0); // Initialize the callbacks. - TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", IRB.getVoidTy(), - IRB.getInt8PtrTy(), NULL); - TsanFuncExit = M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), - NULL); + TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction( + "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL)); + TsanFuncExit = checkInterfaceFunction(M.getOrInsertFunction( + "__tsan_func_exit", IRB.getVoidTy(), NULL)); + OrdTy = IRB.getInt32Ty(); for (size_t i = 0; i < kNumberOfAccessSizes; ++i) { - SmallString<32> ReadName("__tsan_read"); - ReadName += itostr(1 << i); - TsanRead[i] = M.getOrInsertFunction(ReadName, IRB.getVoidTy(), - IRB.getInt8PtrTy(), NULL); - SmallString<32> WriteName("__tsan_write"); - WriteName += itostr(1 << i); - TsanWrite[i] = M.getOrInsertFunction(WriteName, IRB.getVoidTy(), - IRB.getInt8PtrTy(), NULL); - } - TsanVptrUpdate = M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(), - IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), - NULL); - return true; -} + const size_t ByteSize = 1 << i; + const size_t BitSize = ByteSize * 8; + SmallString<32> ReadName("__tsan_read" + itostr(ByteSize)); + TsanRead[i] = checkInterfaceFunction(M.getOrInsertFunction( + ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL)); -bool ThreadSanitizer::doFinalization(Module &M) { - if (ClPrintStats) { - errs() << "ThreadSanitizerStats " << M.getModuleIdentifier() - << ": wr " << stats.NumInstrumentedWrites - << "; rd " << stats.NumInstrumentedReads - << "; vt " << stats.NumInstrumentedVtableWrites - << "; bs " << stats.NumAccessesWithBadSize - << "; rbw " << stats.NumOmittedReadsBeforeWrite - << "; rcg " << stats.NumOmittedReadsFromConstantGlobals - << "; rvt " << stats.NumOmittedReadsFromVtable - << "\n"; + SmallString<32> WriteName("__tsan_write" + itostr(ByteSize)); + TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction( + WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL)); + + Type *Ty = Type::getIntNTy(M.getContext(), BitSize); + Type *PtrTy = Ty->getPointerTo(); + SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) + + "_load"); + TsanAtomicLoad[i] = checkInterfaceFunction(M.getOrInsertFunction( + AtomicLoadName, Ty, PtrTy, OrdTy, NULL)); + + SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) + + "_store"); + TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction( + AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, + NULL)); } + TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction( + "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), NULL)); return true; } @@ -173,13 +183,13 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { if (GV->isConstant()) { // Reads from constant globals can not race with any writes. - stats.NumOmittedReadsFromConstantGlobals++; + NumOmittedReadsFromConstantGlobals++; return true; } } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) { if (isVtableAccess(L)) { // Reads from a vtable pointer can not race with any writes. - stats.NumOmittedReadsFromVtable++; + NumOmittedReadsFromVtable++; return true; } } @@ -197,7 +207,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { // // 'Local' is a vector of insns within the same BB (no calls between). // 'All' is a vector of insns that will be instrumented. -void ThreadSanitizer::choseInstructionsToInstrument( +void ThreadSanitizer::chooseInstructionsToInstrument( SmallVectorImpl<Instruction*> &Local, SmallVectorImpl<Instruction*> &All) { SmallSet<Value*, 8> WriteTargets; @@ -212,7 +222,7 @@ void ThreadSanitizer::choseInstructionsToInstrument( Value *Addr = Load->getPointerOperand(); if (WriteTargets.count(Addr)) { // We will write to this temp, so no reason to analyze the read. - stats.NumOmittedReadsBeforeWrite++; + NumOmittedReadsBeforeWrite++; continue; } if (addrPointsToConstantData(Addr)) { @@ -225,12 +235,27 @@ void ThreadSanitizer::choseInstructionsToInstrument( Local.clear(); } +static bool isAtomic(Instruction *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->isAtomic() && LI->getSynchScope() == CrossThread; + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isAtomic() && SI->getSynchScope() == CrossThread; + if (isa<AtomicRMWInst>(I)) + return true; + if (isa<AtomicCmpXchgInst>(I)) + return true; + if (FenceInst *FI = dyn_cast<FenceInst>(I)) + return FI->getSynchScope() == CrossThread; + return false; +} + bool ThreadSanitizer::runOnFunction(Function &F) { if (!TD) return false; if (BL->isIn(F)) return false; SmallVector<Instruction*, 8> RetVec; SmallVector<Instruction*, 8> AllLoadsAndStores; SmallVector<Instruction*, 8> LocalLoadsAndStores; + SmallVector<Instruction*, 8> AtomicAccesses; bool Res = false; bool HasCalls = false; @@ -240,16 +265,18 @@ bool ThreadSanitizer::runOnFunction(Function &F) { BasicBlock &BB = *FI; for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE; ++BI) { - if (isa<LoadInst>(BI) || isa<StoreInst>(BI)) + if (isAtomic(BI)) + AtomicAccesses.push_back(BI); + else if (isa<LoadInst>(BI) || isa<StoreInst>(BI)) LocalLoadsAndStores.push_back(BI); else if (isa<ReturnInst>(BI)) RetVec.push_back(BI); else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { HasCalls = true; - choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); } } - choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); } // We have collected all loads and stores. @@ -261,6 +288,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) { Res |= instrumentLoadOrStore(AllLoadsAndStores[i]); } + // Instrument atomic memory accesses. + for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) { + Res |= instrumentAtomic(AtomicAccesses[i]); + } + // Instrument function entry/exit points if there were instrumented accesses. if (Res || HasCalls) { IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); @@ -283,29 +315,98 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { Value *Addr = IsWrite ? cast<StoreInst>(I)->getPointerOperand() : cast<LoadInst>(I)->getPointerOperand(); - Type *OrigPtrTy = Addr->getType(); - Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); - assert(OrigTy->isSized()); - uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy); - if (TypeSize != 8 && TypeSize != 16 && - TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { - stats.NumAccessesWithBadSize++; - // Ignore all unusual sizes. + int Idx = getMemoryAccessFuncIndex(Addr); + if (Idx < 0) return false; - } if (IsWrite && isVtableAccess(I)) { + DEBUG(dbgs() << " VPTR : " << *I << "\n"); Value *StoredValue = cast<StoreInst>(I)->getValueOperand(); + // StoredValue does not necessary have a pointer type. + if (isa<IntegerType>(StoredValue->getType())) + StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy()); + // Call TsanVptrUpdate. IRB.CreateCall2(TsanVptrUpdate, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())); - stats.NumInstrumentedVtableWrites++; + NumInstrumentedVtableWrites++; return true; } - size_t Idx = CountTrailingZeros_32(TypeSize / 8); - assert(Idx < kNumberOfAccessSizes); Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx]; IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy())); - if (IsWrite) stats.NumInstrumentedWrites++; - else stats.NumInstrumentedReads++; + if (IsWrite) NumInstrumentedWrites++; + else NumInstrumentedReads++; + return true; +} + +static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { + uint32_t v = 0; + switch (ord) { + case NotAtomic: assert(false); + case Unordered: // Fall-through. + case Monotonic: v = 1 << 0; break; + // case Consume: v = 1 << 1; break; // Not specified yet. + case Acquire: v = 1 << 2; break; + case Release: v = 1 << 3; break; + case AcquireRelease: v = 1 << 4; break; + case SequentiallyConsistent: v = 1 << 5; break; + } + return IRB->getInt32(v); +} + +bool ThreadSanitizer::instrumentAtomic(Instruction *I) { + IRBuilder<> IRB(I); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + Value *Addr = LI->getPointerOperand(); + int Idx = getMemoryAccessFuncIndex(Addr); + if (Idx < 0) + return false; + const size_t ByteSize = 1 << Idx; + const size_t BitSize = ByteSize * 8; + Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); + Type *PtrTy = Ty->getPointerTo(); + Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), + createOrdering(&IRB, LI->getOrdering())}; + CallInst *C = CallInst::Create(TsanAtomicLoad[Idx], + ArrayRef<Value*>(Args)); + ReplaceInstWithInst(I, C); + + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + Value *Addr = SI->getPointerOperand(); + int Idx = getMemoryAccessFuncIndex(Addr); + if (Idx < 0) + return false; + const size_t ByteSize = 1 << Idx; + const size_t BitSize = ByteSize * 8; + Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); + Type *PtrTy = Ty->getPointerTo(); + Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), + IRB.CreateIntCast(SI->getValueOperand(), Ty, false), + createOrdering(&IRB, SI->getOrdering())}; + CallInst *C = CallInst::Create(TsanAtomicStore[Idx], + ArrayRef<Value*>(Args)); + ReplaceInstWithInst(I, C); + } else if (isa<AtomicRMWInst>(I)) { + // FIXME: Not yet supported. + } else if (isa<AtomicCmpXchgInst>(I)) { + // FIXME: Not yet supported. + } else if (isa<FenceInst>(I)) { + // FIXME: Not yet supported. + } return true; } + +int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) { + Type *OrigPtrTy = Addr->getType(); + Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); + assert(OrigTy->isSized()); + uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy); + if (TypeSize != 8 && TypeSize != 16 && + TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { + NumAccessesWithBadSize++; + // Ignore all unusual sizes. + return -1; + } + size_t Idx = CountTrailingZeros_32(TypeSize / 8); + assert(Idx < kNumberOfAccessSizes); + return Idx; +} diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index ba214d1..b344952 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -9,7 +9,7 @@ // // This file implements the Aggressive Dead Code Elimination pass. This pass // optimistically assumes that all instructions are dead until proven otherwise, -// allowing it to eliminate dead computations that other DCE passes do not +// allowing it to eliminate dead computations that other DCE passes do not // catch, particularly involving loop computations. // //===----------------------------------------------------------------------===// @@ -36,13 +36,13 @@ namespace { ADCE() : FunctionPass(ID) { initializeADCEPass(*PassRegistry::getPassRegistry()); } - + virtual bool runOnFunction(Function& F); - + virtual void getAnalysisUsage(AnalysisUsage& AU) const { AU.setPreservesCFG(); } - + }; } @@ -52,7 +52,7 @@ INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) bool ADCE::runOnFunction(Function& F) { SmallPtrSet<Instruction*, 128> alive; SmallVector<Instruction*, 128> worklist; - + // Collect the set of "root" instructions that are known live. for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) if (isa<TerminatorInst>(I.getInstructionIterator()) || @@ -62,7 +62,7 @@ bool ADCE::runOnFunction(Function& F) { alive.insert(I.getInstructionIterator()); worklist.push_back(I.getInstructionIterator()); } - + // Propagate liveness backwards to operands. while (!worklist.empty()) { Instruction* curr = worklist.pop_back_val(); @@ -72,7 +72,7 @@ bool ADCE::runOnFunction(Function& F) { if (alive.insert(Inst)) worklist.push_back(Inst); } - + // The inverse of the live set is the dead set. These are those instructions // which have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. @@ -82,7 +82,7 @@ bool ADCE::runOnFunction(Function& F) { worklist.push_back(I.getInstructionIterator()); I->dropAllReferences(); } - + for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(), E = worklist.end(); I != E; ++I) { ++NumRemoved; diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d660c72..a01e066 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -32,3 +32,5 @@ add_llvm_library(LLVMScalarOpts Sink.cpp TailRecursionElimination.cpp ) + +add_dependencies(LLVMScalarOpts intrinsics_gen) diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 9a5423f..277c4d5 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -18,32 +18,32 @@ #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/InlineAsm.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/ProfileInfo.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Transforms/Utils/AddrModeMatcher.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ProfileInfo.h" #include "llvm/Assembly/Writer.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/PatternMatch.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Transforms/Utils/AddrModeMatcher.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -60,6 +60,7 @@ STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); +STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); static cl::opt<bool> DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), @@ -70,6 +71,10 @@ static cl::opt<bool> DisableDeleteDeadBlocks( "disable-cgp-delete-dead-blocks", cl::Hidden, cl::init(false), cl::desc("Disable deleting dead blocks in CodeGenPrepare")); +static cl::opt<bool> DisableSelectToBranch( + "disable-cgp-select2branch", cl::Hidden, cl::init(false), + cl::desc("Disable select to branch conversion.")); + namespace { class CodeGenPrepare : public FunctionPass { /// TLI - Keep a pointer of a TargetLowering to consult for determining @@ -78,7 +83,7 @@ namespace { const TargetLibraryInfo *TLInfo; DominatorTree *DT; ProfileInfo *PFI; - + /// CurInstIterator - As we scan instructions optimizing them, this is the /// next instruction to optimize. Xforms that can invalidate this should /// update it. @@ -93,6 +98,9 @@ namespace { /// be updated. bool ModifiedDT; + /// OptSize - True if optimizing for size. + bool OptSize; + public: static char ID; // Pass identification, replacement for typeid explicit CodeGenPrepare(const TargetLowering *tli = 0) @@ -118,6 +126,7 @@ namespace { bool OptimizeCallInst(CallInst *CI); bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); + bool OptimizeSelectInst(SelectInst *SI); bool DupRetToEnableTailCallOpts(ReturnInst *RI); bool PlaceDbgValues(Function &F); }; @@ -141,13 +150,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) { TLInfo = &getAnalysis<TargetLibraryInfo>(); DT = getAnalysisIfAvailable<DominatorTree>(); PFI = getAnalysisIfAvailable<ProfileInfo>(); + OptSize = F.hasFnAttr(Attribute::OptimizeForSize); // First pass, eliminate blocks that contain only PHI nodes and an // unconditional branch. EverMadeChange |= EliminateMostlyEmptyBlocks(F); // llvm.dbg.value is far away from the value then iSel may not be able - // handle it properly. iSel will drop llvm.dbg.value if it can not + // handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. EverMadeChange |= PlaceDbgValues(F); @@ -326,7 +336,7 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); - + DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); return; } @@ -537,7 +547,7 @@ protected: bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { BasicBlock *BB = CI->getParent(); - + // Lower inline assembly if we can. // If we found an inline asm expession, and if the target knows how to // lower it to normal LLVM code, do so now. @@ -554,19 +564,19 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { if (OptimizeInlineAsmInst(CI)) return true; } - + // Lower all uses of llvm.objectsize.* IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); if (II && II->getIntrinsicID() == Intrinsic::objectsize) { bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); Type *ReturnTy = CI->getType(); - Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); - + Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); + // Substituting this can cause recursive simplifications, which can // invalidate our iterator. Use a WeakVH to hold onto it in case this // happens. WeakVH IterHandle(CurInstIterator); - + replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0, TLInfo, ModifiedDT ? 0 : DT); @@ -594,7 +604,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // We'll need TargetData from here on out. const TargetData *TD = TLI ? TLI->getTargetData() : 0; if (!TD) return false; - + // Lower all default uses of _chk calls. This is very similar // to what InstCombineCalls does, but here we are only lowering calls // that have the default "don't know" as the objectsize. Anything else @@ -750,13 +760,13 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) { bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy) { Value *Repl = Addr; - - // Try to collapse single-value PHI nodes. This is necessary to undo + + // Try to collapse single-value PHI nodes. This is necessary to undo // unprofitable PRE transformations. SmallVector<Value*, 8> worklist; SmallPtrSet<Value*, 16> Visited; worklist.push_back(Addr); - + // Use a worklist to iteratively look through PHI nodes, and ensure that // the addressing mode obtained from the non-PHI roots of the graph // are equivalent. @@ -768,20 +778,20 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, while (!worklist.empty()) { Value *V = worklist.back(); worklist.pop_back(); - + // Break use-def graph loops. if (!Visited.insert(V)) { Consensus = 0; break; } - + // For a PHI node, push all of its incoming values. if (PHINode *P = dyn_cast<PHINode>(V)) { for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) worklist.push_back(P->getIncomingValue(i)); continue; } - + // For non-PHIs, determine the addressing mode being computed. SmallVector<Instruction*, 16> NewAddrModeInsts; ExtAddrMode NewAddrMode = @@ -816,15 +826,15 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, } continue; } - + Consensus = 0; break; } - + // If the addressing mode couldn't be determined, or if multiple different // ones were determined, bail out now. if (!Consensus) return false; - + // Check to see if any of the instructions supersumed by this addr mode are // non-local to I's BB. bool AnyNonLocal = false; @@ -933,7 +943,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // Use a WeakVH to hold onto it in case this happens. WeakVH IterHandle(CurInstIterator); BasicBlock *BB = CurInstIterator->getParent(); - + RecursivelyDeleteTriviallyDeadInstructions(Repl); if (IterHandle != CurInstIterator) { @@ -945,7 +955,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // This address is now available for reassignment, so erase the table // entry; we don't want to match some completely different instruction. SunkAddrs[Addr] = 0; - } + } } ++NumMemoryInsts; return true; @@ -957,12 +967,12 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { bool MadeChange = false; - TargetLowering::AsmOperandInfoVector + TargetLowering::AsmOperandInfoVector TargetConstraints = TLI->ParseConstraints(CS); unsigned ArgNo = 0; for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - + // Compute the constraint code and ConstraintType to use. TLI->ComputeConstraintToUse(OpInfo, SDValue()); @@ -1091,6 +1101,79 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { return MadeChange; } +/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be +/// turned into an explicit branch. +static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { + // FIXME: This should use the same heuristics as IfConversion to determine + // whether a select is better represented as a branch. This requires that + // branch probability metadata is preserved for the select, which is not the + // case currently. + + CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); + + // If the branch is predicted right, an out of order CPU can avoid blocking on + // the compare. Emit cmovs on compares with a memory operand as branches to + // avoid stalls on the load from memory. If the compare has more than one use + // there's probably another cmov or setcc around so it's not worth emitting a + // branch. + if (!Cmp) + return false; + + Value *CmpOp0 = Cmp->getOperand(0); + Value *CmpOp1 = Cmp->getOperand(1); + + // We check that the memory operand has one use to avoid uses of the loaded + // value directly after the compare, making branches unprofitable. + return Cmp->hasOneUse() && + ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) || + (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse())); +} + + +bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { + // If we have a SelectInst that will likely profit from branch prediction, + // turn it into a branch. + if (DisableSelectToBranch || OptSize || !TLI || + !TLI->isPredictableSelectExpensive()) + return false; + + if (!SI->getCondition()->getType()->isIntegerTy(1) || + !isFormingBranchFromSelectProfitable(SI)) + return false; + + ModifiedDT = true; + + // First, we split the block containing the select into 2 blocks. + BasicBlock *StartBlock = SI->getParent(); + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI)); + BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); + + // Create a new block serving as the landing pad for the branch. + BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid", + NextBlock->getParent(), NextBlock); + + // Move the unconditional branch from the block with the select in it into our + // landing pad block. + StartBlock->getTerminator()->eraseFromParent(); + BranchInst::Create(NextBlock, SmallBlock); + + // Insert the real conditional branch based on the original condition. + BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI); + + // The select itself is replaced with a PHI Node. + PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin()); + PN->takeName(SI); + PN->addIncoming(SI->getTrueValue(), StartBlock); + PN->addIncoming(SI->getFalseValue(), SmallBlock); + SI->replaceAllUsesWith(PN); + SI->eraseFromParent(); + + // Instruct OptimizeBlock to skip to the next block. + CurInstIterator = StartBlock->end(); + ++NumSelectsExpanded; + return true; +} + bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (PHINode *P = dyn_cast<PHINode>(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) @@ -1104,7 +1187,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { } return false; } - + if (CastInst *CI = dyn_cast<CastInst>(I)) { // If the source of the cast is a constant, then this should have // already been constant folded. The only reason NOT to constant fold @@ -1124,23 +1207,23 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { } return false; } - + if (CmpInst *CI = dyn_cast<CmpInst>(I)) return OptimizeCmpExpression(CI); - + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { if (TLI) return OptimizeMemoryInst(I, I->getOperand(0), LI->getType()); return false; } - + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { if (TLI) return OptimizeMemoryInst(I, SI->getOperand(1), SI->getOperand(0)->getType()); return false; } - + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { if (GEPI->hasAllZeroIndices()) { /// The GEP operand must be a pointer, so must its result -> BitCast @@ -1154,13 +1237,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { } return false; } - + if (CallInst *CI = dyn_cast<CallInst>(I)) return OptimizeCallInst(CI); if (ReturnInst *RI = dyn_cast<ReturnInst>(I)) return DupRetToEnableTailCallOpts(RI); + if (SelectInst *SI = dyn_cast<SelectInst>(I)) + return OptimizeSelectInst(SI); + return false; } @@ -1179,7 +1265,7 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { } // llvm.dbg.value is far away from the value then iSel may not be able -// handle it properly. iSel will drop llvm.dbg.value if it can not +// handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. bool CodeGenPrepare::PlaceDbgValues(Function &F) { bool MadeChange = false; diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index c8c5360..5eff0e5 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -32,7 +32,7 @@ #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" using namespace llvm; @@ -71,7 +71,7 @@ namespace { bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, - SmallPtrSet<Value*, 16> &DeadStackObjects); + SmallSetVector<Value*, 16> &DeadStackObjects); virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); @@ -106,7 +106,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } /// static void DeleteDeadInstruction(Instruction *I, MemoryDependenceAnalysis &MD, - SmallPtrSet<Value*, 16> *ValueSet = 0) { + SmallSetVector<Value*, 16> *ValueSet = 0) { SmallVector<Instruction*, 32> NowDeadInsts; NowDeadInsts.push_back(I); @@ -136,7 +136,7 @@ static void DeleteDeadInstruction(Instruction *I, DeadInst->eraseFromParent(); - if (ValueSet) ValueSet->erase(DeadInst); + if (ValueSet) ValueSet->remove(DeadInst); } while (!NowDeadInsts.empty()); } @@ -248,7 +248,7 @@ static bool isShortenable(Instruction *I) { // Don't shorten stores for now if (isa<StoreInst>(I)) return false; - + IntrinsicInst *II = cast<IntrinsicInst>(I); switch (II->getIntrinsicID()) { default: return false; @@ -275,33 +275,9 @@ static Value *getStoredPointerOperand(Instruction *I) { } static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) { - const TargetData *TD = AA.getTargetData(); - - if (const CallInst *CI = extractMallocCall(V)) { - if (const ConstantInt *C = dyn_cast<ConstantInt>(CI->getArgOperand(0))) - return C->getZExtValue(); - } - - if (TD == 0) - return AliasAnalysis::UnknownSize; - - if (const AllocaInst *A = dyn_cast<AllocaInst>(V)) { - // Get size information for the alloca - if (const ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize())) - return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType()); - } - - if (const Argument *A = dyn_cast<Argument>(V)) { - if (A->hasByValAttr()) - if (PointerType *PT = dyn_cast<PointerType>(A->getType())) - return TD->getTypeAllocSize(PT->getElementType()); - } - - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) { - if (!GV->mayBeOverridden()) - return TD->getTypeAllocSize(GV->getType()->getElementType()); - } - + uint64_t Size; + if (getObjectSize(V, Size, AA.getTargetData())) + return Size; return AliasAnalysis::UnknownSize; } @@ -316,7 +292,7 @@ namespace { /// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location /// completely overwrites a store to the 'Earlier' location. -/// 'OverwriteEnd' if the end of the 'Earlier' location is completely +/// 'OverwriteEnd' if the end of the 'Earlier' location is completely /// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, const AliasAnalysis::Location &Earlier, @@ -339,7 +315,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, if (AA.getTargetData() == 0 && Later.Ptr->getType() == Earlier.Ptr->getType()) return OverwriteComplete; - + return OverwriteUnknown; } @@ -405,7 +381,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, Later.Size > Earlier.Size && uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size) return OverwriteComplete; - + // The other interesting case is if the later store overwrites the end of // the earlier store // @@ -544,11 +520,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // If we find a write that is a) removable (i.e., non-volatile), b) is // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. - if (isRemovable(DepWrite) && + if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { - int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, - DepWriteOffset, InstWriteOffset); + int64_t InstWriteOffset, DepWriteOffset; + OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, + DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); @@ -557,7 +533,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { DeleteDeadInstruction(DepWrite, *MD); ++NumFastStores; MadeChange = true; - + // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. BBI = Inst; @@ -575,16 +551,16 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { unsigned DepWriteAlign = DepIntrinsic->getAlignment(); if (llvm::isPowerOf2_64(InstWriteOffset) || ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) { - + DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: " - << *DepWrite << "\n KILLER (offset " - << InstWriteOffset << ", " + << *DepWrite << "\n KILLER (offset " + << InstWriteOffset << ", " << DepLoc.Size << ")" << *Inst << '\n'); - + Value* DepWriteLength = DepIntrinsic->getLength(); Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(), - InstWriteOffset - + InstWriteOffset - DepWriteOffset); DepIntrinsic->setLength(TrimmedLength); MadeChange = true; @@ -694,19 +670,18 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Keep track of all of the stack objects that are dead at the end of the // function. - SmallPtrSet<Value*, 16> DeadStackObjects; + SmallSetVector<Value*, 16> DeadStackObjects; // Find all of the alloca'd pointers in the entry block. BasicBlock *Entry = BB.getParent()->begin(); for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) { - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) - DeadStackObjects.insert(AI); + if (isa<AllocaInst>(I)) + DeadStackObjects.insert(I); // Okay, so these are dead heap objects, but if the pointer never escapes // then it's leaked by this function anyways. - if (CallInst *CI = extractMallocCall(I)) - if (!PointerMayBeCaptured(CI, true, true)) - DeadStackObjects.insert(CI); + else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true)) + DeadStackObjects.insert(I); } // Treat byval arguments the same, stores to them are dead at the end of the @@ -723,14 +698,30 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If we find a store, check to see if it points into a dead stack value. if (hasMemoryWrite(BBI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts - Value *Pointer = GetUnderlyingObject(getStoredPointerOperand(BBI)); + SmallVector<Value *, 4> Pointers; + GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers); // Stores to stack values are valid candidates for removal. - if (DeadStackObjects.count(Pointer)) { + bool AllDead = true; + for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), + E = Pointers.end(); I != E; ++I) + if (!DeadStackObjects.count(*I)) { + AllDead = false; + break; + } + + if (AllDead) { Instruction *Dead = BBI++; DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " - << *Dead << "\n Object: " << *Pointer << '\n'); + << *Dead << "\n Objects: "; + for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), + E = Pointers.end(); I != E; ++I) { + dbgs() << **I; + if (llvm::next(I) != E) + dbgs() << ", "; + } + dbgs() << '\n'); // DCE instructions only used to calculate that store. DeleteDeadInstruction(Dead, *MD, &DeadStackObjects); @@ -749,13 +740,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) { continue; } - if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) { - DeadStackObjects.erase(A); - continue; - } - - if (CallInst *CI = extractMallocCall(BBI)) { - DeadStackObjects.erase(CI); + if (isa<AllocaInst>(BBI) || isAllocLikeFn(BBI)) { + DeadStackObjects.remove(BBI); continue; } @@ -768,7 +754,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If the call might load from any of our allocas, then any store above // the call is live. SmallVector<Value*, 8> LiveAllocas; - for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), + for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), E = DeadStackObjects.end(); I != E; ++I) { // See if the call site touches it. AliasAnalysis::ModRefResult A = @@ -780,7 +766,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), E = LiveAllocas.end(); I != E; ++I) - DeadStackObjects.erase(*I); + DeadStackObjects.remove(*I); // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. @@ -827,7 +813,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, - SmallPtrSet<Value*, 16> &DeadStackObjects) { + SmallSetVector<Value*, 16> &DeadStackObjects) { const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr); // A constant can't be in the dead pointer set. @@ -837,12 +823,12 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, // If the kill pointer can be easily reduced to an alloca, don't bother doing // extraneous AA queries. if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { - DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer)); + DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer)); return; } SmallVector<Value*, 16> NowLive; - for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), + for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), E = DeadStackObjects.end(); I != E; ++I) { // See if the loaded location could alias the stack location. AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA)); @@ -852,5 +838,5 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end(); I != E; ++I) - DeadStackObjects.erase(*I); + DeadStackObjects.remove(*I); } diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index f3c92d6..9759549 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -39,7 +39,7 @@ static unsigned getHash(const void *V) { } //===----------------------------------------------------------------------===// -// SimpleValue +// SimpleValue //===----------------------------------------------------------------------===// namespace { @@ -47,16 +47,16 @@ namespace { /// scoped hash table. struct SimpleValue { Instruction *Inst; - + SimpleValue(Instruction *I) : Inst(I) { assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); } - + bool isSentinel() const { return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); } - + static bool canHandle(Instruction *Inst) { // This can only handle non-void readnone functions. if (CallInst *CI = dyn_cast<CallInst>(Inst)) @@ -90,7 +90,7 @@ template<> struct DenseMapInfo<SimpleValue> { unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; - + // Hash in all of the operands as pointers. unsigned Res = 0; for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) @@ -126,13 +126,13 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { if (LHS.isSentinel() || RHS.isSentinel()) return LHSI == RHSI; - + if (LHSI->getOpcode() != RHSI->getOpcode()) return false; return LHSI->isIdenticalTo(RHSI); } //===----------------------------------------------------------------------===// -// CallValue +// CallValue //===----------------------------------------------------------------------===// namespace { @@ -140,21 +140,21 @@ namespace { /// the scoped hash table. struct CallValue { Instruction *Inst; - + CallValue(Instruction *I) : Inst(I) { assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); } - + bool isSentinel() const { return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); } - + static bool canHandle(Instruction *Inst) { // Don't value number anything that returns void. if (Inst->getType()->isVoidTy()) return false; - + CallInst *CI = dyn_cast<CallInst>(Inst); if (CI == 0 || !CI->onlyReadsMemory()) return false; @@ -168,7 +168,7 @@ namespace llvm { template<> struct isPodLike<CallValue> { static const bool value = true; }; - + template<> struct DenseMapInfo<CallValue> { static inline CallValue getEmptyKey() { return DenseMapInfo<Instruction*>::getEmptyKey(); @@ -189,7 +189,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { "Cannot value number calls with metadata operands"); Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); } - + // Mix in the opcode. return (Res << 1) ^ Inst->getOpcode(); } @@ -203,11 +203,11 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { //===----------------------------------------------------------------------===// -// EarlyCSE pass. +// EarlyCSE pass. //===----------------------------------------------------------------------===// namespace { - + /// EarlyCSE - This pass does a simple depth-first walk over the dominator /// tree, eliminating trivially redundant instructions and using instsimplify /// to canonicalize things as it goes. It is intended to be fast and catch @@ -223,14 +223,14 @@ public: ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, AllocatorTy> ScopedHTType; - + /// AvailableValues - This scoped hash table contains the current values of /// all of our simple scalar expressions. As we walk down the domtree, we /// look to see if instructions are in this: if so, we replace them with what /// we find, otherwise we insert them so that dominated values can succeed in /// their lookup. ScopedHTType *AvailableValues; - + /// AvailableLoads - This scoped hash table contains the current values /// of loads. This allows us to get efficient access to dominating loads when /// we have a fully redundant load. In addition to the most recent load, we @@ -243,15 +243,15 @@ public: typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>, DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType; LoadHTType *AvailableLoads; - + /// AvailableCalls - This scoped hash table contains the current values /// of read-only call values. It uses the same generation count as loads. typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType; CallHTType *AvailableCalls; - + /// CurrentGeneration - This is the current generation of the memory value. unsigned CurrentGeneration; - + static char ID; explicit EarlyCSE() : FunctionPass(ID) { initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); @@ -326,7 +326,7 @@ private: }; bool processNode(DomTreeNode *Node); - + // This transformation requires dominator postdominator info virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<DominatorTree>(); @@ -350,7 +350,7 @@ INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) bool EarlyCSE::processNode(DomTreeNode *Node) { BasicBlock *BB = Node->getBlock(); - + // If this block has a single predecessor, then the predecessor is the parent // of the domtree node and all of the live out memory values are still current // in this block. If this block has multiple predecessors, then they could @@ -359,20 +359,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // predecessors. if (BB->getSinglePredecessor() == 0) ++CurrentGeneration; - + /// LastStore - Keep track of the last non-volatile store that we saw... for /// as long as there in no instruction that reads memory. If we see a store /// to the same location, we delete the dead store. This zaps trivial dead /// stores which can occur in bitfield code among other things. StoreInst *LastStore = 0; - + bool Changed = false; // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { Instruction *Inst = I++; - + // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); @@ -381,7 +381,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumSimplify; continue; } - + // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) { @@ -392,7 +392,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumSimplify; continue; } - + // If this is a simple instruction that we can value number, process it. if (SimpleValue::canHandle(Inst)) { // See if the instruction has an available value. If so, use it. @@ -404,12 +404,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumCSE; continue; } - + // Otherwise, just remember that this value is available. AvailableValues->insert(Inst, Inst); continue; } - + // If this is a non-volatile load, process it. if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { // Ignore volatile loads. @@ -417,7 +417,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LastStore = 0; continue; } - + // If we have an available version of this load, and if it is the right // generation, replace this instruction. std::pair<Value*, unsigned> InVal = @@ -431,18 +431,18 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumCSELoad; continue; } - + // Otherwise, remember that we have this instruction. AvailableLoads->insert(Inst->getOperand(0), std::pair<Value*, unsigned>(Inst, CurrentGeneration)); LastStore = 0; continue; } - + // If this instruction may read from memory, forget LastStore. if (Inst->mayReadFromMemory()) LastStore = 0; - + // If this is a read-only call, process it. if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right @@ -457,19 +457,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumCSECall; continue; } - + // Otherwise, remember that we have this instruction. AvailableCalls->insert(Inst, std::pair<Value*, unsigned>(Inst, CurrentGeneration)); continue; } - + // Okay, this isn't something we can CSE at all. Check to see if it is // something that could modify memory. If so, our available memory values // cannot be used so bump the generation count. if (Inst->mayWriteToMemory()) { ++CurrentGeneration; - + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. @@ -483,7 +483,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LastStore = 0; continue; } - + // Okay, we just invalidated anything we knew about loaded values. Try // to salvage *something* by remembering that the stored value is a live // version of the pointer. It is safe to forward from volatile stores @@ -491,7 +491,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // the store. AvailableLoads->insert(SI->getPointerOperand(), std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); - + // Remember that this was the last store we saw for DSE. if (SI->isSimple()) LastStore = SI; @@ -509,7 +509,7 @@ bool EarlyCSE::runOnFunction(Function &F) { TD = getAnalysisIfAvailable<TargetData>(); TLI = &getAnalysis<TargetLibraryInfo>(); DT = &getAnalysis<DominatorTree>(); - + // Tables that the pass uses when walking the domtree. ScopedHTType AVTable; AvailableValues = &AVTable; @@ -517,7 +517,7 @@ bool EarlyCSE::runOnFunction(Function &F) { AvailableLoads = &LoadTable; CallHTType CallTable; AvailableCalls = &CallTable; - + CurrentGeneration = 0; bool Changed = false; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index fb733ad..140864d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -18,8 +18,15 @@ #define DEBUG_TYPE "gvn" #include "llvm/Transforms/Scalar.h" #include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" +#include "llvm/Metadata.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/Dominators.h" @@ -30,20 +37,14 @@ #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Assembly/Writer.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/Hashing.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/PatternMatch.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; using namespace PatternMatch; @@ -59,6 +60,11 @@ static cl::opt<bool> EnablePRE("enable-pre", cl::init(true), cl::Hidden); static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true)); +// Maximum allowed recursion depth. +static cl::opt<uint32_t> +MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, + cl::desc("Max recurse depth (default = 1000)")); + //===----------------------------------------------------------------------===// // ValueTable Class //===----------------------------------------------------------------------===// @@ -167,7 +173,7 @@ Expression ValueTable::create_expression(Instruction *I) { if (e.varargs[0] > e.varargs[1]) std::swap(e.varargs[0], e.varargs[1]); } - + if (CmpInst *C = dyn_cast<CmpInst>(I)) { // Sort the operand value numbers so x<y and y>x get the same value number. CmpInst::Predicate Predicate = C->getPredicate(); @@ -181,7 +187,7 @@ Expression ValueTable::create_expression(Instruction *I) { II != IE; ++II) e.varargs.push_back(*II); } - + return e; } @@ -385,7 +391,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) { valueNumbering[V] = nextValueNumber; return nextValueNumber++; } - + Instruction* I = cast<Instruction>(V); Expression exp; switch (I->getOpcode()) { @@ -501,7 +507,7 @@ namespace { const TargetLibraryInfo *TLI; ValueTable VN; - + /// LeaderTable - A mapping from value numbers to lists of Value*'s that /// have that value number. Use findLeader to query it. struct LeaderTableEntry { @@ -511,7 +517,7 @@ namespace { }; DenseMap<uint32_t, LeaderTableEntry> LeaderTable; BumpPtrAllocator TableAllocator; - + SmallVector<Instruction*, 8> InstrsToErase; public: static char ID; // Pass identification, replacement for typeid @@ -521,14 +527,14 @@ namespace { } bool runOnFunction(Function &F); - + /// markInstructionForDeletion - This removes the specified instruction from /// our various maps and marks it for deletion. void markInstructionForDeletion(Instruction *I) { VN.erase(I); InstrsToErase.push_back(I); } - + const TargetData *getTargetData() const { return TD; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } @@ -543,25 +549,25 @@ namespace { Curr.BB = BB; return; } - + LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>(); Node->Val = V; Node->BB = BB; Node->Next = Curr.Next; Curr.Next = Node; } - + /// removeFromLeaderTable - Scan the list of values corresponding to a given - /// value number, and remove the given value if encountered. - void removeFromLeaderTable(uint32_t N, Value *V, BasicBlock *BB) { + /// value number, and remove the given instruction if encountered. + void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { LeaderTableEntry* Prev = 0; LeaderTableEntry* Curr = &LeaderTable[N]; - while (Curr->Val != V || Curr->BB != BB) { + while (Curr->Val != I || Curr->BB != BB) { Prev = Curr; Curr = Curr->Next; } - + if (Prev) { Prev->Next = Curr->Next; } else { @@ -591,7 +597,7 @@ namespace { AU.addPreserved<DominatorTree>(); AU.addPreserved<AliasAnalysis>(); } - + // Helper fuctions // FIXME: eliminate or document these better @@ -647,7 +653,11 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { /// 3) we are speculating for this block and have used that to speculate for /// other blocks. static bool IsValueFullyAvailableInBlock(BasicBlock *BB, - DenseMap<BasicBlock*, char> &FullyAvailableBlocks) { + DenseMap<BasicBlock*, char> &FullyAvailableBlocks, + uint32_t RecurseDepth) { + if (RecurseDepth > MaxRecurseDepth) + return false; + // Optimistically assume that the block is fully available and check to see // if we already know about this block in one lookup. std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV = @@ -673,7 +683,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB, // If the value isn't fully available in one of our predecessors, then it // isn't fully available in this block either. Undo our previous // optimistic assumption and bail out. - if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks)) + if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1)) goto SpeculationFailure; return true; @@ -725,15 +735,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy()) return false; - + // The store has to be at least as big as the load. if (TD.getTypeSizeInBits(StoredVal->getType()) < TD.getTypeSizeInBits(LoadTy)) return false; - + return true; } - + /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and /// then a load from a must-aliased pointer of a different type, try to coerce @@ -741,80 +751,80 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, /// InsertPt is the place to insert new instructions. /// /// If we can't do it, return null. -static Value *CoerceAvailableValueToLoadType(Value *StoredVal, +static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, Instruction *InsertPt, const TargetData &TD) { if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) return 0; - + // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); - + uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy); uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); - + // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { // Pointer to Pointer -> use bitcast. if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) return new BitCastInst(StoredVal, LoadedTy, "", InsertPt); - + // Convert source pointers to integers, which can be bitcast. if (StoredValTy->isPointerTy()) { StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } - + Type *TypeToCastTo = LoadedTy; if (TypeToCastTo->isPointerTy()) TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext()); - + if (StoredValTy != TypeToCastTo) StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); - + // Cast to pointer if the load needs a pointer type. if (LoadedTy->isPointerTy()) StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt); - + return StoredVal; } - + // If the loaded value is smaller than the available value, then we can // extract out a piece from it. If the available value is too small, then we // can't do anything. assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail"); - + // Convert source pointers to integers, which can be manipulated. if (StoredValTy->isPointerTy()) { StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } - + // Convert vectors and fp to integer, which can be manipulated. if (!StoredValTy->isIntegerTy()) { StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize); StoredVal = new BitCastInst(StoredVal, StoredValTy, "", InsertPt); } - + // If this is a big-endian system, we need to shift the value down to the low // bits so that a truncate will work. if (TD.isBigEndian()) { Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize); StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt); } - + // Truncate the integer to the right size now. Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize); StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt); - + if (LoadedTy == NewIntTy) return StoredVal; - + // If the result is a pointer, inttoptr. if (LoadedTy->isPointerTy()) return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt); - + // Otherwise, bitcast. return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt); } @@ -835,13 +845,13 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy()) return -1; - + int64_t StoreOffset = 0, LoadOffset = 0; Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD); Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD); if (StoreBase != LoadBase) return -1; - + // If the load and store are to the exact same address, they should have been // a must alias. AA must have gotten confused. // FIXME: Study to see if/when this happens. One case is forwarding a memset @@ -856,18 +866,18 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, abort(); } #endif - + // If the load and store don't overlap at all, the store doesn't provide // anything to the load. In this case, they really don't alias at all, AA // must have gotten confused. uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy); - + if ((WriteSizeInBits & 7) | (LoadSize & 7)) return -1; uint64_t StoreSize = WriteSizeInBits >> 3; // Convert to bytes. LoadSize >>= 3; - - + + bool isAAFailure = false; if (StoreOffset < LoadOffset) isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset; @@ -885,7 +895,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, #endif return -1; } - + // If the Load isn't completely contained within the stored bits, we don't // have all the bits to feed it. We could do something crazy in the future // (issue a smaller load then merge the bits in) but this seems unlikely to be @@ -893,11 +903,11 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, if (StoreOffset > LoadOffset || StoreOffset+StoreSize < LoadOffset+LoadSize) return -1; - + // Okay, we can do this transformation. Return the number of bytes into the // store that the load is. return LoadOffset-StoreOffset; -} +} /// AnalyzeLoadFromClobberingStore - This function is called when we have a /// memdep query of a load that ends up being a clobbering store. @@ -923,23 +933,23 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, // Cannot handle reading from store of first-class aggregate yet. if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) return -1; - + Value *DepPtr = DepLI->getPointerOperand(); uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType()); int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD); if (R != -1) return R; - + // If we have a load/load clobber an DepLI can be widened to cover this load, // then we should widen it! int64_t LoadOffs = 0; const Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD); unsigned LoadSize = TD.getTypeStoreSize(LoadTy); - + unsigned Size = MemoryDependenceAnalysis:: getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD); if (Size == 0) return -1; - + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD); } @@ -958,29 +968,29 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, if (MI->getIntrinsicID() == Intrinsic::memset) return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), MemSizeInBits, TD); - + // If we have a memcpy/memmove, the only case we can handle is if this is a // copy from constant memory. In that case, we can read directly from the // constant memory. MemTransferInst *MTI = cast<MemTransferInst>(MI); - + Constant *Src = dyn_cast<Constant>(MTI->getSource()); if (Src == 0) return -1; - + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD)); if (GV == 0 || !GV->isConstant()) return -1; - + // See if the access is within the bounds of the transfer. int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), MemSizeInBits, TD); if (Offset == -1) return Offset; - + // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, llvm::Type::getInt8PtrTy(Src->getContext())); - Constant *OffsetCst = + Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); @@ -988,7 +998,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, return Offset; return -1; } - + /// GetStoreValueForLoad - This function is called when we have a /// memdep query of a load that ends up being a clobbering store. This means @@ -999,32 +1009,32 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, const TargetData &TD){ LLVMContext &Ctx = SrcVal->getType()->getContext(); - + uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8; - + IRBuilder<> Builder(InsertPt->getParent(), InsertPt); - + // Compute which bits of the stored value are being used by the load. Convert // to an integer type to start with. if (SrcVal->getType()->isPointerTy()) SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx)); if (!SrcVal->getType()->isIntegerTy()) SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); - + // Shift the bits to the least significant depending on endianness. unsigned ShiftAmt; if (TD.isLittleEndian()) ShiftAmt = Offset*8; else ShiftAmt = (StoreSize-LoadSize-Offset)*8; - + if (ShiftAmt) SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt); - + if (LoadSize != StoreSize) SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8)); - + return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); } @@ -1051,14 +1061,14 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, NewLoadSize = NextPowerOf2(NewLoadSize); Value *PtrVal = SrcVal->getPointerOperand(); - + // Insert the new load after the old load. This ensures that subsequent // memdep queries will find the new load. We can't easily remove the old // load completely because it is already in the value numbering table. IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal)); - Type *DestPTy = + Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize*8); - DestPTy = PointerType::get(DestPTy, + DestPTy = PointerType::get(DestPTy, cast<PointerType>(PtrVal->getType())->getAddressSpace()); Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); @@ -1068,7 +1078,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n"); DEBUG(dbgs() << "TO: " << *NewLoad << "\n"); - + // Replace uses of the original load with the wider load. On a big endian // system, we need to shift down to get the relevant bits. Value *RV = NewLoad; @@ -1077,7 +1087,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits()); RV = Builder.CreateTrunc(RV, SrcVal->getType()); SrcVal->replaceAllUsesWith(RV); - + // We would like to use gvn.markInstructionForDeletion here, but we can't // because the load is already memoized into the leader map table that GVN // tracks. It is potentially possible to remove the load from the table, @@ -1086,7 +1096,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, gvn.getMemDep().removeInstruction(SrcVal); SrcVal = NewLoad; } - + return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD); } @@ -1100,7 +1110,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; IRBuilder<> Builder(InsertPt->getParent(), InsertPt); - + // We know that this method is only called when the mem transfer fully // provides the bits for the load. if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) { @@ -1109,9 +1119,9 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Value *Val = MSI->getValue(); if (LoadSize != 1) Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8)); - + Value *OneElt = Val; - + // Splat the value out to the right number of bits. for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) { // If we can double the number of bytes set, do it. @@ -1121,16 +1131,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, NumBytesSet <<= 1; continue; } - + // Otherwise insert one byte at a time. Value *ShVal = Builder.CreateShl(Val, 1*8); Val = Builder.CreateOr(OneElt, ShVal); ++NumBytesSet; } - + return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD); } - + // Otherwise, this is a memcpy/memmove from a constant global. MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); Constant *Src = cast<Constant>(MTI->getSource()); @@ -1139,7 +1149,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, llvm::Type::getInt8PtrTy(Src->getContext())); - Constant *OffsetCst = + Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); @@ -1156,13 +1166,13 @@ struct AvailableValueInBlock { LoadVal, // A value produced by a load. MemIntrin // A memory intrinsic which is loaded from. }; - + /// V - The value that is live out of the block. PointerIntPair<Value *, 2, ValType> Val; - + /// Offset - The byte offset in Val that is interesting for the load query. unsigned Offset; - + static AvailableValueInBlock get(BasicBlock *BB, Value *V, unsigned Offset = 0) { AvailableValueInBlock Res; @@ -1182,7 +1192,7 @@ struct AvailableValueInBlock { Res.Offset = Offset; return Res; } - + static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, unsigned Offset = 0) { AvailableValueInBlock Res; @@ -1201,17 +1211,17 @@ struct AvailableValueInBlock { assert(isSimpleValue() && "Wrong accessor"); return Val.getPointer(); } - + LoadInst *getCoercedLoadValue() const { assert(isCoercedLoadValue() && "Wrong accessor"); return cast<LoadInst>(Val.getPointer()); } - + MemIntrinsic *getMemIntrinValue() const { assert(isMemIntrinValue() && "Wrong accessor"); return cast<MemIntrinsic>(Val.getPointer()); } - + /// MaterializeAdjustedValue - Emit code into this block to adjust the value /// defined here to the specified type. This handles various coercion cases. Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { @@ -1223,7 +1233,7 @@ struct AvailableValueInBlock { assert(TD && "Need target data to handle type mismatch case"); Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), *TD); - + DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1235,7 +1245,7 @@ struct AvailableValueInBlock { } else { Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), gvn); - + DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " << *getCoercedLoadValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1258,12 +1268,12 @@ struct AvailableValueInBlock { /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. -static Value *ConstructSSAForLoadSet(LoadInst *LI, +static Value *ConstructSSAForLoadSet(LoadInst *LI, SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock, GVN &gvn) { // Check for the fully redundant, dominating load case. In this case, we can // just use the dominating value directly. - if (ValuesPerBlock.size() == 1 && + if (ValuesPerBlock.size() == 1 && gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, LI->getParent())) return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); @@ -1272,29 +1282,29 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, SmallVector<PHINode*, 8> NewPHIs; SSAUpdater SSAUpdate(&NewPHIs); SSAUpdate.Initialize(LI->getType(), LI->getName()); - + Type *LoadTy = LI->getType(); - + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { const AvailableValueInBlock &AV = ValuesPerBlock[i]; BasicBlock *BB = AV.BB; - + if (SSAUpdate.HasValueForBlock(BB)) continue; SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn)); } - + // Perform PHI construction. Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); - + // If new PHI nodes were created, notify alias analysis. if (V->getType()->isPointerTy()) { AliasAnalysis *AA = gvn.getAliasAnalysis(); - + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) AA->copyValue(LI, NewPHIs[i]); - + // Now that we've copied information to the new PHIs, scan through // them again and inform alias analysis that we've added potentially // escaping uses to any values that are operands to these PHIs. @@ -1366,7 +1376,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // the pointer operand of the load if PHI translation occurs. Make sure // to consider the right address. Value *Address = Deps[i].getAddress(); - + // If the dependence is to a store that writes to a superset of the bits // read by the load, we can extract the bits we need for the load from the // stored value. @@ -1382,7 +1392,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { } } } - + // Check to see if we have something like this: // load i32* P // load i8* (P+1) @@ -1394,7 +1404,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), LI->getPointerOperand(), DepLI, *TD); - + if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, Offset)); @@ -1413,10 +1423,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, Offset)); continue; - } + } } } - + UnavailableBlocks.push_back(DepBB); continue; } @@ -1426,14 +1436,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { Instruction *DepInst = DepInfo.getInst(); // Loading the allocation -> undef. - if (isa<AllocaInst>(DepInst) || isMalloc(DepInst) || + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) || // Loading immediately after lifetime begin -> undef. isLifetimeStart(DepInst)) { ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, UndefValue::get(LI->getType()))); continue; } - + if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of // different types if we have to. @@ -1451,7 +1461,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { S->getValueOperand())); continue; } - + if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) { // If the types mismatch and we can't handle it, reject reuse of the load. if (LD->getType() != LI->getType()) { @@ -1460,12 +1470,12 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){ UnavailableBlocks.push_back(DepBB); continue; - } + } } ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD)); continue; } - + UnavailableBlocks.push_back(DepBB); continue; } @@ -1479,7 +1489,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // its value. Insert PHIs and remove the fully redundant value now. if (UnavailableBlocks.empty()) { DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); - + // Perform PHI construction. Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); LI->replaceAllUsesWith(V); @@ -1522,10 +1532,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return false; if (Blockers.count(TmpBB)) return false; - + // If any of these blocks has more than one successor (i.e. if the edge we - // just traversed was critical), then there are other paths through this - // block along which the load may not be anticipated. Hoisting the load + // just traversed was critical), then there are other paths through this + // block along which the load may not be anticipated. Hoisting the load // above this block would be adding the load to execution paths along // which it was not previously executed. if (TmpBB->getTerminator()->getNumSuccessors() != 1) @@ -1570,7 +1580,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *Pred = *PI; - if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) { + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } PredLoads[Pred] = 0; @@ -1603,7 +1613,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { unsigned NumUnavailablePreds = PredLoads.size(); assert(NumUnavailablePreds != 0 && "Fully available value should be eliminated above!"); - + // If this load is unavailable in multiple predecessors, reject it. // FIXME: If we could restructure the CFG, we could make a common pred with // all the preds that don't have an available LI and insert a new load into @@ -1680,10 +1690,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { DEBUG(if (!NewInsts.empty()) dbgs() << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back() << '\n'); - + // Assign value numbers to the new instructions. for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) { - // FIXME: We really _ought_ to insert these value numbers into their + // FIXME: We really _ought_ to insert these value numbers into their // parent's availability map. However, in doing so, we risk getting into // ordering issues. If a block hasn't been processed yet, we would be // marking a value as AVAIL-IN, which isn't what we intend. @@ -1725,6 +1735,53 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return true; } +static void patchReplacementInstruction(Value *Repl, Instruction *I) { + // Patch the replacement so that it is not more restrictive than the value + // being replaced. + BinaryOperator *Op = dyn_cast<BinaryOperator>(I); + BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl); + if (Op && ReplOp && isa<OverflowingBinaryOperator>(Op) && + isa<OverflowingBinaryOperator>(ReplOp)) { + if (ReplOp->hasNoSignedWrap() && !Op->hasNoSignedWrap()) + ReplOp->setHasNoSignedWrap(false); + if (ReplOp->hasNoUnsignedWrap() && !Op->hasNoUnsignedWrap()) + ReplOp->setHasNoUnsignedWrap(false); + } + if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) { + SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; + ReplInst->getAllMetadataOtherThanDebugLoc(Metadata); + for (int i = 0, n = Metadata.size(); i < n; ++i) { + unsigned Kind = Metadata[i].first; + MDNode *IMD = I->getMetadata(Kind); + MDNode *ReplMD = Metadata[i].second; + switch(Kind) { + default: + ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata + break; + case LLVMContext::MD_dbg: + llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg"); + case LLVMContext::MD_tbaa: + ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD)); + break; + case LLVMContext::MD_range: + ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD)); + break; + case LLVMContext::MD_prof: + llvm_unreachable("MD_prof in a non terminator instruction"); + break; + case LLVMContext::MD_fpmath: + ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD)); + break; + } + } + } +} + +static void patchAndReplaceAllUsesWith(Value *Repl, Instruction *I) { + patchReplacementInstruction(Repl, I); + I->replaceAllUsesWith(Repl); +} + /// processLoad - Attempt to eliminate a load, first by eliminating it /// locally, and then attempting non-local elimination if that fails. bool GVN::processLoad(LoadInst *L) { @@ -1738,7 +1795,7 @@ bool GVN::processLoad(LoadInst *L) { markInstructionForDeletion(L); return true; } - + // ... to a pointer that has been loaded from before... MemDepResult Dep = MD->getDependency(L); @@ -1764,7 +1821,7 @@ bool GVN::processLoad(LoadInst *L) { AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, L->getType(), L, *TD); } - + // Check to see if we have something like this: // load i32* P // load i8* (P+1) @@ -1774,14 +1831,14 @@ bool GVN::processLoad(LoadInst *L) { // we have the first instruction in the entry block. if (DepLI == L) return false; - + int Offset = AnalyzeLoadFromClobberingLoad(L->getType(), L->getPointerOperand(), DepLI, *TD); if (Offset != -1) AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); } - + // If the clobbering value is a memset/memcpy/memmove, see if we can forward // a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { @@ -1791,11 +1848,11 @@ bool GVN::processLoad(LoadInst *L) { if (Offset != -1) AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD); } - + if (AvailVal) { DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n' << *AvailVal << '\n' << *L << "\n\n\n"); - + // Replace the load! L->replaceAllUsesWith(AvailVal); if (AvailVal->getType()->isPointerTy()) @@ -1805,7 +1862,7 @@ bool GVN::processLoad(LoadInst *L) { return true; } } - + // If the value isn't available, don't do anything! if (Dep.isClobber()) { DEBUG( @@ -1835,7 +1892,7 @@ bool GVN::processLoad(LoadInst *L) { Instruction *DepInst = Dep.getInst(); if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { Value *StoredVal = DepSI->getValueOperand(); - + // The store and load are to a must-aliased pointer, but they may not // actually have the same type. See if we know how to reuse the stored // value (depending on its type). @@ -1845,11 +1902,11 @@ bool GVN::processLoad(LoadInst *L) { L, *TD); if (StoredVal == 0) return false; - + DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal << '\n' << *L << "\n\n\n"); } - else + else return false; } @@ -1864,7 +1921,7 @@ bool GVN::processLoad(LoadInst *L) { if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) { Value *AvailableVal = DepLI; - + // The loads are of a must-aliased pointer, but they may not actually have // the same type. See if we know how to reuse the previously loaded value // (depending on its type). @@ -1874,16 +1931,16 @@ bool GVN::processLoad(LoadInst *L) { L, *TD); if (AvailableVal == 0) return false; - + DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal << "\n" << *L << "\n\n\n"); } - else + else return false; } - + // Remove it! - L->replaceAllUsesWith(AvailableVal); + patchAndReplaceAllUsesWith(AvailableVal, L); if (DepLI->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(DepLI); markInstructionForDeletion(L); @@ -1894,13 +1951,13 @@ bool GVN::processLoad(LoadInst *L) { // If this load really doesn't depend on anything, then we must be loading an // undef value. This can happen when loading for a fresh allocation with no // intervening stores, for example. - if (isa<AllocaInst>(DepInst) || isMalloc(DepInst)) { + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) { L->replaceAllUsesWith(UndefValue::get(L->getType())); markInstructionForDeletion(L); ++NumGVNLoad; return true; } - + // If this load occurs either right after a lifetime begin, // then the loaded value is undefined. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) { @@ -1915,28 +1972,28 @@ bool GVN::processLoad(LoadInst *L) { return false; } -// findLeader - In order to find a leader for a given value number at a +// findLeader - In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, -// and then scan the list to find one whose block dominates the block in +// and then scan the list to find one whose block dominates the block in // question. This is fast because dominator tree queries consist of only // a few comparisons of DFS numbers. Value *GVN::findLeader(BasicBlock *BB, uint32_t num) { LeaderTableEntry Vals = LeaderTable[num]; if (!Vals.Val) return 0; - + Value *Val = 0; if (DT->dominates(Vals.BB, BB)) { Val = Vals.Val; if (isa<Constant>(Val)) return Val; } - + LeaderTableEntry* Next = Vals.Next; while (Next) { if (DT->dominates(Next->BB, BB)) { if (isa<Constant>(Next->Val)) return Next->Val; if (!Val) Val = Next->Val; } - + Next = Next->Next; } @@ -2012,9 +2069,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) { DT->properlyDominates(cast<Instruction>(RHS)->getParent(), Root)) && "Instruction doesn't dominate scope!"); - // If value numbering later deduces that an instruction in the scope is equal - // to 'LHS' then ensure it will be turned into 'RHS'. - addToLeaderTable(LVN, RHS, Root); + // If value numbering later sees that an instruction in the scope is equal + // to 'LHS' then ensure it will be turned into 'RHS'. In order to preserve + // the invariant that instructions only occur in the leader table for their + // own value number (this is used by removeFromLeaderTable), do not do this + // if RHS is an instruction (if an instruction in the scope is morphed into + // LHS then it will be turned into RHS by the next GVN iteration anyway, so + // using the leader table is about compiling faster, not optimizing better). + if (!isa<Instruction>(RHS)) + addToLeaderTable(LVN, RHS, Root); // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As // LHS always has at least one use that is not dominated by Root, this will @@ -2180,7 +2243,7 @@ bool GVN::processInstruction(Instruction *I) { // Instructions with void type don't return a value, so there's // no point in trying to find redundancies in them. if (I->getType()->isVoidTy()) return false; - + uint32_t NextNum = VN.getNextUnusedValueNumber(); unsigned Num = VN.lookup_or_add(I); @@ -2198,7 +2261,7 @@ bool GVN::processInstruction(Instruction *I) { addToLeaderTable(Num, I, I->getParent()); return false; } - + // Perform fast-path value-number based elimination of values inherited from // dominators. Value *repl = findLeader(I->getParent(), Num); @@ -2207,9 +2270,9 @@ bool GVN::processInstruction(Instruction *I) { addToLeaderTable(Num, I, I->getParent()); return false; } - + // Remove it! - I->replaceAllUsesWith(repl); + patchAndReplaceAllUsesWith(repl, I); if (MD && repl->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(repl); markInstructionForDeletion(I); @@ -2234,7 +2297,7 @@ bool GVN::runOnFunction(Function& F) { // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { BasicBlock *BB = FI++; - + bool removedBlock = MergeBlockIntoPredecessor(BB, this); if (removedBlock) ++NumGVNBlocks; @@ -2391,7 +2454,7 @@ bool GVN::performPRE(Function &F) { // we would need to insert instructions in more than one pred. if (NumWithout != 1 || NumWith == 0) continue; - + // Don't do PRE across indirect branch. if (isa<IndirectBrInst>(PREPred->getTerminator())) continue; @@ -2467,7 +2530,7 @@ bool GVN::performPRE(Function &F) { unsigned jj = PHINode::getOperandNumForIncomingValue(ii); VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); } - + if (MD) MD->invalidateCachedPointerInfo(Phi); } @@ -2504,7 +2567,7 @@ bool GVN::splitCriticalEdges() { /// iterateOnFunction - Executes one iteration of GVN bool GVN::iterateOnFunction(Function &F) { cleanupGlobalSets(); - + // Top-down walk of the dominator tree bool Changed = false; #if 0 @@ -2539,7 +2602,7 @@ void GVN::verifyRemoved(const Instruction *Inst) const { I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) { const LeaderTableEntry *Node = &I->second; assert(Node->Val != Inst && "Inst still in value numbering scope!"); - + while (Node->Next) { Node = Node->Next; assert(Node->Val != Inst && "Inst still in value numbering scope!"); diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp index c2bd6e6..b36a3cb 100644 --- a/lib/Transforms/Scalar/GlobalMerge.cpp +++ b/lib/Transforms/Scalar/GlobalMerge.cpp @@ -12,7 +12,7 @@ // global). Such a transformation can significantly reduce the register pressure // when many globals are involved. // -// For example, consider the code which touches several global variables at +// For example, consider the code which touches several global variables at // once: // // static int foo[N], bar[N], baz[N]; @@ -208,8 +208,8 @@ bool GlobalMerge::doInitialization(Module &M) { if (BSSGlobals.size() > 1) Changed |= doMerge(BSSGlobals, M, false); - // FIXME: This currently breaks the EH processing due to way how the - // typeinfo detection works. We might want to detect the TIs and ignore + // FIXME: This currently breaks the EH processing due to way how the + // typeinfo detection works. We might want to detect the TIs and ignore // them in the future. // if (ConstGlobals.size() > 1) // Changed |= doMerge(ConstGlobals, M, true); diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index a9ba657..37f8bdf 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1215,21 +1215,26 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { return 0; } -/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show -/// that the current exit test is already sufficiently canonical. -static bool needsLFTR(Loop *L, DominatorTree *DT) { +/// Return the compare guarding the loop latch, or NULL for unrecognized tests. +static ICmpInst *getLoopTest(Loop *L) { assert(L->getExitingBlock() && "expected loop exit"); BasicBlock *LatchBlock = L->getLoopLatch(); // Don't bother with LFTR if the loop is not properly simplified. if (!LatchBlock) - return false; + return 0; BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); assert(BI && "expected exit branch"); + return dyn_cast<ICmpInst>(BI->getCondition()); +} + +/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show +/// that the current exit test is already sufficiently canonical. +static bool needsLFTR(Loop *L, DominatorTree *DT) { // Do LFTR to simplify the exit condition to an ICMP. - ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); + ICmpInst *Cond = getLoopTest(L); if (!Cond) return true; @@ -1259,6 +1264,48 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) { return Phi != getLoopPhiForCounter(IncV, L, DT); } +/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils +/// down to checking that all operands are constant and listing instructions +/// that may hide undef. +static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited, + unsigned Depth) { + if (isa<Constant>(V)) + return !isa<UndefValue>(V); + + if (Depth >= 6) + return false; + + // Conservatively handle non-constant non-instructions. For example, Arguments + // may be undef. + Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return false; + + // Load and return values may be undef. + if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) + return false; + + // Optimistically handle other instructions. + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { + if (!Visited.insert(*OI)) + continue; + if (!hasConcreteDefImpl(*OI, Visited, Depth+1)) + return false; + } + return true; +} + +/// Return true if the given value is concrete. We must prove that undef can +/// never reach it. +/// +/// TODO: If we decide that this is a good approach to checking for undef, we +/// may factor it into a common location. +static bool hasConcreteDef(Value *V) { + SmallPtrSet<Value*, 8> Visited; + Visited.insert(V); + return hasConcreteDefImpl(V, Visited, 0); +} + /// AlmostDeadIV - Return true if this IV has any uses other than the (soon to /// be rewritten) loop exit test. static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { @@ -1283,6 +1330,8 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// valid count without scaling the address stride, so it remains a pointer /// expression as far as SCEV is concerned. /// +/// Currently only valid for LFTR. See the comments on hasConcreteDef below. +/// /// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount /// /// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride. @@ -1331,6 +1380,19 @@ FindLoopCounter(Loop *L, const SCEV *BECount, if (getLoopPhiForCounter(IncV, L, DT) != Phi) continue; + // Avoid reusing a potentially undef value to compute other values that may + // have originally had a concrete definition. + if (!hasConcreteDef(Phi)) { + // We explicitly allow unknown phis as long as they are already used by + // the loop test. In this case we assume that performing LFTR could not + // increase the number of undef users. + if (ICmpInst *Cond = getLoopTest(L)) { + if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT) + && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) { + continue; + } + } + } const SCEV *Init = AR->getStart(); if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) { @@ -1347,7 +1409,7 @@ FindLoopCounter(Loop *L, const SCEV *BECount, // If two IVs both count from zero or both count from nonzero then the // narrower is likely a dead phi that has been widened. Use the wider phi // to allow the other to be eliminated. - if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) + else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) continue; } BestPhi = Phi; diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 429b61b..dd42c59 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -670,6 +670,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) { Condition = SI->getCondition(); } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) { + // Can't thread indirect branch with no successors. + if (IB->getNumSuccessors() == 0) return false; Condition = IB->getAddress()->stripPointerCasts(); Preference = WantBlockAddress; } else { @@ -859,7 +861,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // If all of the loads and stores that feed the value have the same TBAA tag, // then we can propagate it onto any newly inserted loads. - MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); + MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); SmallPtrSet<BasicBlock*, 8> PredsScanned; typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; @@ -885,7 +887,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { OneUnavailablePred = PredBB; continue; } - + // If tbaa tags disagree or are not present, forget about them. if (TBAATag != ThisTBAATag) TBAATag = 0; @@ -949,7 +951,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { NewVal->setDebugLoc(LI->getDebugLoc()); if (TBAATag) NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag); - + AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 8795cd8..582948e 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -618,6 +618,11 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) { if (!DT->dominates(Inst.getParent(), ExitBlocks[i])) return false; + // As a degenerate case, if the loop is statically infinite then we haven't + // proven anything since there are no exit blocks. + if (ExitBlocks.empty()) + return false; + return true; } diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index f7f3298..3771f5a 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -32,10 +32,10 @@ namespace { LoopDeletion() : LoopPass(ID) { initializeLoopDeletionPass(*PassRegistry::getPassRegistry()); } - + // Possibly eliminate loop L if it is dead. bool runOnLoop(Loop* L, LPPassManager& LPM); - + bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks, SmallVector<BasicBlock*, 4>& exitBlocks, bool &Changed, BasicBlock *Preheader); @@ -46,7 +46,7 @@ namespace { AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - + AU.addPreserved<ScalarEvolution>(); AU.addPreserved<DominatorTree>(); AU.addPreserved<LoopInfo>(); @@ -55,7 +55,7 @@ namespace { } }; } - + char LoopDeletion::ID = 0; INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) @@ -79,7 +79,7 @@ bool LoopDeletion::IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitBlocks, bool &Changed, BasicBlock *Preheader) { BasicBlock* exitBlock = exitBlocks[0]; - + // Make sure that all PHI entries coming from the loop are loop invariant. // Because the code is in LCSSA form, any values used outside of the loop // must pass through a PHI in the exit block, meaning that this check is @@ -97,14 +97,14 @@ bool LoopDeletion::IsLoopDead(Loop* L, if (incoming != P->getIncomingValueForBlock(exitingBlocks[i])) return false; } - + if (Instruction* I = dyn_cast<Instruction>(incoming)) if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) return false; ++BI; } - + // Make sure that no instructions in the block have potential side-effects. // This includes instructions that could write to memory, and loads that are // marked volatile. This could be made more aggressive by using aliasing @@ -117,23 +117,23 @@ bool LoopDeletion::IsLoopDead(Loop* L, return false; } } - + return true; } /// runOnLoop - Remove dead loops, by which we mean loops that do not impact the -/// observable behavior of the program other than finite running time. Note +/// observable behavior of the program other than finite running time. Note /// we do ensure that this never remove a loop that might be infinite, as doing /// so could change the halting/non-halting nature of a program. /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA /// in order to make various safety checks work. bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { - // We can only remove the loop if there is a preheader that we can + // We can only remove the loop if there is a preheader that we can // branch from after removing it. BasicBlock* preheader = L->getLoopPreheader(); if (!preheader) return false; - + // If LoopSimplify form is not available, stay out of trouble. if (!L->hasDedicatedExits()) return false; @@ -142,36 +142,36 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { // they would already have been removed in earlier executions of this pass. if (L->begin() != L->end()) return false; - + SmallVector<BasicBlock*, 4> exitingBlocks; L->getExitingBlocks(exitingBlocks); - + SmallVector<BasicBlock*, 4> exitBlocks; L->getUniqueExitBlocks(exitBlocks); - + // We require that the loop only have a single exit block. Otherwise, we'd // be in the situation of needing to be able to solve statically which exit // block will be branched to, or trying to preserve the branching logic in // a loop invariant manner. if (exitBlocks.size() != 1) return false; - + // Finally, we have to check that the loop really is dead. bool Changed = false; if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader)) return Changed; - + // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. ScalarEvolution& SE = getAnalysis<ScalarEvolution>(); const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) return Changed; - + // Now that we know the removal is safe, remove the loop by changing the - // branch from the preheader to go to the single exit block. + // branch from the preheader to go to the single exit block. BasicBlock* exitBlock = exitBlocks[0]; - + // Because we're deleting a large chunk of code at once, the sequence in which // we remove things is very important to avoid invalidation issues. Don't // mess with this unless you have good reason and know what you're doing. @@ -197,7 +197,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { P->removeIncomingValue(exitingBlocks[i]); ++BI; } - + // Update the dominator tree and remove the instructions and blocks that will // be deleted from the reference counting scheme. DominatorTree& DT = getAnalysis<DominatorTree>(); @@ -211,7 +211,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { DE = ChildNodes.end(); DI != DE; ++DI) { DT.changeImmediateDominator(*DI, DT[preheader]); } - + ChildNodes.clear(); DT.eraseNode(*LI); @@ -219,7 +219,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { // delete it freely later. (*LI)->dropAllReferences(); } - + // Erase the instructions and the blocks without having to worry // about ordering because we already dropped the references. // NOTE: This iteration is safe because erasing the block does not remove its @@ -236,13 +236,13 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(), E = blocks.end(); I != E; ++I) loopInfo.removeBlock(*I); - + // The last step is to inform the loop pass manager that we've // eliminated this loop. LPM.deleteLoopFromQueue(L); Changed = true; - + ++NumDeleted; - + return Changed; } diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index ad15cbb..ac1082c 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -43,20 +43,20 @@ #define DEBUG_TYPE "loop-idiom" #include "llvm/Transforms/Scalar.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/Module.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); @@ -173,7 +173,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) { bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { CurLoop = L; - // Disable loop idiom recognition if the function's name is a common idiom. + // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); if (Name == "memset" || Name == "memcpy") return false; diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index f0f05e6..982400c 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -48,7 +48,7 @@ namespace { } }; } - + char LoopInstSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 59aace9..7eeb152 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -418,12 +418,13 @@ bool LoopRotate::rotateLoop(Loop *L) { } // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. Split the edge to form a real preheader. + // thus is not a preheader anymore. + // Split the edge to form a real preheader. BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this); NewPH->setName(NewHeader->getName() + ".lr.ph"); - // Preserve canonical loop form, which means that 'Exit' should have only one - // predecessor. + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this); ExitSplit->moveBefore(Exit); } else { diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index fe4700b..b14a713 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1308,8 +1308,8 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0; case LSRUse::Special: - // Only handle -1 scales, or no scale. - return AM.Scale == 0 || AM.Scale == -1; + // Special case Basic to handle -1 scales. + return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0; } llvm_unreachable("Invalid LSRUse Kind!"); @@ -1439,7 +1439,41 @@ struct IVInc { // IVChain - The list of IV increments in program order. // We typically add the head of a chain without finding subsequent links. -typedef SmallVector<IVInc,1> IVChain; +struct IVChain { + SmallVector<IVInc,1> Incs; + const SCEV *ExprBase; + + IVChain() : ExprBase(0) {} + + IVChain(const IVInc &Head, const SCEV *Base) + : Incs(1, Head), ExprBase(Base) {} + + typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; + + // begin - return the first increment in the chain. + const_iterator begin() const { + assert(!Incs.empty()); + return llvm::next(Incs.begin()); + } + const_iterator end() const { + return Incs.end(); + } + + // hasIncs - Returns true if this chain contains any increments. + bool hasIncs() const { return Incs.size() >= 2; } + + // add - Add an IVInc to the end of this chain. + void add(const IVInc &X) { Incs.push_back(X); } + + // tailUserInst - Returns the last UserInst in the chain. + Instruction *tailUserInst() const { return Incs.back().UserInst; } + + // isProfitableIncrement - Returns true if IncExpr can be profitably added to + // this chain. + bool isProfitableIncrement(const SCEV *OperExpr, + const SCEV *IncExpr, + ScalarEvolution&); +}; /// ChainUsers - Helper for CollectChains to track multiple IV increment uses. /// Distinguish between FarUsers that definitely cross IV increments and @@ -2160,7 +2194,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, return &LU; // This is the formula where all the registers and symbols matched; // there aren't going to be any others. Since we declined it, we - // can skip the rest of the formulae and procede to the next LSRUse. + // can skip the rest of the formulae and proceed to the next LSRUse. break; } } @@ -2319,41 +2353,23 @@ static const SCEV *getExprBase(const SCEV *S) { /// increment will be an offset relative to the same base. We allow such offsets /// to potentially be used as chain increment as long as it's not obviously /// expensive to expand using real instructions. -static const SCEV * -getProfitableChainIncrement(Value *NextIV, Value *PrevIV, - const IVChain &Chain, Loop *L, - ScalarEvolution &SE, const TargetLowering *TLI) { - // Prune the solution space aggressively by checking that both IV operands - // are expressions that operate on the same unscaled SCEVUnknown. This - // "base" will be canceled by the subsequent getMinusSCEV call. Checking first - // avoids creating extra SCEV expressions. - const SCEV *OperExpr = SE.getSCEV(NextIV); - const SCEV *PrevExpr = SE.getSCEV(PrevIV); - if (getExprBase(OperExpr) != getExprBase(PrevExpr) && !StressIVChain) - return 0; - - const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); - if (!SE.isLoopInvariant(IncExpr, L)) - return 0; - - // We are not able to expand an increment unless it is loop invariant, - // however, the following checks are purely for profitability. +bool IVChain::isProfitableIncrement(const SCEV *OperExpr, + const SCEV *IncExpr, + ScalarEvolution &SE) { + // Aggressively form chains when -stress-ivchain. if (StressIVChain) - return IncExpr; + return true; // Do not replace a constant offset from IV head with a nonconstant IV // increment. if (!isa<SCEVConstant>(IncExpr)) { - const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Chain[0].IVOperand)); + const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand)); if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr))) return 0; } SmallPtrSet<const SCEV*, 8> Processed; - if (isHighCostExpansion(IncExpr, Processed, SE)) - return 0; - - return IncExpr; + return !isHighCostExpansion(IncExpr, Processed, SE); } /// Return true if the number of registers needed for the chain is estimated to @@ -2372,18 +2388,18 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, if (StressIVChain) return true; - if (Chain.size() <= 2) + if (!Chain.hasIncs()) return false; if (!Users.empty()) { - DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " users:\n"; + DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(), E = Users.end(); I != E; ++I) { dbgs() << " " << **I << "\n"; }); return false; } - assert(!Chain.empty() && "empty IV chains are not allowed"); + assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); // The chain itself may require a register, so intialize cost to 1. int cost = 1; @@ -2391,15 +2407,15 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, // A complete chain likely eliminates the need for keeping the original IV in // a register. LSR does not currently know how to form a complete chain unless // the header phi already exists. - if (isa<PHINode>(Chain.back().UserInst) - && SE.getSCEV(Chain.back().UserInst) == Chain[0].IncExpr) { + if (isa<PHINode>(Chain.tailUserInst()) + && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { --cost; } const SCEV *LastIncExpr = 0; unsigned NumConstIncrements = 0; unsigned NumVarIncrements = 0; unsigned NumReusedIncrements = 0; - for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end(); + for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); I != E; ++I) { if (I->IncExpr->isZero()) @@ -2435,7 +2451,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, // the stride. cost -= NumReusedIncrements; - DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " Cost: " << cost << "\n"); + DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost + << "\n"); return cost < 0; } @@ -2446,25 +2463,39 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, SmallVectorImpl<ChainUsers> &ChainUsersVec) { // When IVs are used as types of varying widths, they are generally converted // to a wider type with some uses remaining narrow under a (free) trunc. - Value *NextIV = getWideOperand(IVOper); + Value *const NextIV = getWideOperand(IVOper); + const SCEV *const OperExpr = SE.getSCEV(NextIV); + const SCEV *const OperExprBase = getExprBase(OperExpr); // Visit all existing chains. Check if its IVOper can be computed as a // profitable loop invariant increment from the last link in the Chain. unsigned ChainIdx = 0, NChains = IVChainVec.size(); const SCEV *LastIncExpr = 0; for (; ChainIdx < NChains; ++ChainIdx) { - Value *PrevIV = getWideOperand(IVChainVec[ChainIdx].back().IVOperand); + IVChain &Chain = IVChainVec[ChainIdx]; + + // Prune the solution space aggressively by checking that both IV operands + // are expressions that operate on the same unscaled SCEVUnknown. This + // "base" will be canceled by the subsequent getMinusSCEV call. Checking + // first avoids creating extra SCEV expressions. + if (!StressIVChain && Chain.ExprBase != OperExprBase) + continue; + + Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand); if (!isCompatibleIVType(PrevIV, NextIV)) continue; // A phi node terminates a chain. - if (isa<PHINode>(UserInst) - && isa<PHINode>(IVChainVec[ChainIdx].back().UserInst)) + if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst())) + continue; + + // The increment must be loop-invariant so it can be kept in a register. + const SCEV *PrevExpr = SE.getSCEV(PrevIV); + const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); + if (!SE.isLoopInvariant(IncExpr, L)) continue; - if (const SCEV *IncExpr = - getProfitableChainIncrement(NextIV, PrevIV, IVChainVec[ChainIdx], - L, SE, TLI)) { + if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) { LastIncExpr = IncExpr; break; } @@ -2478,24 +2509,24 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, DEBUG(dbgs() << "IV Chain Limit\n"); return; } - LastIncExpr = SE.getSCEV(NextIV); + LastIncExpr = OperExpr; // IVUsers may have skipped over sign/zero extensions. We don't currently // attempt to form chains involving extensions unless they can be hoisted // into this loop's AddRec. if (!isa<SCEVAddRecExpr>(LastIncExpr)) return; ++NChains; - IVChainVec.resize(NChains); + IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), + OperExprBase)); ChainUsersVec.resize(NChains); - DEBUG(dbgs() << "IV Head: (" << *UserInst << ") IV=" << *LastIncExpr - << "\n"); + DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst + << ") IV=" << *LastIncExpr << "\n"); + } else { + DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst + << ") IV+" << *LastIncExpr << "\n"); + // Add this IV user to the end of the chain. + IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); } - else - DEBUG(dbgs() << "IV Inc: (" << *UserInst << ") IV+" << *LastIncExpr - << "\n"); - - // Add this IV user to the end of the chain. - IVChainVec[ChainIdx].push_back(IVInc(UserInst, IVOper, LastIncExpr)); SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers; // This chain's NearUsers become FarUsers. @@ -2551,6 +2582,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, /// loop latch. This will discover chains on side paths, but requires /// maintaining multiple copies of the Chains state. void LSRInstance::CollectChains() { + DEBUG(dbgs() << "Collecting IV Chains.\n"); SmallVector<ChainUsers, 8> ChainUsersVec; SmallVector<BasicBlock *,8> LatchPath; @@ -2622,10 +2654,10 @@ void LSRInstance::CollectChains() { } void LSRInstance::FinalizeChain(IVChain &Chain) { - assert(!Chain.empty() && "empty IV chains are not allowed"); - DEBUG(dbgs() << "Final Chain: " << *Chain[0].UserInst << "\n"); + assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); + DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); - for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end(); + for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); I != E; ++I) { DEBUG(dbgs() << " Inc: " << *I->UserInst << "\n"); User::op_iterator UseI = @@ -2659,7 +2691,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, SmallVectorImpl<WeakVH> &DeadInsts) { // Find the new IVOperand for the head of the chain. It may have been replaced // by LSR. - const IVInc &Head = Chain[0]; + const IVInc &Head = Chain.Incs[0]; User::op_iterator IVOpEnd = Head.UserInst->op_end(); User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), IVOpEnd, L, SE); @@ -2691,7 +2723,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, Type *IVTy = IVSrc->getType(); Type *IntTy = SE.getEffectiveSCEVType(IVTy); const SCEV *LeftOverExpr = 0; - for (IVChain::const_iterator IncI = llvm::next(Chain.begin()), + for (IVChain::const_iterator IncI = Chain.begin(), IncE = Chain.end(); IncI != IncE; ++IncI) { Instruction *InsertPt = IncI->UserInst; @@ -2736,7 +2768,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, } // If LSR created a new, wider phi, we may also replace its postinc. We only // do this if we also found a wide value for the head of the chain. - if (isa<PHINode>(Chain.back().UserInst)) { + if (isa<PHINode>(Chain.tailUserInst())) { for (BasicBlock::iterator I = L->getHeader()->begin(); PHINode *Phi = dyn_cast<PHINode>(I); ++I) { if (!isCompatibleIVType(Phi, IVSrc)) @@ -2804,7 +2836,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // x == y --> x - y == 0 const SCEV *N = SE.getSCEV(NV); - if (SE.isLoopInvariant(N, L)) { + if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. N = TransformForPostIncUse(Normalize, N, CI, 0, @@ -2974,42 +3006,64 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { /// CollectSubexprs - Split S into subexpressions which can be pulled out into /// separate registers. If C is non-null, multiply each subexpression by C. -static void CollectSubexprs(const SCEV *S, const SCEVConstant *C, - SmallVectorImpl<const SCEV *> &Ops, - const Loop *L, - ScalarEvolution &SE) { +/// +/// Return remainder expression after factoring the subexpressions captured by +/// Ops. If Ops is complete, return NULL. +static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, + SmallVectorImpl<const SCEV *> &Ops, + const Loop *L, + ScalarEvolution &SE, + unsigned Depth = 0) { + // Arbitrarily cap recursion to protect compile time. + if (Depth >= 3) + return S; + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { // Break out add operands. for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); - I != E; ++I) - CollectSubexprs(*I, C, Ops, L, SE); - return; + I != E; ++I) { + const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1); + if (Remainder) + Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); + } + return NULL; } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { // Split a non-zero base out of an addrec. - if (!AR->getStart()->isZero()) { - CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), - AR->getStepRecurrence(SE), - AR->getLoop(), - //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) - SCEV::FlagAnyWrap), - C, Ops, L, SE); - CollectSubexprs(AR->getStart(), C, Ops, L, SE); - return; + if (AR->getStart()->isZero()) + return S; + + const SCEV *Remainder = CollectSubexprs(AR->getStart(), + C, Ops, L, SE, Depth+1); + // Split the non-zero AddRec unless it is part of a nested recurrence that + // does not pertain to this loop. + if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { + Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); + Remainder = NULL; + } + if (Remainder != AR->getStart()) { + if (!Remainder) + Remainder = SE.getConstant(AR->getType(), 0); + return SE.getAddRecExpr(Remainder, + AR->getStepRecurrence(SE), + AR->getLoop(), + //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + SCEV::FlagAnyWrap); } } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { // Break (C * (a + b + c)) into C*a + C*b + C*c. - if (Mul->getNumOperands() == 2) - if (const SCEVConstant *Op0 = - dyn_cast<SCEVConstant>(Mul->getOperand(0))) { - CollectSubexprs(Mul->getOperand(1), - C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0, - Ops, L, SE); - return; - } + if (Mul->getNumOperands() != 2) + return S; + if (const SCEVConstant *Op0 = + dyn_cast<SCEVConstant>(Mul->getOperand(0))) { + C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0; + const SCEV *Remainder = + CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); + if (Remainder) + Ops.push_back(SE.getMulExpr(C, Remainder)); + return NULL; + } } - - // Otherwise use the value itself, optionally with a scale applied. - Ops.push_back(C ? SE.getMulExpr(C, S) : S); + return S; } /// GenerateReassociations - Split out subexpressions from adds and the bases of @@ -3024,7 +3078,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, const SCEV *BaseReg = Base.BaseRegs[i]; SmallVector<const SCEV *, 8> AddOps; - CollectSubexprs(BaseReg, 0, AddOps, L, SE); + const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE); + if (Remainder) + AddOps.push_back(Remainder); if (AddOps.size() == 1) continue; @@ -4108,7 +4164,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, // Attempt to find an insert position in the middle of the block, // instead of at the end, so that it can be used for other expansions. if (IDom == Inst->getParent() && - (!BetterPos || DT.dominates(BetterPos, Inst))) + (!BetterPos || !DT.dominates(Inst, BetterPos))) BetterPos = llvm::next(BasicBlock::iterator(Inst)); } if (!AllDominate) @@ -4236,13 +4292,6 @@ Value *LSRInstance::Expand(const LSRFixup &LF, Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); } - // Flush the operand list to suppress SCEVExpander hoisting. - if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); - Ops.clear(); - Ops.push_back(SE.getUnknown(FullV)); - } - // Expand the ScaledReg portion. Value *ICmpScaledV = 0; if (F.AM.Scale != 0) { @@ -4264,23 +4313,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } else { // Otherwise just expand the scaled register and an explicit scale, // which is expected to be matched as part of the address. + + // Flush the operand list to suppress SCEVExpander hoisting address modes. + if (!Ops.empty() && LU.Kind == LSRUse::Address) { + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); ScaledS = SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.AM.Scale)); Ops.push_back(ScaledS); - - // Flush the operand list to suppress SCEVExpander hoisting. - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); - Ops.clear(); - Ops.push_back(SE.getUnknown(FullV)); } } // Expand the GV portion. if (F.AM.BaseGV) { + // Flush the operand list to suppress SCEVExpander hoisting. + if (!Ops.empty()) { + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } Ops.push_back(SE.getUnknown(F.AM.BaseGV)); + } - // Flush the operand list to suppress SCEVExpander hoisting. + // Flush the operand list to suppress SCEVExpander hoisting of both folded and + // unfolded offsets. LSR assumes they both live next to their uses. + if (!Ops.empty()) { Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); @@ -4485,7 +4545,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, // Mark phi nodes that terminate chains so the expander tries to reuse them. for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { - if (PHINode *PN = dyn_cast<PHINode>(ChainI->back().UserInst)) + if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst())) Rewriter.setChainedPhi(PN); } diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 00ecc74..58f7739 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -409,15 +409,6 @@ bool LoopUnswitch::processCurrentLoop() { if (!currentLoop->isSafeToClone()) return false; - // Loops with invokes, whose unwind edge escapes the loop, cannot be - // unswitched because splitting their edges are non-trivial and don't preserve - // loop simplify information. - for (Loop::block_iterator I = currentLoop->block_begin(), - E = currentLoop->block_end(); I != E; ++I) - if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) - if (!currentLoop->contains(II->getUnwindDest())) - return false; - // Without dedicated exits, splitting the exit edge may fail. if (!currentLoop->hasDedicatedExits()) return false; @@ -633,11 +624,10 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, /// LoopCond == Val to simplify the loop. If we decide that this is profitable, /// unswitch the loop, reprocess the pieces, then return true. bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { - Function *F = loopHeader->getParent(); - Constant *CondVal = 0; BasicBlock *ExitBlock = 0; + if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { // If the condition is trivial, always unswitch. There is no code growth // for this case. @@ -697,8 +687,8 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, // If either edge is critical, split it. This helps preserve LoopSimplify // form for enclosing loops. - SplitCriticalEdge(BI, 0, this); - SplitCriticalEdge(BI, 1, this); + SplitCriticalEdge(BI, 0, this, false, false, true); + SplitCriticalEdge(BI, 1, this, false, false, true); } /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable @@ -1224,8 +1214,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // See if instruction simplification can hack this up. This is common for // things like "select false, X, Y" after unswitching made the condition be - // 'false'. - if (Value *V = SimplifyInstruction(I, 0, 0, DT)) + // 'false'. TODO: update the domtree properly so we can pass it here. + if (Value *V = SimplifyInstruction(I)) if (LI->replacementPreservesLCSSAForm(I, V)) { ReplaceUsesOfWith(I, V, Worklist, L, LPM); continue; diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 689bbe9..7419a65 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -15,9 +15,9 @@ #define DEBUG_TYPE "loweratomic" #include "llvm/Transforms/Scalar.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" -#include "llvm/Support/IRBuilder.h" using namespace llvm; static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { @@ -25,12 +25,12 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { Value *Ptr = CXI->getPointerOperand(); Value *Cmp = CXI->getCompareOperand(); Value *Val = CXI->getNewValOperand(); - + LoadInst *Orig = Builder.CreateLoad(Ptr); Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); Value *Res = Builder.CreateSelect(Equal, Val, Orig); Builder.CreateStore(Res, Ptr); - + CXI->replaceAllUsesWith(Orig); CXI->eraseFromParent(); return true; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index a87cce3..2a5ee33 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -15,21 +15,21 @@ #define DEBUG_TYPE "memcpyopt" #include "llvm/Transforms/Scalar.h" #include "llvm/GlobalVariable.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; @@ -44,7 +44,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, gep_type_iterator GTI = gep_type_begin(GEP); for (unsigned i = 1; i != Idx; ++i, ++GTI) /*skip along*/; - + // Compute the offset implied by the rest of the indices. int64_t Offset = 0; for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { @@ -58,7 +58,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); continue; } - + // Otherwise, we have a sequential type like an array or vector. Multiply // the index by the ElementSize. uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); @@ -77,7 +77,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, Ptr2 = Ptr2->stripPointerCasts(); GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); - + bool VariableIdxFound = false; // If one pointer is a GEP and the other isn't, then see if the GEP is a @@ -91,7 +91,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); return !VariableIdxFound; } - + // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical // base. After that base, they may have some number of common (and // potentially variable) indices. After that they handle some constant @@ -99,7 +99,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, // handle no other case. if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) return false; - + // Skip any common indices and track the GEP types. unsigned Idx = 1; for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) @@ -109,7 +109,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); if (VariableIdxFound) return false; - + Offset = Offset2-Offset1; return true; } @@ -128,19 +128,19 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, namespace { struct MemsetRange { // Start/End - A semi range that describes the span that this range covers. - // The range is closed at the start and open at the end: [Start, End). + // The range is closed at the start and open at the end: [Start, End). int64_t Start, End; /// StartPtr - The getelementptr instruction that points to the start of the /// range. Value *StartPtr; - + /// Alignment - The known alignment of the first store. unsigned Alignment; - + /// TheStores - The actual stores that make up this range. SmallVector<Instruction*, 16> TheStores; - + bool isProfitableToUseMemset(const TargetData &TD) const; }; @@ -152,17 +152,17 @@ bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { // If there is nothing to merge, don't do anything. if (TheStores.size() < 2) return false; - + // If any of the stores are a memset, then it is always good to extend the // memset. for (unsigned i = 0, e = TheStores.size(); i != e; ++i) if (!isa<StoreInst>(TheStores[i])) return true; - + // Assume that the code generator is capable of merging pairs of stores // together if it wants to. if (TheStores.size() == 2) return false; - + // If we have fewer than 8 stores, it can still be worthwhile to do this. // For example, merging 4 i8 stores into an i32 store is useful almost always. // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the @@ -175,15 +175,15 @@ bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { // actually reducing the number of stores used. unsigned Bytes = unsigned(End-Start); unsigned NumPointerStores = Bytes/TD.getPointerSize(); - + // Assume the remaining bytes if any are done a byte at a time. unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); - + // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 // etc. return TheStores.size() > NumPointerStores+NumByteStores; -} +} namespace { @@ -195,12 +195,12 @@ class MemsetRanges { const TargetData &TD; public: MemsetRanges(const TargetData &td) : TD(td) {} - + typedef std::list<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } const_iterator end() const { return Ranges.end(); } bool empty() const { return Ranges.empty(); } - + void addInst(int64_t OffsetFromFirst, Instruction *Inst) { if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) addStore(OffsetFromFirst, SI); @@ -210,21 +210,21 @@ public: void addStore(int64_t OffsetFromFirst, StoreInst *SI) { int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType()); - + addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(), SI->getAlignment(), SI); } - + void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); } - + void addRange(int64_t Start, int64_t Size, Value *Ptr, unsigned Alignment, Instruction *Inst); }; - + } // end anon namespace @@ -240,10 +240,10 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, unsigned Alignment, Instruction *Inst) { int64_t End = Start+Size; range_iterator I = Ranges.begin(), E = Ranges.end(); - + while (I != E && Start > I->End) ++I; - + // We now know that I == E, in which case we didn't find anything to merge // with, or that Start <= I->End. If End < I->Start or I == E, then we need // to insert a new range. Handle this now. @@ -256,18 +256,18 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, R.TheStores.push_back(Inst); return; } - + // This store overlaps with I, add it. I->TheStores.push_back(Inst); - + // At this point, we may have an interval that completely contains our store. // If so, just add it to the interval and return. if (I->Start <= Start && I->End >= End) return; - + // Now we know that Start <= I->End and End >= I->Start so the range overlaps // but is not entirely contained within the range. - + // See if the range extends the start of the range. In this case, it couldn't // possibly cause it to join the prior range, because otherwise we would have // stopped on *it*. @@ -276,7 +276,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, I->StartPtr = Ptr; I->Alignment = Alignment; } - + // Now we know that Start <= I->End and Start >= I->Start (so the startpoint // is in or right at the end of I), and that End >= I->Start. Extend I out to // End. @@ -325,7 +325,7 @@ namespace { AU.addPreserved<AliasAnalysis>(); AU.addPreserved<MemoryDependenceAnalysis>(); } - + // Helper fuctions bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); @@ -341,7 +341,7 @@ namespace { bool iterateOnFunction(Function &F); }; - + char MemCpyOpt::ID = 0; } @@ -361,16 +361,16 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", /// some other patterns to fold away. In particular, this looks for stores to /// neighboring locations of memory. If it sees enough consecutive ones, it /// attempts to merge them together into a memcpy/memset. -Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, +Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { if (TD == 0) return 0; - + // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. MemsetRanges Ranges(*TD); - + BasicBlock::iterator BI = StartInst; for (++BI; !isa<TerminatorInst>(BI); ++BI) { if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { @@ -381,43 +381,43 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, break; continue; } - + if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { // If this is a store, see if we can merge it in. if (!NextStore->isSimple()) break; - + // Check to see if this stored value is of the same byte-splattable value. if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) break; - + // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, *TD)) break; - + Ranges.addStore(Offset, NextStore); } else { MemSetInst *MSI = cast<MemSetInst>(BI); - + if (MSI->isVolatile() || ByteVal != MSI->getValue() || !isa<ConstantInt>(MSI->getLength())) break; - + // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD)) break; - + Ranges.addMemSet(Offset, MSI); } } - + // If we have no ranges, then we just had a single store with nothing that // could be merged in. This is a very common case of course. if (Ranges.empty()) return 0; - + // If we had at least one store that could be merged in, add the starting // store as well. We try to avoid this unless there is at least something // interesting as a small compile-time optimization. @@ -434,28 +434,28 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); I != E; ++I) { const MemsetRange &Range = *I; - + if (Range.TheStores.size() == 1) continue; - + // If it is profitable to lower this range to memset, do so now. if (!Range.isProfitableToUseMemset(*TD)) continue; - + // Otherwise, we do want to transform this! Create a new memset. // Get the starting pointer of the block. StartPtr = Range.StartPtr; - + // Determine alignment unsigned Alignment = Range.Alignment; if (Alignment == 0) { - Type *EltType = + Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); Alignment = TD->getABITypeAlignment(EltType); } - - AMemSet = + + AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); - + DEBUG(dbgs() << "Replace stores:\n"; for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) dbgs() << *Range.TheStores[i] << '\n'; @@ -473,14 +473,14 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, } ++NumMemSetInfer; } - + return AMemSet; } bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; - + if (TD == 0) return false; // Detect cases where we're performing call slot forwarding, but @@ -510,7 +510,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { bool changed = performCallSlotOptzn(LI, - SI->getPointerOperand()->stripPointerCasts(), + SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); if (changed) { @@ -524,10 +524,10 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { } } } - + // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. - + // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. @@ -537,7 +537,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { BBI = I; // Don't invalidate iterator. return true; } - + return false; } @@ -662,7 +662,11 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // the use analysis, we also need to know that it does not sneakily // access dest. We rely on AA to figure this out for us. AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - if (AA.getModRefInfo(C, cpyDest, srcSize) != AliasAnalysis::NoModRef) + AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize); + // If necessary, perform additional analysis. + if (MR != AliasAnalysis::NoModRef) + MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); + if (MR != AliasAnalysis::NoModRef) return false; // All the checks have passed, so do the transformation. @@ -676,7 +680,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (CS.getArgument(i)->getType() == cpyDest->getType()) CS.setArgument(i, cpyDest); else - CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, + CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, CS.getArgument(i)->getType(), cpyDest->getName(), C)); } @@ -697,14 +701,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, /// processMemCpyMemCpyDependence - We've found that the (upward scanning) /// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to /// copy from MDep's input if we can. MSize is the size of M's copy. -/// +/// bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize) { // We can only transforms memcpy's where the dest of one is the source of the // other. if (M->getSource() != MDep->getDest() || MDep->isVolatile()) return false; - + // If dep instruction is reading from our current input, then it is a noop // transfer and substituting the input won't change this instruction. Just // ignore the input and let someone else zap MDep. This handles cases like: @@ -712,14 +716,14 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, // memcpy(b <- a) if (M->getSource() == MDep->getSource()) return false; - + // Second, the length of the memcpy's must be the same, or the preceding one // must be larger than the following one. ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; - + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); // Verify that the copied-from memory doesn't change in between the two @@ -739,23 +743,23 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, false, M, M->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; - + // If the dest of the second might alias the source of the first, then the // source and dest might overlap. We still want to eliminate the intermediate // value, but we have to generate a memmove instead of memcpy. bool UseMemMove = false; if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) UseMemMove = true; - + // If all checks passed, then we can transform M. - + // Make sure to use the lesser of the alignment of the source and the dest // since we're changing where we're reading from, but don't want to increase // the alignment past what can be read from or written to. // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); - + IRBuilder<> Builder(M); if (UseMemMove) Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), @@ -835,13 +839,13 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { if (!TLI->has(LibFunc::memmove)) return false; - + // See if the pointers alias. if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M))) return false; - + DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); - + // If not, then we know we can transform this. Module *Mod = M->getParent()->getParent()->getParent(); Type *ArgTys[3] = { M->getRawDest()->getType(), @@ -857,7 +861,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { ++NumMoveToCpy; return true; } - + /// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { if (TD == 0) return false; @@ -880,7 +884,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { if (MDep == 0 || MDep->isVolatile() || ByValArg->stripPointerCasts() != MDep->getDest()) return false; - + // The length of the memcpy must be larger or equal to the size of the byval. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) @@ -890,13 +894,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // then it is some target specific value that we can't know. unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); if (ByValAlign == 0) return false; - + // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. if (MDep->getAlignment() < ByValAlign && getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) return false; - + // Verify that the copied-from memory doesn't change in between the memcpy and // the byval call. // memcpy(a <- b) @@ -911,16 +915,16 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { false, CS.getInstruction(), MDep->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; - + Value *TmpCast = MDep->getSource(); if (MDep->getSource()->getType() != ByValArg->getType()) TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), "tmpcast", CS.getInstruction()); - + DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" << " " << *MDep << "\n" << " " << *CS.getInstruction() << "\n"); - + // Otherwise we're good! Update the byval argument. CS.setArgument(ArgNo, TmpCast); ++NumMemCpyInstr; @@ -936,9 +940,9 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { // Avoid invalidating the iterator. Instruction *I = BI++; - + bool RepeatInstruction = false; - + if (StoreInst *SI = dyn_cast<StoreInst>(I)) MadeChange |= processStore(SI, BI); else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) @@ -960,7 +964,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { } } } - + return MadeChange; } @@ -972,19 +976,19 @@ bool MemCpyOpt::runOnFunction(Function &F) { MD = &getAnalysis<MemoryDependenceAnalysis>(); TD = getAnalysisIfAvailable<TargetData>(); TLI = &getAnalysis<TargetLibraryInfo>(); - + // If we don't have at least memset and memcpy, there is little point of doing // anything here. These are required by a freestanding implementation, so if // even they are disabled, there is no point in trying hard. if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy)) return false; - + while (1) { if (!iterateOnFunction(F)) break; MadeChange = true; } - + MD = 0; return MadeChange; } diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp index 29234da..3222f20 100644 --- a/lib/Transforms/Scalar/ObjCARC.cpp +++ b/lib/Transforms/Scalar/ObjCARC.cpp @@ -20,7 +20,7 @@ // This file also defines a simple ARC-aware AliasAnalysis. // // WARNING: This file knows about certain library functions. It recognizes them -// by name, and hardwires knowedge of their semantics. +// by name, and hardwires knowledge of their semantics. // // WARNING: This file knows about how certain Objective-C library functions are // used. Naive LLVM IR transformations which would otherwise be @@ -29,18 +29,8 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "objc-arc" -#include "llvm/Function.h" -#include "llvm/Intrinsics.h" -#include "llvm/GlobalVariable.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Module.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" -#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" using namespace llvm; // A handy option to enable/disable all optimizations in this file. @@ -141,6 +131,13 @@ namespace { // ARC Utilities. //===----------------------------------------------------------------------===// +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/CallSite.h" +#include "llvm/ADT/StringSwitch.h" + namespace { /// InstructionClass - A simple classification for instructions. enum InstructionClass { @@ -299,22 +296,23 @@ static InstructionClass GetInstructionClass(const Value *V) { // None of the intrinsic functions do objc_release. For intrinsics, the // only question is whether or not they may be users. switch (F->getIntrinsicID()) { - case 0: break; - case Intrinsic::bswap: case Intrinsic::ctpop: - case Intrinsic::ctlz: case Intrinsic::cttz: case Intrinsic::returnaddress: case Intrinsic::frameaddress: case Intrinsic::stacksave: case Intrinsic::stackrestore: case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend: + case Intrinsic::objectsize: case Intrinsic::prefetch: + case Intrinsic::stackprotector: + case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64: + case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa: + case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext: + case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline: + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + case Intrinsic::invariant_start: case Intrinsic::invariant_end: // Don't let dbg info affect our results. case Intrinsic::dbg_declare: case Intrinsic::dbg_value: // Short cut: Some intrinsics obviously don't use ObjC pointers. return IC_None; default: - for (Function::const_arg_iterator AI = F->arg_begin(), - AE = F->arg_end(); AI != AE; ++AI) - if (IsPotentialUse(AI)) - return IC_User; - return IC_None; + break; } } return GetCallSiteClass(CI); @@ -382,14 +380,14 @@ static InstructionClass GetBasicInstructionClass(const Value *V) { return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User; } -/// IsRetain - Test if the the given class is objc_retain or +/// IsRetain - Test if the given class is objc_retain or /// equivalent. static bool IsRetain(InstructionClass Class) { return Class == IC_Retain || Class == IC_RetainRV; } -/// IsAutorelease - Test if the the given class is objc_autorelease or +/// IsAutorelease - Test if the given class is objc_autorelease or /// equivalent. static bool IsAutorelease(InstructionClass Class) { return Class == IC_Autorelease || @@ -444,7 +442,7 @@ static bool IsNoThrow(InstructionClass Class) { Class == IC_AutoreleasepoolPop; } -/// EraseInstruction - Erase the given instruction. ObjC calls return their +/// EraseInstruction - Erase the given instruction. Many ObjC calls return their /// argument verbatim, so if it's such a call and the return value has users, /// replace them with the argument value. static void EraseInstruction(Instruction *CI) { @@ -565,9 +563,8 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { return Arg; } - // If we found an identifiable object but it has multiple uses, but they - // are trivial uses, we can still consider this to be a single-use - // value. + // If we found an identifiable object but it has multiple uses, but they are + // trivial uses, we can still consider this to be a single-use value. if (IsObjCIdentifiedObject(Arg)) { for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE; ++UI) { @@ -692,7 +689,7 @@ namespace { /// specified pass info. virtual void *getAdjustedAnalysisPointer(const void *PI) { if (PI == &AliasAnalysis::ID) - return (AliasAnalysis*)this; + return static_cast<AliasAnalysis *>(this); return this; } @@ -815,7 +812,7 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { case IC_FusedRetainAutorelease: case IC_FusedRetainAutoreleaseRV: // These functions don't access any memory visible to the compiler. - // Note that this doesn't include objc_retainBlock, becuase it updates + // Note that this doesn't include objc_retainBlock, because it updates // pointers when it copies block data. return NoModRef; default: @@ -915,6 +912,7 @@ bool ObjCARCExpand::runOnFunction(Function &F) { //===----------------------------------------------------------------------===// #include "llvm/Constants.h" +#include "llvm/ADT/STLExtras.h" namespace { /// ObjCARCAPElim - Autorelease pool elimination. @@ -922,8 +920,8 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const; virtual bool runOnModule(Module &M); - bool MayAutorelease(CallSite CS, unsigned Depth = 0); - bool OptimizeBB(BasicBlock *BB); + static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0); + static bool OptimizeBB(BasicBlock *BB); public: static char ID; @@ -949,15 +947,16 @@ void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const { /// MayAutorelease - Interprocedurally determine if calls made by the /// given call site can possibly produce autoreleases. -bool ObjCARCAPElim::MayAutorelease(CallSite CS, unsigned Depth) { - if (Function *Callee = CS.getCalledFunction()) { +bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) { + if (const Function *Callee = CS.getCalledFunction()) { if (Callee->isDeclaration() || Callee->mayBeOverridden()) return true; - for (Function::iterator I = Callee->begin(), E = Callee->end(); + for (Function::const_iterator I = Callee->begin(), E = Callee->end(); I != E; ++I) { - BasicBlock *BB = I; - for (BasicBlock::iterator J = BB->begin(), F = BB->end(); J != F; ++J) - if (CallSite JCS = CallSite(J)) + const BasicBlock *BB = I; + for (BasicBlock::const_iterator J = BB->begin(), F = BB->end(); + J != F; ++J) + if (ImmutableCallSite JCS = ImmutableCallSite(J)) // This recursion depth limit is arbitrary. It's just great // enough to cover known interesting testcases. if (Depth < 3 && @@ -992,7 +991,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { Push = 0; break; case IC_CallOrUser: - if (MayAutorelease(CallSite(Inst))) + if (MayAutorelease(ImmutableCallSite(Inst))) Push = 0; break; default: @@ -1093,14 +1092,10 @@ bool ObjCARCAPElim::runOnModule(Module &M) { // TODO: Delete release+retain pairs (rare). -#include "llvm/GlobalAlias.h" -#include "llvm/Constants.h" #include "llvm/LLVMContext.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/CFG.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/DenseSet.h" STATISTIC(NumNoops, "Number of no-op objc calls eliminated"); STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); @@ -1148,22 +1143,13 @@ bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) { // If the values are Selects with the same condition, we can do a more precise // check: just check for relations between the values on corresponding arms. if (const SelectInst *SB = dyn_cast<SelectInst>(B)) - if (A->getCondition() == SB->getCondition()) { - if (related(A->getTrueValue(), SB->getTrueValue())) - return true; - if (related(A->getFalseValue(), SB->getFalseValue())) - return true; - return false; - } + if (A->getCondition() == SB->getCondition()) + return related(A->getTrueValue(), SB->getTrueValue()) || + related(A->getFalseValue(), SB->getFalseValue()); // Check both arms of the Select node individually. - if (related(A->getTrueValue(), B)) - return true; - if (related(A->getFalseValue(), B)) - return true; - - // The arms both checked out. - return false; + return related(A->getTrueValue(), B) || + related(A->getFalseValue(), B); } bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) { @@ -1361,12 +1347,6 @@ namespace { /// with the "tail" keyword. bool IsTailCallRelease; - /// Partial - True of we've seen an opportunity for partial RR elimination, - /// such as pushing calls into a CFG triangle or into one side of a - /// CFG diamond. - /// TODO: Consider moving this to PtrState. - bool Partial; - /// ReleaseMetadata - If the Calls are objc_release calls and they all have /// a clang.imprecise_release tag, this is the metadata tag. MDNode *ReleaseMetadata; @@ -1381,7 +1361,7 @@ namespace { RRInfo() : KnownSafe(false), IsRetainBlock(false), - IsTailCallRelease(false), Partial(false), + IsTailCallRelease(false), ReleaseMetadata(0) {} void clear(); @@ -1392,7 +1372,6 @@ void RRInfo::clear() { KnownSafe = false; IsRetainBlock = false; IsTailCallRelease = false; - Partial = false; ReleaseMetadata = 0; Calls.clear(); ReverseInsertPts.clear(); @@ -1402,36 +1381,39 @@ namespace { /// PtrState - This class summarizes several per-pointer runtime properties /// which are propogated through the flow graph. class PtrState { - /// RefCount - The known minimum number of reference count increments. - unsigned RefCount; - /// NestCount - The known minimum level of retain+release nesting. unsigned NestCount; + /// KnownPositiveRefCount - True if the reference count is known to + /// be incremented. + bool KnownPositiveRefCount; + + /// Partial - True of we've seen an opportunity for partial RR elimination, + /// such as pushing calls into a CFG triangle or into one side of a + /// CFG diamond. + bool Partial; + /// Seq - The current position in the sequence. - Sequence Seq; + Sequence Seq : 8; public: /// RRI - Unidirectional information about the current sequence. /// TODO: Encapsulate this better. RRInfo RRI; - PtrState() : RefCount(0), NestCount(0), Seq(S_None) {} - - void SetAtLeastOneRefCount() { - if (RefCount == 0) RefCount = 1; - } + PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false), + Seq(S_None) {} - void IncrementRefCount() { - if (RefCount != UINT_MAX) ++RefCount; + void SetKnownPositiveRefCount() { + KnownPositiveRefCount = true; } - void DecrementRefCount() { - if (RefCount != 0) --RefCount; + void ClearRefCount() { + KnownPositiveRefCount = false; } bool IsKnownIncremented() const { - return RefCount > 0; + return KnownPositiveRefCount; } void IncrementNestCount() { @@ -1455,7 +1437,12 @@ namespace { } void ClearSequenceProgress() { - Seq = S_None; + ResetSequenceProgress(S_None); + } + + void ResetSequenceProgress(Sequence NewSeq) { + Seq = NewSeq; + Partial = false; RRI.clear(); } @@ -1466,7 +1453,7 @@ namespace { void PtrState::Merge(const PtrState &Other, bool TopDown) { Seq = MergeSeqs(Seq, Other.Seq, TopDown); - RefCount = std::min(RefCount, Other.RefCount); + KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount; NestCount = std::min(NestCount, Other.NestCount); // We can't merge a plain objc_retain with an objc_retainBlock. @@ -1475,31 +1462,31 @@ PtrState::Merge(const PtrState &Other, bool TopDown) { // If we're not in a sequence (anymore), drop all associated state. if (Seq == S_None) { + Partial = false; RRI.clear(); - } else if (RRI.Partial || Other.RRI.Partial) { + } else if (Partial || Other.Partial) { // If we're doing a merge on a path that's previously seen a partial // merge, conservatively drop the sequence, to avoid doing partial // RR elimination. If the branch predicates for the two merge differ, // mixing them is unsafe. - Seq = S_None; - RRI.clear(); + ClearSequenceProgress(); } else { // Conservatively merge the ReleaseMetadata information. if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata) RRI.ReleaseMetadata = 0; RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe; - RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease; + RRI.IsTailCallRelease = RRI.IsTailCallRelease && + Other.RRI.IsTailCallRelease; RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end()); // Merge the insert point sets. If there are any differences, // that makes this a partial merge. - RRI.Partial = RRI.ReverseInsertPts.size() != - Other.RRI.ReverseInsertPts.size(); + Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size(); for (SmallPtrSet<Instruction *, 2>::const_iterator I = Other.RRI.ReverseInsertPts.begin(), E = Other.RRI.ReverseInsertPts.end(); I != E; ++I) - RRI.Partial |= RRI.ReverseInsertPts.insert(*I); + Partial |= RRI.ReverseInsertPts.insert(*I); } } @@ -1525,6 +1512,11 @@ namespace { /// known about a pointer at the top of each block. MapTy PerPtrBottomUp; + /// Preds, Succs - Effective successors and predecessors of the current + /// block (this ignores ignorable edges and ignored backedges). + SmallVector<BasicBlock *, 2> Preds; + SmallVector<BasicBlock *, 2> Succs; + public: BBState() : TopDownPathCount(0), BottomUpPathCount(0) {} @@ -1582,14 +1574,22 @@ namespace { /// entry to an exit which pass through this block. This is only valid /// after both the top-down and bottom-up traversals are complete. unsigned GetAllPathCount() const { + assert(TopDownPathCount != 0); + assert(BottomUpPathCount != 0); return TopDownPathCount * BottomUpPathCount; } - /// IsVisitedTopDown - Test whether the block for this BBState has been - /// visited by the top-down portion of the algorithm. - bool isVisitedTopDown() const { - return TopDownPathCount != 0; - } + // Specialized CFG utilities. + typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator; + edge_iterator pred_begin() { return Preds.begin(); } + edge_iterator pred_end() { return Preds.end(); } + edge_iterator succ_begin() { return Succs.begin(); } + edge_iterator succ_end() { return Succs.end(); } + + void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); } + void addPred(BasicBlock *Pred) { Preds.push_back(Pred); } + + bool isExit() const { return Succs.empty(); } }; } @@ -1787,12 +1787,9 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) { if (!RetainRVCallee) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - std::vector<Type *> Params; - Params.push_back(I8X); - FunctionType *FTy = - FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); RetainRVCallee = M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy, Attributes); @@ -1804,12 +1801,9 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { if (!AutoreleaseRVCallee) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - std::vector<Type *> Params; - Params.push_back(I8X); - FunctionType *FTy = - FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); AutoreleaseRVCallee = M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy, Attributes); @@ -1820,10 +1814,8 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { Constant *ObjCARCOpt::getReleaseCallee(Module *M) { if (!ReleaseCallee) { LLVMContext &C = M->getContext(); - std::vector<Type *> Params; - Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); ReleaseCallee = M->getOrInsertFunction( "objc_release", @@ -1836,10 +1828,8 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) { Constant *ObjCARCOpt::getRetainCallee(Module *M) { if (!RetainCallee) { LLVMContext &C = M->getContext(); - std::vector<Type *> Params; - Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); RetainCallee = M->getOrInsertFunction( "objc_retain", @@ -1852,16 +1842,14 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) { Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { if (!RetainBlockCallee) { LLVMContext &C = M->getContext(); - std::vector<Type *> Params; - Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); - AttrListPtr Attributes; + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; // objc_retainBlock is not nounwind because it calls user copy constructors // which could theoretically throw. RetainBlockCallee = M->getOrInsertFunction( "objc_retainBlock", FunctionType::get(Params[0], Params, /*isVarArg=*/false), - Attributes); + AttrListPtr()); } return RetainBlockCallee; } @@ -1869,10 +1857,8 @@ Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { if (!AutoreleaseCallee) { LLVMContext &C = M->getContext(); - std::vector<Type *> Params; - Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); AutoreleaseCallee = M->getOrInsertFunction( "objc_autorelease", @@ -2157,13 +2143,13 @@ static bool isNoopInstruction(const Instruction *I) { /// objc_retainAutoreleasedReturnValue if the operand is a return value. void ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { - CallSite CS(GetObjCArg(Retain)); - Instruction *Call = CS.getInstruction(); + ImmutableCallSite CS(GetObjCArg(Retain)); + const Instruction *Call = CS.getInstruction(); if (!Call) return; if (Call->getParent() != Retain->getParent()) return; // Check that the call is next to the retain. - BasicBlock::iterator I = Call; + BasicBlock::const_iterator I = Call; ++I; while (isNoopInstruction(I)) ++I; if (&*I != Retain) @@ -2176,25 +2162,24 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { } /// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into -/// objc_retain if the operand is not a return value. Or, if it can be -/// paired with an objc_autoreleaseReturnValue, delete the pair and -/// return true. +/// objc_retain if the operand is not a return value. Or, if it can be paired +/// with an objc_autoreleaseReturnValue, delete the pair and return true. bool ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Check for the argument being from an immediately preceding call or invoke. - Value *Arg = GetObjCArg(RetainRV); - CallSite CS(Arg); - if (Instruction *Call = CS.getInstruction()) { + const Value *Arg = GetObjCArg(RetainRV); + ImmutableCallSite CS(Arg); + if (const Instruction *Call = CS.getInstruction()) { if (Call->getParent() == RetainRV->getParent()) { - BasicBlock::iterator I = Call; + BasicBlock::const_iterator I = Call; ++I; while (isNoopInstruction(I)) ++I; if (&*I == RetainRV) return false; - } else if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) { BasicBlock *RetainRVParent = RetainRV->getParent(); if (II->getNormalDest() == RetainRVParent) { - BasicBlock::iterator I = RetainRVParent->begin(); + BasicBlock::const_iterator I = RetainRVParent->begin(); while (isNoopInstruction(I)) ++I; if (&*I == RetainRV) return false; @@ -2422,7 +2407,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // These can always be moved up. break; case IC_Release: - // These can't be moved across things that care about the retain count. + // These can't be moved across things that care about the retain + // count. FindDependencies(NeedsPositiveRetainCount, Arg, Inst->getParent(), Inst, DependingInstructions, Visited, PA); @@ -2504,13 +2490,14 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, for (; SI != SE; ++SI) { Sequence SuccSSeq = S_None; bool SuccSRRIKnownSafe = false; - // If VisitBottomUp has visited this successor, take what we know about it. - DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI); - if (BBI != BBStates.end()) { - const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); - SuccSSeq = SuccS.GetSeq(); - SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; - } + // If VisitBottomUp has pointer information for this successor, take + // what we know about it. + DenseMap<const BasicBlock *, BBState>::iterator BBI = + BBStates.find(*SI); + assert(BBI != BBStates.end()); + const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); + SuccSSeq = SuccS.GetSeq(); + SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; switch (SuccSSeq) { case S_None: case S_CanRelease: { @@ -2557,13 +2544,14 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, for (; SI != SE; ++SI) { Sequence SuccSSeq = S_None; bool SuccSRRIKnownSafe = false; - // If VisitBottomUp has visited this successor, take what we know about it. - DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI); - if (BBI != BBStates.end()) { - const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); - SuccSSeq = SuccS.GetSeq(); - SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; - } + // If VisitBottomUp has pointer information for this successor, take + // what we know about it. + DenseMap<const BasicBlock *, BBState>::iterator BBI = + BBStates.find(*SI); + assert(BBI != BBStates.end()); + const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); + SuccSSeq = SuccS.GetSeq(); + SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; switch (SuccSSeq) { case S_None: { if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) { @@ -2621,16 +2609,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) NestingDetected = true; - S.RRI.clear(); - MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); - S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release); + S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release); S.RRI.ReleaseMetadata = ReleaseMetadata; S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented(); S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); S.RRI.Calls.insert(Inst); - S.IncrementRefCount(); S.IncrementNestCount(); break; } @@ -2645,8 +2630,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, Arg = GetObjCArg(Inst); PtrState &S = MyStates.getPtrBottomUpState(Arg); - S.DecrementRefCount(); - S.SetAtLeastOneRefCount(); + S.SetKnownPositiveRefCount(); S.DecrementNestCount(); switch (S.GetSeq()) { @@ -2696,7 +2680,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, // Check for possible releases. if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - S.DecrementRefCount(); + S.ClearRefCount(); switch (Seq) { case S_Use: S.SetSeq(S_CanRelease); @@ -2763,37 +2747,20 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, // Merge the states from each successor to compute the initial state // for the current block. - const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); - succ_const_iterator SI(TI), SE(TI, false); - if (SI == SE) - MyStates.SetAsExit(); - else { - // If the terminator is an invoke marked with the - // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be - // ignored, for ARC purposes. - if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind)) - --SE; - - do { - const BasicBlock *Succ = *SI++; - if (Succ == BB) - continue; - DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); - // If we haven't seen this node yet, then we've found a CFG cycle. - // Be optimistic here; it's CheckForCFGHazards' job detect trouble. - if (I == BBStates.end()) - continue; - MyStates.InitFromSucc(I->second); - while (SI != SE) { - Succ = *SI++; - if (Succ != BB) { - I = BBStates.find(Succ); - if (I != BBStates.end()) - MyStates.MergeSucc(I->second); - } - } - break; - } while (SI != SE); + for (BBState::edge_iterator SI(MyStates.succ_begin()), + SE(MyStates.succ_end()); SI != SE; ++SI) { + const BasicBlock *Succ = *SI; + DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); + assert(I != BBStates.end()); + MyStates.InitFromSucc(I->second); + ++SI; + for (; SI != SE; ++SI) { + Succ = *SI; + I = BBStates.find(Succ); + assert(I != BBStates.end()); + MyStates.MergeSucc(I->second); + } + break; } // Visit all the instructions, bottom-up. @@ -2807,15 +2774,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates); } - // If there's a predecessor with an invoke, visit the invoke as - // if it were part of this block, since we can't insert code after - // an invoke in its own block, and we don't want to split critical - // edges. - for (pred_iterator PI(BB), PE(BB, false); PI != PE; ++PI) { + // If there's a predecessor with an invoke, visit the invoke as if it were + // part of this block, since we can't insert code after an invoke in its own + // block, and we don't want to split critical edges. + for (BBState::edge_iterator PI(MyStates.pred_begin()), + PE(MyStates.pred_end()); PI != PE; ++PI) { BasicBlock *Pred = *PI; - TerminatorInst *PredTI = cast<TerminatorInst>(&Pred->back()); - if (isa<InvokeInst>(PredTI)) - NestingDetected |= VisitInstructionBottomUp(PredTI, BB, Retains, MyStates); + if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back())) + NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates); } return NestingDetected; @@ -2855,25 +2821,23 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, if (S.GetSeq() == S_Retain) NestingDetected = true; - S.SetSeq(S_Retain); - S.RRI.clear(); + S.ResetSequenceProgress(S_Retain); S.RRI.IsRetainBlock = Class == IC_RetainBlock; - // Don't check S.IsKnownIncremented() here because it's not - // sufficient. + // Don't check S.IsKnownIncremented() here because it's not sufficient. S.RRI.KnownSafe = S.IsKnownNested(); S.RRI.Calls.insert(Inst); } - S.SetAtLeastOneRefCount(); - S.IncrementRefCount(); S.IncrementNestCount(); - return NestingDetected; + + // A retain can be a potential use; procede to the generic checking + // code below. + break; } case IC_Release: { Arg = GetObjCArg(Inst); PtrState &S = MyStates.getPtrTopDownState(Arg); - S.DecrementRefCount(); S.DecrementNestCount(); switch (S.GetSeq()) { @@ -2920,7 +2884,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, // Check for possible releases. if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - S.DecrementRefCount(); + S.ClearRefCount(); switch (Seq) { case S_Retain: S.SetSeq(S_CanRelease); @@ -2971,41 +2935,21 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, // Merge the states from each predecessor to compute the initial state // for the current block. - const_pred_iterator PI(BB), PE(BB, false); - if (PI == PE) - MyStates.SetAsEntry(); - else - do { - unsigned OperandNo = PI.getOperandNo(); - const Use &Us = PI.getUse(); - ++PI; - - // Skip invoke unwind edges on invoke instructions marked with - // clang.arc.no_objc_arc_exceptions. - if (const InvokeInst *II = dyn_cast<InvokeInst>(Us.getUser())) - if (OperandNo == II->getNumArgOperands() + 2 && - II->getMetadata(NoObjCARCExceptionsMDKind)) - continue; - - const BasicBlock *Pred = cast<TerminatorInst>(Us.getUser())->getParent(); - if (Pred == BB) - continue; - DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); - // If we haven't seen this node yet, then we've found a CFG cycle. - // Be optimistic here; it's CheckForCFGHazards' job detect trouble. - if (I == BBStates.end() || !I->second.isVisitedTopDown()) - continue; - MyStates.InitFromPred(I->second); - while (PI != PE) { - Pred = *PI++; - if (Pred != BB) { - I = BBStates.find(Pred); - if (I != BBStates.end() && I->second.isVisitedTopDown()) - MyStates.MergePred(I->second); - } - } - break; - } while (PI != PE); + for (BBState::edge_iterator PI(MyStates.pred_begin()), + PE(MyStates.pred_end()); PI != PE; ++PI) { + const BasicBlock *Pred = *PI; + DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); + assert(I != BBStates.end()); + MyStates.InitFromPred(I->second); + ++PI; + for (; PI != PE; ++PI) { + Pred = *PI; + I = BBStates.find(Pred); + assert(I != BBStates.end()); + MyStates.MergePred(I->second); + } + break; + } // Visit all the instructions, top-down. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { @@ -3020,73 +2964,82 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, static void ComputePostOrders(Function &F, SmallVectorImpl<BasicBlock *> &PostOrder, - SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder) { - /// Backedges - Backedges detected in the DFS. These edges will be - /// ignored in the reverse-CFG DFS, so that loops with multiple exits will be - /// traversed in the desired order. - DenseSet<std::pair<BasicBlock *, BasicBlock *> > Backedges; - + SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder, + unsigned NoObjCARCExceptionsMDKind, + DenseMap<const BasicBlock *, BBState> &BBStates) { /// Visited - The visited set, for doing DFS walks. SmallPtrSet<BasicBlock *, 16> Visited; // Do DFS, computing the PostOrder. SmallPtrSet<BasicBlock *, 16> OnStack; SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack; + + // Functions always have exactly one entry block, and we don't have + // any other block that we treat like an entry block. BasicBlock *EntryBB = &F.getEntryBlock(); - SuccStack.push_back(std::make_pair(EntryBB, succ_begin(EntryBB))); + BBState &MyStates = BBStates[EntryBB]; + MyStates.SetAsEntry(); + TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back()); + SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI))); Visited.insert(EntryBB); OnStack.insert(EntryBB); do { dfs_next_succ: - TerminatorInst *TI = cast<TerminatorInst>(&SuccStack.back().first->back()); - succ_iterator End = succ_iterator(TI, true); - while (SuccStack.back().second != End) { - BasicBlock *BB = *SuccStack.back().second++; - if (Visited.insert(BB)) { - SuccStack.push_back(std::make_pair(BB, succ_begin(BB))); - OnStack.insert(BB); + BasicBlock *CurrBB = SuccStack.back().first; + TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back()); + succ_iterator SE(TI, false); + + // If the terminator is an invoke marked with the + // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be + // ignored, for ARC purposes. + if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind)) + --SE; + + while (SuccStack.back().second != SE) { + BasicBlock *SuccBB = *SuccStack.back().second++; + if (Visited.insert(SuccBB)) { + TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back()); + SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI))); + BBStates[CurrBB].addSucc(SuccBB); + BBState &SuccStates = BBStates[SuccBB]; + SuccStates.addPred(CurrBB); + OnStack.insert(SuccBB); goto dfs_next_succ; } - if (OnStack.count(BB)) - Backedges.insert(std::make_pair(SuccStack.back().first, BB)); + + if (!OnStack.count(SuccBB)) { + BBStates[CurrBB].addSucc(SuccBB); + BBStates[SuccBB].addPred(CurrBB); + } } - OnStack.erase(SuccStack.back().first); - PostOrder.push_back(SuccStack.pop_back_val().first); + OnStack.erase(CurrBB); + PostOrder.push_back(CurrBB); + SuccStack.pop_back(); } while (!SuccStack.empty()); Visited.clear(); - // Compute the exits, which are the starting points for reverse-CFG DFS. - // This includes blocks where all the successors are backedges that - // we're skipping. - SmallVector<BasicBlock *, 4> Exits; + // Do reverse-CFG DFS, computing the reverse-CFG PostOrder. + // Functions may have many exits, and there also blocks which we treat + // as exits due to ignored edges. + SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - BasicBlock *BB = I; - TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); - for (succ_iterator SI(TI), SE(TI, true); SI != SE; ++SI) - if (!Backedges.count(std::make_pair(BB, *SI))) - goto HasNonBackedgeSucc; - Exits.push_back(BB); - HasNonBackedgeSucc:; - } + BasicBlock *ExitBB = I; + BBState &MyStates = BBStates[ExitBB]; + if (!MyStates.isExit()) + continue; - // Do reverse-CFG DFS, computing the reverse-CFG PostOrder. - SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> PredStack; - for (SmallVectorImpl<BasicBlock *>::iterator I = Exits.begin(), E = Exits.end(); - I != E; ++I) { - BasicBlock *ExitBB = *I; - PredStack.push_back(std::make_pair(ExitBB, pred_begin(ExitBB))); + MyStates.SetAsExit(); + + PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin())); Visited.insert(ExitBB); while (!PredStack.empty()) { reverse_dfs_next_succ: - pred_iterator End = pred_end(PredStack.back().first); - while (PredStack.back().second != End) { + BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end(); + while (PredStack.back().second != PE) { BasicBlock *BB = *PredStack.back().second++; - // Skip backedges detected in the forward-CFG DFS. - if (Backedges.count(std::make_pair(BB, PredStack.back().first))) - continue; if (Visited.insert(BB)) { - PredStack.push_back(std::make_pair(BB, pred_begin(BB))); + PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin())); goto reverse_dfs_next_succ; } } @@ -3109,7 +3062,9 @@ ObjCARCOpt::Visit(Function &F, // function exit point, and we want to ignore selected cycle edges. SmallVector<BasicBlock *, 16> PostOrder; SmallVector<BasicBlock *, 16> ReverseCFGPostOrder; - ComputePostOrders(F, PostOrder, ReverseCFGPostOrder); + ComputePostOrders(F, PostOrder, ReverseCFGPostOrder, + NoObjCARCExceptionsMDKind, + BBStates); // Use reverse-postorder on the reverse CFG for bottom-up. bool BottomUpNestingDetected = false; @@ -3218,7 +3173,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> // not being managed by ObjC reference counting, so we can delete pairs // regardless of what possible decrements or uses lie between them. bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg); - + // A constant pointer can't be pointing to an object on the heap. It may // be reference-counted, but it won't be deleted. if (const LoadInst *LI = dyn_cast<LoadInst>(Arg)) @@ -3379,6 +3334,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> // Ok, everything checks out and we're all set. Let's move some code! Changed = true; + assert(OldCount != 0 && "Unreachable code?"); AnyPairsCompletelyEliminated = NewCount == 0; NumRRs += OldCount - NewCount; MoveCalls(Arg, RetainsToMove, ReleasesToMove, @@ -3519,7 +3475,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) { for (Value::use_iterator UI = Alloca->use_begin(), UE = Alloca->use_end(); UI != UE; ++UI) { - Instruction *UserInst = cast<Instruction>(*UI); + const Instruction *UserInst = cast<Instruction>(*UI); switch (GetBasicInstructionClass(UserInst)) { case IC_InitWeak: case IC_StoreWeak: @@ -3533,8 +3489,18 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { for (Value::use_iterator UI = Alloca->use_begin(), UE = Alloca->use_end(); UI != UE; ) { CallInst *UserInst = cast<CallInst>(*UI++); - if (!UserInst->use_empty()) - UserInst->replaceAllUsesWith(UserInst->getArgOperand(0)); + switch (GetBasicInstructionClass(UserInst)) { + case IC_InitWeak: + case IC_StoreWeak: + // These functions return their second argument. + UserInst->replaceAllUsesWith(UserInst->getArgOperand(1)); + break; + case IC_DestroyWeak: + // No return value. + break; + default: + llvm_unreachable("alloca really is used!"); + } UserInst->eraseFromParent(); } Alloca->eraseFromParent(); @@ -3602,8 +3568,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); if (!Autorelease) goto next_block; - InstructionClass AutoreleaseClass = - GetBasicInstructionClass(Autorelease); + InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease); if (!IsAutorelease(AutoreleaseClass)) goto next_block; if (GetObjCArg(Autorelease) != Arg) @@ -3694,7 +3659,7 @@ bool ObjCARCOpt::doInitialization(Module &M) { // Intuitively, objc_retain and others are nocapture, however in practice // they are not, because they return their argument value. And objc_release - // calls finalizers. + // calls finalizers which can have arbitrary side effects. // These are initialized lazily. RetainRVCallee = 0; @@ -3746,8 +3711,8 @@ bool ObjCARCOpt::runOnFunction(Function &F) { while (OptimizeSequences(F)) {} // Optimizations if objc_autorelease is used. - if (UsedInThisFunction & - ((1 << IC_Autorelease) | (1 << IC_AutoreleaseRV))) + if (UsedInThisFunction & ((1 << IC_Autorelease) | + (1 << IC_AutoreleaseRV))) OptimizeReturns(F); return Changed; @@ -3795,7 +3760,7 @@ namespace { /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If /// at the end of walking the function we have found no alloca /// instructions, these calls can be marked "tail". - DenseSet<CallInst *> StoreStrongCalls; + SmallPtrSet<CallInst *, 8> StoreStrongCalls; Constant *getStoreStrongCallee(Module *M); Constant *getRetainAutoreleaseCallee(Module *M); @@ -3846,13 +3811,11 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *I8XX = PointerType::getUnqual(I8X); - std::vector<Type *> Params; - Params.push_back(I8XX); - Params.push_back(I8X); + Type *Params[] = { I8XX, I8X }; - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); - Attributes.addAttr(1, Attribute::NoCapture); + AttrListPtr Attributes = AttrListPtr() + .addAttr(~0u, Attribute::NoUnwind) + .addAttr(1, Attribute::NoCapture); StoreStrongCallee = M->getOrInsertFunction( @@ -3867,12 +3830,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { if (!RetainAutoreleaseCallee) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - std::vector<Type *> Params; - Params.push_back(I8X); - FunctionType *FTy = - FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); RetainAutoreleaseCallee = M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes); } @@ -3883,12 +3843,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { if (!RetainAutoreleaseRVCallee) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - std::vector<Type *> Params; - Params.push_back(I8X); - FunctionType *FTy = - FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); RetainAutoreleaseRVCallee = M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy, Attributes); @@ -3896,8 +3853,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { return RetainAutoreleaseRVCallee; } -/// ContractAutorelease - Merge an autorelease with a retain into a fused -/// call. +/// ContractAutorelease - Merge an autorelease with a retain into a fused call. bool ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, InstructionClass Class, @@ -3958,18 +3914,41 @@ void ObjCARCContract::ContractRelease(Instruction *Release, BasicBlock *BB = Release->getParent(); if (Load->getParent() != BB) return; - // Walk down to find the store. + // Walk down to find the store and the release, which may be in either order. BasicBlock::iterator I = Load, End = BB->end(); ++I; AliasAnalysis::Location Loc = AA->getLocation(Load); - while (I != End && - (&*I == Release || - IsRetain(GetBasicInstructionClass(I)) || - !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod))) - ++I; - StoreInst *Store = dyn_cast<StoreInst>(I); - if (!Store || !Store->isSimple()) return; - if (Store->getPointerOperand() != Loc.Ptr) return; + StoreInst *Store = 0; + bool SawRelease = false; + for (; !Store || !SawRelease; ++I) { + if (I == End) + return; + + Instruction *Inst = I; + if (Inst == Release) { + SawRelease = true; + continue; + } + + InstructionClass Class = GetBasicInstructionClass(Inst); + + // Unrelated retains are harmless. + if (IsRetain(Class)) + continue; + + if (Store) { + // The store is the point where we're going to put the objc_storeStrong, + // so make sure there are no uses after it. + if (CanUse(Inst, Load, PA, Class)) + return; + } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) { + // We are moving the load down to the store, so check for anything + // else which writes to the memory between the load and the store. + Store = dyn_cast<StoreInst>(Inst); + if (!Store || !Store->isSimple()) return; + if (Store->getPointerOperand() != Loc.Ptr) return; + } + } Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand()); @@ -4057,7 +4036,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { // It seems that functions which "return twice" are also unsafe for the // "tail" argument, because they are setjmp, which could need to // return to an earlier stack state. - bool TailOkForStoreStrongs = !F.isVarArg() && !F.callsFunctionThatReturnsTwice(); + bool TailOkForStoreStrongs = !F.isVarArg() && + !F.callsFunctionThatReturnsTwice(); // For ObjC library calls which return their argument, replace uses of the // argument with uses of the call return value, if it dominates the use. This @@ -4087,8 +4067,22 @@ bool ObjCARCContract::runOnFunction(Function &F) { if (!RetainRVMarker) break; BasicBlock::iterator BBI = Inst; - --BBI; - while (isNoopInstruction(BBI)) --BBI; + BasicBlock *InstParent = Inst->getParent(); + + // Step up to see if the call immediately precedes the RetainRV call. + // If it's an invoke, we have to cross a block boundary. And we have + // to carefully dodge no-op instructions. + do { + if (&*BBI == InstParent->begin()) { + BasicBlock *Pred = InstParent->getSinglePredecessor(); + if (!Pred) + goto decline_rv_optimization; + BBI = Pred->getTerminator(); + break; + } + --BBI; + } while (isNoopInstruction(BBI)); + if (&*BBI == GetObjCArg(Inst)) { Changed = true; InlineAsm *IA = @@ -4098,6 +4092,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { /*Constraints=*/"", /*hasSideEffects=*/true); CallInst::Create(IA, "", Inst); } + decline_rv_optimization: break; } case IC_InitWeak: { @@ -4147,25 +4142,21 @@ bool ObjCARCContract::runOnFunction(Function &F) { // trivially dominate itself, which would lead us to rewriting its // argument in terms of its return value, which would lead to // infinite loops in GetObjCArg. - if (DT->isReachableFromEntry(U) && - DT->dominates(Inst, U)) { + if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) { Changed = true; Instruction *Replacement = Inst; Type *UseTy = U.get()->getType(); if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) { // For PHI nodes, insert the bitcast in the predecessor block. - unsigned ValNo = - PHINode::getIncomingValueNumForOperand(OperandNo); - BasicBlock *BB = - PHI->getIncomingBlock(ValNo); + unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo); + BasicBlock *BB = PHI->getIncomingBlock(ValNo); if (Replacement->getType() != UseTy) Replacement = new BitCastInst(Replacement, UseTy, "", &BB->back()); // While we're here, rewrite all edges for this PHI, rather // than just one use at a time, to minimize the number of // bitcasts we emit. - for (unsigned i = 0, e = PHI->getNumIncomingValues(); - i != e; ++i) + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) if (PHI->getIncomingBlock(i) == BB) { // Keep the UI iterator valid. if (&PHI->getOperandUse( @@ -4183,8 +4174,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { } } - // If Arg is a no-op casted pointer, strip one level of casts and - // iterate. + // If Arg is a no-op casted pointer, strip one level of casts and iterate. if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg)) Arg = BI->getOperand(0); else if (isa<GEPOperator>(Arg) && @@ -4201,7 +4191,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { // If this function has no escaping allocas or suspicious vararg usage, // objc_storeStrong calls can be marked with the "tail" keyword. if (TailOkForStoreStrongs) - for (DenseSet<CallInst *>::iterator I = StoreStrongCalls.begin(), + for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(), E = StoreStrongCalls.end(); I != E; ++I) (*I)->setTailCall(); StoreStrongCalls.clear(); diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index cb408a1..ffcf97c 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -26,21 +26,23 @@ #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Assembly/Writer.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/DenseMap.h" #include <algorithm> using namespace llvm; -STATISTIC(NumLinear , "Number of insts linearized"); STATISTIC(NumChanged, "Number of insts reassociated"); STATISTIC(NumAnnihil, "Number of expr tree annihilated"); STATISTIC(NumFactor , "Number of multiplies factored"); @@ -70,13 +72,51 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { } } #endif - + +namespace { + /// \brief Utility class representing a base and exponent pair which form one + /// factor of some product. + struct Factor { + Value *Base; + unsigned Power; + + Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} + + /// \brief Sort factors by their Base. + struct BaseSorter { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Base < RHS.Base; + } + }; + + /// \brief Compare factors for equal bases. + struct BaseEqual { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Base == RHS.Base; + } + }; + + /// \brief Sort factors in descending order by their power. + struct PowerDescendingSorter { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Power > RHS.Power; + } + }; + + /// \brief Compare factors for equal powers. + struct PowerEqual { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Power == RHS.Power; + } + }; + }; +} + namespace { class Reassociate : public FunctionPass { DenseMap<BasicBlock*, unsigned> RankMap; DenseMap<AssertingVH<Value>, unsigned> ValueRankMap; - SmallVector<WeakVH, 8> RedoInsts; - SmallVector<WeakVH, 8> DeadInsts; + SetVector<AssertingVH<Instruction> > RedoInsts; bool MadeChange; public: static char ID; // Pass identification, replacement for typeid @@ -92,18 +132,19 @@ namespace { private: void BuildRankMap(Function &F); unsigned getRank(Value *V); - Value *ReassociateExpression(BinaryOperator *I); - void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops, - unsigned Idx = 0); + void ReassociateExpression(BinaryOperator *I); + void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); Value *OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops); - void LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); - void LinearizeExpr(BinaryOperator *I); + bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors); + Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder, + SmallVectorImpl<Factor> &Factors); + Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); Value *RemoveFactorFromExpression(Value *V, Value *Factor); - void ReassociateInst(BasicBlock::iterator &BBI); - - void RemoveDeadBinaryOp(Value *V); + void EraseInst(Instruction *I); + void OptimizeInst(Instruction *I); }; } @@ -114,28 +155,24 @@ INITIALIZE_PASS(Reassociate, "reassociate", // Public interface to the Reassociate pass FunctionPass *llvm::createReassociatePass() { return new Reassociate(); } -void Reassociate::RemoveDeadBinaryOp(Value *V) { - Instruction *Op = dyn_cast<Instruction>(V); - if (!Op || !isa<BinaryOperator>(Op)) - return; - - Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1); - - ValueRankMap.erase(Op); - DeadInsts.push_back(Op); - RemoveDeadBinaryOp(LHS); - RemoveDeadBinaryOp(RHS); +/// isReassociableOp - Return true if V is an instruction of the specified +/// opcode and if it only has one use. +static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { + if (V->hasOneUse() && isa<Instruction>(V) && + cast<Instruction>(V)->getOpcode() == Opcode) + return cast<BinaryOperator>(V); + return 0; } - static bool isUnmovableInstruction(Instruction *I) { if (I->getOpcode() == Instruction::PHI || + I->getOpcode() == Instruction::LandingPad || I->getOpcode() == Instruction::Alloca || I->getOpcode() == Instruction::Load || I->getOpcode() == Instruction::Invoke || (I->getOpcode() == Instruction::Call && !isa<DbgInfoIntrinsic>(I)) || - I->getOpcode() == Instruction::UDiv || + I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::FDiv || I->getOpcode() == Instruction::URem || @@ -198,211 +235,570 @@ unsigned Reassociate::getRank(Value *V) { return ValueRankMap[I] = Rank; } -/// isReassociableOp - Return true if V is an instruction of the specified -/// opcode and if it only has one use. -static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { - if ((V->hasOneUse() || V->use_empty()) && isa<Instruction>(V) && - cast<Instruction>(V)->getOpcode() == Opcode) - return cast<BinaryOperator>(V); - return 0; -} - /// LowerNegateToMultiply - Replace 0-X with X*-1. /// -static Instruction *LowerNegateToMultiply(Instruction *Neg, - DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { +static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { Constant *Cst = Constant::getAllOnesValue(Neg->getType()); - Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg); - ValueRankMap.erase(Neg); + BinaryOperator *Res = + BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg); + Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op. Res->takeName(Neg); Neg->replaceAllUsesWith(Res); Res->setDebugLoc(Neg->getDebugLoc()); - Neg->eraseFromParent(); return Res; } -// Given an expression of the form '(A+B)+(D+C)', turn it into '(((A+B)+C)+D)'. -// Note that if D is also part of the expression tree that we recurse to -// linearize it as well. Besides that case, this does not recurse into A,B, or -// C. -void Reassociate::LinearizeExpr(BinaryOperator *I) { - BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0)); - BinaryOperator *RHS = cast<BinaryOperator>(I->getOperand(1)); - assert(isReassociableOp(LHS, I->getOpcode()) && - isReassociableOp(RHS, I->getOpcode()) && - "Not an expression that needs linearization?"); - - DEBUG(dbgs() << "Linear" << *LHS << '\n' << *RHS << '\n' << *I << '\n'); - - // Move the RHS instruction to live immediately before I, avoiding breaking - // dominator properties. - RHS->moveBefore(I); - - // Move operands around to do the linearization. - I->setOperand(1, RHS->getOperand(0)); - RHS->setOperand(0, LHS); - I->setOperand(0, RHS); - - // Conservatively clear all the optional flags, which may not hold - // after the reassociation. - I->clearSubclassOptionalData(); - LHS->clearSubclassOptionalData(); - RHS->clearSubclassOptionalData(); - - ++NumLinear; - MadeChange = true; - DEBUG(dbgs() << "Linearized: " << *I << '\n'); - - // If D is part of this expression tree, tail recurse. - if (isReassociableOp(I->getOperand(1), I->getOpcode())) - LinearizeExpr(I); +/// CarmichaelShift - Returns k such that lambda(2^Bitwidth) = 2^k, where lambda +/// is the Carmichael function. This means that x^(2^k) === 1 mod 2^Bitwidth for +/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. +/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every +/// even x in Bitwidth-bit arithmetic. +static unsigned CarmichaelShift(unsigned Bitwidth) { + if (Bitwidth < 3) + return Bitwidth - 1; + return Bitwidth - 2; +} + +/// IncorporateWeight - Add the extra weight 'RHS' to the existing weight 'LHS', +/// reducing the combined weight using any special properties of the operation. +/// The existing weight LHS represents the computation X op X op ... op X where +/// X occurs LHS times. The combined weight represents X op X op ... op X with +/// X occurring LHS + RHS times. If op is "Xor" for example then the combined +/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even; +/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second. +static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { + // If we were working with infinite precision arithmetic then the combined + // weight would be LHS + RHS. But we are using finite precision arithmetic, + // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct + // for nilpotent operations and addition, but not for idempotent operations + // and multiplication), so it is important to correctly reduce the combined + // weight back into range if wrapping would be wrong. + + // If RHS is zero then the weight didn't change. + if (RHS.isMinValue()) + return; + // If LHS is zero then the combined weight is RHS. + if (LHS.isMinValue()) { + LHS = RHS; + return; + } + // From this point on we know that neither LHS nor RHS is zero. + + if (Instruction::isIdempotent(Opcode)) { + // Idempotent means X op X === X, so any non-zero weight is equivalent to a + // weight of 1. Keeping weights at zero or one also means that wrapping is + // not a problem. + assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); + return; // Return a weight of 1. + } + if (Instruction::isNilpotent(Opcode)) { + // Nilpotent means X op X === 0, so reduce weights modulo 2. + assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); + LHS = 0; // 1 + 1 === 0 modulo 2. + return; + } + if (Opcode == Instruction::Add) { + // TODO: Reduce the weight by exploiting nsw/nuw? + LHS += RHS; + return; + } + + assert(Opcode == Instruction::Mul && "Unknown associative operation!"); + unsigned Bitwidth = LHS.getBitWidth(); + // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth + // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth + // bit number x, since either x is odd in which case x^CM = 1, or x is even in + // which case both x^W and x^(W - CM) are zero. By subtracting off multiples + // of CM like this weights can always be reduced to the range [0, CM+Bitwidth) + // which by a happy accident means that they can always be represented using + // Bitwidth bits. + // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than + // the Carmichael number). + if (Bitwidth > 3) { + /// CM - The value of Carmichael's lambda function. + APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth)); + // Any weight W >= Threshold can be replaced with W - CM. + APInt Threshold = CM + Bitwidth; + assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!"); + // For Bitwidth 4 or more the following sum does not overflow. + LHS += RHS; + while (LHS.uge(Threshold)) + LHS -= CM; + } else { + // To avoid problems with overflow do everything the same as above but using + // a larger type. + unsigned CM = 1U << CarmichaelShift(Bitwidth); + unsigned Threshold = CM + Bitwidth; + assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold && + "Weights not reduced!"); + unsigned Total = LHS.getZExtValue() + RHS.getZExtValue(); + while (Total >= Threshold) + Total -= CM; + LHS = Total; + } } +/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C +/// is repeated Weight times. +static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C, + APInt Weight) { + // For addition the result can be efficiently computed as the product of the + // constant and the weight. + if (Opcode == Instruction::Add) + return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight)); + + // The weight might be huge, so compute by repeated squaring to ensure that + // compile time is proportional to the logarithm of the weight. + Constant *Result = 0; + Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc. + // Visit the bits in Weight. + while (Weight != 0) { + // If the current bit in Weight is non-zero do Result = Result op Power. + if (Weight[0]) + Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power; + // Move on to the next bit if any more are non-zero. + Weight = Weight.lshr(1); + if (Weight.isMinValue()) + break; + // Square the power. + Power = ConstantExpr::get(Opcode, Power, Power); + } + + assert(Result && "Only positive weights supported!"); + return Result; +} -/// LinearizeExprTree - Given an associative binary expression tree, traverse -/// all of the uses putting it into canonical form. This forces a left-linear -/// form of the expression (((a+b)+c)+d), and collects information about the -/// rank of the non-tree operands. +typedef std::pair<Value*, APInt> RepeatedValue; + +/// LinearizeExprTree - Given an associative binary expression, return the leaf +/// nodes in Ops along with their weights (how many times the leaf occurs). The +/// original expression is the same as +/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times +/// op +/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times +/// op +/// ... +/// op +/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times +/// +/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and +/// they are all non-constant except possibly for the last one, which if it is +/// constant will have weight one (Ops[N].second === 1). +/// +/// This routine may modify the function, in which case it returns 'true'. The +/// changes it makes may well be destructive, changing the value computed by 'I' +/// to something completely different. Thus if the routine returns 'true' then +/// you MUST either replace I with a new expression computed from the Ops array, +/// or use RewriteExprTree to put the values back in. +/// +/// A leaf node is either not a binary operation of the same kind as the root +/// node 'I' (i.e. is not a binary operator at all, or is, but with a different +/// opcode), or is the same kind of binary operator but has a use which either +/// does not belong to the expression, or does belong to the expression but is +/// a leaf node. Every leaf node has at least one use that is a non-leaf node +/// of the expression, while for non-leaf nodes (except for the root 'I') every +/// use is a non-leaf node of the expression. +/// +/// For example: +/// expression graph node names +/// +/// + | I +/// / \ | +/// + + | A, B +/// / \ / \ | +/// * + * | C, D, E +/// / \ / \ / \ | +/// + * | F, G /// -/// NOTE: These intentionally destroys the expression tree operands (turning -/// them into undef values) to reduce #uses of the values. This means that the -/// caller MUST use something like RewriteExprTree to put the values back in. +/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in +/// that order) (C, 1), (E, 1), (F, 2), (G, 2). /// -void Reassociate::LinearizeExprTree(BinaryOperator *I, - SmallVectorImpl<ValueEntry> &Ops) { - Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); +/// The expression is maximal: if some instruction is a binary operator of the +/// same kind as 'I', and all of its uses are non-leaf nodes of the expression, +/// then the instruction also belongs to the expression, is not a leaf node of +/// it, and its operands also belong to the expression (but may be leaf nodes). +/// +/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in +/// order to ensure that every non-root node in the expression has *exactly one* +/// use by a non-leaf node of the expression. This destruction means that the +/// caller MUST either replace 'I' with a new expression or use something like +/// RewriteExprTree to put the values back in if the routine indicates that it +/// made a change by returning 'true'. +/// +/// In the above example either the right operand of A or the left operand of B +/// will be replaced by undef. If it is B's operand then this gives: +/// +/// + | I +/// / \ | +/// + + | A, B - operand of B replaced with undef +/// / \ \ | +/// * + * | C, D, E +/// / \ / \ / \ | +/// + * | F, G +/// +/// Note that such undef operands can only be reached by passing through 'I'. +/// For example, if you visit operands recursively starting from a leaf node +/// then you will never see such an undef operand unless you get back to 'I', +/// which requires passing through a phi node. +/// +/// Note that this routine may also mutate binary operators of the wrong type +/// that have all uses inside the expression (i.e. only used by non-leaf nodes +/// of the expression) if it can turn them into binary operators of the right +/// type and thus make the expression bigger. + +static bool LinearizeExprTree(BinaryOperator *I, + SmallVectorImpl<RepeatedValue> &Ops) { + DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); + unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits(); unsigned Opcode = I->getOpcode(); + assert(Instruction::isAssociative(Opcode) && + Instruction::isCommutative(Opcode) && + "Expected an associative and commutative operation!"); + // If we see an absorbing element then the entire expression must be equal to + // it. For example, if this is a multiplication expression and zero occurs as + // an operand somewhere in it then the result of the expression must be zero. + Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); + + // Visit all operands of the expression, keeping track of their weight (the + // number of paths from the expression root to the operand, or if you like + // the number of times that operand occurs in the linearized expression). + // For example, if I = X + A, where X = A + B, then I, X and B have weight 1 + // while A has weight two. + + // Worklist of non-leaf nodes (their operands are in the expression too) along + // with their weights, representing a certain number of paths to the operator. + // If an operator occurs in the worklist multiple times then we found multiple + // ways to get to it. + SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight) + Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1))); + bool MadeChange = false; + + // Leaves of the expression are values that either aren't the right kind of + // operation (eg: a constant, or a multiply in an add tree), or are, but have + // some uses that are not inside the expression. For example, in I = X + X, + // X = A + B, the value X has two uses (by I) that are in the expression. If + // X has any other uses, for example in a return instruction, then we consider + // X to be a leaf, and won't analyze it further. When we first visit a value, + // if it has more than one use then at first we conservatively consider it to + // be a leaf. Later, as the expression is explored, we may discover some more + // uses of the value from inside the expression. If all uses turn out to be + // from within the expression (and the value is a binary operator of the right + // kind) then the value is no longer considered to be a leaf, and its operands + // are explored. + + // Leaves - Keeps track of the set of putative leaves as well as the number of + // paths to each leaf seen so far. + typedef DenseMap<Value*, APInt> LeafMap; + LeafMap Leaves; // Leaf -> Total weight so far. + SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order. + +#ifndef NDEBUG + SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme. +#endif + while (!Worklist.empty()) { + std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val(); + I = P.first; // We examine the operands of this binary operator. + + for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands. + Value *Op = I->getOperand(OpIdx); + APInt Weight = P.second; // Number of paths to this operand. + DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); + assert(!Op->use_empty() && "No uses, so how did we get to it?!"); + + // If the expression contains an absorbing element then there is no need + // to analyze it further: it must evaluate to the absorbing element. + if (Op == Absorber && !Weight.isMinValue()) { + Ops.push_back(std::make_pair(Absorber, APInt(Bitwidth, 1))); + return MadeChange; + } - // First step, linearize the expression if it is in ((A+B)+(C+D)) form. - BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode); - BinaryOperator *RHSBO = isReassociableOp(RHS, Opcode); + // If this is a binary operation of the right kind with only one use then + // add its operands to the expression. + if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { + assert(Visited.insert(Op) && "Not first visit!"); + DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n"); + Worklist.push_back(std::make_pair(BO, Weight)); + continue; + } - // If this is a multiply expression tree and it contains internal negations, - // transform them into multiplies by -1 so they can be reassociated. - if (I->getOpcode() == Instruction::Mul) { - if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) { - LHS = LowerNegateToMultiply(cast<Instruction>(LHS), ValueRankMap); - LHSBO = isReassociableOp(LHS, Opcode); - } - if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) { - RHS = LowerNegateToMultiply(cast<Instruction>(RHS), ValueRankMap); - RHSBO = isReassociableOp(RHS, Opcode); + // Appears to be a leaf. Is the operand already in the set of leaves? + LeafMap::iterator It = Leaves.find(Op); + if (It == Leaves.end()) { + // Not in the leaf map. Must be the first time we saw this operand. + assert(Visited.insert(Op) && "Not first visit!"); + if (!Op->hasOneUse()) { + // This value has uses not accounted for by the expression, so it is + // not safe to modify. Mark it as being a leaf. + DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n"); + LeafOrder.push_back(Op); + Leaves[Op] = Weight; + continue; + } + // No uses outside the expression, try morphing it. + } else if (It != Leaves.end()) { + // Already in the leaf map. + assert(Visited.count(Op) && "In leaf map but not visited!"); + + // Update the number of paths to the leaf. + IncorporateWeight(It->second, Weight, Opcode); + + // The leaf already has one use from inside the expression. As we want + // exactly one such use, drop this new use of the leaf. + assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); + I->setOperand(OpIdx, UndefValue::get(I->getType())); + MadeChange = true; + + // If the leaf is a binary operation of the right kind and we now see + // that its multiple original uses were in fact all by nodes belonging + // to the expression, then no longer consider it to be a leaf and add + // its operands to the expression. + if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { + DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); + Worklist.push_back(std::make_pair(BO, It->second)); + Leaves.erase(It); + continue; + } + + // If we still have uses that are not accounted for by the expression + // then it is not safe to modify the value. + if (!Op->hasOneUse()) + continue; + + // No uses outside the expression, try morphing it. + Weight = It->second; + Leaves.erase(It); // Since the value may be morphed below. + } + + // At this point we have a value which, first of all, is not a binary + // expression of the right kind, and secondly, is only used inside the + // expression. This means that it can safely be modified. See if we + // can usefully morph it into an expression of the right kind. + assert((!isa<Instruction>(Op) || + cast<Instruction>(Op)->getOpcode() != Opcode) && + "Should have been handled above!"); + assert(Op->hasOneUse() && "Has uses outside the expression tree!"); + + // If this is a multiply expression, turn any internal negations into + // multiplies by -1 so they can be reassociated. + BinaryOperator *BO = dyn_cast<BinaryOperator>(Op); + if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) { + DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); + BO = LowerNegateToMultiply(BO); + DEBUG(dbgs() << *BO << 'n'); + Worklist.push_back(std::make_pair(BO, Weight)); + MadeChange = true; + continue; + } + + // Failed to morph into an expression of the right type. This really is + // a leaf. + DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n"); + assert(!isReassociableOp(Op, Opcode) && "Value was morphed?"); + LeafOrder.push_back(Op); + Leaves[Op] = Weight; } } - if (!LHSBO) { - if (!RHSBO) { - // Neither the LHS or RHS as part of the tree, thus this is a leaf. As - // such, just remember these operands and their rank. - Ops.push_back(ValueEntry(getRank(LHS), LHS)); - Ops.push_back(ValueEntry(getRank(RHS), RHS)); - - // Clear the leaves out. - I->setOperand(0, UndefValue::get(I->getType())); - I->setOperand(1, UndefValue::get(I->getType())); - return; + // The leaves, repeated according to their weights, represent the linearized + // form of the expression. + Constant *Cst = 0; // Accumulate constants here. + for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) { + Value *V = LeafOrder[i]; + LeafMap::iterator It = Leaves.find(V); + if (It == Leaves.end()) + // Node initially thought to be a leaf wasn't. + continue; + assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!"); + APInt Weight = It->second; + if (Weight.isMinValue()) + // Leaf already output or weight reduction eliminated it. + continue; + // Ensure the leaf is only output once. + It->second = 0; + // Glob all constants together into Cst. + if (Constant *C = dyn_cast<Constant>(V)) { + C = EvaluateRepeatedConstant(Opcode, C, Weight); + Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C; + continue; } - - // Turn X+(Y+Z) -> (Y+Z)+X - std::swap(LHSBO, RHSBO); - std::swap(LHS, RHS); - bool Success = !I->swapOperands(); - assert(Success && "swapOperands failed"); - (void)Success; - MadeChange = true; - } else if (RHSBO) { - // Turn (A+B)+(C+D) -> (((A+B)+C)+D). This guarantees the RHS is not - // part of the expression tree. - LinearizeExpr(I); - LHS = LHSBO = cast<BinaryOperator>(I->getOperand(0)); - RHS = I->getOperand(1); - RHSBO = 0; + // Add non-constant + Ops.push_back(std::make_pair(V, Weight)); } - // Okay, now we know that the LHS is a nested expression and that the RHS is - // not. Perform reassociation. - assert(!isReassociableOp(RHS, Opcode) && "LinearizeExpr failed!"); - - // Move LHS right before I to make sure that the tree expression dominates all - // values. - LHSBO->moveBefore(I); + // Add any constants back into Ops, all globbed together and reduced to having + // weight 1 for the convenience of users. + Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); + if (Cst && Cst != Identity) { + // If combining multiple constants resulted in the absorber then the entire + // expression must evaluate to the absorber. + if (Cst == Absorber) + Ops.clear(); + Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1))); + } - // Linearize the expression tree on the LHS. - LinearizeExprTree(LHSBO, Ops); + // For nilpotent operations or addition there may be no operands, for example + // because the expression was "X xor X" or consisted of 2^Bitwidth additions: + // in both cases the weight reduces to 0 causing the value to be skipped. + if (Ops.empty()) { + assert(Identity && "Associative operation without identity!"); + Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1))); + } - // Remember the RHS operand and its rank. - Ops.push_back(ValueEntry(getRank(RHS), RHS)); - - // Clear the RHS leaf out. - I->setOperand(1, UndefValue::get(I->getType())); + return MadeChange; } // RewriteExprTree - Now that the operands for this expression tree are -// linearized and optimized, emit them in-order. This function is written to be -// tail recursive. +// linearized and optimized, emit them in-order. void Reassociate::RewriteExprTree(BinaryOperator *I, - SmallVectorImpl<ValueEntry> &Ops, - unsigned i) { - if (i+2 == Ops.size()) { - if (I->getOperand(0) != Ops[i].Op || - I->getOperand(1) != Ops[i+1].Op) { - Value *OldLHS = I->getOperand(0); - DEBUG(dbgs() << "RA: " << *I << '\n'); - I->setOperand(0, Ops[i].Op); - I->setOperand(1, Ops[i+1].Op); - - // Clear all the optional flags, which may not hold after the - // reassociation if the expression involved more than just this operation. - if (Ops.size() != 2) - I->clearSubclassOptionalData(); - - DEBUG(dbgs() << "TO: " << *I << '\n'); + SmallVectorImpl<ValueEntry> &Ops) { + assert(Ops.size() > 1 && "Single values should be used directly!"); + + // Since our optimizations never increase the number of operations, the new + // expression can always be written by reusing the existing binary operators + // from the original expression tree, without creating any new instructions, + // though the rewritten expression may have a completely different topology. + // We take care to not change anything if the new expression will be the same + // as the original. If more than trivial changes (like commuting operands) + // were made then we are obliged to clear out any optional subclass data like + // nsw flags. + + /// NodesToRewrite - Nodes from the original expression available for writing + /// the new expression into. + SmallVector<BinaryOperator*, 8> NodesToRewrite; + unsigned Opcode = I->getOpcode(); + BinaryOperator *Op = I; + + // ExpressionChanged - Non-null if the rewritten expression differs from the + // original in some non-trivial way, requiring the clearing of optional flags. + // Flags are cleared from the operator in ExpressionChanged up to I inclusive. + BinaryOperator *ExpressionChanged = 0; + for (unsigned i = 0; ; ++i) { + // The last operation (which comes earliest in the IR) is special as both + // operands will come from Ops, rather than just one with the other being + // a subexpression. + if (i+2 == Ops.size()) { + Value *NewLHS = Ops[i].Op; + Value *NewRHS = Ops[i+1].Op; + Value *OldLHS = Op->getOperand(0); + Value *OldRHS = Op->getOperand(1); + + if (NewLHS == OldLHS && NewRHS == OldRHS) + // Nothing changed, leave it alone. + break; + + if (NewLHS == OldRHS && NewRHS == OldLHS) { + // The order of the operands was reversed. Swap them. + DEBUG(dbgs() << "RA: " << *Op << '\n'); + Op->swapOperands(); + DEBUG(dbgs() << "TO: " << *Op << '\n'); + MadeChange = true; + ++NumChanged; + break; + } + + // The new operation differs non-trivially from the original. Overwrite + // the old operands with the new ones. + DEBUG(dbgs() << "RA: " << *Op << '\n'); + if (NewLHS != OldLHS) { + if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(0, NewLHS); + } + if (NewRHS != OldRHS) { + if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(1, NewRHS); + } + DEBUG(dbgs() << "TO: " << *Op << '\n'); + + ExpressionChanged = Op; + MadeChange = true; + ++NumChanged; + + break; + } + + // Not the last operation. The left-hand side will be a sub-expression + // while the right-hand side will be the current element of Ops. + Value *NewRHS = Ops[i].Op; + if (NewRHS != Op->getOperand(1)) { + DEBUG(dbgs() << "RA: " << *Op << '\n'); + if (NewRHS == Op->getOperand(0)) { + // The new right-hand side was already present as the left operand. If + // we are lucky then swapping the operands will sort out both of them. + Op->swapOperands(); + } else { + // Overwrite with the new right-hand side. + if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(1, NewRHS); + ExpressionChanged = Op; + } + DEBUG(dbgs() << "TO: " << *Op << '\n'); MadeChange = true; ++NumChanged; - - // If we reassociated a tree to fewer operands (e.g. (1+a+2) -> (a+3) - // delete the extra, now dead, nodes. - RemoveDeadBinaryOp(OldLHS); } - return; - } - assert(i+2 < Ops.size() && "Ops index out of range!"); - if (I->getOperand(1) != Ops[i].Op) { - DEBUG(dbgs() << "RA: " << *I << '\n'); - I->setOperand(1, Ops[i].Op); + // Now deal with the left-hand side. If this is already an operation node + // from the original expression then just rewrite the rest of the expression + // into it. + if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) { + Op = BO; + continue; + } - // Conservatively clear all the optional flags, which may not hold - // after the reassociation. - I->clearSubclassOptionalData(); + // Otherwise, grab a spare node from the original expression and use that as + // the left-hand side. If there are no nodes left then the optimizers made + // an expression with more nodes than the original! This usually means that + // they did something stupid but it might mean that the problem was just too + // hard (finding the mimimal number of multiplications needed to realize a + // multiplication expression is NP-complete). Whatever the reason, smart or + // stupid, create a new node if there are none left. + BinaryOperator *NewOp; + if (NodesToRewrite.empty()) { + Constant *Undef = UndefValue::get(I->getType()); + NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), + Undef, Undef, "", I); + } else { + NewOp = NodesToRewrite.pop_back_val(); + } - DEBUG(dbgs() << "TO: " << *I << '\n'); + DEBUG(dbgs() << "RA: " << *Op << '\n'); + Op->setOperand(0, NewOp); + DEBUG(dbgs() << "TO: " << *Op << '\n'); + ExpressionChanged = Op; MadeChange = true; ++NumChanged; + Op = NewOp; } - - BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0)); - assert(LHS->getOpcode() == I->getOpcode() && - "Improper expression tree!"); - - // Compactify the tree instructions together with each other to guarantee - // that the expression tree is dominated by all of Ops. - LHS->moveBefore(I); - RewriteExprTree(LHS, Ops, i+1); -} - + // If the expression changed non-trivially then clear out all subclass data + // starting from the operator specified in ExpressionChanged, and compactify + // the operators to just before the expression root to guarantee that the + // expression tree is dominated by all of Ops. + if (ExpressionChanged) + do { + ExpressionChanged->clearSubclassOptionalData(); + if (ExpressionChanged == I) + break; + ExpressionChanged->moveBefore(I); + ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin()); + } while (1); + + // Throw away any left over nodes from the original expression. + for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i) + RedoInsts.insert(NodesToRewrite[i]); +} -// NegateValue - Insert instructions before the instruction pointed to by BI, -// that computes the negative version of the value specified. The negative -// version of the value is returned, and BI is left pointing at the instruction -// that should be processed next by the reassociation pass. -// +/// NegateValue - Insert instructions before the instruction pointed to by BI, +/// that computes the negative version of the value specified. The negative +/// version of the value is returned, and BI is left pointing at the instruction +/// that should be processed next by the reassociation pass. static Value *NegateValue(Value *V, Instruction *BI) { if (Constant *C = dyn_cast<Constant>(V)) return ConstantExpr::getNeg(C); - + // We are trying to expose opportunity for reassociation. One of the things // that we want to do to achieve this is to push a negation as deep into an // expression chain as possible, to expose the add instructions. In practice, @@ -412,22 +808,21 @@ static Value *NegateValue(Value *V, Instruction *BI) { // the constants. We assume that instcombine will clean up the mess later if // we introduce tons of unnecessary negation instructions. // - if (Instruction *I = dyn_cast<Instruction>(V)) - if (I->getOpcode() == Instruction::Add && I->hasOneUse()) { - // Push the negates through the add. - I->setOperand(0, NegateValue(I->getOperand(0), BI)); - I->setOperand(1, NegateValue(I->getOperand(1), BI)); - - // We must move the add instruction here, because the neg instructions do - // not dominate the old add instruction in general. By moving it, we are - // assured that the neg instructions we just inserted dominate the - // instruction we are about to insert after them. - // - I->moveBefore(BI); - I->setName(I->getName()+".neg"); - return I; - } - + if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) { + // Push the negates through the add. + I->setOperand(0, NegateValue(I->getOperand(0), BI)); + I->setOperand(1, NegateValue(I->getOperand(1), BI)); + + // We must move the add instruction here, because the neg instructions do + // not dominate the old add instruction in general. By moving it, we are + // assured that the neg instructions we just inserted dominate the + // instruction we are about to insert after them. + // + I->moveBefore(BI); + I->setName(I->getName()+".neg"); + return I; + } + // Okay, we need to materialize a negated version of V with an instruction. // Scan the use lists of V to see if we have one already. for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){ @@ -443,7 +838,7 @@ static Value *NegateValue(Value *V, Instruction *BI) { // Verify that the negate is in this function, V might be a constant expr. if (TheNeg->getParent()->getParent() != BI->getParent()->getParent()) continue; - + BasicBlock::iterator InsertPt; if (Instruction *InstInput = dyn_cast<Instruction>(V)) { if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) { @@ -471,7 +866,7 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { // If this is a negation, we can't split it up! if (BinaryOperator::isNeg(Sub)) return false; - + // Don't bother to break this up unless either the LHS is an associable add or // subtract or if this is only used by one. if (isReassociableOp(Sub->getOperand(0), Instruction::Add) || @@ -480,19 +875,18 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { if (isReassociableOp(Sub->getOperand(1), Instruction::Add) || isReassociableOp(Sub->getOperand(1), Instruction::Sub)) return true; - if (Sub->hasOneUse() && + if (Sub->hasOneUse() && (isReassociableOp(Sub->use_back(), Instruction::Add) || isReassociableOp(Sub->use_back(), Instruction::Sub))) return true; - + return false; } /// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is /// only used by an add, transform this into (X+(0-Y)) to promote better /// reassociation. -static Instruction *BreakUpSubtract(Instruction *Sub, - DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { +static BinaryOperator *BreakUpSubtract(Instruction *Sub) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // @@ -500,15 +894,15 @@ static Instruction *BreakUpSubtract(Instruction *Sub, // and set it as the RHS of the add instruction we just made. // Value *NegVal = NegateValue(Sub->getOperand(1), Sub); - Instruction *New = + BinaryOperator *New = BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub); + Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. + Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. New->takeName(Sub); // Everyone now refers to the add instruction. - ValueRankMap.erase(Sub); Sub->replaceAllUsesWith(New); New->setDebugLoc(Sub->getDebugLoc()); - Sub->eraseFromParent(); DEBUG(dbgs() << "Negated: " << *New << '\n'); return New; @@ -517,32 +911,23 @@ static Instruction *BreakUpSubtract(Instruction *Sub, /// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used /// by one, change this into a multiply by a constant to assist with further /// reassociation. -static Instruction *ConvertShiftToMul(Instruction *Shl, - DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { - // If an operand of this shift is a reassociable multiply, or if the shift - // is used by a reassociable multiply or add, turn into a multiply. - if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) || - (Shl->hasOneUse() && - (isReassociableOp(Shl->use_back(), Instruction::Mul) || - isReassociableOp(Shl->use_back(), Instruction::Add)))) { - Constant *MulCst = ConstantInt::get(Shl->getType(), 1); - MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1))); - - Instruction *Mul = - BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); - ValueRankMap.erase(Shl); - Mul->takeName(Shl); - Shl->replaceAllUsesWith(Mul); - Mul->setDebugLoc(Shl->getDebugLoc()); - Shl->eraseFromParent(); - return Mul; - } - return 0; +static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { + Constant *MulCst = ConstantInt::get(Shl->getType(), 1); + MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1))); + + BinaryOperator *Mul = + BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); + Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op. + Mul->takeName(Shl); + Shl->replaceAllUsesWith(Mul); + Mul->setDebugLoc(Shl->getDebugLoc()); + return Mul; } -// Scan backwards and forwards among values with the same rank as element i to -// see if X exists. If X does not exist, return i. This is useful when -// scanning for 'x' when we see '-x' because they both get the same rank. +/// FindInOperandList - Scan backwards and forwards among values with the same +/// rank as element i to see if X exists. If X does not exist, return i. This +/// is useful when scanning for 'x' when we see '-x' because they both get the +/// same rank. static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, Value *X) { unsigned XRank = Ops[i].Rank; @@ -559,24 +944,32 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, /// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together /// and returning the result. Insert the tree before I. -static Value *EmitAddTreeOfValues(Instruction *I, SmallVectorImpl<Value*> &Ops){ +static Value *EmitAddTreeOfValues(Instruction *I, + SmallVectorImpl<WeakVH> &Ops){ if (Ops.size() == 1) return Ops.back(); - + Value *V1 = Ops.back(); Ops.pop_back(); Value *V2 = EmitAddTreeOfValues(I, Ops); return BinaryOperator::CreateAdd(V2, V1, "tmp", I); } -/// RemoveFactorFromExpression - If V is an expression tree that is a +/// RemoveFactorFromExpression - If V is an expression tree that is a /// multiplication sequence, and if this sequence contains a multiply by Factor, /// remove Factor from the tree and return the new tree. Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); if (!BO) return 0; - + + SmallVector<RepeatedValue, 8> Tree; + MadeChange |= LinearizeExprTree(BO, Tree); SmallVector<ValueEntry, 8> Factors; - LinearizeExprTree(BO, Factors); + Factors.reserve(Tree.size()); + for (unsigned i = 0, e = Tree.size(); i != e; ++i) { + RepeatedValue E = Tree[i]; + Factors.append(E.second.getZExtValue(), + ValueEntry(getRank(E.first), E.first)); + } bool FoundFactor = false; bool NeedsNegate = false; @@ -586,7 +979,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { Factors.erase(Factors.begin()+i); break; } - + // If this is a negative version of this factor, remove it. if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op)) @@ -596,29 +989,28 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { break; } } - + if (!FoundFactor) { // Make sure to restore the operands to the expression tree. RewriteExprTree(BO, Factors); return 0; } - + BasicBlock::iterator InsertPt = BO; ++InsertPt; - + // If this was just a single multiply, remove the multiply and return the only // remaining operand. if (Factors.size() == 1) { - ValueRankMap.erase(BO); - DeadInsts.push_back(BO); + RedoInsts.insert(BO); V = Factors[0].Op; } else { RewriteExprTree(BO, Factors); V = BO; } - + if (NeedsNegate) V = BinaryOperator::CreateNeg(V, "neg", InsertPt); - + return V; } @@ -628,31 +1020,16 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { /// Ops is the top-level list of add operands we're trying to factor. static void FindSingleUseMultiplyFactors(Value *V, SmallVectorImpl<Value*> &Factors, - const SmallVectorImpl<ValueEntry> &Ops, - bool IsRoot) { - BinaryOperator *BO; - if (!(V->hasOneUse() || V->use_empty()) || // More than one use. - !(BO = dyn_cast<BinaryOperator>(V)) || - BO->getOpcode() != Instruction::Mul) { + const SmallVectorImpl<ValueEntry> &Ops) { + BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); + if (!BO) { Factors.push_back(V); return; } - - // If this value has a single use because it is another input to the add - // tree we're reassociating and we dropped its use, it actually has two - // uses and we can't factor it. - if (!IsRoot) { - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - if (Ops[i].Op == V) { - Factors.push_back(V); - return; - } - } - - + // Otherwise, add the LHS and RHS to the list of factors. - FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops, false); - FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops, false); + FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops); + FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops); } /// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor' @@ -672,12 +1049,12 @@ static Value *OptimizeAndOrXor(unsigned Opcode, if (FoundX != i) { if (Opcode == Instruction::And) // ...&X&~X = 0 return Constant::getNullValue(X->getType()); - + if (Opcode == Instruction::Or) // ...|X|~X = -1 return Constant::getAllOnesValue(X->getType()); } } - + // Next, check for duplicate pairs of values, which we assume are next to // each other, due to our sorting criteria. assert(i < Ops.size()); @@ -689,12 +1066,12 @@ static Value *OptimizeAndOrXor(unsigned Opcode, ++NumAnnihil; continue; } - + // Drop pairs of values for Xor. assert(Opcode == Instruction::Xor); if (e == 2) return Constant::getNullValue(Ops[0].Op->getType()); - + // Y ^ X^X -> Y Ops.erase(Ops.begin()+i, Ops.begin()+i+2); i -= 1; e -= 2; @@ -727,46 +1104,46 @@ Value *Reassociate::OptimizeAdd(Instruction *I, Ops.erase(Ops.begin()+i); ++NumFound; } while (i != Ops.size() && Ops[i].Op == TheOp); - + DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); ++NumFactor; - + // Insert a new multiply. Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound); Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I); - + // Now that we have inserted a multiply, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6 - RedoInsts.push_back(Mul); - + RedoInsts.insert(cast<Instruction>(Mul)); + // If every add operand was a duplicate, return the multiply. if (Ops.empty()) return Mul; - + // Otherwise, we had some input that didn't have the dupe, such as // "A + A + B" -> "A*2 + B". Add the new multiply to the list of // things being added by this operation. Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul)); - + --i; e = Ops.size(); continue; } - + // Check for X and -X in the operand list. if (!BinaryOperator::isNeg(TheOp)) continue; - + Value *X = BinaryOperator::getNegArgument(TheOp); unsigned FoundX = FindInOperandList(Ops, i, X); if (FoundX == i) continue; - + // Remove X and -X from the operand list. if (Ops.size() == 2) return Constant::getNullValue(X->getType()); - + Ops.erase(Ops.begin()+i); if (i < FoundX) --FoundX; @@ -777,37 +1154,37 @@ Value *Reassociate::OptimizeAdd(Instruction *I, --i; // Revisit element. e -= 2; // Removed two elements. } - + // Scan the operand list, checking to see if there are any common factors // between operands. Consider something like A*A+A*B*C+D. We would like to // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies. // To efficiently find this, we count the number of times a factor occurs // for any ADD operands that are MULs. DenseMap<Value*, unsigned> FactorOccurrences; - + // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4) // where they are actually the same multiply. unsigned MaxOcc = 0; Value *MaxOccVal = 0; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { - BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op); - if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty()) + BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); + if (!BOp) continue; - + // Compute all of the factors of this added value. SmallVector<Value*, 8> Factors; - FindSingleUseMultiplyFactors(BOp, Factors, Ops, true); + FindSingleUseMultiplyFactors(BOp, Factors, Ops); assert(Factors.size() > 1 && "Bad linearize!"); - + // Add one to FactorOccurrences for each unique factor in this op. SmallPtrSet<Value*, 8> Duplicates; for (unsigned i = 0, e = Factors.size(); i != e; ++i) { Value *Factor = Factors[i]; if (!Duplicates.insert(Factor)) continue; - + unsigned Occ = ++FactorOccurrences[Factor]; if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } - + // If Factor is a negative constant, add the negated value as a factor // because we can percolate the negate out. Watch for minint, which // cannot be positivified. @@ -816,13 +1193,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I, Factor = ConstantInt::get(CI->getContext(), -CI->getValue()); assert(!Duplicates.count(Factor) && "Shouldn't have two constant factors, missed a canonicalize"); - + unsigned Occ = ++FactorOccurrences[Factor]; if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } } } } - + // If any factor occurred more than one time, we can pull it out. if (MaxOcc > 1) { DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); @@ -830,16 +1207,16 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // Create a new instruction that uses the MaxOccVal twice. If we don't do // this, we could otherwise run into situations where removing a factor - // from an expression will drop a use of maxocc, and this can cause + // from an expression will drop a use of maxocc, and this can cause // RemoveFactorFromExpression on successive values to behave differently. Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal); - SmallVector<Value*, 4> NewMulOps; + SmallVector<WeakVH, 4> NewMulOps; for (unsigned i = 0; i != Ops.size(); ++i) { // Only try to remove factors from expressions we're allowed to. - BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op); - if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty()) + BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); + if (!BOp) continue; - + if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) { // The factorized operand may occur several times. Convert them all in // one fell swoop. @@ -853,7 +1230,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, --i; } } - + // No need for extra uses anymore. delete DummyInst; @@ -865,26 +1242,201 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C)) assert(NumAddedValues > 1 && "Each occurrence should contribute a value"); (void)NumAddedValues; - V = ReassociateExpression(cast<BinaryOperator>(V)); + if (Instruction *VI = dyn_cast<Instruction>(V)) + RedoInsts.insert(VI); // Create the multiply. - Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I); + Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I); // Rerun associate on the multiply in case the inner expression turned into // a multiply. We want to make sure that we keep things in canonical form. - V2 = ReassociateExpression(cast<BinaryOperator>(V2)); - + RedoInsts.insert(V2); + // If every add operand included the factor (e.g. "A*B + A*C"), then the // entire result expression is just the multiply "A*(B+C)". if (Ops.empty()) return V2; - + // Otherwise, we had some input that didn't have the factor, such as // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of // things being added by this operation. Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2)); } - + + return 0; +} + +namespace { + /// \brief Predicate tests whether a ValueEntry's op is in a map. + struct IsValueInMap { + const DenseMap<Value *, unsigned> ⤅ + + IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {} + + bool operator()(const ValueEntry &Entry) { + return Map.find(Entry.Op) != Map.end(); + } + }; +} + +/// \brief Build up a vector of value/power pairs factoring a product. +/// +/// Given a series of multiplication operands, build a vector of factors and +/// the powers each is raised to when forming the final product. Sort them in +/// the order of descending power. +/// +/// (x*x) -> [(x, 2)] +/// ((x*x)*x) -> [(x, 3)] +/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)] +/// +/// \returns Whether any factors have a power greater than one. +bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors) { + // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this. + // Compute the sum of powers of simplifiable factors. + unsigned FactorPowerSum = 0; + for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) { + Value *Op = Ops[Idx-1].Op; + + // Count the number of occurrences of this value. + unsigned Count = 1; + for (; Idx < Size && Ops[Idx].Op == Op; ++Idx) + ++Count; + // Track for simplification all factors which occur 2 or more times. + if (Count > 1) + FactorPowerSum += Count; + } + + // We can only simplify factors if the sum of the powers of our simplifiable + // factors is 4 or higher. When that is the case, we will *always* have + // a simplification. This is an important invariant to prevent cyclicly + // trying to simplify already minimal formations. + if (FactorPowerSum < 4) + return false; + + // Now gather the simplifiable factors, removing them from Ops. + FactorPowerSum = 0; + for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) { + Value *Op = Ops[Idx-1].Op; + + // Count the number of occurrences of this value. + unsigned Count = 1; + for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx) + ++Count; + if (Count == 1) + continue; + // Move an even number of occurrences to Factors. + Count &= ~1U; + Idx -= Count; + FactorPowerSum += Count; + Factors.push_back(Factor(Op, Count)); + Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count); + } + + // None of the adjustments above should have reduced the sum of factor powers + // below our mininum of '4'. + assert(FactorPowerSum >= 4); + + std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter()); + return true; +} + +/// \brief Build a tree of multiplies, computing the product of Ops. +static Value *buildMultiplyTree(IRBuilder<> &Builder, + SmallVectorImpl<Value*> &Ops) { + if (Ops.size() == 1) + return Ops.back(); + + Value *LHS = Ops.pop_back_val(); + do { + LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); + } while (!Ops.empty()); + + return LHS; +} + +/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*... +/// +/// Given a vector of values raised to various powers, where no two values are +/// equal and the powers are sorted in decreasing order, compute the minimal +/// DAG of multiplies to compute the final product, and return that product +/// value. +Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder, + SmallVectorImpl<Factor> &Factors) { + assert(Factors[0].Power); + SmallVector<Value *, 4> OuterProduct; + for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size(); + Idx < Size && Factors[Idx].Power > 0; ++Idx) { + if (Factors[Idx].Power != Factors[LastIdx].Power) { + LastIdx = Idx; + continue; + } + + // We want to multiply across all the factors with the same power so that + // we can raise them to that power as a single entity. Build a mini tree + // for that. + SmallVector<Value *, 4> InnerProduct; + InnerProduct.push_back(Factors[LastIdx].Base); + do { + InnerProduct.push_back(Factors[Idx].Base); + ++Idx; + } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power); + + // Reset the base value of the first factor to the new expression tree. + // We'll remove all the factors with the same power in a second pass. + Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct); + if (Instruction *MI = dyn_cast<Instruction>(M)) + RedoInsts.insert(MI); + + LastIdx = Idx; + } + // Unique factors with equal powers -- we've folded them into the first one's + // base. + Factors.erase(std::unique(Factors.begin(), Factors.end(), + Factor::PowerEqual()), + Factors.end()); + + // Iteratively collect the base of each factor with an add power into the + // outer product, and halve each power in preparation for squaring the + // expression. + for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) { + if (Factors[Idx].Power & 1) + OuterProduct.push_back(Factors[Idx].Base); + Factors[Idx].Power >>= 1; + } + if (Factors[0].Power) { + Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors); + OuterProduct.push_back(SquareRoot); + OuterProduct.push_back(SquareRoot); + } + if (OuterProduct.size() == 1) + return OuterProduct.front(); + + Value *V = buildMultiplyTree(Builder, OuterProduct); + return V; +} + +Value *Reassociate::OptimizeMul(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { + // We can only optimize the multiplies when there is a chain of more than + // three, such that a balanced tree might require fewer total multiplies. + if (Ops.size() < 4) + return 0; + + // Try to turn linear trees of multiplies without other uses of the + // intermediate stages into minimal multiply DAGs with perfect sub-expression + // re-use. + SmallVector<Factor, 4> Factors; + if (!collectMultiplyFactors(Ops, Factors)) + return 0; // All distinct factors, so nothing left for us to do. + + IRBuilder<> Builder(I); + Value *V = buildMinimalMultiplyDAG(Builder, Factors); + if (Ops.empty()) + return V; + + ValueEntry NewEntry = ValueEntry(getRank(V), V); + Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry); return 0; } @@ -892,95 +1444,105 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. - bool IterateOptimization = false; if (Ops.size() == 1) return Ops[0].Op; unsigned Opcode = I->getOpcode(); - - if (Constant *V1 = dyn_cast<Constant>(Ops[Ops.size()-2].Op)) - if (Constant *V2 = dyn_cast<Constant>(Ops.back().Op)) { - Ops.pop_back(); - Ops.back().Op = ConstantExpr::get(Opcode, V1, V2); - return OptimizeExpression(I, Ops); - } - - // Check for destructive annihilation due to a constant being used. - if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Ops.back().Op)) - switch (Opcode) { - default: break; - case Instruction::And: - if (CstVal->isZero()) // X & 0 -> 0 - return CstVal; - if (CstVal->isAllOnesValue()) // X & -1 -> X - Ops.pop_back(); - break; - case Instruction::Mul: - if (CstVal->isZero()) { // X * 0 -> 0 - ++NumAnnihil; - return CstVal; - } - - if (cast<ConstantInt>(CstVal)->isOne()) - Ops.pop_back(); // X * 1 -> X - break; - case Instruction::Or: - if (CstVal->isAllOnesValue()) // X | -1 -> -1 - return CstVal; - // FALLTHROUGH! - case Instruction::Add: - case Instruction::Xor: - if (CstVal->isZero()) // X [|^+] 0 -> X - Ops.pop_back(); - break; - } - if (Ops.size() == 1) return Ops[0].Op; // Handle destructive annihilation due to identities between elements in the // argument list here. + unsigned NumOps = Ops.size(); switch (Opcode) { default: break; case Instruction::And: case Instruction::Or: - case Instruction::Xor: { - unsigned NumOps = Ops.size(); + case Instruction::Xor: if (Value *Result = OptimizeAndOrXor(Opcode, Ops)) return Result; - IterateOptimization |= Ops.size() != NumOps; break; - } - case Instruction::Add: { - unsigned NumOps = Ops.size(); + case Instruction::Add: if (Value *Result = OptimizeAdd(I, Ops)) return Result; - IterateOptimization |= Ops.size() != NumOps; - } + break; + case Instruction::Mul: + if (Value *Result = OptimizeMul(I, Ops)) + return Result; break; - //case Instruction::Mul: } - if (IterateOptimization) + if (Ops.size() != NumOps) return OptimizeExpression(I, Ops); return 0; } +/// EraseInst - Zap the given instruction, adding interesting operands to the +/// work list. +void Reassociate::EraseInst(Instruction *I) { + assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); + SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end()); + // Erase the dead instruction. + ValueRankMap.erase(I); + RedoInsts.remove(I); + I->eraseFromParent(); + // Optimize its operands. + SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) { + // If this is a node in an expression tree, climb to the expression root + // and add that since that's where optimization actually happens. + unsigned Opcode = Op->getOpcode(); + while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode && + Visited.insert(Op)) + Op = Op->use_back(); + RedoInsts.insert(Op); + } +} + +/// OptimizeInst - Inspect and optimize the given instruction. Note that erasing +/// instructions is not allowed. +void Reassociate::OptimizeInst(Instruction *I) { + // Only consider operations that we understand. + if (!isa<BinaryOperator>(I)) + return; -/// ReassociateInst - Inspect and reassociate the instruction at the -/// given position, post-incrementing the position. -void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) { - Instruction *BI = BBI++; - if (BI->getOpcode() == Instruction::Shl && - isa<ConstantInt>(BI->getOperand(1))) - if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) { + if (I->getOpcode() == Instruction::Shl && + isa<ConstantInt>(I->getOperand(1))) + // If an operand of this shift is a reassociable multiply, or if the shift + // is used by a reassociable multiply or add, turn into a multiply. + if (isReassociableOp(I->getOperand(0), Instruction::Mul) || + (I->hasOneUse() && + (isReassociableOp(I->use_back(), Instruction::Mul) || + isReassociableOp(I->use_back(), Instruction::Add)))) { + Instruction *NI = ConvertShiftToMul(I); + RedoInsts.insert(I); MadeChange = true; - BI = NI; + I = NI; + } + + // Floating point binary operators are not associative, but we can still + // commute (some) of them, to canonicalize the order of their operands. + // This can potentially expose more CSE opportunities, and makes writing + // other transformations simpler. + if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) { + // FAdd and FMul can be commuted. + if (I->getOpcode() != Instruction::FMul && + I->getOpcode() != Instruction::FAdd) + return; + + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + unsigned LHSRank = getRank(LHS); + unsigned RHSRank = getRank(RHS); + + // Sort the operands by rank. + if (RHSRank < LHSRank) { + I->setOperand(0, RHS); + I->setOperand(1, LHS); } - // Reject cases where it is pointless to do this. - if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || - BI->getType()->isVectorTy()) - return; // Floating point ops are not associative. + return; + } // Do not reassociate boolean (i1) expressions. We want to preserve the // original order of evaluation for short-circuited comparisons that @@ -988,58 +1550,66 @@ void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) { // is not further optimized, it is likely to be transformed back to a // short-circuited form for code gen, and the source order may have been // optimized for the most likely conditions. - if (BI->getType()->isIntegerTy(1)) + if (I->getType()->isIntegerTy(1)) return; // If this is a subtract instruction which is not already in negate form, // see if we can convert it to X+-Y. - if (BI->getOpcode() == Instruction::Sub) { - if (ShouldBreakUpSubtract(BI)) { - BI = BreakUpSubtract(BI, ValueRankMap); - // Reset the BBI iterator in case BreakUpSubtract changed the - // instruction it points to. - BBI = BI; - ++BBI; + if (I->getOpcode() == Instruction::Sub) { + if (ShouldBreakUpSubtract(I)) { + Instruction *NI = BreakUpSubtract(I); + RedoInsts.insert(I); MadeChange = true; - } else if (BinaryOperator::isNeg(BI)) { + I = NI; + } else if (BinaryOperator::isNeg(I)) { // Otherwise, this is a negation. See if the operand is a multiply tree // and if this is not an inner node of a multiply tree. - if (isReassociableOp(BI->getOperand(1), Instruction::Mul) && - (!BI->hasOneUse() || - !isReassociableOp(BI->use_back(), Instruction::Mul))) { - BI = LowerNegateToMultiply(BI, ValueRankMap); + if (isReassociableOp(I->getOperand(1), Instruction::Mul) && + (!I->hasOneUse() || + !isReassociableOp(I->use_back(), Instruction::Mul))) { + Instruction *NI = LowerNegateToMultiply(I); + RedoInsts.insert(I); MadeChange = true; + I = NI; } } } - // If this instruction is a commutative binary operator, process it. - if (!BI->isAssociative()) return; - BinaryOperator *I = cast<BinaryOperator>(BI); + // If this instruction is an associative binary operator, process it. + if (!I->isAssociative()) return; + BinaryOperator *BO = cast<BinaryOperator>(I); // If this is an interior node of a reassociable tree, ignore it until we // get to the root of the tree, to avoid N^2 analysis. - if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode())) + unsigned Opcode = BO->getOpcode(); + if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode) return; - // If this is an add tree that is used by a sub instruction, ignore it + // If this is an add tree that is used by a sub instruction, ignore it // until we process the subtract. - if (I->hasOneUse() && I->getOpcode() == Instruction::Add && - cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub) + if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add && + cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub) return; - ReassociateExpression(I); + ReassociateExpression(BO); } -Value *Reassociate::ReassociateExpression(BinaryOperator *I) { - +void Reassociate::ReassociateExpression(BinaryOperator *I) { + // First, walk the expression tree, linearizing the tree, collecting the // operand information. + SmallVector<RepeatedValue, 8> Tree; + MadeChange |= LinearizeExprTree(I, Tree); SmallVector<ValueEntry, 8> Ops; - LinearizeExprTree(I, Ops); - + Ops.reserve(Tree.size()); + for (unsigned i = 0, e = Tree.size(); i != e; ++i) { + RepeatedValue E = Tree[i]; + Ops.append(E.second.getZExtValue(), + ValueEntry(getRank(E.first), E.first)); + } + DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); - + // Now that we have linearized the tree to a list and have gathered all of // the operands and their ranks, sort the operands by their rank. Use a // stable_sort so that values with equal ranks will have their relative @@ -1047,21 +1617,24 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) { // this sorts so that the highest ranking values end up at the beginning of // the vector. std::stable_sort(Ops.begin(), Ops.end()); - + // OptimizeExpression - Now that we have the expression tree in a convenient // sorted form, optimize it globally if possible. if (Value *V = OptimizeExpression(I, Ops)) { + if (V == I) + // Self-referential expression in unreachable code. + return; // This expression tree simplified to something that isn't a tree, // eliminate it. DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); I->replaceAllUsesWith(V); if (Instruction *VI = dyn_cast<Instruction>(V)) VI->setDebugLoc(I->getDebugLoc()); - RemoveDeadBinaryOp(I); + RedoInsts.insert(I); ++NumAnnihil; - return V; + return; } - + // We want to sink immediates as deeply as possible except in the case where // this is a multiply tree used only by an add, and the immediate is a -1. // In this case we reassociate to put the negation on the outside so that we @@ -1073,51 +1646,57 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) { ValueEntry Tmp = Ops.pop_back_val(); Ops.insert(Ops.begin(), Tmp); } - + DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); - + if (Ops.size() == 1) { + if (Ops[0].Op == I) + // Self-referential expression in unreachable code. + return; + // This expression tree simplified to something that isn't a tree, // eliminate it. I->replaceAllUsesWith(Ops[0].Op); if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op)) OI->setDebugLoc(I->getDebugLoc()); - RemoveDeadBinaryOp(I); - return Ops[0].Op; + RedoInsts.insert(I); + return; } - + // Now that we ordered and optimized the expressions, splat them back into // the expression tree, removing any unneeded nodes. RewriteExprTree(I, Ops); - return I; } - bool Reassociate::runOnFunction(Function &F) { - // Recalculate the rank map for F + // Calculate the rank map for F BuildRankMap(F); MadeChange = false; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) - for (BasicBlock::iterator BBI = FI->begin(); BBI != FI->end(); ) - ReassociateInst(BBI); - - // Now that we're done, revisit any instructions which are likely to - // have secondary reassociation opportunities. - while (!RedoInsts.empty()) - if (Value *V = RedoInsts.pop_back_val()) { - BasicBlock::iterator BBI = cast<Instruction>(V); - ReassociateInst(BBI); - } + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + // Optimize every instruction in the basic block. + for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ) + if (isInstructionTriviallyDead(II)) { + EraseInst(II++); + } else { + OptimizeInst(II); + assert(II->getParent() == BI && "Moved to a different block!"); + ++II; + } - // Now that we're done, delete any instructions which are no longer used. - while (!DeadInsts.empty()) - if (Value *V = DeadInsts.pop_back_val()) - RecursivelyDeleteTriviallyDeadInstructions(V); + // If this produced extra instructions to optimize, handle them now. + while (!RedoInsts.empty()) { + Instruction *I = RedoInsts.pop_back_val(); + if (isInstructionTriviallyDead(I)) + EraseInst(I); + else + OptimizeInst(I); + } + } // We are done with the rank map. RankMap.clear(); ValueRankMap.clear(); + return MadeChange; } - diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index 47afc77..ea1de63 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file demotes all registers to memory references. It is intented to be +// This file demotes all registers to memory references. It is intended to be // the inverse of PromoteMemoryToRegister. By converting to loads, the only // values live across basic blocks are allocas and loads before phi nodes. // It is intended that this should make CFG hacking much easier. @@ -59,7 +59,7 @@ namespace { virtual bool runOnFunction(Function &F); }; } - + char RegToMem::ID = 0; INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots", false, false) @@ -68,25 +68,25 @@ INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots", false, false) bool RegToMem::runOnFunction(Function &F) { - if (F.isDeclaration()) + if (F.isDeclaration()) return false; - + // Insert all new allocas into entry block. BasicBlock *BBEntry = &F.getEntryBlock(); assert(pred_begin(BBEntry) == pred_end(BBEntry) && "Entry block to function must not have predecessors!"); - + // Find first non-alloca instruction and create insertion point. This is // safe if block is well-formed: it always have terminator, otherwise // we'll get and assertion. BasicBlock::iterator I = BBEntry->begin(); while (isa<AllocaInst>(I)) ++I; - + CastInst *AllocaInsertionPoint = new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())), Type::getInt32Ty(F.getContext()), "reg2mem alloca point", I); - + // Find the escaped instructions. But don't create stack slots for // allocas in entry block. std::list<Instruction*> WorkList; @@ -99,15 +99,15 @@ bool RegToMem::runOnFunction(Function &F) { WorkList.push_front(&*iib); } } - + // Demote escaped instructions NumRegsDemoted += WorkList.size(); - for (std::list<Instruction*>::iterator ilb = WorkList.begin(), + for (std::list<Instruction*>::iterator ilb = WorkList.begin(), ile = WorkList.end(); ilb != ile; ++ilb) DemoteRegToStack(**ilb, false, AllocaInsertionPoint); - + WorkList.clear(); - + // Find all phi's for (Function::iterator ibb = F.begin(), ibe = F.end(); ibb != ibe; ++ibb) @@ -115,19 +115,18 @@ bool RegToMem::runOnFunction(Function &F) { iib != iie; ++iib) if (isa<PHINode>(iib)) WorkList.push_front(&*iib); - + // Demote phi nodes NumPhisDemoted += WorkList.size(); - for (std::list<Instruction*>::iterator ilb = WorkList.begin(), + for (std::list<Instruction*>::iterator ilb = WorkList.begin(), ile = WorkList.end(); ilb != ile; ++ilb) DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint); - + return true; } // createDemoteRegisterToMemory - Provide an entry point to create this pass. -// char &llvm::DemoteRegisterToMemoryID = RegToMem::ID; FunctionPass *llvm::createDemoteRegisterToMemoryPass() { return new RegToMem(); diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 16b64a5..2c39aab 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -409,7 +409,7 @@ private: if (Constant *C = dyn_cast<Constant>(V)) { Constant *Elt = C->getAggregateElement(i); - + if (Elt == 0) LV.markOverdefined(); // Unknown sort of constant. else if (isa<UndefValue>(Elt)) diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 7d65bcc..48318c8 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements common infrastructure for libLLVMScalarOpts.a, which +// This file implements common infrastructure for libLLVMScalarOpts.a, which // implements several scalar transformations over the LLVM intermediate // representation, including the C bindings for that library. // @@ -24,7 +24,7 @@ using namespace llvm; -/// initializeScalarOptsPasses - Initialize all passes linked into the +/// initializeScalarOptsPasses - Initialize all passes linked into the /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 026fea1..ec835b1 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -22,33 +22,34 @@ #define DEBUG_TYPE "scalarrepl" #include "llvm/Transforms/Scalar.h" #include "llvm/Constants.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" +#include "llvm/Operator.h" #include "llvm/Pass.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Analysis/DIBuilder.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; STATISTIC(NumReplaced, "Number of allocas broken up"); @@ -59,12 +60,25 @@ STATISTIC(NumGlobals, "Number of allocas copied from constant global"); namespace { struct SROA : public FunctionPass { - SROA(int T, bool hasDT, char &ID) + SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) : FunctionPass(ID), HasDomTree(hasDT) { if (T == -1) SRThreshold = 128; else SRThreshold = T; + if (ST == -1) + StructMemberThreshold = 32; + else + StructMemberThreshold = ST; + if (AT == -1) + ArrayElementThreshold = 8; + else + ArrayElementThreshold = AT; + if (SLT == -1) + // Do not limit the scalar integer load size if no threshold is given. + ScalarLoadThreshold = -1; + else + ScalarLoadThreshold = SLT; } bool runOnFunction(Function &F); @@ -86,11 +100,11 @@ namespace { struct AllocaInfo { /// The alloca to promote. AllocaInst *AI; - + /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite /// looping and avoid redundant work. SmallPtrSet<PHINode*, 8> CheckedPHIs; - + /// isUnsafe - This is set to true if the alloca cannot be SROA'd. bool isUnsafe : 1; @@ -104,19 +118,32 @@ namespace { /// ever accessed, or false if the alloca is only accessed with mem /// intrinsics or load/store that only access the entire alloca at once. bool hasSubelementAccess : 1; - + /// hasALoadOrStore - This is true if there are any loads or stores to it. /// The alloca may just be accessed with memcpy, for example, which would /// not set this. bool hasALoadOrStore : 1; - + explicit AllocaInfo(AllocaInst *ai) : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false), hasSubelementAccess(false), hasALoadOrStore(false) {} }; + /// SRThreshold - The maximum alloca size to considered for SROA. unsigned SRThreshold; + /// StructMemberThreshold - The maximum number of members a struct can + /// contain to be considered for SROA. + unsigned StructMemberThreshold; + + /// ArrayElementThreshold - The maximum number of elements an array can + /// have to be considered for SROA. + unsigned ArrayElementThreshold; + + /// ScalarLoadThreshold - The maximum size in bits of scalars to load when + /// converting to scalar + unsigned ScalarLoadThreshold; + void MarkUnsafe(AllocaInfo &I, Instruction *User) { I.isUnsafe = true; DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n'); @@ -155,19 +182,21 @@ namespace { SmallVector<AllocaInst*, 32> &NewElts); void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts); + bool ShouldAttemptScalarRepl(AllocaInst *AI); static MemTransferInst *isOnlyCopiedFromConstantGlobal( AllocaInst *AI, SmallVector<Instruction*, 4> &ToDelete); }; - + // SROA_DT - SROA that uses DominatorTree. struct SROA_DT : public SROA { static char ID; public: - SROA_DT(int T = -1) : SROA(T, true, ID) { + SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : + SROA(T, true, ID, ST, AT, SLT) { initializeSROA_DTPass(*PassRegistry::getPassRegistry()); } - + // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. virtual void getAnalysisUsage(AnalysisUsage &AU) const { @@ -175,22 +204,23 @@ namespace { AU.setPreservesCFG(); } }; - + // SROA_SSAUp - SROA that uses SSAUpdater. struct SROA_SSAUp : public SROA { static char ID; public: - SROA_SSAUp(int T = -1) : SROA(T, false, ID) { + SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : + SROA(T, false, ID, ST, AT, SLT) { initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry()); } - + // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); } }; - + } char SROA_DT::ID = 0; @@ -209,10 +239,15 @@ INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", // Public interface to the ScalarReplAggregates pass FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold, - bool UseDomTree) { + bool UseDomTree, + int StructMemberThreshold, + int ArrayElementThreshold, + int ScalarLoadThreshold) { if (UseDomTree) - return new SROA_DT(Threshold); - return new SROA_SSAUp(Threshold); + return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold, + ScalarLoadThreshold); + return new SROA_SSAUp(Threshold, StructMemberThreshold, + ArrayElementThreshold, ScalarLoadThreshold); } @@ -228,6 +263,7 @@ class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; const TargetData &TD; + unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object /// which means that mem2reg can't promote it. @@ -258,28 +294,38 @@ class ConvertToScalarInfo { /// isn't possible to turn into a vector type, it gets set to VoidTy. VectorType *VectorTy; - /// HadNonMemTransferAccess - True if there is at least one access to the + /// HadNonMemTransferAccess - True if there is at least one access to the /// alloca that is not a MemTransferInst. We don't want to turn structs into /// large integers unless there is some potential for optimization. bool HadNonMemTransferAccess; + /// HadDynamicAccess - True if some element of this alloca was dynamic. + /// We don't yet have support for turning a dynamic access into a large + /// integer. + bool HadDynamicAccess; + public: - explicit ConvertToScalarInfo(unsigned Size, const TargetData &td) - : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown), - VectorTy(0), HadNonMemTransferAccess(false) { } + explicit ConvertToScalarInfo(unsigned Size, const TargetData &td, + unsigned SLT) + : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false), + ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false), + HadDynamicAccess(false) { } AllocaInst *TryConvert(AllocaInst *AI); private: - bool CanConvertToScalar(Value *V, uint64_t Offset); + bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx); void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset); bool MergeInVectorType(VectorType *VInTy, uint64_t Offset); - void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); + void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, + Value *NonConstantIdx); Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType, - uint64_t Offset, IRBuilder<> &Builder); + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder); Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, - uint64_t Offset, IRBuilder<> &Builder); + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder); }; } // end anonymous namespace. @@ -290,7 +336,7 @@ private: AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // If we can't convert this scalar, or if mem2reg can trivially do it, bail // out. - if (!CanConvertToScalar(AI, 0) || !IsNotTrivial) + if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial) return 0; // If an alloca has only memset / memcpy uses, it may still have an Unknown @@ -315,16 +361,27 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { NewTy = VectorTy; // Use the vector type. } else { unsigned BitWidth = AllocaSize * 8; + + // Do not convert to scalar integer if the alloca size exceeds the + // scalar load threshold. + if (BitWidth > ScalarLoadThreshold) + return 0; + if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth)) return 0; + // Dynamic accesses on integers aren't yet supported. They need us to shift + // by a dynamic amount which could be difficult to work out as we might not + // know whether to use a left or right shift. + if (ScalarKind == Integer && HadDynamicAccess) + return 0; DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); - ConvertUsesToScalar(AI, NewAI, 0); + ConvertUsesToScalar(AI, NewAI, 0, 0); return NewAI; } @@ -411,7 +468,8 @@ bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, /// /// If we see at least one access to the value that is as a vector type, set the /// SawVec flag. -bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { +bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, + Value* NonConstantIdx) { for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { Instruction *User = cast<Instruction>(*UI); @@ -441,24 +499,35 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { if (!onlyUsedByLifetimeMarkers(BCI)) IsNotTrivial = true; // Can't be mem2reg'd. - if (!CanConvertToScalar(BCI, Offset)) + if (!CanConvertToScalar(BCI, Offset, NonConstantIdx)) return false; continue; } if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { // If this is a GEP with a variable indices, we can't handle it. - if (!GEP->hasAllConstantIndices()) + PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType()); + if (!PtrTy) return false; // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - if (!GEP->getPointerOperandType()->isPointerTy()) - return false; - uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + Value *GEPNonConstantIdx = 0; + if (!GEP->hasAllConstantIndices()) { + if (!isa<VectorType>(PtrTy->getElementType())) + return false; + if (NonConstantIdx) + return false; + GEPNonConstantIdx = Indices.pop_back_val(); + if (!GEPNonConstantIdx->getType()->isIntegerTy(32)) + return false; + HadDynamicAccess = true; + } else + GEPNonConstantIdx = NonConstantIdx; + uint64_t GEPOffset = TD.getIndexedOffset(PtrTy, Indices); // See if all uses can be converted. - if (!CanConvertToScalar(GEP, Offset+GEPOffset)) + if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx)) return false; IsNotTrivial = true; // Can't be mem2reg'd. HadNonMemTransferAccess = true; @@ -468,6 +537,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { // If this is a constant sized memset of a constant value (e.g. 0) we can // handle it. if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + // Store to dynamic index. + if (NonConstantIdx) + return false; // Store of constant value. if (!isa<ConstantInt>(MSI->getValue())) return false; @@ -492,6 +564,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { // If this is a memcpy or memmove into or out of the whole allocation, we // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + // Store to dynamic index. + if (NonConstantIdx) + return false; ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) return false; @@ -523,12 +598,13 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. By the end of this, there should be no uses of Ptr. void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, - uint64_t Offset) { + uint64_t Offset, + Value* NonConstantIdx) { while (!Ptr->use_empty()) { Instruction *User = cast<Instruction>(Ptr->use_back()); if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { - ConvertUsesToScalar(CI, NewAI, Offset); + ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx); CI->eraseFromParent(); continue; } @@ -536,9 +612,11 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + if (!GEP->hasAllConstantIndices()) + NonConstantIdx = Indices.pop_back_val(); uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), Indices); - ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8); + ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, NonConstantIdx); GEP->eraseFromParent(); continue; } @@ -549,7 +627,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // The load is a bit extract from NewAI shifted right by Offset bits. Value *LoadedVal = Builder.CreateLoad(NewAI); Value *NewLoadVal - = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); + = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, + NonConstantIdx, Builder); LI->replaceAllUsesWith(NewLoadVal); LI->eraseFromParent(); continue; @@ -559,7 +638,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, assert(SI->getOperand(0) != Ptr && "Consistency error!"); Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, - Builder); + NonConstantIdx, Builder); Builder.CreateStore(New, NewAI); SI->eraseFromParent(); @@ -574,6 +653,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // transform it into a store of the expanded constant value. if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { assert(MSI->getRawDest() == Ptr && "Consistency error!"); + assert(!NonConstantIdx && "Cannot replace dynamic memset with insert"); int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue(); if (SNumBytes > 0 && (SNumBytes >> 32) == 0) { unsigned NumBytes = static_cast<unsigned>(SNumBytes); @@ -590,7 +670,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue( ConstantInt::get(User->getContext(), APVal), - Old, Offset, Builder); + Old, Offset, 0, Builder); Builder.CreateStore(New, NewAI); // If the load we just inserted is now dead, then the memset overwrote @@ -606,6 +686,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { assert(Offset == 0 && "must be store to start of alloca"); + assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert"); // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store @@ -678,7 +759,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, /// shifted to the right. Value *ConvertToScalarInfo:: ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, - uint64_t Offset, IRBuilder<> &Builder) { + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder) { // If the load is of the whole new alloca, no conversion is needed. Type *FromType = FromVal->getType(); if (FromType == ToType && Offset == 0) @@ -700,7 +782,17 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); } // Return the element extracted out of it. - Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt)); + Value *Idx; + if (NonConstantIdx) { + if (Elt) + Idx = Builder.CreateAdd(NonConstantIdx, + Builder.getInt32(Elt), + "dyn.offset"); + else + Idx = NonConstantIdx; + } else + Idx = Builder.getInt32(Elt); + Value *V = Builder.CreateExtractElement(FromVal, Idx); if (V->getType() != ToType) V = Builder.CreateBitCast(V, ToType); return V; @@ -709,23 +801,27 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, // If ToType is a first class aggregate, extract out each of the pieces and // use insertvalue's to form the FCA. if (StructType *ST = dyn_cast<StructType>(ToType)) { + assert(!NonConstantIdx && + "Dynamic indexing into struct types not supported"); const StructLayout &Layout = *TD.getStructLayout(ST); Value *Res = UndefValue::get(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), Offset+Layout.getElementOffsetInBits(i), - Builder); + 0, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) { + assert(!NonConstantIdx && + "Dynamic indexing into array types not supported"); uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); Value *Res = UndefValue::get(AT); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), - Offset+i*EltSize, Builder); + Offset+i*EltSize, 0, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; @@ -791,9 +887,14 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, /// /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. +/// +/// NonConstantIdx is an index value if there was a GEP with a non-constant +/// index value. If this is 0 then all GEPs used to find this insert address +/// are constant. Value *ConvertToScalarInfo:: ConvertScalar_InsertValue(Value *SV, Value *Old, - uint64_t Offset, IRBuilder<> &Builder) { + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder) { // Convert the stored type to the actual type, shift it left to insert // then 'or' into place. Type *AllocaType = Old->getType(); @@ -814,26 +915,40 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, SV = Builder.CreateBitCast(SV, EltTy); uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy); unsigned Elt = Offset/EltSize; - return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt)); + Value *Idx; + if (NonConstantIdx) { + if (Elt) + Idx = Builder.CreateAdd(NonConstantIdx, + Builder.getInt32(Elt), + "dyn.offset"); + else + Idx = NonConstantIdx; + } else + Idx = Builder.getInt32(Elt); + return Builder.CreateInsertElement(Old, SV, Idx); } // If SV is a first-class aggregate value, insert each value recursively. if (StructType *ST = dyn_cast<StructType>(SV->getType())) { + assert(!NonConstantIdx && + "Dynamic indexing into struct types not supported"); const StructLayout &Layout = *TD.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), - Builder); + 0, Builder); } return Old; } if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { + assert(!NonConstantIdx && + "Dynamic indexing into array types not supported"); uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); - Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); + Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder); } return Old; } @@ -935,7 +1050,7 @@ public: AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, DIBuilder *DB) : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {} - + void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; @@ -950,18 +1065,18 @@ public: LoadAndStorePromoter::run(Insts); AI->eraseFromParent(); - for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), + for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; DDI->eraseFromParent(); } - for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), + for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; DVI->eraseFromParent(); } } - + virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &Insts) const { if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -970,7 +1085,7 @@ public: } virtual void updateDebugInfo(Instruction *Inst) const { - for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) @@ -978,7 +1093,7 @@ public: else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, *DIB); } - for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; Value *Arg = NULL; @@ -1021,12 +1136,12 @@ public: static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); - + for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); UI != UE; ++UI) { LoadInst *LI = dyn_cast<LoadInst>(*UI); if (LI == 0 || !LI->isSimple()) return false; - + // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, @@ -1036,7 +1151,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { LI->getAlignment(), TD)) return false; } - + return true; } @@ -1067,20 +1182,20 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { UI != UE; ++UI) { LoadInst *LI = dyn_cast<LoadInst>(*UI); if (LI == 0 || !LI->isSimple()) return false; - + // For now we only allow loads in the same block as the PHI. This is a // common case that happens when instcombine merges two loads through a PHI. if (LI->getParent() != BB) return false; - + // Ensure that there are no instructions between the PHI and the load that // could store. for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; - + MaxAlign = std::max(MaxAlign, LI->getAlignment()); } - + // Okay, we know that we have one or more loads in the same block as the PHI. // We can transform this if it is safe to push the loads into the predecessor // blocks. The only thing to watch out for is that we can't put a possibly @@ -1108,10 +1223,10 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { if (InVal->isDereferenceablePointer() || isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD)) continue; - + return false; } - + return true; } @@ -1123,7 +1238,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; - + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); UI != UE; ++UI) { User *U = *UI; @@ -1132,7 +1247,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { return false; continue; } - + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { if (SI->getOperand(0) == AI || !SI->isSimple()) return false; // Don't allow a store OF the AI, only INTO the AI. @@ -1146,7 +1261,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { Value *Result = SI->getOperand(1+CI->isZero()); SI->replaceAllUsesWith(Result); SI->eraseFromParent(); - + // This is very rare and we just scrambled the use list of AI, start // over completely. return tryToMakeAllocaBePromotable(AI, TD); @@ -1156,33 +1271,33 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { // loads, then we can transform this by rewriting the select. if (!isSafeSelectToSpeculate(SI, TD)) return false; - + InstsToRewrite.insert(SI); continue; } - + if (PHINode *PN = dyn_cast<PHINode>(U)) { if (PN->use_empty()) { // Dead PHIs can be stripped. InstsToRewrite.insert(PN); continue; } - + // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads // in the pred blocks, then we can transform this by rewriting the PHI. if (!isSafePHIToSpeculate(PN, TD)) return false; - + InstsToRewrite.insert(PN); continue; } - + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { if (onlyUsedByLifetimeMarkers(BCI)) { InstsToRewrite.insert(BCI); continue; } } - + return false; } @@ -1190,7 +1305,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { // we're done! if (InstsToRewrite.empty()) return true; - + // If we have instructions that need to be rewritten for this to be promotable // take care of it now. for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { @@ -1211,13 +1326,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { // loads with a new select. while (!SI->use_empty()) { LoadInst *LI = cast<LoadInst>(SI->use_back()); - + IRBuilder<> Builder(LI); - LoadInst *TrueLoad = + LoadInst *TrueLoad = Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t"); - LoadInst *FalseLoad = + LoadInst *FalseLoad = Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f"); - + // Transfer alignment and TBAA info if present. TrueLoad->setAlignment(LI->getAlignment()); FalseLoad->setAlignment(LI->getAlignment()); @@ -1225,18 +1340,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag); FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag); } - + Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); V->takeName(LI); LI->replaceAllUsesWith(V); LI->eraseFromParent(); } - + // Now that all the loads are gone, the select is gone too. SI->eraseFromParent(); continue; } - + // Otherwise, we have a PHI node which allows us to push the loads into the // predecessors. PHINode *PN = cast<PHINode>(InstsToRewrite[i]); @@ -1244,7 +1359,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { PN->eraseFromParent(); continue; } - + Type *LoadTy = cast<PointerType>(PN->getType())->getElementType(); PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(), PN->getName()+".ld", PN); @@ -1254,18 +1369,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { LoadInst *SomeLoad = cast<LoadInst>(PN->use_back()); MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); unsigned Align = SomeLoad->getAlignment(); - + // Rewrite all loads of the PN to use the new PHI. while (!PN->use_empty()) { LoadInst *LI = cast<LoadInst>(PN->use_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } - + // Inject loads into all of the pred blocks. Keep track of which blocks we // insert them into in case we have multiple edges from the same block. DenseMap<BasicBlock*, LoadInst*> InsertedLoads; - + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); LoadInst *&Load = InsertedLoads[Pred]; @@ -1276,13 +1391,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { Load->setAlignment(Align); if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); } - + NewPN->addIncoming(Load, Pred); } - + PN->eraseFromParent(); } - + ++NumAdjusted; return true; } @@ -1315,7 +1430,7 @@ bool SROA::performPromotion(Function &F) { SSAUpdater SSA; for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { AllocaInst *AI = Allocas[i]; - + // Build list of instructions to promote. for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ++UI) @@ -1334,18 +1449,36 @@ bool SROA::performPromotion(Function &F) { /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for /// SROA. It must be a struct or array type with a small number of elements. -static bool ShouldAttemptScalarRepl(AllocaInst *AI) { +bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { Type *T = AI->getAllocatedType(); - // Do not promote any struct into more than 32 separate vars. + // Do not promote any struct that has too many members. if (StructType *ST = dyn_cast<StructType>(T)) - return ST->getNumElements() <= 32; - // Arrays are much less likely to be safe for SROA; only consider - // them if they are very small. + return ST->getNumElements() <= StructMemberThreshold; + // Do not promote any array that has too many elements. if (ArrayType *AT = dyn_cast<ArrayType>(T)) - return AT->getNumElements() <= 8; + return AT->getNumElements() <= ArrayElementThreshold; return false; } +/// getPointeeAlignment - Compute the minimum alignment of the value pointed +/// to by the given pointer. +static unsigned getPointeeAlignment(Value *V, const TargetData &TD) { + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + if (CE->getOpcode() == Instruction::BitCast || + (CE->getOpcode() == Instruction::GetElementPtr && + cast<GEPOperator>(CE)->hasAllZeroIndices())) + return getPointeeAlignment(CE->getOperand(0), TD); + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + if (!GV->isDeclaration()) + return TD.getPreferredAlignment(GV); + + if (PointerType *PT = dyn_cast<PointerType>(V->getType())) + return TD.getABITypeAlignment(PT->getElementType()); + + return 0; +} + // performScalarRepl - This algorithm is a simple worklist driven algorithm, // which runs on all of the alloca instructions in the function, removing them @@ -1379,23 +1512,26 @@ bool SROA::performScalarRepl(Function &F) { continue; // Check to see if this allocation is only modified by a memcpy/memmove from - // a constant global. If this is the case, we can change all users to use + // a constant global whose alignment is equal to or exceeds that of the + // allocation. If this is the case, we can change all users to use // the constant global instead. This is commonly produced by the CFE by // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' // is only subsequently read. SmallVector<Instruction *, 4> ToDelete; if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(AI, ToDelete)) { - DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); - DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); - for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) - ToDelete[i]->eraseFromParent(); - Constant *TheSrc = cast<Constant>(Copy->getSource()); - AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); - Copy->eraseFromParent(); // Don't mutate the global. - AI->eraseFromParent(); - ++NumGlobals; - Changed = true; - continue; + if (AI->getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) { + DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); + DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); + for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) + ToDelete[i]->eraseFromParent(); + Constant *TheSrc = cast<Constant>(Copy->getSource()); + AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); + Copy->eraseFromParent(); // Don't mutate the global. + AI->eraseFromParent(); + ++NumGlobals; + Changed = true; + continue; + } } // Check to see if we can perform the core SROA transformation. We cannot @@ -1425,8 +1561,8 @@ bool SROA::performScalarRepl(Function &F) { // promoted itself. If so, we don't want to transform it needlessly. Note // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. - if (AllocaInst *NewAI = - ConvertToScalarInfo((unsigned)AllocaSize, *TD).TryConvert(AI)) { + if (AllocaInst *NewAI = ConvertToScalarInfo( + (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; @@ -1531,12 +1667,12 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), LIType, false, Info, LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; - + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); - + Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), SIType, true, Info, SI, true /*AllowWholeAccess*/); @@ -1553,7 +1689,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, if (Info.isUnsafe) return; } } - + /// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer /// derived from the alloca, we can often still split the alloca into elements. @@ -1570,10 +1706,10 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, if (PHINode *PN = dyn_cast<PHINode>(I)) if (!Info.CheckedPHIs.insert(PN)) return; - + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { Instruction *User = cast<Instruction>(*UI); - + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { isSafePHISelectUseForScalarRepl(BC, Offset, Info); } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { @@ -1590,12 +1726,12 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), LIType, false, Info, LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; - + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); - + Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), SIType, true, Info, SI, false /*AllowWholeAccess*/); @@ -1619,6 +1755,8 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI); if (GEPIt == E) return; + bool NonConstant = false; + unsigned NonConstantIdxSize = 0; // Walk through the GEP type indices, checking the types that this indexes // into. @@ -1628,15 +1766,30 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, continue; ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand()); - if (!IdxVal) - return MarkUnsafe(Info, GEPI); + if (!IdxVal) { + // Non constant GEPs are only a problem on arrays, structs, and pointers + // Vectors can be dynamically indexed. + // FIXME: Add support for dynamic indexing on arrays. This should be + // ok on any subarrays of the alloca array, eg, a[0][i] is ok, but a[i][0] + // isn't. + if (!(*GEPIt)->isVectorTy()) + return MarkUnsafe(Info, GEPI); + NonConstant = true; + NonConstantIdxSize = TD->getTypeAllocSize(*GEPIt); + } } // Compute the offset due to this GEP and check if the alloca has a // component element at that offset. SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); + // If this GEP is non constant then the last operand must have been a + // dynamic index into a vector. Pop this now as it has no impact on the + // constant part of the offset. + if (NonConstant) + Indices.pop_back(); Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); - if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0)) + if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, + NonConstantIdxSize)) MarkUnsafe(Info, GEPI); } @@ -1741,6 +1894,12 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { if (Offset >= AT->getNumElements() * EltSize) return false; Offset %= EltSize; + } else if (VectorType *VT = dyn_cast<VectorType>(T)) { + EltTy = VT->getElementType(); + EltSize = TD->getTypeAllocSize(EltTy); + if (Offset >= VT->getNumElements() * EltSize) + return false; + Offset %= EltSize; } else { return false; } @@ -1766,12 +1925,12 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, RewriteBitCast(BC, AI, Offset, NewElts); continue; } - + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { RewriteGEP(GEPI, AI, Offset, NewElts); continue; } - + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); @@ -1790,10 +1949,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } continue; } - + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { Type *LIType = LI->getType(); - + if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { // Replace: // %res = load { i32, i32 }* %alloc @@ -1819,7 +1978,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } continue; } - + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { Value *Val = SI->getOperand(0); Type *SIType = Val->getType(); @@ -1846,16 +2005,16 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } continue; } - + if (isa<SelectInst>(User) || isa<PHINode>(User)) { - // If we have a PHI user of the alloca itself (as opposed to a GEP or + // If we have a PHI user of the alloca itself (as opposed to a GEP or // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to // the new pointer. if (!isa<AllocaInst>(I)) continue; - + assert(Offset == 0 && NewElts[0] && "Direct alloca use should have a zero offset"); - + // If we have a use of the alloca, we know the derived uses will be // utilizing just the first element of the scalarized result. Insert a // bitcast of the first alloca before the user as required. @@ -1908,9 +2067,16 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Offset -= Layout->getElementOffset(Idx); IdxTy = Type::getInt32Ty(T->getContext()); return Idx; + } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { + T = AT->getElementType(); + uint64_t EltSize = TD->getTypeAllocSize(T); + Idx = Offset / EltSize; + Offset -= Idx * EltSize; + IdxTy = Type::getInt64Ty(T->getContext()); + return Idx; } - ArrayType *AT = cast<ArrayType>(T); - T = AT->getElementType(); + VectorType *VT = cast<VectorType>(T); + T = VT->getElementType(); uint64_t EltSize = TD->getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; @@ -1925,6 +2091,13 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVector<AllocaInst*, 32> &NewElts) { uint64_t OldOffset = Offset; SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); + // If the GEP was dynamic then it must have been a dynamic vector lookup. + // In this case, it must be the last GEP operand which is dynamic so keep that + // aside until we've found the constant GEP offset then add it back in at the + // end. + Value* NonConstantIdx = 0; + if (!GEPI->hasAllConstantIndices()) + NonConstantIdx = Indices.pop_back_val(); Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); @@ -1951,6 +2124,17 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy); NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); } + if (NonConstantIdx) { + Type* GepTy = T; + // This GEP has a dynamic index. We need to add "i32 0" to index through + // any structs or arrays in the original type until we get to the vector + // to index. + while (!isa<VectorType>(GepTy)) { + NewArgs.push_back(Constant::getNullValue(i32Ty)); + GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U); + } + NewArgs.push_back(NonConstantIdx); + } Instruction *Val = NewElts[Idx]; if (NewArgs.size() > 1) { Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI); @@ -2202,7 +2386,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); - + // Handle tail padding by extending the operand if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) SrcVal = Builder.CreateZExt(SrcVal, @@ -2464,7 +2648,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { return false; } } - + return true; } diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index a66b3e3..d13e4ab 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -67,7 +67,7 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { // nodes. for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) (*SI)->removePredecessor(BB); - + // Insert a call to llvm.trap right before this. This turns the undefined // behavior into a hard fail instead of falling through into random code. if (UseLLVMTrap) { @@ -77,7 +77,7 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { CallTrap->setDebugLoc(I->getDebugLoc()); } new UnreachableInst(I->getContext(), I); - + // All instructions after this are dead. BasicBlock::iterator BBI = I, BBE = BB->end(); while (BBI != BBE) { @@ -89,7 +89,6 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { /// ChangeToCall - Convert the specified invoke into a normal call. static void ChangeToCall(InvokeInst *II) { - BasicBlock *BB = II->getParent(); SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); NewCall->takeName(II); @@ -102,19 +101,19 @@ static void ChangeToCall(InvokeInst *II) { BranchInst::Create(II->getNormalDest(), II); // Update PHI nodes in the unwind destination - II->getUnwindDest()->removePredecessor(BB); - BB->getInstList().erase(II); + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); } static bool MarkAliveBlocks(BasicBlock *BB, SmallPtrSet<BasicBlock*, 128> &Reachable) { - + SmallVector<BasicBlock*, 128> Worklist; Worklist.push_back(BB); bool Changed = false; do { BB = Worklist.pop_back_val(); - + if (!Reachable.insert(BB)) continue; @@ -136,7 +135,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, break; } } - + // Store to undef and store to null are undefined and used to signal that // they should be changed to unreachable by passes that can't modify the // CFG. @@ -145,7 +144,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, if (SI->isVolatile()) continue; Value *Ptr = SI->getOperand(1); - + if (isa<UndefValue>(Ptr) || (isa<ConstantPointerNull>(Ptr) && SI->getPointerAddressSpace() == 0)) { @@ -157,11 +156,22 @@ static bool MarkAliveBlocks(BasicBlock *BB, } // Turn invokes that call 'nounwind' functions into ordinary calls. - if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) - if (II->doesNotThrow()) { - ChangeToCall(II); + if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { + Value *Callee = II->getCalledValue(); + if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { + ChangeToUnreachable(II, true); + Changed = true; + } else if (II->doesNotThrow()) { + if (II->use_empty() && II->onlyReadsMemory()) { + // jump to the normal destination branch. + BranchInst::Create(II->getNormalDest(), II); + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); + } else + ChangeToCall(II); Changed = true; } + } Changed |= ConstantFoldTerminator(BB, true); for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) @@ -170,38 +180,38 @@ static bool MarkAliveBlocks(BasicBlock *BB, return Changed; } -/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even -/// if they are in a dead cycle. Return true if a change was made, false +/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even +/// if they are in a dead cycle. Return true if a change was made, false /// otherwise. static bool RemoveUnreachableBlocksFromFn(Function &F) { SmallPtrSet<BasicBlock*, 128> Reachable; bool Changed = MarkAliveBlocks(F.begin(), Reachable); - + // If there are unreachable blocks in the CFG... if (Reachable.size() == F.size()) return Changed; - + assert(Reachable.size() < F.size()); NumSimpl += F.size()-Reachable.size(); - + // Loop over all of the basic blocks that are not reachable, dropping all of // their internal references... for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { if (Reachable.count(BB)) continue; - + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) if (Reachable.count(*SI)) (*SI)->removePredecessor(BB); BB->dropAllReferences(); } - + for (Function::iterator I = ++F.begin(); I != F.end();) if (!Reachable.count(I)) I = F.getBasicBlockList().erase(I); else ++I; - + return true; } @@ -209,17 +219,17 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) { /// node) return blocks, merge them together to promote recursive block merging. static bool MergeEmptyReturnBlocks(Function &F) { bool Changed = false; - + BasicBlock *RetBlock = 0; - + // Scan all the blocks in the function, looking for empty return blocks. for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) { BasicBlock &BB = *BBI++; - + // Only look at return blocks. ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()); if (Ret == 0) continue; - + // Only look at the block if it is empty or the only other thing in it is a // single PHI node that is the operand to the return. if (Ret != &BB.front()) { @@ -241,21 +251,21 @@ static bool MergeEmptyReturnBlocks(Function &F) { RetBlock = &BB; continue; } - + // Otherwise, we found a duplicate return block. Merge the two. Changed = true; - + // Case when there is no input to the return or when the returned values // agree is trivial. Note that they can't agree if there are phis in the // blocks. if (Ret->getNumOperands() == 0 || - Ret->getOperand(0) == + Ret->getOperand(0) == cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) { BB.replaceAllUsesWith(RetBlock); BB.eraseFromParent(); continue; } - + // If the canonical return block has no PHI node, create one now. PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin()); if (RetBlockPHI == 0) { @@ -264,12 +274,12 @@ static bool MergeEmptyReturnBlocks(Function &F) { RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(), std::distance(PB, PE), "merge", &RetBlock->front()); - + for (pred_iterator PI = PB; PI != PE; ++PI) RetBlockPHI->addIncoming(InVal, *PI); RetBlock->getTerminator()->setOperand(0, RetBlockPHI); } - + // Turn BB into a block that just unconditionally branches to the return // block. This handles the case when the two return blocks have a common // predecessor but that return different things. @@ -277,7 +287,7 @@ static bool MergeEmptyReturnBlocks(Function &F) { BB.getTerminator()->eraseFromParent(); BranchInst::Create(RetBlock, &BB); } - + return Changed; } @@ -288,7 +298,7 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) { bool LocalChange = true; while (LocalChange) { LocalChange = false; - + // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { @@ -317,7 +327,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { // IterativeSimplifyCFG can (rarely) make some loops dead. If this happens, // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should // iterate between the two optimizations. We structure the code like this to - // avoid reruning IterativeSimplifyCFG if the second pass of + // avoid reruning IterativeSimplifyCFG if the second pass of // RemoveUnreachableBlocksFromFn doesn't do anything. if (!RemoveUnreachableBlocksFromFn(F)) return true; diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index f7b6941..a1a8a41 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -18,20 +18,20 @@ #define DEBUG_TYPE "simplify-libcalls" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/IRBuilder.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/IRBuilder.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/StringMap.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! using namespace llvm; @@ -100,7 +100,7 @@ static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { } return true; } - + static bool CallHasFloatingPointArgument(const CallInst *CI) { for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); it != e; ++it) { @@ -256,7 +256,7 @@ struct StrChrOpt : public LibCallOptimization { ConstantInt::get(TD->getIntPtrType(*Context), Len), B, TD); } - + // Otherwise, the character is a constant, see if the first argument is // a string literal. If so, we can constant fold. StringRef Str; @@ -459,6 +459,50 @@ struct StrCpyOpt : public LibCallOptimization { }; //===---------------------------------------===// +// 'stpcpy' Optimizations + +struct StpCpyOpt: public LibCallOptimization { + bool OptChkCall; // True if it's optimizing a __stpcpy_chk libcall. + + StpCpyOpt(bool c) : OptChkCall(c) {} + + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "stpcpy" function prototype. + unsigned NumParams = OptChkCall ? 3 : 2; + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != NumParams || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + // These optimizations require TargetData. + if (!TD) return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // stpcpy(x,x) -> x+strlen(x) + return B.CreateInBoundsGEP(Dst, EmitStrLen(Src, B, TD)); + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len); + Value *DstEnd = B.CreateGEP(Dst, + ConstantInt::get(TD->getIntPtrType(*Context), + Len - 1)); + + // We have enough information to now generate the memcpy call to do the + // copy for us. Make a memcpy to copy the nul byte with align = 1. + if (OptChkCall) + EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, TD); + else + B.CreateMemCpy(Dst, Src, LenV, 1); + return DstEnd; + } +}; + +//===---------------------------------------===// // 'strncpy' Optimizations struct StrNCpyOpt : public LibCallOptimization { @@ -1470,12 +1514,15 @@ namespace { /// class SimplifyLibCalls : public FunctionPass { TargetLibraryInfo *TLI; - + StringMap<LibCallOptimization*> Optimizations; // String and Memory LibCall Optimizations StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr; - StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; - StrNCpyOpt StrNCpy; StrLenOpt StrLen; StrPBrkOpt StrPBrk; + StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; + StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; + StpCpyOpt StpCpy; StpCpyOpt StpCpyChk; + StrNCpyOpt StrNCpy; + StrLenOpt StrLen; StrPBrkOpt StrPBrk; StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr; MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet; // Math Library Optimizations @@ -1487,11 +1534,12 @@ namespace { SPrintFOpt SPrintF; PrintFOpt PrintF; FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF; PutsOpt Puts; - + bool Modified; // This is only used by doInitialization. public: static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) { + SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true), + StpCpy(false), StpCpyChk(true) { initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } void AddOpt(LibFunc::Func F, LibCallOptimization* Opt); @@ -1542,6 +1590,7 @@ void SimplifyLibCalls::InitOptimizations() { Optimizations["strncmp"] = &StrNCmp; Optimizations["strcpy"] = &StrCpy; Optimizations["strncpy"] = &StrNCpy; + Optimizations["stpcpy"] = &StpCpy; Optimizations["strlen"] = &StrLen; Optimizations["strpbrk"] = &StrPBrk; Optimizations["strtol"] = &StrTo; @@ -1561,6 +1610,7 @@ void SimplifyLibCalls::InitOptimizations() { // _chk variants of String and Memory LibCall Optimizations. Optimizations["__strcpy_chk"] = &StrCpyChk; + Optimizations["__stpcpy_chk"] = &StpCpyChk; // Math Library Optimizations Optimizations["cosf"] = &Cos; @@ -1717,7 +1767,7 @@ void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) { void SimplifyLibCalls::inferPrototypeAttributes(Function &F) { FunctionType *FTy = F.getFunctionType(); - + StringRef Name = F.getName(); switch (Name[0]) { case 's': @@ -1746,6 +1796,7 @@ void SimplifyLibCalls::inferPrototypeAttributes(Function &F) { Name == "strtold" || Name == "strncat" || Name == "strncpy" || + Name == "stpncpy" || Name == "strtoull") { if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) @@ -2406,10 +2457,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) { // * sqrt(Nroot(x)) -> pow(x,1/(2*N)) // * sqrt(pow(x,y)) -> pow(|x|,y*0.5) // -// stpcpy: -// * stpcpy(str, "literal") -> -// llvm.memcpy(str,"literal",strlen("literal")+1,1) -// // strchr: // * strchr(p, 0) -> strlen(p) // tan, tanf, tanl: diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index ef65c0a..34f1d6c 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -27,6 +27,7 @@ using namespace llvm; STATISTIC(NumSunk, "Number of instructions sunk"); +STATISTIC(NumSinkIter, "Number of sinking iterations"); namespace { class Sinking : public FunctionPass { @@ -39,9 +40,9 @@ namespace { Sinking() : FunctionPass(ID) { initializeSinkingPass(*PassRegistry::getPassRegistry()); } - + virtual bool runOnFunction(Function &F); - + virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); @@ -55,9 +56,10 @@ namespace { bool ProcessBlock(BasicBlock &BB); bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores); bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const; + bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const; }; } // end anonymous namespace - + char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfo) @@ -69,7 +71,7 @@ FunctionPass *llvm::createSinkingPass() { return new Sinking(); } /// AllUsesDominatedByBlock - Return true if all uses of the specified value /// occur in blocks dominated by the specified block. -bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, +bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const { // Ignoring debug uses is necessary so debug info doesn't affect the code. // This may leave a referencing dbg_value in the original block, before @@ -98,20 +100,19 @@ bool Sinking::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfo>(); AA = &getAnalysis<AliasAnalysis>(); - bool EverMadeChange = false; - - while (1) { - bool MadeChange = false; + bool MadeChange, EverMadeChange = false; + do { + MadeChange = false; + DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); // Process all basic blocks. - for (Function::iterator I = F.begin(), E = F.end(); + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) MadeChange |= ProcessBlock(*I); - - // If this iteration over the code changed anything, keep iterating. - if (!MadeChange) break; - EverMadeChange = true; - } + EverMadeChange |= MadeChange; + NumSinkIter++; + } while (MadeChange); + return EverMadeChange; } @@ -120,8 +121,8 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false; // Don't bother sinking code out of unreachable blocks. In addition to being - // unprofitable, it can also lead to infinite looping, because in an unreachable - // loop there may be nowhere to stop. + // unprofitable, it can also lead to infinite looping, because in an + // unreachable loop there may be nowhere to stop. if (!DT->isReachableFromEntry(&BB)) return false; bool MadeChange = false; @@ -133,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { SmallPtrSet<Instruction *, 8> Stores; do { Instruction *Inst = I; // The instruction to sink. - + // Predecrement I (if it's not begin) so that it isn't invalidated by // sinking. ProcessedBegin = I == BB.begin(); @@ -145,10 +146,10 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { if (SinkInstruction(Inst, Stores)) ++NumSunk, MadeChange = true; - + // If we just processed the first instruction in the block, we're done. } while (!ProcessedBegin); - + return MadeChange; } @@ -174,6 +175,45 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, return true; } +/// IsAcceptableTarget - Return true if it is possible to sink the instruction +/// in the specified basic block. +bool Sinking::IsAcceptableTarget(Instruction *Inst, + BasicBlock *SuccToSinkTo) const { + assert(Inst && "Instruction to be sunk is null"); + assert(SuccToSinkTo && "Candidate sink target is null"); + + // It is not possible to sink an instruction into its own block. This can + // happen with loops. + if (Inst->getParent() == SuccToSinkTo) + return false; + + // If the block has multiple predecessors, this would introduce computation + // on different code paths. We could split the critical edge, but for now we + // just punt. + // FIXME: Split critical edges if not backedges. + if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { + // We cannot sink a load across a critical edge - there may be stores in + // other code paths. + if (!isSafeToSpeculativelyExecute(Inst)) + return false; + + // We don't want to sink across a critical edge if we don't dominate the + // successor. We could be introducing calculations to new code paths. + if (!DT->dominates(Inst->getParent(), SuccToSinkTo)) + return false; + + // Don't sink instructions into a loop. + Loop *succ = LI->getLoopFor(SuccToSinkTo); + Loop *cur = LI->getLoopFor(Inst->getParent()); + if (succ != 0 && succ != cur) + return false; + } + + // Finally, check that all the uses of the instruction are actually + // dominated by the candidate + return AllUsesDominatedByBlock(Inst, SuccToSinkTo); +} + /// SinkInstruction - Determine whether it is safe to sink the specified machine /// instruction out of its current block into a successor. bool Sinking::SinkInstruction(Instruction *Inst, @@ -181,7 +221,7 @@ bool Sinking::SinkInstruction(Instruction *Inst, // Check if it's safe to move the instruction. if (!isSafeToMove(Inst, AA, Stores)) return false; - + // FIXME: This should include support for sinking instructions within the // block they are currently in to shorten the live ranges. We often get // instructions sunk into the top of a large block, but it would be better to @@ -189,86 +229,42 @@ bool Sinking::SinkInstruction(Instruction *Inst, // be careful not to *increase* register pressure though, e.g. sinking // "x = y + z" down if it kills y and z would increase the live ranges of y // and z and only shrink the live range of x. - - // Loop over all the operands of the specified instruction. If there is - // anything we can't handle, bail out. - BasicBlock *ParentBlock = Inst->getParent(); - + // SuccToSinkTo - This is the successor to sink this instruction to, once we // decide. BasicBlock *SuccToSinkTo = 0; - - // FIXME: This picks a successor to sink into based on having one - // successor that dominates all the uses. However, there are cases where - // sinking can happen but where the sink point isn't a successor. For - // example: - // x = computation - // if () {} else {} - // use x - // the instruction could be sunk over the whole diamond for the - // if/then/else (or loop, etc), allowing it to be sunk into other blocks - // after that. - + // Instructions can only be sunk if all their uses are in blocks // dominated by one of the successors. - // Look at all the successors and decide which one - // we should sink to. - for (succ_iterator SI = succ_begin(ParentBlock), - E = succ_end(ParentBlock); SI != E; ++SI) { - if (AllUsesDominatedByBlock(Inst, *SI)) { - SuccToSinkTo = *SI; - break; - } + // Look at all the postdominators and see if we can sink it in one. + DomTreeNode *DTN = DT->getNode(Inst->getParent()); + for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end(); + I != E && SuccToSinkTo == 0; ++I) { + BasicBlock *Candidate = (*I)->getBlock(); + if ((*I)->getIDom()->getBlock() == Inst->getParent() && + IsAcceptableTarget(Inst, Candidate)) + SuccToSinkTo = Candidate; + } + + // If no suitable postdominator was found, look at all the successors and + // decide which one we should sink to, if any. + for (succ_iterator I = succ_begin(Inst->getParent()), + E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) { + if (IsAcceptableTarget(Inst, *I)) + SuccToSinkTo = *I; } - + // If we couldn't find a block to sink to, ignore this instruction. if (SuccToSinkTo == 0) return false; - - // It is not possible to sink an instruction into its own block. This can - // happen with loops. - if (Inst->getParent() == SuccToSinkTo) - return false; - - DEBUG(dbgs() << "Sink instr " << *Inst); - DEBUG(dbgs() << "to block "; - WriteAsOperand(dbgs(), SuccToSinkTo, false)); - - // If the block has multiple predecessors, this would introduce computation on - // a path that it doesn't already exist. We could split the critical edge, - // but for now we just punt. - // FIXME: Split critical edges if not backedges. - if (SuccToSinkTo->getUniquePredecessor() != ParentBlock) { - // We cannot sink a load across a critical edge - there may be stores in - // other code paths. - if (!isSafeToSpeculativelyExecute(Inst)) { - DEBUG(dbgs() << " *** PUNTING: Wont sink load along critical edge.\n"); - return false; - } - // We don't want to sink across a critical edge if we don't dominate the - // successor. We could be introducing calculations to new code paths. - if (!DT->dominates(ParentBlock, SuccToSinkTo)) { - DEBUG(dbgs() << " *** PUNTING: Critical edge found\n"); - return false; - } - - // Don't sink instructions into a loop. - if (LI->isLoopHeader(SuccToSinkTo)) { - DEBUG(dbgs() << " *** PUNTING: Loop header found\n"); - return false; - } + DEBUG(dbgs() << "Sink" << *Inst << " ("; + WriteAsOperand(dbgs(), Inst->getParent(), false); + dbgs() << " -> "; + WriteAsOperand(dbgs(), SuccToSinkTo, false); + dbgs() << ")\n"); - // Otherwise we are OK with sinking along a critical edge. - DEBUG(dbgs() << "Sinking along critical edge.\n"); - } - - // Determine where to insert into. Skip phi nodes. - BasicBlock::iterator InsertPos = SuccToSinkTo->begin(); - while (InsertPos != SuccToSinkTo->end() && isa<PHINode>(InsertPos)) - ++InsertPos; - // Move the instruction. - Inst->moveBefore(InsertPos); + Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt()); return true; } diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index e21eb9d..6557d63 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -172,7 +172,7 @@ bool TailCallElim::runOnFunction(Function &F) { FunctionContainsEscapingAllocas |= CheckForEscapingAllocas(BB, CannotTCETailMarkedCall); } - + /// FIXME: The code generator produces really bad code when an 'escaping /// alloca' is changed from being a static alloca to being a dynamic alloca. /// Until this is resolved, disable this transformation if that would ever @@ -234,7 +234,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { // call does not mod/ref the memory location being processed. if (I->mayHaveSideEffects()) // This also handles volatile loads. return false; - + if (LoadInst *L = dyn_cast<LoadInst>(I)) { // Loads may always be moved above calls without side effects. if (CI->mayHaveSideEffects()) { @@ -364,7 +364,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, if (&BB->front() == TI) // Make sure there is something before the terminator. return 0; - + // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. CallInst *CI = 0; @@ -388,10 +388,10 @@ TailCallElim::FindTRECandidate(Instruction *TI, // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call // and disable this xform in this case, because the code generator will // lower the call to fabs into inline code. - if (BB == &F->getEntryBlock() && + if (BB == &F->getEntryBlock() && FirstNonDbg(BB->front()) == CI && FirstNonDbg(llvm::next(BB->begin())) == TI && - callIsSmall(F)) { + callIsSmall(CI)) { // A single-block function with just a call and a return. Check that // the arguments match. CallSite::arg_iterator I = CallSite(CI).arg_begin(), @@ -432,7 +432,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock::iterator BBI = CI; for (++BBI; &*BBI != Ret; ++BBI) { if (CanMoveAboveCall(BBI, CI)) continue; - + // If we can't move the instruction above the call, it might be because it // is an associative and commutative operation that could be transformed // using accumulator recursion elimination. Check to see if this is the diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index 3859a1a..5576432 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -671,12 +671,3 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, return cast<ReturnInst>(NewRet); } -/// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a -/// given basic block. -DebugLoc llvm::GetFirstDebugLocInBasicBlock(const BasicBlock *BB) { - if (const Instruction *I = BB->getFirstNonPHI()) - return I->getDebugLoc(); - // Scanning entire block may be too expensive, if the first instruction - // does not have valid location info. - return DebugLoc(); -} diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index f752d79..6b04e3d 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -117,33 +117,38 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum, return false; } -/// CreatePHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form +/// createPHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form /// may require new PHIs in the new exit block. This function inserts the -/// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB +/// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB /// is the new loop exit block, and DestBB is the old loop exit, now the /// successor of SplitBB. -static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds, +static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds, BasicBlock *SplitBB, BasicBlock *DestBB) { // SplitBB shouldn't have anything non-trivial in it yet. - assert(SplitBB->getFirstNonPHI() == SplitBB->getTerminator() && - "SplitBB has non-PHI nodes!"); + assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() || + SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!"); - // For each PHI in the destination block... + // For each PHI in the destination block. for (BasicBlock::iterator I = DestBB->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) { unsigned Idx = PN->getBasicBlockIndex(SplitBB); Value *V = PN->getIncomingValue(Idx); + // If the input is a PHI which already satisfies LCSSA, don't create // a new one. if (const PHINode *VP = dyn_cast<PHINode>(V)) if (VP->getParent() == SplitBB) continue; + // Otherwise a new PHI is needed. Create one and populate it. - PHINode *NewPN = PHINode::Create(PN->getType(), Preds.size(), "split", - SplitBB->getTerminator()); + PHINode *NewPN = + PHINode::Create(PN->getType(), Preds.size(), "split", + SplitBB->isLandingPad() ? + SplitBB->begin() : SplitBB->getTerminator()); for (unsigned i = 0, e = Preds.size(); i != e; ++i) NewPN->addIncoming(V, Preds[i]); + // Update the original PHI. PN->setIncomingValue(Idx, NewPN); } @@ -168,7 +173,8 @@ static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds, /// BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, Pass *P, bool MergeIdenticalEdges, - bool DontDeleteUselessPhis) { + bool DontDeleteUselessPhis, + bool SplitLandingPads) { if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return 0; assert(!isa<IndirectBrInst>(TI) && @@ -335,11 +341,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, "Split point for loop exit is contained in loop!"); // Update LCSSA form in the newly created exit block. - if (P->mustPreserveAnalysisID(LCSSAID)) { - SmallVector<BasicBlock *, 1> OrigPred; - OrigPred.push_back(TIBB); - CreatePHIsForSplitLoopExit(OrigPred, NewBB, DestBB); - } + if (P->mustPreserveAnalysisID(LCSSAID)) + createPHIsForSplitLoopExit(TIBB, NewBB, DestBB); // For each unique exit block... // FIXME: This code is functionally equivalent to the corresponding @@ -371,10 +374,19 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, // getUniqueExitBlocks above because that depends on LoopSimplify // form, which we're in the process of restoring! if (!Preds.empty() && HasPredOutsideOfLoop) { - BasicBlock *NewExitBB = - SplitBlockPredecessors(Exit, Preds, "split", P); - if (P->mustPreserveAnalysisID(LCSSAID)) - CreatePHIsForSplitLoopExit(Preds, NewExitBB, Exit); + if (!Exit->isLandingPad()) { + BasicBlock *NewExitBB = + SplitBlockPredecessors(Exit, Preds, "split", P); + if (P->mustPreserveAnalysisID(LCSSAID)) + createPHIsForSplitLoopExit(Preds, NewExitBB, Exit); + } else if (SplitLandingPads) { + SmallVector<BasicBlock*, 8> NewBBs; + SplitLandingPadPredecessors(Exit, Preds, + ".split1", ".split2", + P, NewBBs); + if (P->mustPreserveAnalysisID(LCSSAID)) + createPHIsForSplitLoopExit(Preds, NewBBs[0], Exit); + } } } } diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index a808303..27f7724 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -12,18 +12,18 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/BuildLibCalls.h" -#include "llvm/Type.h" #include "llvm/Constants.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/Intrinsics.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" +#include "llvm/LLVMContext.h" #include "llvm/Module.h" -#include "llvm/Support/IRBuilder.h" +#include "llvm/Type.h" +#include "llvm/ADT/SmallString.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/LLVMContext.h" -#include "llvm/Intrinsics.h" -#include "llvm/ADT/SmallString.h" using namespace llvm; @@ -42,7 +42,7 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) { Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI, 2), + Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI), TD->getIntPtrType(Context), B.getInt8PtrTy(), NULL); @@ -64,7 +64,7 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, Type *I8Ptr = B.getInt8PtrTy(); Type *I32Ty = B.getInt32Ty(); - Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(&AWI, 1), + Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(AWI), I8Ptr, I8Ptr, I32Ty, NULL); CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B), ConstantInt::get(I32Ty, C), "strchr"); @@ -84,7 +84,7 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI, 3), + Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -107,7 +107,7 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); Type *I8Ptr = B.getInt8PtrTy(); - Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2), + Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI), I8Ptr, I8Ptr, I8Ptr, NULL); CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B), Name); @@ -125,7 +125,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); Type *I8Ptr = B.getInt8PtrTy(); - Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2), + Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI), I8Ptr, I8Ptr, I8Ptr, Len->getType(), NULL); CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B), @@ -145,7 +145,7 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); Value *MemCpy = M->getOrInsertFunction("__memcpy_chk", - AttrListPtr::get(&AWI, 1), + AttrListPtr::get(AWI), B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -167,7 +167,7 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val, AttributeWithIndex AWI; AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(&AWI, 1), + Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(AWI), B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), @@ -192,7 +192,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI, 3), + Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -260,7 +260,7 @@ void llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD) { AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); - Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI, 2), + Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt8PtrTy(), NULL); @@ -280,7 +280,7 @@ void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI, 2), + F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt32Ty(), File->getType(), NULL); @@ -309,7 +309,7 @@ void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, StringRef FPutsName = TLI->getName(LibFunc::fputs); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI, 3), + F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), NULL); @@ -337,7 +337,7 @@ void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, StringRef FWriteName = TLI->getName(LibFunc::fwrite); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI, 3), + F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI), TD->getIntPtrType(Context), B.getInt8PtrTy(), TD->getIntPtrType(Context), diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt index 7f5cb5e..4ff31ca 100644 --- a/lib/Transforms/Utils/CMakeLists.txt +++ b/lib/Transforms/Utils/CMakeLists.txt @@ -29,3 +29,5 @@ add_llvm_library(LLVMTransformUtils Utils.cpp ValueMapper.cpp ) + +add_dependencies(LLVMTransformUtils intrinsics_gen) diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 20052a4..99237b8 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" @@ -28,7 +29,6 @@ #include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/ADT/SmallVector.h" #include <map> using namespace llvm; diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp index a0e027b..1dac6b5 100644 --- a/lib/Transforms/Utils/CloneModule.cpp +++ b/lib/Transforms/Utils/CloneModule.cpp @@ -53,7 +53,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { I->isConstant(), I->getLinkage(), (Constant*) 0, I->getName(), (GlobalVariable*) 0, - I->isThreadLocal(), + I->getThreadLocalMode(), I->getType()->getAddressSpace()); GV->copyAttributesFrom(I); VMap[I] = GV; diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index e8c0b80..c545cd6 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -13,7 +13,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Instructions.h" @@ -23,6 +23,8 @@ #include "llvm/Pass.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Support/CommandLine.h" @@ -43,61 +45,139 @@ static cl::opt<bool> AggregateArgsOpt("aggregate-extracted-args", cl::Hidden, cl::desc("Aggregate arguments to code-extracted functions")); -namespace { - class CodeExtractor { - typedef SetVector<Value*> Values; - SetVector<BasicBlock*> BlocksToExtract; - DominatorTree* DT; - bool AggregateArgs; - unsigned NumExitBlocks; - Type *RetTy; - public: - CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false) - : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {} - - Function *ExtractCodeRegion(ArrayRef<BasicBlock*> code); - - bool isEligible(ArrayRef<BasicBlock*> code); - - private: - /// definedInRegion - Return true if the specified value is defined in the - /// extracted region. - bool definedInRegion(Value *V) const { - if (Instruction *I = dyn_cast<Instruction>(V)) - if (BlocksToExtract.count(I->getParent())) - return true; - return false; - } +/// \brief Test whether a block is valid for extraction. +static bool isBlockValidForExtraction(const BasicBlock &BB) { + // Landing pads must be in the function where they were inserted for cleanup. + if (BB.isLandingPad()) + return false; - /// definedInCaller - Return true if the specified value is defined in the - /// function being code extracted, but not in the region being extracted. - /// These values must be passed in as live-ins to the function. - bool definedInCaller(Value *V) const { - if (isa<Argument>(V)) return true; - if (Instruction *I = dyn_cast<Instruction>(V)) - if (!BlocksToExtract.count(I->getParent())) - return true; + // Don't hoist code containing allocas, invokes, or vastarts. + for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) { + if (isa<AllocaInst>(I) || isa<InvokeInst>(I)) return false; + if (const CallInst *CI = dyn_cast<CallInst>(I)) + if (const Function *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::vastart) + return false; + } + + return true; +} + +/// \brief Build a set of blocks to extract if the input blocks are viable. +template <typename IteratorT> +static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin, + IteratorT BBEnd) { + SetVector<BasicBlock *> Result; + + assert(BBBegin != BBEnd); + + // Loop over the blocks, adding them to our set-vector, and aborting with an + // empty set if we encounter invalid blocks. + for (IteratorT I = BBBegin, E = BBEnd; I != E; ++I) { + if (!Result.insert(*I)) + llvm_unreachable("Repeated basic blocks in extraction input"); + + if (!isBlockValidForExtraction(**I)) { + Result.clear(); + return Result; } + } + +#ifndef NDEBUG + for (SetVector<BasicBlock *>::iterator I = llvm::next(Result.begin()), + E = Result.end(); + I != E; ++I) + for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I); + PI != PE; ++PI) + assert(Result.count(*PI) && + "No blocks in this region may have entries from outside the region" + " except for the first block!"); +#endif + + return Result; +} + +/// \brief Helper to call buildExtractionBlockSet with an ArrayRef. +static SetVector<BasicBlock *> +buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs) { + return buildExtractionBlockSet(BBs.begin(), BBs.end()); +} + +/// \brief Helper to call buildExtractionBlockSet with a RegionNode. +static SetVector<BasicBlock *> +buildExtractionBlockSet(const RegionNode &RN) { + if (!RN.isSubRegion()) + // Just a single BasicBlock. + return buildExtractionBlockSet(RN.getNodeAs<BasicBlock>()); - void severSplitPHINodes(BasicBlock *&Header); - void splitReturnBlocks(); - void findInputsOutputs(Values &inputs, Values &outputs); + const Region &R = *RN.getNodeAs<Region>(); - Function *constructFunction(const Values &inputs, - const Values &outputs, - BasicBlock *header, - BasicBlock *newRootNode, BasicBlock *newHeader, - Function *oldFunction, Module *M); + return buildExtractionBlockSet(R.block_begin(), R.block_end()); +} - void moveCodeToFunction(Function *newFunction); +CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs) + : DT(0), AggregateArgs(AggregateArgs||AggregateArgsOpt), + Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {} + +CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT, + bool AggregateArgs) + : DT(DT), AggregateArgs(AggregateArgs||AggregateArgsOpt), + Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {} + +CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs) + : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt), + Blocks(buildExtractionBlockSet(L.getBlocks())), NumExitBlocks(~0U) {} + +CodeExtractor::CodeExtractor(DominatorTree &DT, const RegionNode &RN, + bool AggregateArgs) + : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt), + Blocks(buildExtractionBlockSet(RN)), NumExitBlocks(~0U) {} + +/// definedInRegion - Return true if the specified value is defined in the +/// extracted region. +static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (Blocks.count(I->getParent())) + return true; + return false; +} - void emitCallAndSwitchStatement(Function *newFunction, - BasicBlock *newHeader, - Values &inputs, - Values &outputs); +/// definedInCaller - Return true if the specified value is defined in the +/// function being code extracted, but not in the region being extracted. +/// These values must be passed in as live-ins to the function. +static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) { + if (isa<Argument>(V)) return true; + if (Instruction *I = dyn_cast<Instruction>(V)) + if (!Blocks.count(I->getParent())) + return true; + return false; +} - }; +void CodeExtractor::findInputsOutputs(ValueSet &Inputs, + ValueSet &Outputs) const { + for (SetVector<BasicBlock *>::const_iterator I = Blocks.begin(), + E = Blocks.end(); + I != E; ++I) { + BasicBlock *BB = *I; + + // If a used value is defined outside the region, it's an input. If an + // instruction is used outside the region, it's an output. + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); + II != IE; ++II) { + for (User::op_iterator OI = II->op_begin(), OE = II->op_end(); + OI != OE; ++OI) + if (definedInCaller(Blocks, *OI)) + Inputs.insert(*OI); + + for (Value::use_iterator UI = II->use_begin(), UE = II->use_end(); + UI != UE; ++UI) + if (!definedInRegion(Blocks, *UI)) { + Outputs.insert(II); + break; + } + } + } } /// severSplitPHINodes - If a PHI node has multiple inputs from outside of the @@ -115,7 +195,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // than one entry from outside the region. If so, we need to sever the // header block into two. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (BlocksToExtract.count(PN->getIncomingBlock(i))) + if (Blocks.count(PN->getIncomingBlock(i))) ++NumPredsFromRegion; else ++NumPredsOutsideRegion; @@ -136,8 +216,8 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // We only want to code extract the second block now, and it becomes the new // header of the region. BasicBlock *OldPred = Header; - BlocksToExtract.remove(OldPred); - BlocksToExtract.insert(NewBB); + Blocks.remove(OldPred); + Blocks.insert(NewBB); Header = NewBB; // Okay, update dominator sets. The blocks that dominate the new one are the @@ -152,7 +232,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // Loop over all of the predecessors of OldPred that are in the region, // changing them to branch to NewBB instead. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + if (Blocks.count(PN->getIncomingBlock(i))) { TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator(); TI->replaceUsesOfWith(OldPred, NewBB); } @@ -170,7 +250,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // Loop over all of the incoming value in PN, moving them to NewPN if they // are from the extracted region. for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { - if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + if (Blocks.count(PN->getIncomingBlock(i))) { NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i)); PN->removeIncomingValue(i); --i; @@ -181,8 +261,8 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { } void CodeExtractor::splitReturnBlocks() { - for (SetVector<BasicBlock*>::iterator I = BlocksToExtract.begin(), - E = BlocksToExtract.end(); I != E; ++I) + for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end(); + I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) { BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret"); if (DT) { @@ -203,45 +283,11 @@ void CodeExtractor::splitReturnBlocks() { } } -// findInputsOutputs - Find inputs to, outputs from the code region. -// -void CodeExtractor::findInputsOutputs(Values &inputs, Values &outputs) { - std::set<BasicBlock*> ExitBlocks; - for (SetVector<BasicBlock*>::const_iterator ci = BlocksToExtract.begin(), - ce = BlocksToExtract.end(); ci != ce; ++ci) { - BasicBlock *BB = *ci; - - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - // If a used value is defined outside the region, it's an input. If an - // instruction is used outside the region, it's an output. - for (User::op_iterator O = I->op_begin(), E = I->op_end(); O != E; ++O) - if (definedInCaller(*O)) - inputs.insert(*O); - - // Consider uses of this instruction (outputs). - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) - if (!definedInRegion(*UI)) { - outputs.insert(I); - break; - } - } // for: insts - - // Keep track of the exit blocks from the region. - TerminatorInst *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!BlocksToExtract.count(TI->getSuccessor(i))) - ExitBlocks.insert(TI->getSuccessor(i)); - } // for: basic blocks - - NumExitBlocks = ExitBlocks.size(); -} - /// constructFunction - make a function based on inputs and outputs, as follows: /// f(in0, ..., inN, out0, ..., outN) /// -Function *CodeExtractor::constructFunction(const Values &inputs, - const Values &outputs, +Function *CodeExtractor::constructFunction(const ValueSet &inputs, + const ValueSet &outputs, BasicBlock *header, BasicBlock *newRootNode, BasicBlock *newHeader, @@ -261,15 +307,15 @@ Function *CodeExtractor::constructFunction(const Values &inputs, std::vector<Type*> paramTy; // Add the types of the input values to the function's argument list - for (Values::const_iterator i = inputs.begin(), - e = inputs.end(); i != e; ++i) { + for (ValueSet::const_iterator i = inputs.begin(), e = inputs.end(); + i != e; ++i) { const Value *value = *i; DEBUG(dbgs() << "value used in func: " << *value << "\n"); paramTy.push_back(value->getType()); } // Add the types of the output values to the function's argument list. - for (Values::const_iterator I = outputs.begin(), E = outputs.end(); + for (ValueSet::const_iterator I = outputs.begin(), E = outputs.end(); I != E; ++I) { DEBUG(dbgs() << "instr used in func: " << **I << "\n"); if (AggregateArgs) @@ -326,7 +372,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs, for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end(); use != useE; ++use) if (Instruction* inst = dyn_cast<Instruction>(*use)) - if (BlocksToExtract.count(inst->getParent())) + if (Blocks.count(inst->getParent())) inst->replaceUsesOfWith(inputs[i], RewriteVal); } @@ -347,7 +393,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs, // The BasicBlock which contains the branch is not in the region // modify the branch target to a new block if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i])) - if (!BlocksToExtract.count(TI->getParent()) && + if (!Blocks.count(TI->getParent()) && TI->getParent()->getParent() == oldFunction) TI->replaceUsesOfWith(header, newHeader); @@ -373,7 +419,7 @@ static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) { /// necessary. void CodeExtractor:: emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, - Values &inputs, Values &outputs) { + ValueSet &inputs, ValueSet &outputs) { // Emit a call to the new function, passing in: *pointer to struct (if // aggregating parameters), or plan inputs and allocated memory for outputs std::vector<Value*> params, StructValues, ReloadOutputs, Reloads; @@ -381,14 +427,14 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, LLVMContext &Context = newFunction->getContext(); // Add inputs as params, or to be filled into the struct - for (Values::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i) + for (ValueSet::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i) if (AggregateArgs) StructValues.push_back(*i); else params.push_back(*i); // Create allocas for the outputs - for (Values::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) { + for (ValueSet::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) { if (AggregateArgs) { StructValues.push_back(*i); } else { @@ -403,7 +449,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, AllocaInst *Struct = 0; if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { std::vector<Type*> ArgTypes; - for (Values::iterator v = StructValues.begin(), + for (ValueSet::iterator v = StructValues.begin(), ve = StructValues.end(); v != ve; ++v) ArgTypes.push_back((*v)->getType()); @@ -458,7 +504,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end()); for (unsigned u = 0, e = Users.size(); u != e; ++u) { Instruction *inst = cast<Instruction>(Users[u]); - if (!BlocksToExtract.count(inst->getParent())) + if (!Blocks.count(inst->getParent())) inst->replaceUsesOfWith(outputs[i], load); } } @@ -476,11 +522,11 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, std::map<BasicBlock*, BasicBlock*> ExitBlockMap; unsigned switchVal = 0; - for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(), - e = BlocksToExtract.end(); i != e; ++i) { + for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(), + e = Blocks.end(); i != e; ++i) { TerminatorInst *TI = (*i)->getTerminator(); for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!BlocksToExtract.count(TI->getSuccessor(i))) { + if (!Blocks.count(TI->getSuccessor(i))) { BasicBlock *OldTarget = TI->getSuccessor(i); // add a new basic block which returns the appropriate value BasicBlock *&NewTarget = ExitBlockMap[OldTarget]; @@ -618,18 +664,19 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, TheSwitch->setCondition(call); TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks)); // Remove redundant case - TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1)); + SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1); + TheSwitch->removeCase(ToBeRemoved); break; } } void CodeExtractor::moveCodeToFunction(Function *newFunction) { - Function *oldFunc = (*BlocksToExtract.begin())->getParent(); + Function *oldFunc = (*Blocks.begin())->getParent(); Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList(); Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList(); - for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(), - e = BlocksToExtract.end(); i != e; ++i) { + for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(), + e = Blocks.end(); i != e; ++i) { // Delete the basic block from the old function, and the list of blocks oldBlocks.remove(*i); @@ -638,47 +685,15 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) { } } -/// ExtractRegion - Removes a loop from a function, replaces it with a call to -/// new function. Returns pointer to the new function. -/// -/// algorithm: -/// -/// find inputs and outputs for the region -/// -/// for inputs: add to function as args, map input instr* to arg# -/// for outputs: add allocas for scalars, -/// add to func as args, map output instr* to arg# -/// -/// rewrite func to use argument #s instead of instr* -/// -/// for each scalar output in the function: at every exit, store intermediate -/// computed result back into memory. -/// -Function *CodeExtractor:: -ExtractCodeRegion(ArrayRef<BasicBlock*> code) { - if (!isEligible(code)) +Function *CodeExtractor::extractCodeRegion() { + if (!isEligible()) return 0; - // 1) Find inputs, outputs - // 2) Construct new function - // * Add allocas for defs, pass as args by reference - // * Pass in uses as args - // 3) Move code region, add call instr to func - // - BlocksToExtract.insert(code.begin(), code.end()); - - Values inputs, outputs; + ValueSet inputs, outputs; // Assumption: this is a single-entry code region, and the header is the first // block in the region. - BasicBlock *header = code[0]; - - for (unsigned i = 1, e = code.size(); i != e; ++i) - for (pred_iterator PI = pred_begin(code[i]), E = pred_end(code[i]); - PI != E; ++PI) - assert(BlocksToExtract.count(*PI) && - "No blocks in this region may have entries from outside the region" - " except for the first block!"); + BasicBlock *header = *Blocks.begin(); // If we have to split PHI nodes or the entry block, do so now. severSplitPHINodes(header); @@ -703,6 +718,14 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) { // Find inputs to, outputs from the code region. findInputsOutputs(inputs, outputs); + SmallPtrSet<BasicBlock *, 1> ExitBlocks; + for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end(); + I != E; ++I) + for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI) + if (!Blocks.count(*SI)) + ExitBlocks.insert(*SI); + NumExitBlocks = ExitBlocks.size(); + // Construct new function based on inputs/outputs & add allocas for all defs. Function *newFunction = constructFunction(inputs, outputs, header, newFuncRoot, @@ -718,7 +741,7 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) { for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) { PHINode *PN = cast<PHINode>(I); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (!BlocksToExtract.count(PN->getIncomingBlock(i))) + if (!Blocks.count(PN->getIncomingBlock(i))) PN->setIncomingBlock(i, newFuncRoot); } @@ -732,7 +755,7 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) { PHINode *PN = cast<PHINode>(I); std::set<BasicBlock*> ProcessedPreds; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + if (Blocks.count(PN->getIncomingBlock(i))) { if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second) PN->setIncomingBlock(i, codeReplacer); else { @@ -754,44 +777,3 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) { report_fatal_error("verifyFunction failed!")); return newFunction; } - -bool CodeExtractor::isEligible(ArrayRef<BasicBlock*> code) { - // Deny a single basic block that's a landing pad block. - if (code.size() == 1 && code[0]->isLandingPad()) - return false; - - // Deny code region if it contains allocas or vastarts. - for (ArrayRef<BasicBlock*>::iterator BB = code.begin(), e=code.end(); - BB != e; ++BB) - for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end(); - I != Ie; ++I) - if (isa<AllocaInst>(*I)) - return false; - else if (const CallInst *CI = dyn_cast<CallInst>(I)) - if (const Function *F = CI->getCalledFunction()) - if (F->getIntrinsicID() == Intrinsic::vastart) - return false; - return true; -} - - -/// ExtractCodeRegion - Slurp a sequence of basic blocks into a brand new -/// function. -/// -Function* llvm::ExtractCodeRegion(DominatorTree &DT, - ArrayRef<BasicBlock*> code, - bool AggregateArgs) { - return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code); -} - -/// ExtractLoop - Slurp a natural loop into a brand new function. -/// -Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) { - return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks()); -} - -/// ExtractBasicBlock - Slurp a basic block into a brand new function. -/// -Function* llvm::ExtractBasicBlock(ArrayRef<BasicBlock*> BBs, bool AggregateArgs){ - return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(BBs); -} diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index d2b167a..89e89e7 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -13,22 +13,22 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Attributes.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" -#include "llvm/Module.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Intrinsics.h" -#include "llvm/Attributes.h" +#include "llvm/Module.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/CallGraph.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Support/CallSite.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/IRBuilder.h" using namespace llvm; bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, @@ -43,10 +43,10 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, namespace { /// A class for recording information about inlining through an invoke. class InvokeInliningInfo { - BasicBlock *OuterResumeDest; //< Destination of the invoke's unwind. - BasicBlock *InnerResumeDest; //< Destination for the callee's resume. - LandingPadInst *CallerLPad; //< LandingPadInst associated with the invoke. - PHINode *InnerEHValuesPHI; //< PHI for EH values from landingpad insts. + BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind. + BasicBlock *InnerResumeDest; ///< Destination for the callee's resume. + LandingPadInst *CallerLPad; ///< LandingPadInst associated with the invoke. + PHINode *InnerEHValuesPHI; ///< PHI for EH values from landingpad insts. SmallVector<Value*, 8> UnwindDestPHIValues; public: diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index d1c4d59..bed7d72 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -14,31 +14,31 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Constants.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/DerivedTypes.h" #include "llvm/GlobalAlias.h" #include "llvm/GlobalVariable.h" -#include "llvm/DerivedTypes.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" #include "llvm/Metadata.h" #include "llvm/Operator.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Analysis/DIBuilder.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ProfileInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" using namespace llvm; //===----------------------------------------------------------------------===// @@ -169,16 +169,21 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) { // Otherwise, we can fold this switch into a conditional branch // instruction if it has only one non-default destination. SwitchInst::CaseIt FirstCase = SI->case_begin(); - Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), - FirstCase.getCaseValue(), "cond"); - - // Insert the new branch. - Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(), - SI->getDefaultDest()); - - // Delete the old switch. - SI->eraseFromParent(); - return true; + IntegersSubset& Case = FirstCase.getCaseValueEx(); + if (Case.isSingleNumber()) { + // FIXME: Currently work with ConstantInt based numbers. + Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), + Case.getSingleNumber(0).toConstantInt(), + "cond"); + + // Insert the new branch. + Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(), + SI->getDefaultDest()); + + // Delete the old switch. + SI->eraseFromParent(); + return true; + } } return false; } @@ -260,7 +265,7 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) { return isa<UndefValue>(II->getArgOperand(1)); } - if (extractMallocCall(I)) return true; + if (isAllocLikeFn(I)) return true; if (CallInst *CI = isFreeCall(I)) if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0))) @@ -700,7 +705,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { CollisionMap[PN] = Old; break; } - // Procede to the next PHI in the list. + // Proceed to the next PHI in the list. OtherPN = I->second; } } diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index e15497a..2023750 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -95,9 +95,11 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, // Erase basic block from the function... // ScalarEvolution holds references to loop exit blocks. - if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) { - if (Loop *L = LI->getLoopFor(BB)) - SE->forgetLoop(L); + if (LPM) { + if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) { + if (Loop *L = LI->getLoopFor(BB)) + SE->forgetLoop(L); + } } LI->removeBlock(BB); BB->eraseFromParent(); @@ -204,9 +206,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // Notify ScalarEvolution that the loop will be substantially changed, // if not outright eliminated. - ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); - if (SE) - SE->forgetLoop(L); + if (LPM) { + ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); + if (SE) + SE->forgetLoop(L); + } // If we know the trip count, we know the multiple... unsigned BreakoutTrip = 0; @@ -405,24 +409,26 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, } } - // FIXME: Reconstruct dom info, because it is not preserved properly. - // Incrementally updating domtree after loop unrolling would be easy. - if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>()) - DT->runOnFunction(*L->getHeader()->getParent()); - - // Simplify any new induction variables in the partially unrolled loop. - if (SE && !CompletelyUnroll) { - SmallVector<WeakVH, 16> DeadInsts; - simplifyLoopIVs(L, SE, LPM, DeadInsts); - - // Aggressively clean up dead instructions that simplifyLoopIVs already - // identified. Any remaining should be cleaned up below. - while (!DeadInsts.empty()) - if (Instruction *Inst = - dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) - RecursivelyDeleteTriviallyDeadInstructions(Inst); + if (LPM) { + // FIXME: Reconstruct dom info, because it is not preserved properly. + // Incrementally updating domtree after loop unrolling would be easy. + if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>()) + DT->runOnFunction(*L->getHeader()->getParent()); + + // Simplify any new induction variables in the partially unrolled loop. + ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); + if (SE && !CompletelyUnroll) { + SmallVector<WeakVH, 16> DeadInsts; + simplifyLoopIVs(L, SE, LPM, DeadInsts); + + // Aggressively clean up dead instructions that simplifyLoopIVs already + // identified. Any remaining should be cleaned up below. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); + } } - // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 3aa6bef..67e17f4 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -131,7 +131,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, /// There are two value maps that are defined and used. VMap is /// for the values in the current loop instance. LVMap contains /// the values from the last loop instance. We need the LVMap values -/// to update the inital values for the current loop instance. +/// to update the initial values for the current loop instance. /// static void CloneLoopBlocks(Loop *L, bool FirstCopy, @@ -237,6 +237,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification + if (!LPM) + return false; ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); if (SE == 0) return false; diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp index c70ced1..02bdcda 100644 --- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp @@ -12,18 +12,19 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "lower-expect-intrinsic" +#include "llvm/BasicBlock.h" #include "llvm/Constants.h" #include "llvm/Function.h" -#include "llvm/BasicBlock.h" -#include "llvm/LLVMContext.h" #include "llvm/Instructions.h" #include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/MDBuilder.h" #include "llvm/Metadata.h" #include "llvm/Pass.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/Statistic.h" #include <vector> using namespace llvm; @@ -70,24 +71,18 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) { if (!ExpectedValue) return false; - LLVMContext &Context = CI->getContext(); - Type *Int32Ty = Type::getInt32Ty(Context); - SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue); - std::vector<Value *> Vec; - unsigned n = SI->getNumCases(); - Vec.resize(n + 1 + 1); // +1 for MDString and +1 for default case - - Vec[0] = MDString::get(Context, "branch_weights"); - Vec[1] = ConstantInt::get(Int32Ty, Case == SI->case_default() ? - LikelyBranchWeight : UnlikelyBranchWeight); - for (unsigned i = 0; i < n; ++i) { - Vec[i + 1 + 1] = ConstantInt::get(Int32Ty, i == Case.getCaseIndex() ? - LikelyBranchWeight : UnlikelyBranchWeight); - } + unsigned n = SI->getNumCases(); // +1 for default case. + std::vector<uint32_t> Weights(n + 1); - MDNode *WeightsNode = llvm::MDNode::get(Context, Vec); - SI->setMetadata(LLVMContext::MD_prof, WeightsNode); + Weights[0] = Case == SI->case_default() ? LikelyBranchWeight + : UnlikelyBranchWeight; + for (unsigned i = 0; i != n; ++i) + Weights[i + 1] = i == Case.getCaseIndex() ? LikelyBranchWeight + : UnlikelyBranchWeight; + + SI->setMetadata(LLVMContext::MD_prof, + MDBuilder(CI->getContext()).createBranchWeights(Weights)); SI->setCondition(ArgValue); return true; @@ -120,20 +115,17 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { if (!ExpectedValue) return false; - LLVMContext &Context = CI->getContext(); - Type *Int32Ty = Type::getInt32Ty(Context); - bool Likely = ExpectedValue->isOne(); + MDBuilder MDB(CI->getContext()); + MDNode *Node; // If expect value is equal to 1 it means that we are more likely to take // branch 0, in other case more likely is branch 1. - Value *Ops[] = { - MDString::get(Context, "branch_weights"), - ConstantInt::get(Int32Ty, Likely ? LikelyBranchWeight : UnlikelyBranchWeight), - ConstantInt::get(Int32Ty, Likely ? UnlikelyBranchWeight : LikelyBranchWeight) - }; + if (ExpectedValue->isOne()) + Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight); + else + Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); - MDNode *WeightsNode = MDNode::get(Context, Ops); - BI->setMetadata(LLVMContext::MD_prof, WeightsNode); + BI->setMetadata(LLVMContext::MD_prof, Node); CmpI->setOperand(0, ArgValue); return true; diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp index a16130d..1547439 100644 --- a/lib/Transforms/Utils/LowerSwitch.cpp +++ b/lib/Transforms/Utils/LowerSwitch.cpp @@ -66,18 +66,6 @@ namespace { BasicBlock* OrigBlock, BasicBlock* Default); unsigned Clusterify(CaseVector& Cases, SwitchInst *SI); }; - - /// The comparison function for sorting the switch case values in the vector. - /// WARNING: Case ranges should be disjoint! - struct CaseCmp { - bool operator () (const LowerSwitch::CaseRange& C1, - const LowerSwitch::CaseRange& C2) { - - const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low); - const ConstantInt* CI2 = cast<const ConstantInt>(C2.High); - return CI1->getValue().slt(CI2->getValue()); - } - }; } char LowerSwitch::ID = 0; @@ -159,7 +147,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, Function::iterator FI = OrigBlock; F->getBasicBlockList().insert(++FI, NewNode); - ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, + ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT, Val, Pivot.Low, "Pivot"); NewNode->getInstList().push_back(Comp); BranchInst::Create(LBranch, RBranch, Comp, NewNode); @@ -234,40 +222,34 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, // Clusterify - Transform simple list of Cases into list of CaseRange's unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { - unsigned numCmps = 0; + + IntegersSubsetToBB TheClusterifier; // Start with "simple" cases - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) - Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(), - i.getCaseSuccessor())); + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + BasicBlock *SuccBB = i.getCaseSuccessor(); + IntegersSubset CaseRanges = i.getCaseValueEx(); + TheClusterifier.add(CaseRanges, SuccBB); + } - std::sort(Cases.begin(), Cases.end(), CaseCmp()); - - // Merge case into clusters - if (Cases.size()>=2) - for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) { - int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue(); - int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue(); - BasicBlock* nextBB = J->BB; - BasicBlock* currentBB = I->BB; - - // If the two neighboring cases go to the same destination, merge them - // into a single case. - if ((nextValue-currentValue==1) && (currentBB == nextBB)) { - I->High = J->High; - J = Cases.erase(J); - } else { - I = J++; - } - } - - for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) { - if (I->Low != I->High) + TheClusterifier.optimize(); + + size_t numCmps = 0; + for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(), + e = TheClusterifier.end(); i != e; ++i, ++numCmps) { + IntegersSubsetToBB::Cluster &C = *i; + + // FIXME: Currently work with ConstantInt based numbers. + // Changing it to APInt based is a pretty heavy for this commit. + Cases.push_back(CaseRange(C.first.getLow().toConstantInt(), + C.first.getHigh().toConstantInt(), C.second)); + if (C.first.isSingleNumber()) // A range counts double, since it requires two compares. ++numCmps; } - return numCmps; + return numCmps; } // processSwitchInst - Replace the specified switch instruction with a sequence diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp index 8491c55..dbcf3b2 100644 --- a/lib/Transforms/Utils/ModuleUtils.cpp +++ b/lib/Transforms/Utils/ModuleUtils.cpp @@ -14,8 +14,8 @@ #include "llvm/Transforms/Utils/ModuleUtils.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/Module.h" -#include "llvm/Support/IRBuilder.h" using namespace llvm; diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 2357d81..dd5e20e 100644 --- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -28,14 +28,14 @@ #define DEBUG_TYPE "mem2reg" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" +#include "llvm/DIBuilder.h" #include "llvm/Function.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Metadata.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Analysis/DIBuilder.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index e60a41b..b3f5289 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -190,8 +190,11 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { return V; } - // Set DebugLoc. - InsertedPHI->setDebugLoc(GetFirstDebugLocInBasicBlock(BB)); + // Set the DebugLoc of the inserted PHI, if available. + DebugLoc DL; + if (const Instruction *I = BB->getFirstNonPHI()) + DL = I->getDebugLoc(); + InsertedPHI->setDebugLoc(DL); // If the client wants to know about all new instructions, tell it. if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI); @@ -230,28 +233,6 @@ void SSAUpdater::RewriteUseAfterInsertions(Use &U) { U.set(V); } -/// PHIiter - Iterator for PHI operands. This is used for the PHI_iterator -/// in the SSAUpdaterImpl template. -namespace { - class PHIiter { - private: - PHINode *PHI; - unsigned idx; - - public: - explicit PHIiter(PHINode *P) // begin iterator - : PHI(P), idx(0) {} - PHIiter(PHINode *P, bool) // end iterator - : PHI(P), idx(PHI->getNumIncomingValues()) {} - - PHIiter &operator++() { ++idx; return *this; } - bool operator==(const PHIiter& x) const { return idx == x.idx; } - bool operator!=(const PHIiter& x) const { return !operator==(x); } - Value *getIncomingValue() { return PHI->getIncomingValue(idx); } - BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); } - }; -} - /// SSAUpdaterTraits<SSAUpdater> - Traits for the SSAUpdaterImpl template, /// specialized for SSAUpdater. namespace llvm { @@ -266,9 +247,26 @@ public: static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); } static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); } - typedef PHIiter PHI_iterator; - static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); } - static inline PHI_iterator PHI_end(PhiT *PHI) { + class PHI_iterator { + private: + PHINode *PHI; + unsigned idx; + + public: + explicit PHI_iterator(PHINode *P) // begin iterator + : PHI(P), idx(0) {} + PHI_iterator(PHINode *P, bool) // end iterator + : PHI(P), idx(PHI->getNumIncomingValues()) {} + + PHI_iterator &operator++() { ++idx; return *this; } + bool operator==(const PHI_iterator& x) const { return idx == x.idx; } + bool operator!=(const PHI_iterator& x) const { return !operator==(x); } + Value *getIncomingValue() { return PHI->getIncomingValue(idx); } + BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); } + }; + + static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); } + static PHI_iterator PHI_end(PhiT *PHI) { return PHI_iterator(PHI, true); } diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 66dd2c9..518df7c 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -16,29 +16,30 @@ #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" +#include "llvm/MDBuilder.h" #include "llvm/Metadata.h" #include "llvm/Operator.h" #include "llvm/Type.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConstantRange.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/NoFolder.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> #include <set> #include <map> @@ -55,12 +56,26 @@ DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false), STATISTIC(NumSpeculations, "Number of speculative executed instructions"); namespace { + /// ValueEqualityComparisonCase - Represents a case of a switch. + struct ValueEqualityComparisonCase { + ConstantInt *Value; + BasicBlock *Dest; + + ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest) + : Value(Value), Dest(Dest) {} + + bool operator<(ValueEqualityComparisonCase RHS) const { + // Comparing pointers is ok as we only rely on the order for uniquing. + return Value < RHS.Value; + } + }; + class SimplifyCFGOpt { const TargetData *const TD; Value *isValueEqualityComparison(TerminatorInst *TI); BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, - std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases); + std::vector<ValueEqualityComparisonCase> &Cases); bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, BasicBlock *Pred, IRBuilder<> &Builder); @@ -107,6 +122,47 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) { return true; } +/// isProfitableToFoldUnconditional - Return true if it is safe and profitable +/// to merge these two terminator instructions together, where SI1 is an +/// unconditional branch. PhiNodes will store all PHI nodes in common +/// successors. +/// +static bool isProfitableToFoldUnconditional(BranchInst *SI1, + BranchInst *SI2, + Instruction *Cond, + SmallVectorImpl<PHINode*> &PhiNodes) { + if (SI1 == SI2) return false; // Can't merge with self! + assert(SI1->isUnconditional() && SI2->isConditional()); + + // We fold the unconditional branch if we can easily update all PHI nodes in + // common successors: + // 1> We have a constant incoming value for the conditional branch; + // 2> We have "Cond" as the incoming value for the unconditional branch; + // 3> SI2->getCondition() and Cond have same operands. + CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition()); + if (!Ci2) return false; + if (!(Cond->getOperand(0) == Ci2->getOperand(0) && + Cond->getOperand(1) == Ci2->getOperand(1)) && + !(Cond->getOperand(0) == Ci2->getOperand(1) && + Cond->getOperand(1) == Ci2->getOperand(0))) + return false; + + BasicBlock *SI1BB = SI1->getParent(); + BasicBlock *SI2BB = SI2->getParent(); + SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB)); + for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I) + if (SI1Succs.count(*I)) + for (BasicBlock::iterator BBI = (*I)->begin(); + isa<PHINode>(BBI); ++BBI) { + PHINode *PN = cast<PHINode>(BBI); + if (PN->getIncomingValueForBlock(SI1BB) != Cond || + !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB))) + return false; + PhiNodes.push_back(PN); + } + return true; +} + /// AddPredecessorToBlock - Update PHI nodes in Succ to indicate that there will /// now be entries in it from the 'NewPred' block. The values that will be /// flowing into the PHI nodes will be the same as those coming in from @@ -476,21 +532,22 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { /// decode all of the 'cases' that it represents and return the 'default' block. BasicBlock *SimplifyCFGOpt:: GetValueEqualityComparisonCases(TerminatorInst *TI, - std::vector<std::pair<ConstantInt*, - BasicBlock*> > &Cases) { + std::vector<ValueEqualityComparisonCase> + &Cases) { if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { Cases.reserve(SI->getNumCases()); for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) - Cases.push_back(std::make_pair(i.getCaseValue(), - i.getCaseSuccessor())); + Cases.push_back(ValueEqualityComparisonCase(i.getCaseValue(), + i.getCaseSuccessor())); return SI->getDefaultDest(); } BranchInst *BI = cast<BranchInst>(TI); ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); - Cases.push_back(std::make_pair(GetConstantInt(ICI->getOperand(1), TD), - BI->getSuccessor(ICI->getPredicate() == - ICmpInst::ICMP_NE))); + BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE); + Cases.push_back(ValueEqualityComparisonCase(GetConstantInt(ICI->getOperand(1), + TD), + Succ)); return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ); } @@ -498,9 +555,9 @@ GetValueEqualityComparisonCases(TerminatorInst *TI, /// EliminateBlockCases - Given a vector of bb/value pairs, remove any entries /// in the list that match the specified block. static void EliminateBlockCases(BasicBlock *BB, - std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases) { + std::vector<ValueEqualityComparisonCase> &Cases) { for (unsigned i = 0, e = Cases.size(); i != e; ++i) - if (Cases[i].second == BB) { + if (Cases[i].Dest == BB) { Cases.erase(Cases.begin()+i); --i; --e; } @@ -509,9 +566,9 @@ static void EliminateBlockCases(BasicBlock *BB, /// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as /// well. static bool -ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1, - std::vector<std::pair<ConstantInt*, BasicBlock*> > &C2) { - std::vector<std::pair<ConstantInt*, BasicBlock*> > *V1 = &C1, *V2 = &C2; +ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1, + std::vector<ValueEqualityComparisonCase > &C2) { + std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2; // Make V1 be smaller than V2. if (V1->size() > V2->size()) @@ -520,9 +577,9 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1, if (V1->size() == 0) return false; if (V1->size() == 1) { // Just scan V2. - ConstantInt *TheVal = (*V1)[0].first; + ConstantInt *TheVal = (*V1)[0].Value; for (unsigned i = 0, e = V2->size(); i != e; ++i) - if (TheVal == (*V2)[i].first) + if (TheVal == (*V2)[i].Value) return true; } @@ -531,9 +588,9 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1, array_pod_sort(V2->begin(), V2->end()); unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size(); while (i1 != e1 && i2 != e2) { - if ((*V1)[i1].first == (*V2)[i2].first) + if ((*V1)[i1].Value == (*V2)[i2].Value) return true; - if ((*V1)[i1].first < (*V2)[i2].first) + if ((*V1)[i1].Value < (*V2)[i2].Value) ++i1; else ++i2; @@ -559,13 +616,13 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, if (ThisVal != PredVal) return false; // Different predicates. // Find out information about when control will move from Pred to TI's block. - std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases; + std::vector<ValueEqualityComparisonCase> PredCases; BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases); EliminateBlockCases(PredDef, PredCases); // Remove default from cases. // Find information about how control leaves this block. - std::vector<std::pair<ConstantInt*, BasicBlock*> > ThisCases; + std::vector<ValueEqualityComparisonCase> ThisCases; BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases); EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases. @@ -587,7 +644,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, (void) NI; // Remove PHI node entries for the dead edge. - ThisCases[0].second->removePredecessor(TI->getParent()); + ThisCases[0].Dest->removePredecessor(TI->getParent()); DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n"); @@ -600,7 +657,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, // Okay, TI has cases that are statically dead, prune them away. SmallPtrSet<Constant*, 16> DeadCases; for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - DeadCases.insert(PredCases[i].first); + DeadCases.insert(PredCases[i].Value); DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() << "Through successor TI: " << *TI); @@ -622,10 +679,10 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, ConstantInt *TIV = 0; BasicBlock *TIBB = TI->getParent(); for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - if (PredCases[i].second == TIBB) { + if (PredCases[i].Dest == TIBB) { if (TIV != 0) return false; // Cannot handle multiple values coming to this block. - TIV = PredCases[i].first; + TIV = PredCases[i].Value; } assert(TIV && "No edge from pred to succ?"); @@ -633,8 +690,8 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, // BB. Find out which successor will unconditionally be branched to. BasicBlock *TheRealDest = 0; for (unsigned i = 0, e = ThisCases.size(); i != e; ++i) - if (ThisCases[i].first == TIV) { - TheRealDest = ThisCases[i].second; + if (ThisCases[i].Value == TIV) { + TheRealDest = ThisCases[i].Dest; break; } @@ -702,10 +759,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, if (PCV == CV && SafeToMergeTerminators(TI, PTI)) { // Figure out which 'cases' to copy from SI to PSI. - std::vector<std::pair<ConstantInt*, BasicBlock*> > BBCases; + std::vector<ValueEqualityComparisonCase> BBCases; BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases); - std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases; + std::vector<ValueEqualityComparisonCase> PredCases; BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases); // Based on whether the default edge from PTI goes to BB or not, fill in @@ -718,8 +775,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // that don't occur in PTI, or that branch to BB will be activated. std::set<ConstantInt*, ConstantIntOrdering> PTIHandled; for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - if (PredCases[i].second != BB) - PTIHandled.insert(PredCases[i].first); + if (PredCases[i].Dest != BB) + PTIHandled.insert(PredCases[i].Value); else { // The default destination is BB, we don't need explicit targets. std::swap(PredCases[i], PredCases.back()); @@ -734,10 +791,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, NewSuccessors.push_back(BBDefault); } for (unsigned i = 0, e = BBCases.size(); i != e; ++i) - if (!PTIHandled.count(BBCases[i].first) && - BBCases[i].second != BBDefault) { + if (!PTIHandled.count(BBCases[i].Value) && + BBCases[i].Dest != BBDefault) { PredCases.push_back(BBCases[i]); - NewSuccessors.push_back(BBCases[i].second); + NewSuccessors.push_back(BBCases[i].Dest); } } else { @@ -746,8 +803,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // activated. std::set<ConstantInt*, ConstantIntOrdering> PTIHandled; for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - if (PredCases[i].second == BB) { - PTIHandled.insert(PredCases[i].first); + if (PredCases[i].Dest == BB) { + PTIHandled.insert(PredCases[i].Value); std::swap(PredCases[i], PredCases.back()); PredCases.pop_back(); --i; --e; @@ -756,11 +813,11 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // Okay, now we know which constants were sent to BB from the // predecessor. Figure out where they will all go now. for (unsigned i = 0, e = BBCases.size(); i != e; ++i) - if (PTIHandled.count(BBCases[i].first)) { + if (PTIHandled.count(BBCases[i].Value)) { // If this is one we are capable of getting... PredCases.push_back(BBCases[i]); - NewSuccessors.push_back(BBCases[i].second); - PTIHandled.erase(BBCases[i].first);// This constant is taken care of + NewSuccessors.push_back(BBCases[i].Dest); + PTIHandled.erase(BBCases[i].Value);// This constant is taken care of } // If there are any constants vectored to BB that TI doesn't handle, @@ -768,7 +825,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = PTIHandled.begin(), E = PTIHandled.end(); I != E; ++I) { - PredCases.push_back(std::make_pair(*I, BBDefault)); + PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault)); NewSuccessors.push_back(BBDefault); } } @@ -792,7 +849,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, PredCases.size()); NewSI->setDebugLoc(PTI->getDebugLoc()); for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - NewSI->addCase(PredCases[i].first, PredCases[i].second); + NewSI->addCase(PredCases[i].Value, PredCases[i].Dest); EraseTerminatorInstAndDCECond(PTI); @@ -1273,7 +1330,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { return false; } - // If we folded the the first phi, PN dangles at this point. Refresh it. If + // If we folded the first phi, PN dangles at this point. Refresh it. If // we ran out of PHIs then we simplified them all. PN = dyn_cast<PHINode>(BB->begin()); if (PN == 0) return true; @@ -1490,6 +1547,23 @@ static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D, return Result; } +/// checkCSEInPredecessor - Return true if the given instruction is available +/// in its predecessor block. If yes, the instruction will be removed. +/// +static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) { + if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst)) + return false; + for (BasicBlock::iterator I = PB->begin(), E = PB->end(); I != E; I++) { + Instruction *PBI = &*I; + // Check whether Inst and PBI generate the same value. + if (Inst->isIdenticalTo(PBI)) { + Inst->replaceAllUsesWith(PBI); + Inst->eraseFromParent(); + return true; + } + } + return false; +} /// FoldBranchToCommonDest - If this basic block is simple enough, and if a /// predecessor branches to us and one of our successors, fold the block into @@ -1497,7 +1571,36 @@ static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D, bool llvm::FoldBranchToCommonDest(BranchInst *BI) { BasicBlock *BB = BI->getParent(); - Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); + Instruction *Cond = 0; + if (BI->isConditional()) + Cond = dyn_cast<Instruction>(BI->getCondition()); + else { + // For unconditional branch, check for a simple CFG pattern, where + // BB has a single predecessor and BB's successor is also its predecessor's + // successor. If such pattern exisits, check for CSE between BB and its + // predecessor. + if (BasicBlock *PB = BB->getSinglePredecessor()) + if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator())) + if (PBI->isConditional() && + (BI->getSuccessor(0) == PBI->getSuccessor(0) || + BI->getSuccessor(0) == PBI->getSuccessor(1))) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); + I != E; ) { + Instruction *Curr = I++; + if (isa<CmpInst>(Curr)) { + Cond = Curr; + break; + } + // Quit if we can't remove this instruction. + if (!checkCSEInPredecessor(Curr, PB)) + return false; + } + } + + if (Cond == 0) + return false; + } + if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) || Cond->getParent() != BB || !Cond->hasOneUse()) return false; @@ -1549,7 +1652,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Finally, don't infinitely unroll conditional loops. BasicBlock *TrueDest = BI->getSuccessor(0); - BasicBlock *FalseDest = BI->getSuccessor(1); + BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0; if (TrueDest == BB || FalseDest == BB) return false; @@ -1560,23 +1663,33 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Check that we have two conditional branches. If there is a PHI node in // the common successor, verify that the same value flows in from both // blocks. - if (PBI == 0 || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI)) + SmallVector<PHINode*, 4> PHIs; + if (PBI == 0 || PBI->isUnconditional() || + (BI->isConditional() && + !SafeToMergeTerminators(BI, PBI)) || + (!BI->isConditional() && + !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs))) continue; // Determine if the two branches share a common destination. Instruction::BinaryOps Opc; bool InvertPredCond = false; - if (PBI->getSuccessor(0) == TrueDest) - Opc = Instruction::Or; - else if (PBI->getSuccessor(1) == FalseDest) - Opc = Instruction::And; - else if (PBI->getSuccessor(0) == FalseDest) - Opc = Instruction::And, InvertPredCond = true; - else if (PBI->getSuccessor(1) == TrueDest) - Opc = Instruction::Or, InvertPredCond = true; - else - continue; + if (BI->isConditional()) { + if (PBI->getSuccessor(0) == TrueDest) + Opc = Instruction::Or; + else if (PBI->getSuccessor(1) == FalseDest) + Opc = Instruction::And; + else if (PBI->getSuccessor(0) == FalseDest) + Opc = Instruction::And, InvertPredCond = true; + else if (PBI->getSuccessor(1) == TrueDest) + Opc = Instruction::Or, InvertPredCond = true; + else + continue; + } else { + if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest) + continue; + } // Ensure that any values used in the bonus instruction are also used // by the terminator of the predecessor. This means that those values @@ -1652,17 +1765,69 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { New->takeName(Cond); Cond->setName(New->getName()+".old"); - Instruction *NewCond = - cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(), + if (BI->isConditional()) { + Instruction *NewCond = + cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(), New, "or.cond")); - PBI->setCondition(NewCond); - if (PBI->getSuccessor(0) == BB) { - AddPredecessorToBlock(TrueDest, PredBlock, BB); - PBI->setSuccessor(0, TrueDest); - } - if (PBI->getSuccessor(1) == BB) { - AddPredecessorToBlock(FalseDest, PredBlock, BB); - PBI->setSuccessor(1, FalseDest); + PBI->setCondition(NewCond); + + if (PBI->getSuccessor(0) == BB) { + AddPredecessorToBlock(TrueDest, PredBlock, BB); + PBI->setSuccessor(0, TrueDest); + } + if (PBI->getSuccessor(1) == BB) { + AddPredecessorToBlock(FalseDest, PredBlock, BB); + PBI->setSuccessor(1, FalseDest); + } + } else { + // Update PHI nodes in the common successors. + for (unsigned i = 0, e = PHIs.size(); i != e; ++i) { + ConstantInt *PBI_C = cast<ConstantInt>( + PHIs[i]->getIncomingValueForBlock(PBI->getParent())); + assert(PBI_C->getType()->isIntegerTy(1)); + Instruction *MergedCond = 0; + if (PBI->getSuccessor(0) == TrueDest) { + // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value) + // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value) + // is false: !PBI_Cond and BI_Value + Instruction *NotCond = + cast<Instruction>(Builder.CreateNot(PBI->getCondition(), + "not.cond")); + MergedCond = + cast<Instruction>(Builder.CreateBinOp(Instruction::And, + NotCond, New, + "and.cond")); + if (PBI_C->isOne()) + MergedCond = + cast<Instruction>(Builder.CreateBinOp(Instruction::Or, + PBI->getCondition(), MergedCond, + "or.cond")); + } else { + // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C) + // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond) + // is false: PBI_Cond and BI_Value + MergedCond = + cast<Instruction>(Builder.CreateBinOp(Instruction::And, + PBI->getCondition(), New, + "and.cond")); + if (PBI_C->isOne()) { + Instruction *NotCond = + cast<Instruction>(Builder.CreateNot(PBI->getCondition(), + "not.cond")); + MergedCond = + cast<Instruction>(Builder.CreateBinOp(Instruction::Or, + NotCond, MergedCond, + "or.cond")); + } + } + // Update PHI Node. + PHIs[i]->setIncomingValue(PHIs[i]->getBasicBlockIndex(PBI->getParent()), + MergedCond); + } + // Change PBI from Conditional to Unconditional. + BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI); + EraseTerminatorInstAndDCECond(PBI); + PBI = New_PBI; } // TODO: If BB is reachable from all paths through PredBlock, then we @@ -1670,7 +1835,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Merge probability data into PredBlock's branch. APInt A, B, C, D; - if (ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) { + if (PBI->isConditional() && BI->isConditional() && + ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) { // Given IR which does: // bbA: // br i1 %x, label %bbB, label %bbC @@ -1740,12 +1906,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { ProbTrue = ProbTrue.udiv(GCD); ProbFalse = ProbFalse.udiv(GCD); - LLVMContext &Context = BI->getContext(); - Value *Ops[3]; - Ops[0] = BI->getMetadata(LLVMContext::MD_prof)->getOperand(0); - Ops[1] = ConstantInt::get(Context, ProbTrue); - Ops[2] = ConstantInt::get(Context, ProbFalse); - PBI->setMetadata(LLVMContext::MD_prof, MDNode::get(Context, Ops)); + MDBuilder MDB(BI->getContext()); + MDNode *N = MDB.createBranchWeights(ProbTrue.getZExtValue(), + ProbFalse.getZExtValue()); + PBI->setMetadata(LLVMContext::MD_prof, N); } else { PBI->setMetadata(LLVMContext::MD_prof, NULL); } @@ -2758,6 +2922,12 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ return true; } + // If this basic block is ONLY a compare and a branch, and if a predecessor + // branches to us and our successor, fold the comparison into the + // predecessor and use logical operations to update the incoming value + // for PHI nodes in common successor. + if (FoldBranchToCommonDest(BI)) + return SimplifyCFG(BB) | true; return false; } diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index 4030bef..5d673f1 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -16,7 +16,6 @@ #define DEBUG_TYPE "indvars" #include "llvm/Instructions.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -44,7 +43,6 @@ namespace { class SimplifyIndvar { Loop *L; LoopInfo *LI; - DominatorTree *DT; ScalarEvolution *SE; const TargetData *TD; // May be NULL diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 9d62306..62d23cb 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -23,6 +23,7 @@ #include "llvm/IntrinsicInst.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" +#include "llvm/Metadata.h" #include "llvm/Pass.h" #include "llvm/Type.h" #include "llvm/ADT/DenseMap.h" @@ -41,6 +42,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <map> @@ -66,6 +68,10 @@ static cl::opt<unsigned> MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, cl::desc("The maximum number of pairing iterations")); +static cl::opt<bool> +Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, + cl::desc("Don't try to form non-2^n-length vectors")); + static cl::opt<unsigned> MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, cl::desc("The maximum number of pairable instructions per group")); @@ -76,6 +82,10 @@ MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), " a full cycle check")); static cl::opt<bool> +NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize boolean (i1) values")); + +static cl::opt<bool> NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize integer values")); @@ -104,6 +114,10 @@ NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize select instructions")); static cl::opt<bool> +NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize comparison instructions")); + +static cl::opt<bool> NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize getelementptr instructions")); @@ -182,12 +196,12 @@ namespace { // FIXME: const correct? - bool vectorizePairs(BasicBlock &BB); + bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); bool getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts); + std::vector<Value *> &PairableInsts, bool NonPow2Len); void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, @@ -211,7 +225,7 @@ namespace { bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); bool areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore); + bool IsSimpleLoadStore, bool NonPow2Len); bool trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, @@ -263,26 +277,32 @@ namespace { bool UseCycleCheck); Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool &FlipMemInputs); + Instruction *J, unsigned o, bool FlipMemInputs); void fillNewShuffleMask(LLVMContext& Context, Instruction *J, - unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, - unsigned IdxOffset, std::vector<Constant*> &Mask); + unsigned MaskOffset, unsigned NumInElem, + unsigned NumInElem1, unsigned IdxOffset, + std::vector<Constant*> &Mask); Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, Instruction *J); + bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, + unsigned o, Value *&LOp, unsigned numElemL, + Type *ArgTypeL, Type *ArgTypeR, + unsigned IdxOff = 0); + Value *getReplacementInput(LLVMContext& Context, Instruction *I, Instruction *J, unsigned o, bool FlipMemInputs); void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool &FlipMemInputs); + bool FlipMemInputs); void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, - Instruction *&K2, bool &FlipMemInputs); + Instruction *&K2, bool FlipMemInputs); void collectPairLoadMoveSet(BasicBlock &BB, DenseMap<Value *, Value *> &ChosenPairs, @@ -294,6 +314,10 @@ namespace { DenseMap<Value *, Value *> &ChosenPairs, std::multimap<Value *, Value *> &LoadMoveSet); + void collectPtrInfo(std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<Value *> &LowPtrInsts); + bool canMoveUsesOfIAfterJ(BasicBlock &BB, std::multimap<Value *, Value *> &LoadMoveSet, Instruction *I, Instruction *J); @@ -303,12 +327,15 @@ namespace { Instruction *&InsertionPt, Instruction *I, Instruction *J); + void combineMetadata(Instruction *K, const Instruction *J); + bool vectorizeBB(BasicBlock &BB) { bool changed = false; // Iterate a sufficient number of times to merge types of size 1 bit, // then 2 bits, then 4, etc. up to half of the target vector width of the // target vector register. - for (unsigned v = 2, n = 1; + unsigned n = 1; + for (unsigned v = 2; v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { DEBUG(dbgs() << "BBV: fusing loop #" << n << @@ -320,6 +347,16 @@ namespace { break; } + if (changed && !Pow2LenOnly) { + ++n; + for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { + DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << + n << " for " << BB.getName() << " in " << + BB.getParent()->getName() << "...\n"); + if (!vectorizePairs(BB, true)) break; + } + } + DEBUG(dbgs() << "BBV: done!\n"); return changed; } @@ -341,15 +378,43 @@ namespace { AU.setPreservesCFG(); } - // This returns the vector type that holds a pair of the provided type. - // If the provided type is already a vector, then its length is doubled. - static inline VectorType *getVecTypeForPair(Type *ElemTy) { + static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { + assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && + "Cannot form vector from incompatible scalar types"); + Type *STy = ElemTy->getScalarType(); + + unsigned numElem; if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { - unsigned numElem = VTy->getNumElements(); - return VectorType::get(ElemTy->getScalarType(), numElem*2); + numElem = VTy->getNumElements(); + } else { + numElem = 1; } - return VectorType::get(ElemTy, 2); + if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { + numElem += VTy->getNumElements(); + } else { + numElem += 1; + } + + return VectorType::get(STy, numElem); + } + + static inline void getInstructionTypes(Instruction *I, + Type *&T1, Type *&T2) { + if (isa<StoreInst>(I)) { + // For stores, it is the value type, not the pointer type that matters + // because the value is what will come from a vector register. + + Value *IVal = cast<StoreInst>(I)->getValueOperand(); + T1 = IVal->getType(); + } else { + T1 = I->getType(); + } + + if (I->isCast()) + T2 = cast<CastInst>(I)->getSrcTy(); + else + T2 = T1; } // Returns the weight associated with the provided value. A chain of @@ -385,8 +450,7 @@ namespace { // true if the offset could be determined to be some constant value. // For example, if OffsetInElmts == 1, then J accesses the memory directly // after I; if OffsetInElmts == -1 then I accesses the memory - // directly after J. This function assumes that both instructions - // have the same type. + // directly after J. bool getPairPtrInfo(Instruction *I, Instruction *J, Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, int64_t &OffsetInElmts) { @@ -418,7 +482,12 @@ namespace { Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); - assert(VTy == cast<PointerType>(JPtr->getType())->getElementType()); + Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType(); + if (VTy != VTy2 && Offset < 0) { + int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); + OffsetInElmts = Offset/VTy2TSS; + return (abs64(Offset) % VTy2TSS) == 0; + } OffsetInElmts = Offset/VTyTSS; return (abs64(Offset) % VTyTSS) == 0; @@ -471,7 +540,7 @@ namespace { // This function implements one vectorization iteration on the provided // basic block. It returns true if the block is changed. - bool BBVectorize::vectorizePairs(BasicBlock &BB) { + bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { bool ShouldContinue; BasicBlock::iterator Start = BB.getFirstInsertionPt(); @@ -482,7 +551,7 @@ namespace { std::vector<Value *> PairableInsts; std::multimap<Value *, Value *> CandidatePairs; ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, - PairableInsts); + PairableInsts, NonPow2Len); if (PairableInsts.empty()) continue; // Now we have a map of all of the pairable instructions and we need to @@ -529,6 +598,10 @@ namespace { // passes should coalesce the build/extract combinations. fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); + + // It is important to cleanup here so that future iterations of this + // function have less work to do. + (void) SimplifyInstructionsInBlock(&BB, TD); return true; } @@ -567,6 +640,9 @@ namespace { } else if (isa<SelectInst>(I)) { if (!Config.VectorizeSelect) return false; + } else if (isa<CmpInst>(I)) { + if (!Config.VectorizeCmp) + return false; } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { if (!Config.VectorizeGEP) return false; @@ -584,41 +660,39 @@ namespace { return false; Type *T1, *T2; - if (isa<StoreInst>(I)) { - // For stores, it is the value type, not the pointer type that matters - // because the value is what will come from a vector register. - - Value *IVal = cast<StoreInst>(I)->getValueOperand(); - T1 = IVal->getType(); - } else { - T1 = I->getType(); - } - - if (I->isCast()) - T2 = cast<CastInst>(I)->getSrcTy(); - else - T2 = T1; + getInstructionTypes(I, T1, T2); // Not every type can be vectorized... if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || !(VectorType::isValidElementType(T2) || T2->isVectorTy())) return false; - if (!Config.VectorizeInts - && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) - return false; - + if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) { + if (!Config.VectorizeBools) + return false; + } else { + if (!Config.VectorizeInts + && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + return false; + } + if (!Config.VectorizeFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) return false; + // Don't vectorize target-specific types. + if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) + return false; + if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) + return false; + if ((!Config.VectorizePointers || TD == 0) && (T1->getScalarType()->isPointerTy() || T2->getScalarType()->isPointerTy())) return false; - if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 || - T2->getPrimitiveSizeInBits() > Config.VectorBits/2) + if (T1->getPrimitiveSizeInBits() >= Config.VectorBits || + T2->getPrimitiveSizeInBits() >= Config.VectorBits) return false; return true; @@ -629,36 +703,25 @@ namespace { // that I has already been determined to be vectorizable and that J is not // in the use tree of I. bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore) { + bool IsSimpleLoadStore, bool NonPow2Len) { DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << " <-> " << *J << "\n"); // Loads and stores can be merged if they have different alignments, // but are otherwise the same. - LoadInst *LI, *LJ; - StoreInst *SI, *SJ; - if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) { - if (I->getType() != J->getType()) - return false; + if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | + (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) + return false; - if (LI->getPointerOperand()->getType() != - LJ->getPointerOperand()->getType() || - LI->isVolatile() != LJ->isVolatile() || - LI->getOrdering() != LJ->getOrdering() || - LI->getSynchScope() != LJ->getSynchScope()) - return false; - } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) { - if (SI->getValueOperand()->getType() != - SJ->getValueOperand()->getType() || - SI->getPointerOperand()->getType() != - SJ->getPointerOperand()->getType() || - SI->isVolatile() != SJ->isVolatile() || - SI->getOrdering() != SJ->getOrdering() || - SI->getSynchScope() != SJ->getSynchScope()) - return false; - } else if (!J->isSameOperationAs(I)) { + Type *IT1, *IT2, *JT1, *JT2; + getInstructionTypes(I, IT1, IT2); + getInstructionTypes(J, JT1, JT2); + unsigned MaxTypeBits = std::max( + IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), + IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); + if (MaxTypeBits > Config.VectorBits) return false; - } + // FIXME: handle addsub-type operations! if (IsSimpleLoadStore) { @@ -668,8 +731,11 @@ namespace { if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, OffsetInElmts) && abs64(OffsetInElmts) == 1) { if (Config.AlignedOnly) { - Type *aType = isa<StoreInst>(I) ? + Type *aTypeI = isa<StoreInst>(I) ? cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); + Type *aTypeJ = isa<StoreInst>(J) ? + cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + // An aligned load or store is possible only if the instruction // with the lower offset has an alignment suitable for the // vector type. @@ -677,7 +743,7 @@ namespace { unsigned BottomAlignment = IAlignment; if (OffsetInElmts < 0) BottomAlignment = JAlignment; - Type *VType = getVecTypeForPair(aType); + Type *VType = getVecTypeForPair(aTypeI, aTypeJ); unsigned VecAlignment = TD->getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; @@ -685,11 +751,6 @@ namespace { } else { return false; } - } else if (isa<ShuffleVectorInst>(I)) { - // Only merge two shuffles if they're both constant - return isa<Constant>(I->getOperand(2)) && - isa<Constant>(J->getOperand(2)); - // FIXME: We may want to vectorize non-constant shuffles also. } // The powi intrinsic is special because only the first argument is @@ -772,7 +833,7 @@ namespace { bool BBVectorize::getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts) { + std::vector<Value *> &PairableInsts, bool NonPow2Len) { BasicBlock::iterator E = BB.end(); if (Start == E) return false; @@ -808,7 +869,7 @@ namespace { // J does not use I, and comes before the first use of I, so it can be // merged with I if the instructions are compatible. - if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue; + if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue; // J is a candidate for merging with I. if (!PairableInsts.size() || @@ -1430,24 +1491,27 @@ namespace { // instruction that fuses I with J. Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, Instruction *I, Instruction *J, unsigned o, - bool &FlipMemInputs) { + bool FlipMemInputs) { Value *IPtr, *JPtr; unsigned IAlignment, JAlignment; int64_t OffsetInElmts; + + // Note: the analysis might fail here, that is why FlipMemInputs has + // been precomputed (OffsetInElmts must be unused here). (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, OffsetInElmts); // The pointer value is taken to be the one with the lowest offset. Value *VPtr; - if (OffsetInElmts > 0) { + if (!FlipMemInputs) { VPtr = IPtr; } else { - FlipMemInputs = true; VPtr = JPtr; } - Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType(); - Type *VArgType = getVecTypeForPair(ArgType); + Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); + Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); Type *VArgPtrType = PointerType::get(VArgType, cast<PointerType>(IPtr->getType())->getAddressSpace()); return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), @@ -1455,15 +1519,17 @@ namespace { } void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, - unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, - unsigned IdxOffset, std::vector<Constant*> &Mask) { - for (unsigned v = 0; v < NumElem/2; ++v) { + unsigned MaskOffset, unsigned NumInElem, + unsigned NumInElem1, unsigned IdxOffset, + std::vector<Constant*> &Mask) { + unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements(); + for (unsigned v = 0; v < NumElem1; ++v) { int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); if (m < 0) { Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); } else { unsigned mm = m + (int) IdxOffset; - if (m >= (int) NumInElem) + if (m >= (int) NumInElem1) mm += (int) NumInElem; Mask[v+MaskOffset] = @@ -1479,8 +1545,11 @@ namespace { // This is the shuffle mask. We need to append the second // mask to the first, and the numbers need to be adjusted. - Type *ArgType = I->getType(); - Type *VArgType = getVecTypeForPair(ArgType); + Type *ArgTypeI = I->getType(); + Type *ArgTypeJ = J->getType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); + + unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements(); // Get the total number of elements in the fused vector type. // By definition, this must equal the number of elements in @@ -1488,19 +1557,81 @@ namespace { unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); std::vector<Constant*> Mask(NumElem); - Type *OpType = I->getOperand(0)->getType(); - unsigned NumInElem = cast<VectorType>(OpType)->getNumElements(); + Type *OpTypeI = I->getOperand(0)->getType(); + unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements(); + Type *OpTypeJ = J->getOperand(0)->getType(); + unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements(); + + // The fused vector will be: + // ----------------------------------------------------- + // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | + // ----------------------------------------------------- + // from which we'll extract NumElem total elements (where the first NumElemI + // of them come from the mask in I and the remainder come from the mask + // in J. // For the mask from the first pair... - fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask); + fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, + 0, Mask); // For the mask from the second pair... - fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem, - Mask); + fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, + NumInElemI, Mask); return ConstantVector::get(Mask); } + bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, Value *&LOp, + unsigned numElemL, + Type *ArgTypeL, Type *ArgTypeH, + unsigned IdxOff) { + bool ExpandedIEChain = false; + if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { + // If we have a pure insertelement chain, then this can be rewritten + // into a chain that directly builds the larger type. + bool PureChain = true; + InsertElementInst *LIENext = LIE; + do { + if (!isa<UndefValue>(LIENext->getOperand(0)) && + !isa<InsertElementInst>(LIENext->getOperand(0))) { + PureChain = false; + break; + } + } while ((LIENext = + dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); + + if (PureChain) { + SmallVector<Value *, 8> VectElemts(numElemL, + UndefValue::get(ArgTypeL->getScalarType())); + InsertElementInst *LIENext = LIE; + do { + unsigned Idx = + cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); + VectElemts[Idx] = LIENext->getOperand(1); + } while ((LIENext = + dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); + + LIENext = 0; + Value *LIEPrev = UndefValue::get(ArgTypeH); + for (unsigned i = 0; i < numElemL; ++i) { + if (isa<UndefValue>(VectElemts[i])) continue; + LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], + ConstantInt::get(Type::getInt32Ty(Context), + i + IdxOff), + getReplacementName(I, true, o, i+1)); + LIENext->insertBefore(J); + LIEPrev = LIENext; + } + + LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); + ExpandedIEChain = true; + } + } + + return ExpandedIEChain; + } + // Returns the value to be used as the specified operand of the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, @@ -1508,84 +1639,333 @@ namespace { Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); - // Compute the fused vector type for this operand - Type *ArgType = I->getOperand(o)->getType(); - VectorType *VArgType = getVecTypeForPair(ArgType); + // Compute the fused vector type for this operand + Type *ArgTypeI = I->getOperand(o)->getType(); + Type *ArgTypeJ = J->getOperand(o)->getType(); + VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); Instruction *L = I, *H = J; + Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; if (FlipMemInputs) { L = J; H = I; + ArgTypeL = ArgTypeJ; + ArgTypeH = ArgTypeI; } - if (ArgType->isVectorTy()) { - unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); - std::vector<Constant*> Mask(numElem); - for (unsigned v = 0; v < numElem; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + unsigned numElemL; + if (ArgTypeL->isVectorTy()) + numElemL = cast<VectorType>(ArgTypeL)->getNumElements(); + else + numElemL = 1; - Instruction *BV = new ShuffleVectorInst(L->getOperand(o), - H->getOperand(o), - ConstantVector::get(Mask), - getReplacementName(I, true, o)); - BV->insertBefore(J); - return BV; + unsigned numElemH; + if (ArgTypeH->isVectorTy()) + numElemH = cast<VectorType>(ArgTypeH)->getNumElements(); + else + numElemH = 1; + + Value *LOp = L->getOperand(o); + Value *HOp = H->getOperand(o); + unsigned numElem = VArgType->getNumElements(); + + // First, we check if we can reuse the "original" vector outputs (if these + // exist). We might need a shuffle. + ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); + ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); + ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); + ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); + + // FIXME: If we're fusing shuffle instructions, then we can't apply this + // optimization. The input vectors to the shuffle might be a different + // length from the shuffle outputs. Unfortunately, the replacement + // shuffle mask has already been formed, and the mask entries are sensitive + // to the sizes of the inputs. + bool IsSizeChangeShuffle = + isa<ShuffleVectorInst>(L) && + (LOp->getType() != L->getType() || HOp->getType() != H->getType()); + + if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { + // We can have at most two unique vector inputs. + bool CanUseInputs = true; + Value *I1, *I2 = 0; + if (LEE) { + I1 = LEE->getOperand(0); + } else { + I1 = LSV->getOperand(0); + I2 = LSV->getOperand(1); + if (I2 == I1 || isa<UndefValue>(I2)) + I2 = 0; + } + + if (HEE) { + Value *I3 = HEE->getOperand(0); + if (!I2 && I3 != I1) + I2 = I3; + else if (I3 != I1 && I3 != I2) + CanUseInputs = false; + } else { + Value *I3 = HSV->getOperand(0); + if (!I2 && I3 != I1) + I2 = I3; + else if (I3 != I1 && I3 != I2) + CanUseInputs = false; + + if (CanUseInputs) { + Value *I4 = HSV->getOperand(1); + if (!isa<UndefValue>(I4)) { + if (!I2 && I4 != I1) + I2 = I4; + else if (I4 != I1 && I4 != I2) + CanUseInputs = false; + } + } + } + + if (CanUseInputs) { + unsigned LOpElem = + cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType()) + ->getNumElements(); + unsigned HOpElem = + cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType()) + ->getNumElements(); + + // We have one or two input vectors. We need to map each index of the + // operands to the index of the original vector. + SmallVector<std::pair<int, int>, 8> II(numElem); + for (unsigned i = 0; i < numElemL; ++i) { + int Idx, INum; + if (LEE) { + Idx = + cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); + INum = LEE->getOperand(0) == I1 ? 0 : 1; + } else { + Idx = LSV->getMaskValue(i); + if (Idx < (int) LOpElem) { + INum = LSV->getOperand(0) == I1 ? 0 : 1; + } else { + Idx -= LOpElem; + INum = LSV->getOperand(1) == I1 ? 0 : 1; + } + } + + II[i] = std::pair<int, int>(Idx, INum); + } + for (unsigned i = 0; i < numElemH; ++i) { + int Idx, INum; + if (HEE) { + Idx = + cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); + INum = HEE->getOperand(0) == I1 ? 0 : 1; + } else { + Idx = HSV->getMaskValue(i); + if (Idx < (int) HOpElem) { + INum = HSV->getOperand(0) == I1 ? 0 : 1; + } else { + Idx -= HOpElem; + INum = HSV->getOperand(1) == I1 ? 0 : 1; + } + } + + II[i + numElemL] = std::pair<int, int>(Idx, INum); + } + + // We now have an array which tells us from which index of which + // input vector each element of the operand comes. + VectorType *I1T = cast<VectorType>(I1->getType()); + unsigned I1Elem = I1T->getNumElements(); + + if (!I2) { + // In this case there is only one underlying vector input. Check for + // the trivial case where we can use the input directly. + if (I1Elem == numElem) { + bool ElemInOrder = true; + for (unsigned i = 0; i < numElem; ++i) { + if (II[i].first != (int) i && II[i].first != -1) { + ElemInOrder = false; + break; + } + } + + if (ElemInOrder) + return I1; + } + + // A shuffle is needed. + std::vector<Constant *> Mask(numElem); + for (unsigned i = 0; i < numElem; ++i) { + int Idx = II[i].first; + if (Idx == -1) + Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); + else + Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); + } + + Instruction *S = + new ShuffleVectorInst(I1, UndefValue::get(I1T), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + S->insertBefore(J); + return S; + } + + VectorType *I2T = cast<VectorType>(I2->getType()); + unsigned I2Elem = I2T->getNumElements(); + + // This input comes from two distinct vectors. The first step is to + // make sure that both vectors are the same length. If not, the + // smaller one will need to grow before they can be shuffled together. + if (I1Elem < I2Elem) { + std::vector<Constant *> Mask(I2Elem); + unsigned v = 0; + for (; v < I1Elem; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + for (; v < I2Elem; ++v) + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + + Instruction *NewI1 = + new ShuffleVectorInst(I1, UndefValue::get(I1T), + ConstantVector::get(Mask), + getReplacementName(I, true, o, 1)); + NewI1->insertBefore(J); + I1 = NewI1; + I1T = I2T; + I1Elem = I2Elem; + } else if (I1Elem > I2Elem) { + std::vector<Constant *> Mask(I1Elem); + unsigned v = 0; + for (; v < I2Elem; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + for (; v < I1Elem; ++v) + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + + Instruction *NewI2 = + new ShuffleVectorInst(I2, UndefValue::get(I2T), + ConstantVector::get(Mask), + getReplacementName(I, true, o, 1)); + NewI2->insertBefore(J); + I2 = NewI2; + I2T = I1T; + I2Elem = I1Elem; + } + + // Now that both I1 and I2 are the same length we can shuffle them + // together (and use the result). + std::vector<Constant *> Mask(numElem); + for (unsigned v = 0; v < numElem; ++v) { + if (II[v].first == -1) { + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + } else { + int Idx = II[v].first + II[v].second * I1Elem; + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); + } + } + + Instruction *NewOp = + new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), + getReplacementName(I, true, o)); + NewOp->insertBefore(J); + return NewOp; + } } - // If these two inputs are the output of another vector instruction, - // then we should use that output directly. It might be necessary to - // permute it first. [When pairings are fused recursively, you can - // end up with cases where a large vector is decomposed into scalars - // using extractelement instructions, then built into size-2 - // vectors using insertelement and the into larger vectors using - // shuffles. InstCombine does not simplify all of these cases well, - // and so we make sure that shuffles are generated here when possible. - ExtractElementInst *LEE - = dyn_cast<ExtractElementInst>(L->getOperand(o)); - ExtractElementInst *HEE - = dyn_cast<ExtractElementInst>(H->getOperand(o)); - - if (LEE && HEE && - LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) { - VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType()); - unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue(); - unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue(); - if (LEE->getOperand(0) == HEE->getOperand(0)) { - if (LowIndx == 0 && HighIndx == 1) - return LEE->getOperand(0); - - std::vector<Constant*> Mask(2); - Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); - Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); - - Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), - UndefValue::get(EEType), - ConstantVector::get(Mask), - getReplacementName(I, true, o)); - BV->insertBefore(J); - return BV; + Type *ArgType = ArgTypeL; + if (numElemL < numElemH) { + if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, + ArgTypeL, VArgType, 1)) { + // This is another short-circuit case: we're combining a scalar into + // a vector that is formed by an IE chain. We've just expanded the IE + // chain, now insert the scalar and we're done. + + Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, + getReplacementName(I, true, o)); + S->insertBefore(J); + return S; + } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, + ArgTypeH)) { + // The two vector inputs to the shuffle must be the same length, + // so extend the smaller vector to be the same length as the larger one. + Instruction *NLOp; + if (numElemL > 1) { + + std::vector<Constant *> Mask(numElemH); + unsigned v = 0; + for (; v < numElemL; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + for (; v < numElemH; ++v) + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + + NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), + ConstantVector::get(Mask), + getReplacementName(I, true, o, 1)); + } else { + NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, + getReplacementName(I, true, o, 1)); + } + + NLOp->insertBefore(J); + LOp = NLOp; } - std::vector<Constant*> Mask(2); - HighIndx += EEType->getNumElements(); - Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); - Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); + ArgType = ArgTypeH; + } else if (numElemL > numElemH) { + if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, + ArgTypeH, VArgType)) { + Instruction *S = + InsertElementInst::Create(LOp, HOp, + ConstantInt::get(Type::getInt32Ty(Context), + numElemL), + getReplacementName(I, true, o)); + S->insertBefore(J); + return S; + } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, + ArgTypeL)) { + Instruction *NHOp; + if (numElemH > 1) { + std::vector<Constant *> Mask(numElemL); + unsigned v = 0; + for (; v < numElemH; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + for (; v < numElemL; ++v) + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + + NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), + ConstantVector::get(Mask), + getReplacementName(I, true, o, 1)); + } else { + NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, + getReplacementName(I, true, o, 1)); + } + + NHOp->insertBefore(J); + HOp = NHOp; + } + } - Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), - HEE->getOperand(0), - ConstantVector::get(Mask), - getReplacementName(I, true, o)); + if (ArgType->isVectorTy()) { + unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); + std::vector<Constant*> Mask(numElem); + for (unsigned v = 0; v < numElem; ++v) { + unsigned Idx = v; + // If the low vector was expanded, we need to skip the extra + // undefined entries. + if (v >= numElemL && numElemH > numElemL) + Idx += (numElemH - numElemL); + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); + } + + Instruction *BV = new ShuffleVectorInst(LOp, HOp, + ConstantVector::get(Mask), + getReplacementName(I, true, o)); BV->insertBefore(J); return BV; } Instruction *BV1 = InsertElementInst::Create( - UndefValue::get(VArgType), - L->getOperand(o), CV0, + UndefValue::get(VArgType), LOp, CV0, getReplacementName(I, true, o, 1)); BV1->insertBefore(I); - Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o), - CV1, + Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, getReplacementName(I, true, o, 2)); BV2->insertBefore(J); return BV2; @@ -1596,8 +1976,7 @@ namespace { void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool &FlipMemInputs) { - FlipMemInputs = false; + bool FlipMemInputs) { unsigned NumOperands = I->getNumOperands(); for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { @@ -1616,10 +1995,10 @@ namespace { BasicBlock &BB = *I->getParent(); Module *M = BB.getParent()->getParent(); - Type *ArgType = I->getType(); - Type *VArgType = getVecTypeForPair(ArgType); + Type *ArgTypeI = I->getType(); + Type *ArgTypeJ = J->getType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - // FIXME: is it safe to do this here? ReplacedOperands[o] = Intrinsic::getDeclaration(M, (Intrinsic::ID) IID, VArgType); continue; @@ -1648,36 +2027,60 @@ namespace { Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, Instruction *&K2, - bool &FlipMemInputs) { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); - + bool FlipMemInputs) { if (isa<StoreInst>(I)) { AA->replaceWithNewValue(I, K); AA->replaceWithNewValue(J, K); } else { Type *IType = I->getType(); - Type *VType = getVecTypeForPair(IType); + Type *JType = J->getType(); + + VectorType *VType = getVecTypeForPair(IType, JType); + unsigned numElem = VType->getNumElements(); + + unsigned numElemI, numElemJ; + if (IType->isVectorTy()) + numElemI = cast<VectorType>(IType)->getNumElements(); + else + numElemI = 1; + + if (JType->isVectorTy()) + numElemJ = cast<VectorType>(JType)->getNumElements(); + else + numElemJ = 1; if (IType->isVectorTy()) { - unsigned numElem = cast<VectorType>(IType)->getNumElements(); - std::vector<Constant*> Mask1(numElem), Mask2(numElem); - for (unsigned v = 0; v < numElem; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v); - } + std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); + for (unsigned v = 0; v < numElemI; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); + } - K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask2 : Mask1), - getReplacementName(K, false, 1)); - K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask1 : Mask2), - getReplacementName(K, false, 2)); + K1 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get( + FlipMemInputs ? Mask2 : Mask1), + getReplacementName(K, false, 1)); } else { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, getReplacementName(K, false, 1)); + } + + if (JType->isVectorTy()) { + std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); + for (unsigned v = 0; v < numElemJ; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); + } + + K2 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get( + FlipMemInputs ? Mask1 : Mask2), + getReplacementName(K, false, 2)); + } else { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, getReplacementName(K, false, 2)); } @@ -1778,6 +2181,61 @@ namespace { } } + // As with the aliasing information, SCEV can also change because of + // vectorization. This information is used to compute relative pointer + // offsets; the necessary information will be cached here prior to + // fusion. + void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<Value *> &LowPtrInsts) { + for (std::vector<Value *>::iterator PI = PairableInsts.begin(), + PIE = PairableInsts.end(); PI != PIE; ++PI) { + DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); + if (P == ChosenPairs.end()) continue; + + Instruction *I = cast<Instruction>(P->first); + Instruction *J = cast<Instruction>(P->second); + + if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) + continue; + + Value *IPtr, *JPtr; + unsigned IAlignment, JAlignment; + int64_t OffsetInElmts; + if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + OffsetInElmts) || abs64(OffsetInElmts) != 1) + llvm_unreachable("Pre-fusion pointer analysis failed"); + + Value *LowPI = (OffsetInElmts > 0) ? I : J; + LowPtrInsts.insert(LowPI); + } + } + + // When the first instruction in each pair is cloned, it will inherit its + // parent's metadata. This metadata must be combined with that of the other + // instruction in a safe way. + void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) { + SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; + K->getAllMetadataOtherThanDebugLoc(Metadata); + for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { + unsigned Kind = Metadata[i].first; + MDNode *JMD = J->getMetadata(Kind); + MDNode *KMD = Metadata[i].second; + + switch (Kind) { + default: + K->setMetadata(Kind, 0); // Remove unknown metadata + break; + case LLVMContext::MD_tbaa: + K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); + break; + case LLVMContext::MD_fpmath: + K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); + break; + } + } + } + // This function fuses the chosen instruction pairs into vector instructions, // taking care preserve any needed scalar outputs and, then, it reorders the // remaining instructions as needed (users of the first member of the pair @@ -1804,6 +2262,9 @@ namespace { std::multimap<Value *, Value *> LoadMoveSet; collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); + DenseSet<Value *> LowPtrInsts; + collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts); + DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { @@ -1843,7 +2304,10 @@ namespace { continue; } - bool FlipMemInputs; + bool FlipMemInputs = false; + if (isa<LoadInst>(I) || isa<StoreInst>(I)) + FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end()); + unsigned NumOperands = I->getNumOperands(); SmallVector<Value *, 3> ReplacedOperands(NumOperands); getReplacementInputsForPair(Context, I, J, ReplacedOperands, @@ -1855,7 +2319,9 @@ namespace { if (I->hasName()) K->takeName(I); if (!isa<StoreInst>(K)) - K->mutateType(getVecTypeForPair(I->getType())); + K->mutateType(getVecTypeForPair(I->getType(), J->getType())); + + combineMetadata(K, J); for (unsigned o = 0; o < NumOperands; ++o) K->setOperand(o, ReplacedOperands[o]); @@ -1947,6 +2413,7 @@ llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { //===----------------------------------------------------------------------===// VectorizeConfig::VectorizeConfig() { VectorBits = ::VectorBits; + VectorizeBools = !::NoBools; VectorizeInts = !::NoInts; VectorizeFloats = !::NoFloats; VectorizePointers = !::NoPointers; @@ -1954,6 +2421,7 @@ VectorizeConfig::VectorizeConfig() { VectorizeMath = !::NoMath; VectorizeFMA = !::NoFMA; VectorizeSelect = !::NoSelect; + VectorizeCmp = !::NoCmp; VectorizeGEP = !::NoGEP; VectorizeMemOps = !::NoMemOps; AlignedOnly = ::AlignedOnly; @@ -1963,6 +2431,7 @@ VectorizeConfig::VectorizeConfig() { SplatBreaksChain = ::SplatBreaksChain; MaxInsts = ::MaxInsts; MaxIter = ::MaxIter; + Pow2LenOnly = ::Pow2LenOnly; NoMemOpBoost = ::NoMemOpBoost; FastDep = ::FastDep; } diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 4b66930..06cf1e4 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -2,3 +2,5 @@ add_llvm_library(LLVMVectorize BBVectorize.cpp Vectorize.cpp ) + +add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp index 7b39efb..aedb86b 100644 --- a/lib/VMCore/AsmWriter.cpp +++ b/lib/VMCore/AsmWriter.cpp @@ -20,6 +20,7 @@ #include "llvm/LLVMContext.h" #include "llvm/CallingConv.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/InlineAsm.h" #include "llvm/IntrinsicInst.h" @@ -99,7 +100,11 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) { bool NeedsQuotes = isdigit(Name[0]); if (!NeedsQuotes) { for (unsigned i = 0, e = Name.size(); i != e; ++i) { - char C = Name[i]; + // By making this unsigned, the value passed in to isalnum will always be + // in the range 0-255. This is important when building with MSVC because + // its implementation will assert. This situation can arise when dealing + // with UTF-8 multibyte characters. + unsigned char C = Name[i]; if (!isalnum(C) && C != '-' && C != '.' && C != '_') { NeedsQuotes = true; break; @@ -708,8 +713,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, } if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) { - if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf || - &CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle || + if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle || &CFP->getValueAPF().getSemantics() == &APFloat::IEEEdouble) { // We would like to output the FP constant value in exponential notation, // but we cannot do this if doing so will lose precision. Check here to @@ -759,16 +763,20 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, return; } - // Some form of long double. These appear as a magic letter identifying - // the type, then a fixed number of hex digits. + // Either half, or some form of long double. + // These appear as a magic letter identifying the type, then a + // fixed number of hex digits. Out << "0x"; + // Bit position, in the current word, of the next nibble to print. + int shiftcount; + if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) { Out << 'K'; // api needed to prevent premature destruction APInt api = CFP->getValueAPF().bitcastToAPInt(); const uint64_t* p = api.getRawData(); uint64_t word = p[1]; - int shiftcount=12; + shiftcount = 12; int width = api.getBitWidth(); for (int j=0; j<width; j+=4, shiftcount-=4) { unsigned int nibble = (word>>shiftcount) & 15; @@ -784,17 +792,21 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, } } return; - } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) + } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) { + shiftcount = 60; Out << 'L'; - else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) + } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) { + shiftcount = 60; Out << 'M'; - else + } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf) { + shiftcount = 12; + Out << 'H'; + } else llvm_unreachable("Unsupported floating point type"); // api needed to prevent premature destruction APInt api = CFP->getValueAPF().bitcastToAPInt(); const uint64_t* p = api.getRawData(); uint64_t word = *p; - int shiftcount=60; int width = api.getBitWidth(); for (int j=0; j<width; j+=4, shiftcount-=4) { unsigned int nibble = (word>>shiftcount) & 15; @@ -1369,6 +1381,26 @@ static void PrintVisibility(GlobalValue::VisibilityTypes Vis, } } +static void PrintThreadLocalModel(GlobalVariable::ThreadLocalMode TLM, + formatted_raw_ostream &Out) { + switch (TLM) { + case GlobalVariable::NotThreadLocal: + break; + case GlobalVariable::GeneralDynamicTLSModel: + Out << "thread_local "; + break; + case GlobalVariable::LocalDynamicTLSModel: + Out << "thread_local(localdynamic) "; + break; + case GlobalVariable::InitialExecTLSModel: + Out << "thread_local(initialexec) "; + break; + case GlobalVariable::LocalExecTLSModel: + Out << "thread_local(localexec) "; + break; + } +} + void AssemblyWriter::printGlobal(const GlobalVariable *GV) { if (GV->isMaterializable()) Out << "; Materializable\n"; @@ -1381,8 +1413,8 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) { PrintLinkage(GV->getLinkage(), Out); PrintVisibility(GV->getVisibility(), Out); + PrintThreadLocalModel(GV->getThreadLocalMode(), Out); - if (GV->isThreadLocal()) Out << "thread_local "; if (unsigned AddressSpace = GV->getType()->getAddressSpace()) Out << "addrspace(" << AddressSpace << ") "; if (GV->hasUnnamedAddr()) Out << "unnamed_addr "; @@ -2004,19 +2036,22 @@ static void WriteMDNodeComment(const MDNode *Node, formatted_raw_ostream &Out) { if (Node->getNumOperands() < 1) return; - ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Node->getOperand(0)); - if (!CI) return; - APInt Val = CI->getValue(); - APInt Tag = Val & ~APInt(Val.getBitWidth(), LLVMDebugVersionMask); - if (Val.ult(LLVMDebugVersion11)) + + Value *Op = Node->getOperand(0); + if (!Op || !isa<ConstantInt>(Op) || cast<ConstantInt>(Op)->getBitWidth() < 32) + return; + + DIDescriptor Desc(Node); + if (Desc.getVersion() < LLVMDebugVersion11) return; + unsigned Tag = Desc.getTag(); Out.PadToColumn(50); - if (Tag == dwarf::DW_TAG_user_base) + if (dwarf::TagString(Tag)) { + Out << "; "; + Desc.print(Out); + } else if (Tag == dwarf::DW_TAG_user_base) { Out << "; [ DW_TAG_user_base ]"; - else if (Tag.isIntN(32)) { - if (const char *TagName = dwarf::TagString(Tag.getZExtValue())) - Out << "; [ " << TagName << " ]"; } } diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp index c05132b..d466ac6 100644 --- a/lib/VMCore/Attributes.cpp +++ b/lib/VMCore/Attributes.cpp @@ -131,8 +131,8 @@ class AttributeListImpl : public FoldingSetNode { public: SmallVector<AttributeWithIndex, 4> Attrs; - AttributeListImpl(const AttributeWithIndex *Attr, unsigned NumAttrs) - : Attrs(Attr, Attr+NumAttrs) { + AttributeListImpl(ArrayRef<AttributeWithIndex> attrs) + : Attrs(attrs.begin(), attrs.end()) { RefCount = 0; } @@ -150,13 +150,12 @@ public: } void Profile(FoldingSetNodeID &ID) const { - Profile(ID, Attrs.data(), Attrs.size()); + Profile(ID, Attrs); } - static void Profile(FoldingSetNodeID &ID, const AttributeWithIndex *Attr, - unsigned NumAttrs) { - for (unsigned i = 0; i != NumAttrs; ++i) { - ID.AddInteger(Attr[i].Attrs.Raw()); - ID.AddInteger(Attr[i].Index); + static void Profile(FoldingSetNodeID &ID, ArrayRef<AttributeWithIndex> Attrs){ + for (unsigned i = 0, e = Attrs.size(); i != e; ++i) { + ID.AddInteger(Attrs[i].Attrs.Raw()); + ID.AddInteger(Attrs[i].Index); } } }; @@ -168,13 +167,13 @@ AttributeListImpl::~AttributeListImpl() { } -AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs) { +AttrListPtr AttrListPtr::get(ArrayRef<AttributeWithIndex> Attrs) { // If there are no attributes then return a null AttributesList pointer. - if (NumAttrs == 0) + if (Attrs.empty()) return AttrListPtr(); #ifndef NDEBUG - for (unsigned i = 0; i != NumAttrs; ++i) { + for (unsigned i = 0, e = Attrs.size(); i != e; ++i) { assert(Attrs[i].Attrs != Attribute::None && "Pointless attribute!"); assert((!i || Attrs[i-1].Index < Attrs[i].Index) && @@ -184,7 +183,7 @@ AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs) // Otherwise, build a key to look up the existing attributes. FoldingSetNodeID ID; - AttributeListImpl::Profile(ID, Attrs, NumAttrs); + AttributeListImpl::Profile(ID, Attrs); void *InsertPos; sys::SmartScopedLock<true> Lock(*ALMutex); @@ -195,7 +194,7 @@ AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs) // If we didn't find any existing attributes of the same shape then // create a new one and insert it. if (!PAL) { - PAL = new AttributeListImpl(Attrs, NumAttrs); + PAL = new AttributeListImpl(Attrs); AttributesLists->InsertNode(PAL, InsertPos); } @@ -308,7 +307,7 @@ AttrListPtr AttrListPtr::addAttr(unsigned Idx, Attributes Attrs) const { OldAttrList.begin()+i, OldAttrList.end()); } - return get(NewAttrList.data(), NewAttrList.size()); + return get(NewAttrList); } AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const { @@ -343,7 +342,7 @@ AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const { NewAttrList.insert(NewAttrList.end(), OldAttrList.begin()+i, OldAttrList.end()); - return get(NewAttrList.data(), NewAttrList.size()); + return get(NewAttrList); } void AttrListPtr::dump() const { diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp index 2e16372..094ca75 100644 --- a/lib/VMCore/AutoUpgrade.cpp +++ b/lib/VMCore/AutoUpgrade.cpp @@ -14,17 +14,32 @@ #include "llvm/AutoUpgrade.h" #include "llvm/Constants.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/Instruction.h" +#include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/CFG.h" +#include "llvm/Support/CallSite.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/IRBuilder.h" #include <cstring> using namespace llvm; +// Upgrade the declarations of the SSE4.1 functions whose arguments have +// changed their type from v4f32 to v2i64. +static bool UpgradeSSE41Function(Function* F, Intrinsic::ID IID, + Function *&NewFn) { + // Check whether this is an old version of the function, which received + // v4f32 arguments. + Type *Arg0Type = F->getFunctionType()->getParamType(0); + if (Arg0Type != VectorType::get(Type::getFloatTy(F->getContext()), 4)) + return false; + + // Yes, it's old, replace it with new version. + F->setName(F->getName() + ".old"); + NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + return true; +} static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { assert(F && "Illegal to upgrade a non-existent Function."); @@ -37,6 +52,27 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { switch (Name[0]) { default: break; + case 'a': { + if (Name.startswith("arm.neon.vclz")) { + Type* args[2] = { + F->arg_begin()->getType(), + Type::getInt1Ty(F->getContext()) + }; + // Can't use Intrinsic::getDeclaration here as it adds a ".i1" to + // the end of the name. Change name from llvm.arm.neon.vclz.* to + // llvm.ctlz.* + FunctionType* fType = FunctionType::get(F->getReturnType(), args, false); + NewFn = Function::Create(fType, F->getLinkage(), + "llvm.ctlz." + Name.substr(14), F->getParent()); + return true; + } + if (Name.startswith("arm.neon.vcnt")) { + NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop, + F->arg_begin()->getType()); + return true; + } + break; + } case 'c': { if (Name.startswith("ctlz.") && F->arg_size() == 1) { F->setName(Name + ".old"); @@ -57,17 +93,49 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.sse2.pcmpgt.") || Name.startswith("x86.avx2.pcmpeq.") || Name.startswith("x86.avx2.pcmpgt.") || - Name.startswith("x86.avx.vpermil.")) { + Name.startswith("x86.avx.vpermil.") || + Name == "x86.avx.movnt.dq.256" || + Name == "x86.avx.movnt.pd.256" || + Name == "x86.avx.movnt.ps.256" || + (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) { NewFn = 0; return true; } + // SSE4.1 ptest functions may have an old signature. + if (Name.startswith("x86.sse41.ptest")) { + if (Name == "x86.sse41.ptestc") + return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestc, NewFn); + if (Name == "x86.sse41.ptestz") + return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestz, NewFn); + if (Name == "x86.sse41.ptestnzc") + return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn); + } + // frcz.ss/sd may need to have an argument dropped + if (Name.startswith("x86.xop.vfrcz.ss") && F->arg_size() == 2) { + F->setName(Name + ".old"); + NewFn = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::x86_xop_vfrcz_ss); + return true; + } + if (Name.startswith("x86.xop.vfrcz.sd") && F->arg_size() == 2) { + F->setName(Name + ".old"); + NewFn = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::x86_xop_vfrcz_sd); + return true; + } + // Fix the FMA4 intrinsics to remove the 4 + if (Name.startswith("x86.fma4.")) { + F->setName("llvm.x86.fma" + Name.substr(8)); + NewFn = F; + return true; + } break; } } - // This may not belong here. This function is effectively being overloaded - // to both detect an intrinsic which needs upgrading, and to provide the - // upgraded form of the intrinsic. We should perhaps have two separate + // This may not belong here. This function is effectively being overloaded + // to both detect an intrinsic which needs upgrading, and to provide the + // upgraded form of the intrinsic. We should perhaps have two separate // functions for this. return false; } @@ -89,8 +157,8 @@ bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) { return false; } -// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the -// upgraded intrinsic. All argument and return casting must be provided in +// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the +// upgraded intrinsic. All argument and return casting must be provided in // order to seamlessly integrate with existing context. void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Function *F = CI->getCalledFunction(); @@ -118,15 +186,85 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { "pcmpgt"); // need to sign extend since icmp returns vector of i1 Rep = Builder.CreateSExt(Rep, CI->getType(), ""); + } else if (Name == "llvm.x86.avx.movnt.dq.256" || + Name == "llvm.x86.avx.movnt.ps.256" || + Name == "llvm.x86.avx.movnt.pd.256") { + IRBuilder<> Builder(C); + Builder.SetInsertPoint(CI->getParent(), CI); + + Module *M = F->getParent(); + SmallVector<Value *, 1> Elts; + Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1)); + MDNode *Node = MDNode::get(C, Elts); + + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + + // Convert the type of the pointer to a pointer to the stored type. + Value *BC = Builder.CreateBitCast(Arg0, + PointerType::getUnqual(Arg1->getType()), + "cast"); + StoreInst *SI = Builder.CreateStore(Arg1, BC); + SI->setMetadata(M->getMDKindID("nontemporal"), Node); + SI->setAlignment(16); + + // Remove intrinsic. + CI->eraseFromParent(); + return; + } else if (Name.startswith("llvm.x86.xop.vpcom")) { + Intrinsic::ID intID; + if (Name.endswith("ub")) + intID = Intrinsic::x86_xop_vpcomub; + else if (Name.endswith("uw")) + intID = Intrinsic::x86_xop_vpcomuw; + else if (Name.endswith("ud")) + intID = Intrinsic::x86_xop_vpcomud; + else if (Name.endswith("uq")) + intID = Intrinsic::x86_xop_vpcomuq; + else if (Name.endswith("b")) + intID = Intrinsic::x86_xop_vpcomb; + else if (Name.endswith("w")) + intID = Intrinsic::x86_xop_vpcomw; + else if (Name.endswith("d")) + intID = Intrinsic::x86_xop_vpcomd; + else if (Name.endswith("q")) + intID = Intrinsic::x86_xop_vpcomq; + else + llvm_unreachable("Unknown suffix"); + + Name = Name.substr(18); // strip off "llvm.x86.xop.vpcom" + unsigned Imm; + if (Name.startswith("lt")) + Imm = 0; + else if (Name.startswith("le")) + Imm = 1; + else if (Name.startswith("gt")) + Imm = 2; + else if (Name.startswith("ge")) + Imm = 3; + else if (Name.startswith("eq")) + Imm = 4; + else if (Name.startswith("ne")) + Imm = 5; + else if (Name.startswith("true")) + Imm = 6; + else if (Name.startswith("false")) + Imm = 7; + else + llvm_unreachable("Unknown condition"); + + Function *VPCOM = Intrinsic::getDeclaration(F->getParent(), intID); + Rep = Builder.CreateCall3(VPCOM, CI->getArgOperand(0), + CI->getArgOperand(1), Builder.getInt8(Imm)); } else { bool PD128 = false, PD256 = false, PS128 = false, PS256 = false; - if (Name.startswith("llvm.x86.avx.vpermil.pd.256")) + if (Name == "llvm.x86.avx.vpermil.pd.256") PD256 = true; - else if (Name.startswith("llvm.x86.avx.vpermil.pd")) + else if (Name == "llvm.x86.avx.vpermil.pd") PD128 = true; - else if (Name.startswith("llvm.x86.avx.vpermil.ps.256")) + else if (Name == "llvm.x86.avx.vpermil.ps.256") PS256 = true; - else if (Name.startswith("llvm.x86.avx.vpermil.ps")) + else if (Name == "llvm.x86.avx.vpermil.ps") PS128 = true; if (PD256 || PD128 || PS256 || PS128) { @@ -162,6 +300,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { return; } + std::string Name = CI->getName().str(); + CI->setName(Name + ".old"); + switch (NewFn->getIntrinsicID()) { default: llvm_unreachable("Unknown function for CallInst upgrade."); @@ -170,12 +311,60 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { case Intrinsic::cttz: assert(CI->getNumArgOperands() == 1 && "Mismatch between function args and call args"); - StringRef Name = CI->getName(); - CI->setName(Name + ".old"); CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0), Builder.getFalse(), Name)); CI->eraseFromParent(); return; + + case Intrinsic::arm_neon_vclz: { + // Change name from llvm.arm.neon.vclz.* to llvm.ctlz.* + CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0), + Builder.getFalse(), + "llvm.ctlz." + Name.substr(14))); + CI->eraseFromParent(); + return; + } + case Intrinsic::ctpop: { + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(0))); + CI->eraseFromParent(); + return; + } + + case Intrinsic::x86_xop_vfrcz_ss: + case Intrinsic::x86_xop_vfrcz_sd: + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(1), + Name)); + CI->eraseFromParent(); + return; + + case Intrinsic::x86_sse41_ptestc: + case Intrinsic::x86_sse41_ptestz: + case Intrinsic::x86_sse41_ptestnzc: { + // The arguments for these intrinsics used to be v4f32, and changed + // to v2i64. This is purely a nop, since those are bitwise intrinsics. + // So, the only thing required is a bitcast for both arguments. + // First, check the arguments have the old type. + Value *Arg0 = CI->getArgOperand(0); + if (Arg0->getType() != VectorType::get(Type::getFloatTy(C), 4)) + return; + + // Old intrinsic, add bitcasts + Value *Arg1 = CI->getArgOperand(1); + + Value *BC0 = + Builder.CreateBitCast(Arg0, + VectorType::get(Type::getInt64Ty(C), 2), + "cast"); + Value *BC1 = + Builder.CreateBitCast(Arg1, + VectorType::get(Type::getInt64Ty(C), 2), + "cast"); + + CallInst* NewCall = Builder.CreateCall2(NewFn, BC0, BC1, Name); + CI->replaceAllUsesWith(NewCall); + CI->eraseFromParent(); + return; + } } } diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt index e1efcda..648ccbd 100644 --- a/lib/VMCore/CMakeLists.txt +++ b/lib/VMCore/CMakeLists.txt @@ -8,7 +8,9 @@ add_llvm_library(LLVMCore ConstantFold.cpp Constants.cpp Core.cpp + DebugInfo.cpp DebugLoc.cpp + DIBuilder.cpp Dominators.cpp Function.cpp GCOV.cpp @@ -36,3 +38,14 @@ add_llvm_library(LLVMCore ValueTypes.cpp Verifier.cpp ) + +# Workaround: It takes over 20 minutes to compile with msvc10. +# FIXME: Suppressing optimizations to core libraries would not be good thing. +if( MSVC_VERSION EQUAL 1600 ) +set_property( + SOURCE Function.cpp + PROPERTY COMPILE_FLAGS "/Og-" + ) +endif() + +add_dependencies(LLVMCore intrinsics_gen) diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp index b743287..8e82876 100644 --- a/lib/VMCore/ConstantFold.cpp +++ b/lib/VMCore/ConstantFold.cpp @@ -55,13 +55,12 @@ static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) { Type *DstEltTy = DstTy->getElementType(); - // Check to verify that all elements of the input are simple. SmallVector<Constant*, 16> Result; + Type *Ty = IntegerType::get(CV->getContext(), 32); for (unsigned i = 0; i != NumElts; ++i) { - Constant *C = CV->getAggregateElement(i); - if (C == 0) return 0; + Constant *C = + ConstantExpr::getExtractElement(CV, ConstantInt::get(Ty, i)); C = ConstantExpr::getBitCast(C, DstEltTy); - if (isa<ConstantExpr>(C)) return 0; Result.push_back(C); } @@ -553,9 +552,12 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V, SmallVector<Constant*, 16> res; VectorType *DestVecTy = cast<VectorType>(DestTy); Type *DstEltTy = DestVecTy->getElementType(); - for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i) - res.push_back(ConstantExpr::getCast(opc, - V->getAggregateElement(i), DstEltTy)); + Type *Ty = IntegerType::get(V->getContext(), 32); + for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i) { + Constant *C = + ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i)); + res.push_back(ConstantExpr::getCast(opc, C, DstEltTy)); + } return ConstantVector::get(res); } @@ -696,12 +698,13 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond, // If the condition is a vector constant, fold the result elementwise. if (ConstantVector *CondV = dyn_cast<ConstantVector>(Cond)) { SmallVector<Constant*, 16> Result; + Type *Ty = IntegerType::get(CondV->getContext(), 32); for (unsigned i = 0, e = V1->getType()->getVectorNumElements(); i != e;++i){ ConstantInt *Cond = dyn_cast<ConstantInt>(CondV->getOperand(i)); if (Cond == 0) break; - Constant *Res = (Cond->getZExtValue() ? V2 : V1)->getAggregateElement(i); - if (Res == 0) break; + Constant *V = Cond->isNullValue() ? V2 : V1; + Constant *Res = ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i)); Result.push_back(Res); } @@ -721,12 +724,12 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond, if (ConstantExpr *TrueVal = dyn_cast<ConstantExpr>(V1)) { if (TrueVal->getOpcode() == Instruction::Select) if (TrueVal->getOperand(0) == Cond) - return ConstantExpr::getSelect(Cond, TrueVal->getOperand(1), V2); + return ConstantExpr::getSelect(Cond, TrueVal->getOperand(1), V2); } if (ConstantExpr *FalseVal = dyn_cast<ConstantExpr>(V2)) { if (FalseVal->getOpcode() == Instruction::Select) if (FalseVal->getOperand(0) == Cond) - return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2)); + return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2)); } return 0; @@ -760,16 +763,16 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val, const APInt &IdxVal = CIdx->getValue(); SmallVector<Constant*, 16> Result; + Type *Ty = IntegerType::get(Val->getContext(), 32); for (unsigned i = 0, e = Val->getType()->getVectorNumElements(); i != e; ++i){ if (i == IdxVal) { Result.push_back(Elt); continue; } - if (Constant *C = Val->getAggregateElement(i)) - Result.push_back(C); - else - return 0; + Constant *C = + ConstantExpr::getExtractElement(Val, ConstantInt::get(Ty, i)); + Result.push_back(C); } return ConstantVector::get(Result); @@ -801,11 +804,15 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *InElt; if (unsigned(Elt) >= SrcNumElts*2) InElt = UndefValue::get(EltTy); - else if (unsigned(Elt) >= SrcNumElts) - InElt = V2->getAggregateElement(Elt - SrcNumElts); - else - InElt = V1->getAggregateElement(Elt); - if (InElt == 0) return 0; + else if (unsigned(Elt) >= SrcNumElts) { + Type *Ty = IntegerType::get(V2->getContext(), 32); + InElt = + ConstantExpr::getExtractElement(V2, + ConstantInt::get(Ty, Elt - SrcNumElts)); + } else { + Type *Ty = IntegerType::get(V1->getContext(), 32); + InElt = ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, Elt)); + } Result.push_back(InElt); } @@ -1130,16 +1137,17 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, } else if (VectorType *VTy = dyn_cast<VectorType>(C1->getType())) { // Perform elementwise folding. SmallVector<Constant*, 16> Result; + Type *Ty = IntegerType::get(VTy->getContext(), 32); for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) { - Constant *LHS = C1->getAggregateElement(i); - Constant *RHS = C2->getAggregateElement(i); - if (LHS == 0 || RHS == 0) break; + Constant *LHS = + ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i)); + Constant *RHS = + ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i)); Result.push_back(ConstantExpr::get(Opcode, LHS, RHS)); } - if (Result.size() == VTy->getNumElements()) - return ConstantVector::get(Result); + return ConstantVector::get(Result); } if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) { @@ -1697,17 +1705,18 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, // If we can constant fold the comparison of each element, constant fold // the whole vector comparison. SmallVector<Constant*, 4> ResElts; + Type *Ty = IntegerType::get(C1->getContext(), 32); // Compare the elements, producing an i1 result or constant expr. for (unsigned i = 0, e = C1->getType()->getVectorNumElements(); i != e;++i){ - Constant *C1E = C1->getAggregateElement(i); - Constant *C2E = C2->getAggregateElement(i); - if (C1E == 0 || C2E == 0) break; + Constant *C1E = + ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i)); + Constant *C2E = + ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i)); ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E)); } - if (ResElts.size() == C1->getType()->getVectorNumElements()) - return ConstantVector::get(ResElts); + return ConstantVector::get(ResElts); } if (C1->getType()->isFloatingPointTy()) { diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp index 6dbc144..a4e21e1 100644 --- a/lib/VMCore/Constants.cpp +++ b/lib/VMCore/Constants.cpp @@ -46,7 +46,7 @@ bool Constant::isNegativeZeroValue() const { // Floating point values have an explicit -0.0 value. if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this)) return CFP->isZero() && CFP->isNegative(); - + // Otherwise, just use +0.0. return isNullValue(); } @@ -55,7 +55,7 @@ bool Constant::isNullValue() const { // 0 is null. if (const ConstantInt *CI = dyn_cast<ConstantInt>(this)) return CI->isZero(); - + // +0.0 is null. if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this)) return CFP->isZero() && !CFP->isNegative(); @@ -161,19 +161,19 @@ Constant *Constant::getAllOnesValue(Type *Ty) { Constant *Constant::getAggregateElement(unsigned Elt) const { if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(this)) return Elt < CS->getNumOperands() ? CS->getOperand(Elt) : 0; - + if (const ConstantArray *CA = dyn_cast<ConstantArray>(this)) return Elt < CA->getNumOperands() ? CA->getOperand(Elt) : 0; - + if (const ConstantVector *CV = dyn_cast<ConstantVector>(this)) return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : 0; - + if (const ConstantAggregateZero *CAZ =dyn_cast<ConstantAggregateZero>(this)) return CAZ->getElementValue(Elt); - + if (const UndefValue *UV = dyn_cast<UndefValue>(this)) return UV->getElementValue(Elt); - + if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this)) return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt) : 0; return 0; @@ -222,10 +222,10 @@ bool Constant::canTrap() const { // The only thing that could possibly trap are constant exprs. const ConstantExpr *CE = dyn_cast<ConstantExpr>(this); if (!CE) return false; - - // ConstantExpr traps if any operands can trap. + + // ConstantExpr traps if any operands can trap. for (unsigned i = 0, e = getNumOperands(); i != e; ++i) - if (CE->getOperand(i)->canTrap()) + if (CE->getOperand(i)->canTrap()) return true; // Otherwise, only specific operations can trap. @@ -252,7 +252,7 @@ bool Constant::isConstantUsed() const { const Constant *UC = dyn_cast<Constant>(*UI); if (UC == 0 || isa<GlobalValue>(UC)) return true; - + if (UC->isConstantUsed()) return true; } @@ -302,12 +302,12 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const { cast<BlockAddress>(RHS->getOperand(0))->getFunction()) return NoRelocation; } - + PossibleRelocationsTy Result = NoRelocation; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) Result = std::max(Result, cast<Constant>(getOperand(i))->getRelocationInfo()); - + return Result; } @@ -316,14 +316,14 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const { /// constantexpr. static bool removeDeadUsersOfConstant(const Constant *C) { if (isa<GlobalValue>(C)) return false; // Cannot remove this - + while (!C->use_empty()) { const Constant *User = dyn_cast<Constant>(C->use_back()); if (!User) return false; // Non-constant usage; if (!removeDeadUsersOfConstant(User)) return false; // Constant wasn't dead } - + const_cast<Constant*>(C)->destroyConstant(); return true; } @@ -343,7 +343,7 @@ void Constant::removeDeadConstantUsers() const { ++I; continue; } - + if (!removeDeadUsersOfConstant(User)) { // If the constant wasn't dead, remember that this was the last live use // and move on to the next constant. @@ -351,7 +351,7 @@ void Constant::removeDeadConstantUsers() const { ++I; continue; } - + // If the constant was dead, then the iterator is invalidated. if (LastNonDeadUser == E) { I = use_begin(); @@ -485,7 +485,7 @@ static const fltSemantics *TypeToFloatSemantics(Type *Ty) { return &APFloat::x87DoubleExtended; else if (Ty->isFP128Ty()) return &APFloat::IEEEquad; - + assert(Ty->isPPC_FP128Ty() && "Unknown FP format"); return &APFloat::PPCDoubleDouble; } @@ -497,7 +497,7 @@ void ConstantFP::anchor() { } /// 2.0/1.0 etc, that are known-valid both as double and as the target format. Constant *ConstantFP::get(Type *Ty, double V) { LLVMContext &Context = Ty->getContext(); - + APFloat FV(V); bool ignored; FV.convert(*TypeToFloatSemantics(Ty->getScalarType()), @@ -550,11 +550,11 @@ Constant *ConstantFP::getZeroValueForNegation(Type *Ty) { // ConstantFP accessors. ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) { DenseMapAPFloatKeyInfo::KeyTy Key(V); - + LLVMContextImpl* pImpl = Context.pImpl; - + ConstantFP *&Slot = pImpl->FPConstants[Key]; - + if (!Slot) { Type *Ty; if (&V.getSemantics() == &APFloat::IEEEhalf) @@ -574,7 +574,7 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) { } Slot = new ConstantFP(Ty, V); } - + return Slot; } @@ -695,7 +695,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) { "Wrong type in array element initializer"); } LLVMContextImpl *pImpl = Ty->getContext().pImpl; - + // If this is an all-zero array, return a ConstantAggregateZero object. If // all undef, return an UndefValue, if "all simple", then return a // ConstantDataArray. @@ -751,7 +751,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) { return ConstantDataArray::get(C->getContext(), Elts); } } - + if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) { if (CFP->getType()->isFloatTy()) { SmallVector<float, 16> Elts; @@ -788,7 +788,7 @@ StructType *ConstantStruct::getTypeForElements(LLVMContext &Context, SmallVector<Type*, 16> EltTypes(VecSize); for (unsigned i = 0; i != VecSize; ++i) EltTypes[i] = V[i]->getType(); - + return StructType::get(Context, EltTypes, Packed); } @@ -833,12 +833,12 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) { isUndef = false; } } - } + } if (isZero) return ConstantAggregateZero::get(ST); if (isUndef) return UndefValue::get(ST); - + return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V); } @@ -881,12 +881,12 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) { break; } } - + if (isZero) return ConstantAggregateZero::get(T); if (isUndef) return UndefValue::get(T); - + // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. if (ConstantDataSequential::isElementTypeCompatible(C->getType())) { @@ -932,7 +932,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) { return ConstantDataVector::get(C->getContext(), Elts); } } - + if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) { if (CFP->getType()->isFloatTy()) { SmallVector<float, 16> Elts; @@ -955,7 +955,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) { } } } - + // Otherwise, the element type isn't compatible with ConstantDataVector, or // the operand list constants a ConstantExpr or something else strange. return pImpl->VectorConstants.getOrCreate(T, V); @@ -967,7 +967,7 @@ Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) { if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) && ConstantDataSequential::isElementTypeCompatible(V->getType())) return ConstantDataVector::getSplat(NumElts, V); - + SmallVector<Constant*, 32> Elts(NumElts, V); return get(Elts); } @@ -1039,7 +1039,7 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const { SmallVector<Constant*, 8> NewOps; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) NewOps.push_back(i == OpNo ? Op : getOperand(i)); - + return getWithOperands(NewOps); } @@ -1052,7 +1052,7 @@ getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const { bool AnyChange = Ty != getType(); for (unsigned i = 0; i != Ops.size(); ++i) AnyChange |= Ops[i] != getOperand(i); - + if (!AnyChange) // No operands changed, return self. return const_cast<ConstantExpr*>(this); @@ -1177,7 +1177,7 @@ ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) { ConstantAggregateZero *&Entry = Ty->getContext().pImpl->CAZConstants[Ty]; if (Entry == 0) Entry = new ConstantAggregateZero(Ty); - + return Entry; } @@ -1232,7 +1232,7 @@ ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) { ConstantPointerNull *&Entry = Ty->getContext().pImpl->CPNConstants[Ty]; if (Entry == 0) Entry = new ConstantPointerNull(Ty); - + return Entry; } @@ -1252,7 +1252,7 @@ UndefValue *UndefValue::get(Type *Ty) { UndefValue *&Entry = Ty->getContext().pImpl->UVConstants[Ty]; if (Entry == 0) Entry = new UndefValue(Ty); - + return Entry; } @@ -1277,7 +1277,7 @@ BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) { F->getContext().pImpl->BlockAddresses[std::make_pair(F, BB)]; if (BA == 0) BA = new BlockAddress(F, BB); - + assert(BA->getFunction() == F && "Basic block moved between functions"); return BA; } @@ -1305,19 +1305,19 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) { // case, we have to remove the map entry. Function *NewF = getFunction(); BasicBlock *NewBB = getBasicBlock(); - + if (U == &Op<0>()) NewF = cast<Function>(To); else NewBB = cast<BasicBlock>(To); - + // See if the 'new' entry already exists, if not, just update this in place // and return early. BlockAddress *&NewBA = getContext().pImpl->BlockAddresses[std::make_pair(NewF, NewBB)]; if (NewBA == 0) { getBasicBlock()->AdjustBlockAddressRefCount(-1); - + // Remove the old entry, this can't cause the map to rehash (just a // tombstone will get added). getContext().pImpl->BlockAddresses.erase(std::make_pair(getFunction(), @@ -1331,10 +1331,10 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) { // Otherwise, I do need to replace this with an existing value. assert(NewBA != this && "I didn't contain From!"); - + // Everyone using this now uses the replacement. replaceAllUsesWith(NewBA); - + destroyConstant(); } @@ -1355,10 +1355,10 @@ static inline Constant *getFoldedCast( // Look up the constant in the table first to ensure uniqueness std::vector<Constant*> argVec(1, C); ExprMapKeyType Key(opc, argVec); - + return pImpl->ExprConstants.getOrCreate(Ty, Key); } - + Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) { Instruction::CastOps opc = Instruction::CastOps(oc); assert(Instruction::isCast(opc) && "opcode out of range"); @@ -1381,7 +1381,7 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) { case Instruction::IntToPtr: return getIntToPtr(C, Ty); case Instruction::BitCast: return getBitCast(C, Ty); } -} +} Constant *ConstantExpr::getZExtOrBitCast(Constant *C, Type *Ty) { if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits()) @@ -1572,11 +1572,11 @@ Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy) { Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) { assert(CastInst::castIsValid(Instruction::BitCast, C, DstTy) && "Invalid constantexpr bitcast!"); - + // It is common to ask for a bitcast of a value to its own type, handle this // speedily. if (C->getType() == DstTy) return C; - + return getFoldedCast(Instruction::BitCast, C, DstTy); } @@ -1588,7 +1588,7 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2, "Invalid opcode in binary constant expression"); assert(C1->getType() == C2->getType() && "Operand types in binary constant expression should match"); - + #ifndef NDEBUG switch (Opcode) { case Instruction::Add: @@ -1649,11 +1649,11 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2, if (Constant *FC = ConstantFoldBinaryInstruction(Opcode, C1, C2)) return FC; // Fold a few common cases. - + std::vector<Constant*> argVec(1, C1); argVec.push_back(C2); ExprMapKeyType Key(Opcode, argVec, 0, Flags); - + LLVMContextImpl *pImpl = C1->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(C1->getType(), Key); } @@ -1703,7 +1703,7 @@ Constant *ConstantExpr::getOffsetOf(Type* Ty, Constant *FieldNo) { Constant *ConstantExpr::getCompare(unsigned short Predicate, Constant *C1, Constant *C2) { assert(C1->getType() == C2->getType() && "Op types should be identical!"); - + switch (Predicate) { default: llvm_unreachable("Invalid CmpInst predicate"); case CmpInst::FCMP_FALSE: case CmpInst::FCMP_OEQ: case CmpInst::FCMP_OGT: @@ -1713,7 +1713,7 @@ Constant *ConstantExpr::getCompare(unsigned short Predicate, case CmpInst::FCMP_ULT: case CmpInst::FCMP_ULE: case CmpInst::FCMP_UNE: case CmpInst::FCMP_TRUE: return getFCmp(Predicate, C1, C2); - + case CmpInst::ICMP_EQ: case CmpInst::ICMP_NE: case CmpInst::ICMP_UGT: case CmpInst::ICMP_UGE: case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE: case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: case CmpInst::ICMP_SLT: @@ -1732,7 +1732,7 @@ Constant *ConstantExpr::getSelect(Constant *C, Constant *V1, Constant *V2) { argVec[1] = V1; argVec[2] = V2; ExprMapKeyType Key(Instruction::Select, argVec); - + LLVMContextImpl *pImpl = C->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(V1->getType(), Key); } @@ -1747,7 +1747,7 @@ Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs, assert(Ty && "GEP indices invalid!"); unsigned AS = C->getType()->getPointerAddressSpace(); Type *ReqTy = Ty->getPointerTo(AS); - + assert(C->getType()->isPointerTy() && "Non-pointer type for constant GetElementPtr expression"); // Look up the constant in the table first to ensure uniqueness @@ -1758,7 +1758,7 @@ Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs, ArgVec.push_back(cast<Constant>(Idxs[i])); const ExprMapKeyType Key(Instruction::GetElementPtr, ArgVec, 0, InBounds ? GEPOperator::IsInBounds : 0); - + LLVMContextImpl *pImpl = C->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ReqTy, Key); } @@ -1815,15 +1815,15 @@ Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx) { "Tried to create extractelement operation on non-vector type!"); assert(Idx->getType()->isIntegerTy(32) && "Extractelement index must be i32 type!"); - + if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx)) return FC; // Fold a few common cases. - + // Look up the constant in the table first to ensure uniqueness std::vector<Constant*> ArgVec(1, Val); ArgVec.push_back(Idx); const ExprMapKeyType Key(Instruction::ExtractElement,ArgVec); - + LLVMContextImpl *pImpl = Val->getContext().pImpl; Type *ReqTy = Val->getType()->getVectorElementType(); return pImpl->ExprConstants.getOrCreate(ReqTy, Key); @@ -1845,7 +1845,7 @@ Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt, ArgVec.push_back(Elt); ArgVec.push_back(Idx); const ExprMapKeyType Key(Instruction::InsertElement,ArgVec); - + LLVMContextImpl *pImpl = Val->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(Val->getType(), Key); } @@ -1867,7 +1867,7 @@ Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2, ArgVec.push_back(V2); ArgVec.push_back(Mask); const ExprMapKeyType Key(Instruction::ShuffleVector,ArgVec); - + LLVMContextImpl *pImpl = ShufTy->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ShufTy, Key); } @@ -1892,7 +1892,7 @@ Constant *ConstantExpr::getExtractValue(Constant *Agg, Type *ReqTy = ExtractValueInst::getIndexedType(Agg->getType(), Idxs); (void)ReqTy; assert(ReqTy && "extractvalue indices invalid!"); - + assert(Agg->getType()->isFirstClassType() && "Non-first-class type for constant extractvalue expression"); Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs); @@ -2007,6 +2007,47 @@ Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2, bool isExact) { isExact ? PossiblyExactOperator::IsExact : 0); } +/// getBinOpIdentity - Return the identity for the given binary operation, +/// i.e. a constant C such that X op C = X and C op X = X for every X. It +/// returns null if the operator doesn't have an identity. +Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) { + switch (Opcode) { + default: + // Doesn't have an identity. + return 0; + + case Instruction::Add: + case Instruction::Or: + case Instruction::Xor: + return Constant::getNullValue(Ty); + + case Instruction::Mul: + return ConstantInt::get(Ty, 1); + + case Instruction::And: + return Constant::getAllOnesValue(Ty); + } +} + +/// getBinOpAbsorber - Return the absorbing element for the given binary +/// operation, i.e. a constant C such that X op C = C and C op X = C for +/// every X. For example, this returns zero for integer multiplication. +/// It returns null if the operator doesn't have an absorbing element. +Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) { + switch (Opcode) { + default: + // Doesn't have an absorber. + return 0; + + case Instruction::Or: + return Constant::getAllOnesValue(Ty); + + case Instruction::And: + case Instruction::Mul: + return Constant::getNullValue(Ty); + } +} + // destroyConstant - Remove the constant from the constant table... // void ConstantExpr::destroyConstant() { @@ -2107,7 +2148,7 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) { // Do a lookup to see if we have already formed one of these. StringMap<ConstantDataSequential*>::MapEntryTy &Slot = Ty->getContext().pImpl->CDSConstants.GetOrCreateValue(Elements); - + // The bucket can point to a linked list of different CDS's that have the same // body but different types. For example, 0,0,0,1 could be a 4 element array // of i8, or a 1-element array of i32. They'll both end up in the same @@ -2117,7 +2158,7 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) { Entry = &Node->Next, Node = *Entry) if (Node->getType() == Ty) return Node; - + // Okay, we didn't get a hit. Create a node of the right class, link it in, // and return it. if (isa<ArrayType>(Ty)) @@ -2131,7 +2172,7 @@ void ConstantDataSequential::destroyConstant() { // Remove the constant from the StringMap. StringMap<ConstantDataSequential*> &CDSConstants = getType()->getContext().pImpl->CDSConstants; - + StringMap<ConstantDataSequential*>::iterator Slot = CDSConstants.find(getRawDataValues()); @@ -2158,11 +2199,11 @@ void ConstantDataSequential::destroyConstant() { } } } - + // If we were part of a list, make sure that we don't delete the list that is // still owned by the uniquing map. Next = 0; - + // Finally, actually delete it. destroyConstantImpl(); } @@ -2172,27 +2213,33 @@ void ConstantDataSequential::destroyConstant() { /// can return a ConstantAggregateZero object. Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint8_t> Elts) { Type *Ty = ArrayType::get(Type::getInt8Ty(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*1), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){ Type *Ty = ArrayType::get(Type::getInt16Ty(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*2), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){ Type *Ty = ArrayType::get(Type::getInt32Ty(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){ Type *Ty = ArrayType::get(Type::getInt64Ty(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<float> Elts) { Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty); } Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) { Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty); } /// getString - This method constructs a CDS and initializes it with a text @@ -2202,9 +2249,12 @@ Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) { /// to disable this behavior. Constant *ConstantDataArray::getString(LLVMContext &Context, StringRef Str, bool AddNull) { - if (!AddNull) - return get(Context, ArrayRef<uint8_t>((uint8_t*)Str.data(), Str.size())); - + if (!AddNull) { + const uint8_t *Data = reinterpret_cast<const uint8_t *>(Str.data()); + return get(Context, ArrayRef<uint8_t>(const_cast<uint8_t *>(Data), + Str.size())); + } + SmallVector<uint8_t, 64> ElementVals; ElementVals.append(Str.begin(), Str.end()); ElementVals.push_back(0); @@ -2216,27 +2266,33 @@ Constant *ConstantDataArray::getString(LLVMContext &Context, /// can return a ConstantAggregateZero object. Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint8_t> Elts){ Type *Ty = VectorType::get(Type::getInt8Ty(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*1), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){ Type *Ty = VectorType::get(Type::getInt16Ty(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*2), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){ Type *Ty = VectorType::get(Type::getInt32Ty(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){ Type *Ty = VectorType::get(Type::getInt64Ty(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<float> Elts) { Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty); } Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<double> Elts) { Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size()); - return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty); + const char *Data = reinterpret_cast<const char *>(Elts.data()); + return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty); } Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) { @@ -2281,15 +2337,19 @@ uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const { assert(isa<IntegerType>(getElementType()) && "Accessor can only be used when element is an integer"); const char *EltPtr = getElementPointer(Elt); - + // The data is stored in host byte order, make sure to cast back to the right // type to load with the right endianness. switch (getElementType()->getIntegerBitWidth()) { default: llvm_unreachable("Invalid bitwidth for CDS"); - case 8: return *(uint8_t*)EltPtr; - case 16: return *(uint16_t*)EltPtr; - case 32: return *(uint32_t*)EltPtr; - case 64: return *(uint64_t*)EltPtr; + case 8: + return *const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(EltPtr)); + case 16: + return *const_cast<uint16_t *>(reinterpret_cast<const uint16_t *>(EltPtr)); + case 32: + return *const_cast<uint32_t *>(reinterpret_cast<const uint32_t *>(EltPtr)); + case 64: + return *const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(EltPtr)); } } @@ -2301,8 +2361,14 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const { switch (getElementType()->getTypeID()) { default: llvm_unreachable("Accessor can only be used when element is float/double!"); - case Type::FloatTyID: return APFloat(*(float*)EltPtr); - case Type::DoubleTyID: return APFloat(*(double*)EltPtr); + case Type::FloatTyID: { + const float *FloatPrt = reinterpret_cast<const float *>(EltPtr); + return APFloat(*const_cast<float *>(FloatPrt)); + } + case Type::DoubleTyID: { + const double *DoublePtr = reinterpret_cast<const double *>(EltPtr); + return APFloat(*const_cast<double *>(DoublePtr)); + } } } @@ -2311,7 +2377,8 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const { float ConstantDataSequential::getElementAsFloat(unsigned Elt) const { assert(getElementType()->isFloatTy() && "Accessor can only be used when element is a 'float'"); - return *(float*)getElementPointer(Elt); + const float *EltPtr = reinterpret_cast<const float *>(getElementPointer(Elt)); + return *const_cast<float *>(EltPtr); } /// getElementAsDouble - If this is an sequential container of doubles, return @@ -2319,7 +2386,9 @@ float ConstantDataSequential::getElementAsFloat(unsigned Elt) const { double ConstantDataSequential::getElementAsDouble(unsigned Elt) const { assert(getElementType()->isDoubleTy() && "Accessor can only be used when element is a 'float'"); - return *(double*)getElementPointer(Elt); + const double *EltPtr = + reinterpret_cast<const double *>(getElementPointer(Elt)); + return *const_cast<double *>(EltPtr); } /// getElementAsConstant - Return a Constant for a specified index's element. @@ -2328,7 +2397,7 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const { Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const { if (getElementType()->isFloatTy() || getElementType()->isDoubleTy()) return ConstantFP::get(getContext(), getElementAsAPFloat(Elt)); - + return ConstantInt::get(getElementType(), getElementAsInteger(Elt)); } @@ -2342,12 +2411,12 @@ bool ConstantDataSequential::isString() const { bool ConstantDataSequential::isCString() const { if (!isString()) return false; - + StringRef Str = getAsString(); - + // The last value must be nul. if (Str.back() != 0) return false; - + // Other elements must be non-nul. return Str.drop_back().find(0) == StringRef::npos; } @@ -2356,13 +2425,13 @@ bool ConstantDataSequential::isCString() const { /// elements have the same value, return that value. Otherwise return NULL. Constant *ConstantDataVector::getSplatValue() const { const char *Base = getRawDataValues().data(); - + // Compare elements 1+ to the 0'th element. unsigned EltSize = getElementByteSize(); for (unsigned i = 1, e = getNumElements(); i != e; ++i) if (memcmp(Base, Base+i*EltSize, EltSize)) return 0; - + // If they're all the same, return the 0th one as a representative. return getElementAsConstant(0); } @@ -2393,10 +2462,10 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To, Lookup.first = cast<ArrayType>(getType()); Values.reserve(getNumOperands()); // Build replacement array. - // Fill values with the modified operands of the constant array. Also, + // Fill values with the modified operands of the constant array. Also, // compute whether this turns into an all-zeros array. unsigned NumUpdated = 0; - + // Keep track of whether all the values in the array are "ToC". bool AllSame = true; for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) { @@ -2408,7 +2477,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To, Values.push_back(Val); AllSame &= Val == ToC; } - + Constant *Replacement = 0; if (AllSame && ToC->isNullValue()) { Replacement = ConstantAggregateZero::get(getType()); @@ -2419,7 +2488,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To, Lookup.second = makeArrayRef(Values); LLVMContextImpl::ArrayConstantsTy::MapTy::iterator I = pImpl->ArrayConstants.find(Lookup); - + if (I != pImpl->ArrayConstants.map_end()) { Replacement = I->first; } else { @@ -2428,7 +2497,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To, // old with the new, then deleting the old... just update the current one // in place! pImpl->ArrayConstants.remove(this); - + // Update to the new value. Optimize for the case when we have a single // operand that we're changing, but handle bulk updates efficiently. if (NumUpdated == 1) { @@ -2445,13 +2514,13 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To, return; } } - + // Otherwise, I do need to replace this with an existing value. assert(Replacement != this && "I didn't contain From!"); - + // Everyone using this now uses the replacement. replaceAllUsesWith(Replacement); - + // Delete the old constant! destroyConstant(); } @@ -2468,8 +2537,8 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To, LLVMContextImpl::StructConstantsTy::LookupKey Lookup; Lookup.first = cast<StructType>(getType()); Values.reserve(getNumOperands()); // Build replacement struct. - - // Fill values with the modified operands of the constant struct. Also, + + // Fill values with the modified operands of the constant struct. Also, // compute whether this turns into an all-zeros struct. bool isAllZeros = false; bool isAllUndef = false; @@ -2492,9 +2561,9 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To, Values.push_back(cast<Constant>(O->get())); } Values[OperandToUpdate] = ToC; - + LLVMContextImpl *pImpl = getContext().pImpl; - + Constant *Replacement = 0; if (isAllZeros) { Replacement = ConstantAggregateZero::get(getType()); @@ -2505,7 +2574,7 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To, Lookup.second = makeArrayRef(Values); LLVMContextImpl::StructConstantsTy::MapTy::iterator I = pImpl->StructConstants.find(Lookup); - + if (I != pImpl->StructConstants.map_end()) { Replacement = I->first; } else { @@ -2514,19 +2583,19 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To, // old with the new, then deleting the old... just update the current one // in place! pImpl->StructConstants.remove(this); - + // Update to the new value. setOperand(OperandToUpdate, ToC); pImpl->StructConstants.insert(this); return; } } - + assert(Replacement != this && "I didn't contain From!"); - + // Everyone using this now uses the replacement. replaceAllUsesWith(Replacement); - + // Delete the old constant! destroyConstant(); } @@ -2534,7 +2603,7 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To, void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) { assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!"); - + SmallVector<Constant*, 8> Values; Values.reserve(getNumOperands()); // Build replacement array... for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { @@ -2542,13 +2611,13 @@ void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To, if (Val == From) Val = cast<Constant>(To); Values.push_back(Val); } - + Constant *Replacement = get(Values); assert(Replacement != this && "I didn't contain From!"); - + // Everyone using this now uses the replacement. replaceAllUsesWith(Replacement); - + // Delete the old constant! destroyConstant(); } @@ -2557,19 +2626,19 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV, Use *U) { assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!"); Constant *To = cast<Constant>(ToV); - + SmallVector<Constant*, 8> NewOps; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { Constant *Op = getOperand(i); NewOps.push_back(Op == From ? To : Op); } - + Constant *Replacement = getWithOperands(NewOps); assert(Replacement != this && "I didn't contain From!"); - + // Everyone using this now uses the replacement. replaceAllUsesWith(Replacement); - + // Delete the old constant! destroyConstant(); } diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp index a9cca22..972db3c 100644 --- a/lib/VMCore/Core.cpp +++ b/lib/VMCore/Core.cpp @@ -115,6 +115,25 @@ void LLVMDumpModule(LLVMModuleRef M) { unwrap(M)->dump(); } +LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename, + char **ErrorMessage) { + std::string error; + raw_fd_ostream dest(Filename, error); + if (!error.empty()) { + *ErrorMessage = strdup(error.c_str()); + return true; + } + + unwrap(M)->print(dest, NULL); + + if (!error.empty()) { + *ErrorMessage = strdup(error.c_str()); + return true; + } + dest.flush(); + return false; +} + /*--.. Operations on inline assembler ......................................--*/ void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) { unwrap(M)->setModuleInlineAsm(StringRef(Asm)); @@ -1191,7 +1210,7 @@ LLVMValueRef LLVMAddGlobalInAddressSpace(LLVMModuleRef M, LLVMTypeRef Ty, unsigned AddressSpace) { return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false, GlobalValue::ExternalLinkage, 0, Name, 0, - false, AddressSpace)); + GlobalVariable::NotThreadLocal, AddressSpace)); } LLVMValueRef LLVMGetNamedGlobal(LLVMModuleRef M, const char *Name) { diff --git a/lib/Analysis/DIBuilder.cpp b/lib/VMCore/DIBuilder.cpp index 85913b1..f5894e9 100644 --- a/lib/Analysis/DIBuilder.cpp +++ b/lib/VMCore/DIBuilder.cpp @@ -11,9 +11,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/DIBuilder.h" -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/DIBuilder.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/IntrinsicInst.h" #include "llvm/Module.h" #include "llvm/ADT/STLExtras.h" @@ -47,16 +47,16 @@ void DIBuilder::finalize() { DIType(TempSubprograms).replaceAllUsesWith(SPs); for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { DISubprogram SP(SPs.getElement(i)); + SmallVector<Value *, 4> Variables; if (NamedMDNode *NMD = getFnSpecificMDNode(M, SP)) { - SmallVector<Value *, 4> Variables; for (unsigned ii = 0, ee = NMD->getNumOperands(); ii != ee; ++ii) Variables.push_back(NMD->getOperand(ii)); - if (MDNode *Temp = SP.getVariablesNodes()) { - DIArray AV = getOrCreateArray(Variables); - DIType(Temp).replaceAllUsesWith(AV); - } NMD->eraseFromParent(); } + if (MDNode *Temp = SP.getVariablesNodes()) { + DIArray AV = getOrCreateArray(Variables); + DIType(Temp).replaceAllUsesWith(AV); + } } DIArray GVs = getOrCreateArray(AllGVs); @@ -101,7 +101,7 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename, Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_compile_unit), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), ConstantInt::get(Type::getInt32Ty(VMContext), Lang), MDString::get(VMContext, Filename), MDString::get(VMContext, Directory), @@ -163,7 +163,7 @@ DIType DIBuilder::createNullPtrType(StringRef Name) { ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags; - ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Encoding + ConstantInt::get(Type::getInt32Ty(VMContext), 0) // Encoding }; return DIType(MDNode::get(VMContext, Elts)); } @@ -229,12 +229,13 @@ DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits, return DIType(MDNode::get(VMContext, Elts)); } -/// createReferenceType - Create debugging information entry for a reference. -DIType DIBuilder::createReferenceType(DIType RTy) { +/// createReferenceType - Create debugging information entry for a reference +/// type. +DIType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) { assert(RTy.Verify() && "Unable to create reference type"); // References are encoded in DIDerivedType format. Value *Elts[] = { - GetTagConstant(VMContext, dwarf::DW_TAG_reference_type), + GetTagConstant(VMContext, Tag), NULL, // TheCU, NULL, // Name NULL, // Filename @@ -387,11 +388,11 @@ DIType DIBuilder::createObjCIVar(StringRef Name, /// createObjCProperty - Create debugging information entry for Objective-C /// property. DIObjCProperty DIBuilder::createObjCProperty(StringRef Name, - DIFile File, unsigned LineNumber, + DIFile File, unsigned LineNumber, StringRef GetterName, StringRef SetterName, unsigned PropertyAttributes, - DIType Ty) { + DIType Ty) { Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property), MDString::get(VMContext, Name), @@ -405,33 +406,6 @@ DIObjCProperty DIBuilder::createObjCProperty(StringRef Name, return DIObjCProperty(MDNode::get(VMContext, Elts)); } -/// createClassType - Create debugging information entry for a class. -DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name, - DIFile File, unsigned LineNumber, - uint64_t SizeInBits, uint64_t AlignInBits, - uint64_t OffsetInBits, unsigned Flags, - DIType DerivedFrom, DIArray Elements, - MDNode *VTableHolder, MDNode *TemplateParams) { - // TAG_class_type is encoded in DICompositeType format. - Value *Elts[] = { - GetTagConstant(VMContext, dwarf::DW_TAG_class_type), - getNonCompileUnitScope(Context), - MDString::get(VMContext, Name), - File, - ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber), - ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits), - ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits), - ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits), - ConstantInt::get(Type::getInt32Ty(VMContext), Flags), - DerivedFrom, - Elements, - ConstantInt::get(Type::getInt32Ty(VMContext), 0), - VTableHolder, - TemplateParams - }; - return DIType(MDNode::get(VMContext, Elts)); -} - /// createTemplateTypeParameter - Create debugging information for template /// type parameter. DITemplateTypeParameter @@ -470,6 +444,34 @@ DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name, return DITemplateValueParameter(MDNode::get(VMContext, Elts)); } +/// createClassType - Create debugging information entry for a class. +DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name, + DIFile File, unsigned LineNumber, + uint64_t SizeInBits, uint64_t AlignInBits, + uint64_t OffsetInBits, unsigned Flags, + DIType DerivedFrom, DIArray Elements, + MDNode *VTableHolder, + MDNode *TemplateParams) { + // TAG_class_type is encoded in DICompositeType format. + Value *Elts[] = { + GetTagConstant(VMContext, dwarf::DW_TAG_class_type), + getNonCompileUnitScope(Context), + MDString::get(VMContext, Name), + File, + ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber), + ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits), + ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits), + ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits), + ConstantInt::get(Type::getInt32Ty(VMContext), Flags), + DerivedFrom, + Elements, + ConstantInt::get(Type::getInt32Ty(VMContext), 0), + VTableHolder, + TemplateParams + }; + return DIType(MDNode::get(VMContext, Elts)); +} + /// createStructType - Create debugging information entry for a struct. DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name, DIFile File, unsigned LineNumber, @@ -490,7 +492,7 @@ DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name, NULL, Elements, ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)) }; return DIType(MDNode::get(VMContext, Elts)); } @@ -515,7 +517,7 @@ DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name, NULL, Elements, ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)) }; return DIType(MDNode::get(VMContext, Elts)); } @@ -525,9 +527,9 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) { // TAG_subroutine_type is encoded in DICompositeType format. Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), MDString::get(VMContext, ""), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), ConstantInt::get(Type::getInt32Ty(VMContext), 0), ConstantInt::get(Type::getInt64Ty(VMContext), 0), ConstantInt::get(Type::getInt64Ty(VMContext), 0), @@ -536,7 +538,7 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) { NULL, ParameterTypes, ConstantInt::get(Type::getInt32Ty(VMContext), 0), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)) }; return DIType(MDNode::get(VMContext, Elts)); } @@ -547,7 +549,8 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber, uint64_t SizeInBits, uint64_t AlignInBits, - DIArray Elements) { + DIArray Elements, + DIType ClassType, unsigned Flags) { // TAG_enumeration_type is encoded in DICompositeType format. Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type), @@ -558,11 +561,11 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name, ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits), ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits), ConstantInt::get(Type::getInt32Ty(VMContext), 0), - ConstantInt::get(Type::getInt32Ty(VMContext), 0), - NULL, + ConstantInt::get(Type::getInt32Ty(VMContext), Flags), + ClassType, Elements, ConstantInt::get(Type::getInt32Ty(VMContext), 0), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)) }; MDNode *Node = MDNode::get(VMContext, Elts); AllEnumTypes.push_back(Node); @@ -586,7 +589,7 @@ DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits, Ty, Subscripts, ConstantInt::get(Type::getInt32Ty(VMContext), 0), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)) }; return DIType(MDNode::get(VMContext, Elts)); } @@ -608,7 +611,7 @@ DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits, Ty, Subscripts, ConstantInt::get(Type::getInt32Ty(VMContext), 0), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)) }; return DIType(MDNode::get(VMContext, Elts)); } @@ -677,12 +680,13 @@ DIType DIBuilder::createTemporaryType(DIFile F) { /// createForwardDecl - Create a temporary forward-declared type that /// can be RAUW'd if the full type is seen. -DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIFile F, +DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, + DIDescriptor Scope, DIFile F, unsigned Line, unsigned RuntimeLang) { // Create a temporary MDNode. Value *Elts[] = { GetTagConstant(VMContext, Tag), - NULL, // TheCU + getNonCompileUnitScope(Scope), MDString::get(VMContext, Name), F, ConstantInt::get(Type::getInt32Ty(VMContext), Line), @@ -703,7 +707,7 @@ DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIFile F, /// getOrCreateArray - Get a DIArray, create one if required. DIArray DIBuilder::getOrCreateArray(ArrayRef<Value *> Elements) { if (Elements.empty()) { - Value *Null = llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)); + Value *Null = Constant::getNullValue(Type::getInt32Ty(VMContext)); return DIArray(MDNode::get(VMContext, Null)); } return DIArray(MDNode::get(VMContext, Elements)); @@ -724,10 +728,10 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Hi) { /// createGlobalVariable - Create a new descriptor for the specified global. DIGlobalVariable DIBuilder:: createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber, - DIType Ty, bool isLocalToUnit, llvm::Value *Val) { + DIType Ty, bool isLocalToUnit, Value *Val) { Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_variable), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), NULL, // TheCU, MDString::get(VMContext, Name), MDString::get(VMContext, Name), @@ -749,10 +753,10 @@ createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber, DIGlobalVariable DIBuilder:: createStaticVariable(DIDescriptor Context, StringRef Name, StringRef LinkageName, DIFile F, unsigned LineNumber, - DIType Ty, bool isLocalToUnit, llvm::Value *Val) { + DIType Ty, bool isLocalToUnit, Value *Val) { Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_variable), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), getNonCompileUnitScope(Context), MDString::get(VMContext, Name), MDString::get(VMContext, Name), @@ -783,7 +787,7 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope, ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24))), Ty, ConstantInt::get(Type::getInt32Ty(VMContext), Flags), - Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)) }; MDNode *Node = MDNode::get(VMContext, Elts); if (AlwaysPreserve) { @@ -812,8 +816,8 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope, Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24)))); Elts.push_back(Ty); - Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext))); - Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext))); + Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))); + Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))); Elts.append(Addr.begin(), Addr.end()); return DIVariable(MDNode::get(VMContext, Elts)); @@ -838,7 +842,7 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context, Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_subprogram), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), getNonCompileUnitScope(Context), MDString::get(VMContext, Name), MDString::get(VMContext, Name), @@ -887,7 +891,7 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context, Value *Elts[] = { GetTagConstant(VMContext, dwarf::DW_TAG_subprogram), - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), getNonCompileUnitScope(Context), MDString::get(VMContext, Name), MDString::get(VMContext, Name), @@ -904,9 +908,9 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context, ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized), Fn, TParam, - llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)), + Constant::getNullValue(Type::getInt32Ty(VMContext)), THolder, - // FIXME: Do we want to use a different scope lines? + // FIXME: Do we want to use different scope/lines? ConstantInt::get(Type::getInt32Ty(VMContext), LineNo) }; MDNode *Node = MDNode::get(VMContext, Elts); diff --git a/lib/Analysis/DebugInfo.cpp b/lib/VMCore/DebugInfo.cpp index f61a8f3..c8f8f7d 100644 --- a/lib/Analysis/DebugInfo.cpp +++ b/lib/VMCore/DebugInfo.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/DebugInfo.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Intrinsics.h" @@ -112,16 +112,16 @@ Function *DIDescriptor::getFunctionField(unsigned Elt) const { } unsigned DIVariable::getNumAddrElements() const { - if (getVersion() <= llvm::LLVMDebugVersion8) + if (getVersion() <= LLVMDebugVersion8) return DbgNode->getNumOperands()-6; - if (getVersion() == llvm::LLVMDebugVersion9) + if (getVersion() == LLVMDebugVersion9) return DbgNode->getNumOperands()-7; return DbgNode->getNumOperands()-8; } /// getInlinedAt - If this variable is inlined then return inline location. MDNode *DIVariable::getInlinedAt() const { - if (getVersion() <= llvm::LLVMDebugVersion9) + if (getVersion() <= LLVMDebugVersion9) return NULL; return dyn_cast_or_null<MDNode>(DbgNode->getOperand(7)); } @@ -150,6 +150,7 @@ bool DIDescriptor::isDerivedType() const { case dwarf::DW_TAG_typedef: case dwarf::DW_TAG_pointer_type: case dwarf::DW_TAG_reference_type: + case dwarf::DW_TAG_rvalue_reference_type: case dwarf::DW_TAG_const_type: case dwarf::DW_TAG_volatile_type: case dwarf::DW_TAG_restrict_type: @@ -399,11 +400,13 @@ bool DIType::Verify() const { unsigned Tag = getTag(); if (!isBasicType() && Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_pointer_type && - Tag != dwarf::DW_TAG_reference_type && Tag != dwarf::DW_TAG_restrict_type - && Tag != dwarf::DW_TAG_vector_type && Tag != dwarf::DW_TAG_array_type - && Tag != dwarf::DW_TAG_enumeration_type - && Tag != dwarf::DW_TAG_subroutine_type - && getFilename().empty()) + Tag != dwarf::DW_TAG_reference_type && + Tag != dwarf::DW_TAG_rvalue_reference_type && + Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_vector_type && + Tag != dwarf::DW_TAG_array_type && + Tag != dwarf::DW_TAG_enumeration_type && + Tag != dwarf::DW_TAG_subroutine_type && + getFilename().empty()) return false; return true; } @@ -500,27 +503,28 @@ bool DINameSpace::Verify() const { uint64_t DIDerivedType::getOriginalTypeSize() const { unsigned Tag = getTag(); - if (Tag == dwarf::DW_TAG_member || Tag == dwarf::DW_TAG_typedef || - Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type || - Tag == dwarf::DW_TAG_restrict_type) { - DIType BaseType = getTypeDerivedFrom(); - // If this type is not derived from any type then take conservative - // approach. - if (!BaseType.isValid()) - return getSizeInBits(); - // If this is a derived type, go ahead and get the base type, unless - // it's a reference then it's just the size of the field. Pointer types - // have no need of this since they're a different type of qualification - // on the type. - if (BaseType.getTag() == dwarf::DW_TAG_reference_type) - return getSizeInBits(); - else if (BaseType.isDerivedType()) - return DIDerivedType(BaseType).getOriginalTypeSize(); - else - return BaseType.getSizeInBits(); - } + if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef && + Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type && + Tag != dwarf::DW_TAG_restrict_type) + return getSizeInBits(); + + DIType BaseType = getTypeDerivedFrom(); + + // If this type is not derived from any type then take conservative approach. + if (!BaseType.isValid()) + return getSizeInBits(); + + // If this is a derived type, go ahead and get the base type, unless it's a + // reference then it's just the size of the field. Pointer types have no need + // of this since they're a different type of qualification on the type. + if (BaseType.getTag() == dwarf::DW_TAG_reference_type || + BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type) + return getSizeInBits(); + + if (BaseType.isDerivedType()) + return DIDerivedType(BaseType).getOriginalTypeSize(); - return getSizeInBits(); + return BaseType.getSizeInBits(); } /// getObjCProperty - Return property node, if this ivar is associated with one. @@ -538,7 +542,7 @@ bool DIVariable::isInlinedFnArgument(const Function *CurFn) { return false; // This variable is not inlined function argument if its scope // does not describe current function. - return !(DISubprogram(getContext()).describes(CurFn)); + return !DISubprogram(getContext()).describes(CurFn); } /// describes - Return true if this subprogram provides debugging @@ -660,257 +664,6 @@ DIArray DICompileUnit::getGlobalVariables() const { return DIArray(); } -//===----------------------------------------------------------------------===// -// DIDescriptor: vtable anchors for all descriptors. -//===----------------------------------------------------------------------===// - -void DIScope::anchor() { } - -void DICompileUnit::anchor() { } - -void DIFile::anchor() { } - -void DIType::anchor() { } - -void DIBasicType::anchor() { } - -void DIDerivedType::anchor() { } - -void DICompositeType::anchor() { } - -void DISubprogram::anchor() { } - -void DILexicalBlock::anchor() { } - -void DINameSpace::anchor() { } - -void DILexicalBlockFile::anchor() { } - -//===----------------------------------------------------------------------===// -// DIDescriptor: dump routines for all descriptors. -//===----------------------------------------------------------------------===// - - -/// print - Print descriptor. -void DIDescriptor::print(raw_ostream &OS) const { - OS << "[" << dwarf::TagString(getTag()) << "] "; - OS.write_hex((intptr_t) &*DbgNode) << ']'; -} - -/// print - Print compile unit. -void DICompileUnit::print(raw_ostream &OS) const { - if (getLanguage()) - OS << " [" << dwarf::LanguageString(getLanguage()) << "] "; - - OS << " [" << getDirectory() << "/" << getFilename() << "]"; -} - -/// print - Print type. -void DIType::print(raw_ostream &OS) const { - if (!DbgNode) return; - - StringRef Res = getName(); - if (!Res.empty()) - OS << " [" << Res << "] "; - - unsigned Tag = getTag(); - OS << " [" << dwarf::TagString(Tag) << "] "; - - // TODO : Print context - OS << " [" - << "line " << getLineNumber() << ", " - << getSizeInBits() << " bits, " - << getAlignInBits() << " bit alignment, " - << getOffsetInBits() << " bit offset" - << "] "; - - if (isPrivate()) - OS << " [private] "; - else if (isProtected()) - OS << " [protected] "; - - if (isForwardDecl()) - OS << " [fwd] "; - - if (isBasicType()) - DIBasicType(DbgNode).print(OS); - else if (isDerivedType()) { - DIDerivedType DTy = DIDerivedType(DbgNode); - DTy.print(OS); - DICompositeType CTy = getDICompositeType(DTy); - if (CTy.Verify()) - CTy.print(OS); - } - else if (isCompositeType()) - DICompositeType(DbgNode).print(OS); - else { - OS << "Invalid DIType\n"; - return; - } - - OS << "\n"; -} - -/// print - Print basic type. -void DIBasicType::print(raw_ostream &OS) const { - OS << " [" << dwarf::AttributeEncodingString(getEncoding()) << "] "; -} - -/// print - Print derived type. -void DIDerivedType::print(raw_ostream &OS) const { - OS << "\n\t Derived From: "; - getTypeDerivedFrom().print(OS); - OS << "\n\t"; -} - -/// print - Print composite type. -void DICompositeType::print(raw_ostream &OS) const { - DIArray A = getTypeArray(); - OS << " [" << A.getNumElements() << " elements]"; -} - -/// print - Print subprogram. -void DISubprogram::print(raw_ostream &OS) const { - StringRef Res = getName(); - if (!Res.empty()) - OS << " [" << Res << "] "; - - unsigned Tag = getTag(); - OS << " [" << dwarf::TagString(Tag) << "] "; - - // TODO : Print context - OS << " [" << getLineNumber() << "] "; - - if (isLocalToUnit()) - OS << " [local] "; - - if (isDefinition()) - OS << " [def] "; - - if (getScopeLineNumber() != getLineNumber()) - OS << " [Scope: " << getScopeLineNumber() << "] "; - - OS << "\n"; -} - -/// print - Print global variable. -void DIGlobalVariable::print(raw_ostream &OS) const { - OS << " ["; - StringRef Res = getName(); - if (!Res.empty()) - OS << " [" << Res << "] "; - - unsigned Tag = getTag(); - OS << " [" << dwarf::TagString(Tag) << "] "; - - // TODO : Print context - OS << " [" << getLineNumber() << "] "; - - if (isLocalToUnit()) - OS << " [local] "; - - if (isDefinition()) - OS << " [def] "; - - if (isGlobalVariable()) - DIGlobalVariable(DbgNode).print(OS); - OS << "]\n"; -} - -static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS, - const LLVMContext &Ctx) { - if (!DL.isUnknown()) { // Print source line info. - DIScope Scope(DL.getScope(Ctx)); - // Omit the directory, because it's likely to be long and uninteresting. - if (Scope.Verify()) - CommentOS << Scope.getFilename(); - else - CommentOS << "<unknown>"; - CommentOS << ':' << DL.getLine(); - if (DL.getCol() != 0) - CommentOS << ':' << DL.getCol(); - DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx)); - if (!InlinedAtDL.isUnknown()) { - CommentOS << " @[ "; - printDebugLoc(InlinedAtDL, CommentOS, Ctx); - CommentOS << " ]"; - } - } -} - -void DIVariable::printExtendedName(raw_ostream &OS) const { - const LLVMContext &Ctx = DbgNode->getContext(); - StringRef Res = getName(); - if (!Res.empty()) - OS << Res << "," << getLineNumber(); - if (MDNode *InlinedAt = getInlinedAt()) { - DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt); - if (!InlinedAtDL.isUnknown()) { - OS << " @["; - printDebugLoc(InlinedAtDL, OS, Ctx); - OS << "]"; - } - } -} - -/// print - Print variable. -void DIVariable::print(raw_ostream &OS) const { - StringRef Res = getName(); - if (!Res.empty()) - OS << " [" << Res << "] "; - - OS << " [" << getLineNumber() << "] "; - getType().print(OS); - OS << "\n"; - - // FIXME: Dump complex addresses -} - -/// dump - Print descriptor to dbgs() with a newline. -void DIDescriptor::dump() const { - print(dbgs()); dbgs() << '\n'; -} - -/// dump - Print compile unit to dbgs() with a newline. -void DICompileUnit::dump() const { - print(dbgs()); dbgs() << '\n'; -} - -/// dump - Print type to dbgs() with a newline. -void DIType::dump() const { - print(dbgs()); dbgs() << '\n'; -} - -/// dump - Print basic type to dbgs() with a newline. -void DIBasicType::dump() const { - print(dbgs()); dbgs() << '\n'; -} - -/// dump - Print derived type to dbgs() with a newline. -void DIDerivedType::dump() const { - print(dbgs()); dbgs() << '\n'; -} - -/// dump - Print composite type to dbgs() with a newline. -void DICompositeType::dump() const { - print(dbgs()); dbgs() << '\n'; -} - -/// dump - Print subprogram to dbgs() with a newline. -void DISubprogram::dump() const { - print(dbgs()); dbgs() << '\n'; -} - -/// dump - Print global variable. -void DIGlobalVariable::dump() const { - print(dbgs()); dbgs() << '\n'; -} - -/// dump - Print variable. -void DIVariable::dump() const { - print(dbgs()); dbgs() << '\n'; -} - /// fixupObjcLikeName - Replace contains special characters used /// in a typical Objective-C names with '.' in a given string. static void fixupObjcLikeName(StringRef Str, SmallVectorImpl<char> &Out) { @@ -981,11 +734,50 @@ DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) { // Insert inlined scope as 7th element. for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i) i == 7 ? - Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext))): + Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))): Elts.push_back(DV->getOperand(i)); return DIVariable(MDNode::get(VMContext, Elts)); } +/// getDISubprogram - Find subprogram that is enclosing this scope. +DISubprogram llvm::getDISubprogram(const MDNode *Scope) { + DIDescriptor D(Scope); + if (D.isSubprogram()) + return DISubprogram(Scope); + + if (D.isLexicalBlockFile()) + return getDISubprogram(DILexicalBlockFile(Scope).getContext()); + + if (D.isLexicalBlock()) + return getDISubprogram(DILexicalBlock(Scope).getContext()); + + return DISubprogram(); +} + +/// getDICompositeType - Find underlying composite type. +DICompositeType llvm::getDICompositeType(DIType T) { + if (T.isCompositeType()) + return DICompositeType(T); + + if (T.isDerivedType()) + return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom()); + + return DICompositeType(); +} + +/// isSubprogramContext - Return true if Context is either a subprogram +/// or another context nested inside a subprogram. +bool llvm::isSubprogramContext(const MDNode *Context) { + if (!Context) + return false; + DIDescriptor D(Context); + if (D.isSubprogram()) + return true; + if (D.isType()) + return isSubprogramContext(DIType(Context).getContext()); + return false; +} + //===----------------------------------------------------------------------===// // DebugInfoFinder implementations. //===----------------------------------------------------------------------===// @@ -1188,42 +980,189 @@ bool DebugInfoFinder::addSubprogram(DISubprogram SP) { return true; } -/// getDISubprogram - Find subprogram that is enclosing this scope. -DISubprogram llvm::getDISubprogram(const MDNode *Scope) { - DIDescriptor D(Scope); - if (D.isSubprogram()) - return DISubprogram(Scope); +//===----------------------------------------------------------------------===// +// DIDescriptor: dump routines for all descriptors. +//===----------------------------------------------------------------------===// - if (D.isLexicalBlockFile()) - return getDISubprogram(DILexicalBlockFile(Scope).getContext()); - - if (D.isLexicalBlock()) - return getDISubprogram(DILexicalBlock(Scope).getContext()); +/// dump - Print descriptor to dbgs() with a newline. +void DIDescriptor::dump() const { + print(dbgs()); dbgs() << '\n'; +} - return DISubprogram(); +/// print - Print descriptor. +void DIDescriptor::print(raw_ostream &OS) const { + if (!DbgNode) return; + + if (const char *Tag = dwarf::TagString(getTag())) + OS << "[ " << Tag << " ]"; + + if (this->isSubrange()) { + DISubrange(DbgNode).printInternal(OS); + } else if (this->isCompileUnit()) { + DICompileUnit(DbgNode).printInternal(OS); + } else if (this->isFile()) { + DIFile(DbgNode).printInternal(OS); + } else if (this->isEnumerator()) { + DIEnumerator(DbgNode).printInternal(OS); + } else if (this->isBasicType()) { + DIType(DbgNode).printInternal(OS); + } else if (this->isDerivedType()) { + DIDerivedType(DbgNode).printInternal(OS); + } else if (this->isCompositeType()) { + DICompositeType(DbgNode).printInternal(OS); + } else if (this->isSubprogram()) { + DISubprogram(DbgNode).printInternal(OS); + } else if (this->isGlobalVariable()) { + DIGlobalVariable(DbgNode).printInternal(OS); + } else if (this->isVariable()) { + DIVariable(DbgNode).printInternal(OS); + } else if (this->isObjCProperty()) { + DIObjCProperty(DbgNode).printInternal(OS); + } else if (this->isScope()) { + DIScope(DbgNode).printInternal(OS); + } } -/// getDICompositeType - Find underlying composite type. -DICompositeType llvm::getDICompositeType(DIType T) { - if (T.isCompositeType()) - return DICompositeType(T); +void DISubrange::printInternal(raw_ostream &OS) const { + OS << " [" << getLo() << ", " << getHi() << ']'; +} - if (T.isDerivedType()) - return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom()); +void DIScope::printInternal(raw_ostream &OS) const { + OS << " [" << getDirectory() << "/" << getFilename() << ']'; +} - return DICompositeType(); +void DICompileUnit::printInternal(raw_ostream &OS) const { + DIScope::printInternal(OS); + if (unsigned Lang = getLanguage()) + OS << " [" << dwarf::LanguageString(Lang) << ']'; } -/// isSubprogramContext - Return true if Context is either a subprogram -/// or another context nested inside a subprogram. -bool llvm::isSubprogramContext(const MDNode *Context) { - if (!Context) - return false; - DIDescriptor D(Context); - if (D.isSubprogram()) - return true; - if (D.isType()) - return isSubprogramContext(DIType(Context).getContext()); - return false; +void DIEnumerator::printInternal(raw_ostream &OS) const { + OS << " [" << getName() << " :: " << getEnumValue() << ']'; +} + +void DIType::printInternal(raw_ostream &OS) const { + if (!DbgNode) return; + + StringRef Res = getName(); + if (!Res.empty()) + OS << " [" << Res << "]"; + + // TODO: Print context? + + OS << " [line " << getLineNumber() + << ", size " << getSizeInBits() + << ", align " << getAlignInBits() + << ", offset " << getOffsetInBits(); + if (isBasicType()) + if (const char *Enc = + dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding())) + OS << ", enc " << Enc; + OS << "]"; + + if (isPrivate()) + OS << " [private]"; + else if (isProtected()) + OS << " [protected]"; + + if (isForwardDecl()) + OS << " [fwd]"; +} + +void DIDerivedType::printInternal(raw_ostream &OS) const { + DIType::printInternal(OS); + OS << " [from " << getTypeDerivedFrom().getName() << ']'; +} + +void DICompositeType::printInternal(raw_ostream &OS) const { + DIType::printInternal(OS); + DIArray A = getTypeArray(); + OS << " [" << A.getNumElements() << " elements]"; +} + +void DISubprogram::printInternal(raw_ostream &OS) const { + // TODO : Print context + OS << " [line " << getLineNumber() << ']'; + + if (isLocalToUnit()) + OS << " [local]"; + + if (isDefinition()) + OS << " [def]"; + + if (getScopeLineNumber() != getLineNumber()) + OS << " [scope " << getScopeLineNumber() << "]"; + + StringRef Res = getName(); + if (!Res.empty()) + OS << " [" << Res << ']'; +} + +void DIGlobalVariable::printInternal(raw_ostream &OS) const { + StringRef Res = getName(); + if (!Res.empty()) + OS << " [" << Res << ']'; + + OS << " [line " << getLineNumber() << ']'; + + // TODO : Print context + + if (isLocalToUnit()) + OS << " [local]"; + + if (isDefinition()) + OS << " [def]"; +} + +void DIVariable::printInternal(raw_ostream &OS) const { + StringRef Res = getName(); + if (!Res.empty()) + OS << " [" << Res << ']'; + + OS << " [line " << getLineNumber() << ']'; +} + +void DIObjCProperty::printInternal(raw_ostream &OS) const { + StringRef Name = getObjCPropertyName(); + if (!Name.empty()) + OS << " [" << Name << ']'; + + OS << " [line " << getLineNumber() + << ", properties " << getUnsignedField(6) << ']'; } +static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS, + const LLVMContext &Ctx) { + if (!DL.isUnknown()) { // Print source line info. + DIScope Scope(DL.getScope(Ctx)); + // Omit the directory, because it's likely to be long and uninteresting. + if (Scope.Verify()) + CommentOS << Scope.getFilename(); + else + CommentOS << "<unknown>"; + CommentOS << ':' << DL.getLine(); + if (DL.getCol() != 0) + CommentOS << ':' << DL.getCol(); + DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx)); + if (!InlinedAtDL.isUnknown()) { + CommentOS << " @[ "; + printDebugLoc(InlinedAtDL, CommentOS, Ctx); + CommentOS << " ]"; + } + } +} + +void DIVariable::printExtendedName(raw_ostream &OS) const { + const LLVMContext &Ctx = DbgNode->getContext(); + StringRef Res = getName(); + if (!Res.empty()) + OS << Res << "," << getLineNumber(); + if (MDNode *InlinedAt = getInlinedAt()) { + DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt); + if (!InlinedAtDL.isUnknown()) { + OS << " @["; + printDebugLoc(InlinedAtDL, OS, Ctx); + OS << "]"; + } + } +} diff --git a/lib/VMCore/DebugLoc.cpp b/lib/VMCore/DebugLoc.cpp index 9013d28..c6a3053 100644 --- a/lib/VMCore/DebugLoc.cpp +++ b/lib/VMCore/DebugLoc.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/DebugLoc.h" +#include "llvm/DebugInfo.h" #include "llvm/ADT/DenseMapInfo.h" #include "LLVMContextImpl.h" using namespace llvm; @@ -114,34 +115,19 @@ MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const { /// getFromDILocation - Translate the DILocation quad into a DebugLoc. DebugLoc DebugLoc::getFromDILocation(MDNode *N) { - if (N == 0 || N->getNumOperands() != 4) return DebugLoc(); - - MDNode *Scope = dyn_cast_or_null<MDNode>(N->getOperand(2)); + DILocation Loc(N); + MDNode *Scope = Loc.getScope(); if (Scope == 0) return DebugLoc(); - - unsigned LineNo = 0, ColNo = 0; - if (ConstantInt *Line = dyn_cast_or_null<ConstantInt>(N->getOperand(0))) - LineNo = Line->getZExtValue(); - if (ConstantInt *Col = dyn_cast_or_null<ConstantInt>(N->getOperand(1))) - ColNo = Col->getZExtValue(); - - return get(LineNo, ColNo, Scope, dyn_cast_or_null<MDNode>(N->getOperand(3))); + return get(Loc.getLineNumber(), Loc.getColumnNumber(), Scope, + Loc.getOrigLocation()); } /// getFromDILexicalBlock - Translate the DILexicalBlock into a DebugLoc. DebugLoc DebugLoc::getFromDILexicalBlock(MDNode *N) { - if (N == 0 || N->getNumOperands() < 3) return DebugLoc(); - - MDNode *Scope = dyn_cast_or_null<MDNode>(N->getOperand(1)); + DILexicalBlock LexBlock(N); + MDNode *Scope = LexBlock.getContext(); if (Scope == 0) return DebugLoc(); - - unsigned LineNo = 0, ColNo = 0; - if (ConstantInt *Line = dyn_cast_or_null<ConstantInt>(N->getOperand(2))) - LineNo = Line->getZExtValue(); - if (ConstantInt *Col = dyn_cast_or_null<ConstantInt>(N->getOperand(3))) - ColNo = Col->getZExtValue(); - - return get(LineNo, ColNo, Scope, NULL); + return get(LexBlock.getLineNumber(), LexBlock.getColumnNumber(), Scope, NULL); } void DebugLoc::dump(const LLVMContext &Ctx) const { @@ -164,22 +150,10 @@ void DebugLoc::dump(const LLVMContext &Ctx) const { // DenseMap specialization //===----------------------------------------------------------------------===// -DebugLoc DenseMapInfo<DebugLoc>::getEmptyKey() { - return DebugLoc::getEmptyKey(); -} - -DebugLoc DenseMapInfo<DebugLoc>::getTombstoneKey() { - return DebugLoc::getTombstoneKey(); -} - unsigned DenseMapInfo<DebugLoc>::getHashValue(const DebugLoc &Key) { return static_cast<unsigned>(hash_combine(Key.LineCol, Key.ScopeIdx)); } -bool DenseMapInfo<DebugLoc>::isEqual(const DebugLoc &LHS, const DebugLoc &RHS) { - return LHS == RHS; -} - //===----------------------------------------------------------------------===// // LLVMContextImpl Implementation //===----------------------------------------------------------------------===// diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp index af6344e..2e0b316 100644 --- a/lib/VMCore/Function.cpp +++ b/lib/VMCore/Function.cpp @@ -29,7 +29,6 @@ #include "llvm/ADT/StringExtras.h" using namespace llvm; - // Explicit instantiations of SymbolTableListTraits since some of the methods // are not in the public header file... template class llvm::SymbolTableListTraits<Argument, Function>; @@ -358,17 +357,239 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) { return Result; } -FunctionType *Intrinsic::getType(LLVMContext &Context, - ID id, ArrayRef<Type*> Tys) { - Type *ResultTy = NULL; - SmallVector<Type*, 8> ArgTys; - bool IsVarArg = false; + +/// IIT_Info - These are enumerators that describe the entries returned by the +/// getIntrinsicInfoTableEntries function. +/// +/// NOTE: This must be kept in synch with the copy in TblGen/IntrinsicEmitter! +enum IIT_Info { + // Common values should be encoded with 0-15. + IIT_Done = 0, + IIT_I1 = 1, + IIT_I8 = 2, + IIT_I16 = 3, + IIT_I32 = 4, + IIT_I64 = 5, + IIT_F32 = 6, + IIT_F64 = 7, + IIT_V2 = 8, + IIT_V4 = 9, + IIT_V8 = 10, + IIT_V16 = 11, + IIT_V32 = 12, + IIT_MMX = 13, + IIT_PTR = 14, + IIT_ARG = 15, -#define GET_INTRINSIC_GENERATOR + // Values from 16+ are only encodable with the inefficient encoding. + IIT_METADATA = 16, + IIT_EMPTYSTRUCT = 17, + IIT_STRUCT2 = 18, + IIT_STRUCT3 = 19, + IIT_STRUCT4 = 20, + IIT_STRUCT5 = 21, + IIT_EXTEND_VEC_ARG = 22, + IIT_TRUNC_VEC_ARG = 23, + IIT_ANYPTR = 24 +}; + + +static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos, + SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) { + IIT_Info Info = IIT_Info(Infos[NextElt++]); + unsigned StructElts = 2; + using namespace Intrinsic; + + switch (Info) { + case IIT_Done: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0)); + return; + case IIT_MMX: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0)); + return; + case IIT_METADATA: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0)); + return; + case IIT_F32: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0)); + return; + case IIT_F64: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Double, 0)); + return; + case IIT_I1: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1)); + return; + case IIT_I8: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8)); + return; + case IIT_I16: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer,16)); + return; + case IIT_I32: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 32)); + return; + case IIT_I64: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64)); + return; + case IIT_V2: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 2)); + DecodeIITType(NextElt, Infos, OutputTable); + return; + case IIT_V4: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 4)); + DecodeIITType(NextElt, Infos, OutputTable); + return; + case IIT_V8: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 8)); + DecodeIITType(NextElt, Infos, OutputTable); + return; + case IIT_V16: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 16)); + DecodeIITType(NextElt, Infos, OutputTable); + return; + case IIT_V32: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 32)); + DecodeIITType(NextElt, Infos, OutputTable); + return; + case IIT_PTR: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0)); + DecodeIITType(NextElt, Infos, OutputTable); + return; + case IIT_ANYPTR: { // [ANYPTR addrspace, subtype] + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, + Infos[NextElt++])); + DecodeIITType(NextElt, Infos, OutputTable); + return; + } + case IIT_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Argument, ArgInfo)); + return; + } + case IIT_EXTEND_VEC_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::ExtendVecArgument, + ArgInfo)); + return; + } + case IIT_TRUNC_VEC_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::TruncVecArgument, + ArgInfo)); + return; + } + case IIT_EMPTYSTRUCT: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0)); + return; + case IIT_STRUCT5: ++StructElts; // FALL THROUGH. + case IIT_STRUCT4: ++StructElts; // FALL THROUGH. + case IIT_STRUCT3: ++StructElts; // FALL THROUGH. + case IIT_STRUCT2: { + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct,StructElts)); + + for (unsigned i = 0; i != StructElts; ++i) + DecodeIITType(NextElt, Infos, OutputTable); + return; + } + } + llvm_unreachable("unhandled"); +} + + +#define GET_INTRINSIC_GENERATOR_GLOBAL #include "llvm/Intrinsics.gen" -#undef GET_INTRINSIC_GENERATOR +#undef GET_INTRINSIC_GENERATOR_GLOBAL + +void Intrinsic::getIntrinsicInfoTableEntries(ID id, + SmallVectorImpl<IITDescriptor> &T){ + // Check to see if the intrinsic's type was expressible by the table. + unsigned TableVal = IIT_Table[id-1]; + + // Decode the TableVal into an array of IITValues. + SmallVector<unsigned char, 8> IITValues; + ArrayRef<unsigned char> IITEntries; + unsigned NextElt = 0; + if ((TableVal >> 31) != 0) { + // This is an offset into the IIT_LongEncodingTable. + IITEntries = IIT_LongEncodingTable; + + // Strip sentinel bit. + NextElt = (TableVal << 1) >> 1; + } else { + // Decode the TableVal into an array of IITValues. If the entry was encoded + // into a single word in the table itself, decode it now. + do { + IITValues.push_back(TableVal & 0xF); + TableVal >>= 4; + } while (TableVal); + + IITEntries = IITValues; + NextElt = 0; + } - return FunctionType::get(ResultTy, ArgTys, IsVarArg); + // Okay, decode the table into the output vector of IITDescriptors. + DecodeIITType(NextElt, IITEntries, T); + while (NextElt != IITEntries.size() && IITEntries[NextElt] != 0) + DecodeIITType(NextElt, IITEntries, T); +} + + +static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos, + ArrayRef<Type*> Tys, LLVMContext &Context) { + using namespace Intrinsic; + IITDescriptor D = Infos.front(); + Infos = Infos.slice(1); + + switch (D.Kind) { + case IITDescriptor::Void: return Type::getVoidTy(Context); + case IITDescriptor::MMX: return Type::getX86_MMXTy(Context); + case IITDescriptor::Metadata: return Type::getMetadataTy(Context); + case IITDescriptor::Float: return Type::getFloatTy(Context); + case IITDescriptor::Double: return Type::getDoubleTy(Context); + + case IITDescriptor::Integer: + return IntegerType::get(Context, D.Integer_Width); + case IITDescriptor::Vector: + return VectorType::get(DecodeFixedType(Infos, Tys, Context),D.Vector_Width); + case IITDescriptor::Pointer: + return PointerType::get(DecodeFixedType(Infos, Tys, Context), + D.Pointer_AddressSpace); + case IITDescriptor::Struct: { + Type *Elts[5]; + assert(D.Struct_NumElements <= 5 && "Can't handle this yet"); + for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i) + Elts[i] = DecodeFixedType(Infos, Tys, Context); + return StructType::get(Context, ArrayRef<Type*>(Elts,D.Struct_NumElements)); + } + + case IITDescriptor::Argument: + return Tys[D.getArgumentNumber()]; + case IITDescriptor::ExtendVecArgument: + return VectorType::getExtendedElementVectorType(cast<VectorType>( + Tys[D.getArgumentNumber()])); + + case IITDescriptor::TruncVecArgument: + return VectorType::getTruncatedElementVectorType(cast<VectorType>( + Tys[D.getArgumentNumber()])); + } + llvm_unreachable("unhandled"); +} + + + +FunctionType *Intrinsic::getType(LLVMContext &Context, + ID id, ArrayRef<Type*> Tys) { + SmallVector<IITDescriptor, 8> Table; + getIntrinsicInfoTableEntries(id, Table); + + ArrayRef<IITDescriptor> TableRef = Table; + Type *ResultTy = DecodeFixedType(TableRef, Tys, Context); + + SmallVector<Type*, 8> ArgTys; + while (!TableRef.empty()) + ArgTys.push_back(DecodeFixedType(TableRef, Tys, Context)); + + return FunctionType::get(ResultTy, ArgTys, false); } bool Intrinsic::isOverloaded(ID id) { @@ -400,7 +621,8 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) { bool Function::hasAddressTaken(const User* *PutOffender) const { for (Value::const_use_iterator I = use_begin(), E = use_end(); I != E; ++I) { const User *U = *I; - // FIXME: Check for blockaddress, which does not take the address. + if (isa<BlockAddress>(U)) + continue; if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) return PutOffender ? (*PutOffender = U, true) : true; ImmutableCallSite CS(cast<Instruction>(U)); @@ -439,4 +661,3 @@ bool Function::callsFunctionThatReturnsTwice() const { return false; } -// vim: sw=2 ai diff --git a/lib/VMCore/GCOV.cpp b/lib/VMCore/GCOV.cpp index 595c452..003a5d4 100644 --- a/lib/VMCore/GCOV.cpp +++ b/lib/VMCore/GCOV.cpp @@ -64,7 +64,7 @@ bool GCOVFile::read(GCOVBuffer &Buffer) { /// dump - Dump GCOVFile content on standard out for debugging purposes. void GCOVFile::dump() { for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(), - E = Functions.end(); I != E; ++I) + E = Functions.end(); I != E; ++I) (*I)->dump(); } @@ -72,7 +72,7 @@ void GCOVFile::dump() { /// reading .gcno and .gcda files. void GCOVFile::collectLineCounts(FileInfo &FI) { for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(), - E = Functions.end(); I != E; ++I) + E = Functions.end(); I != E; ++I) (*I)->collectLineCounts(FI); FI.print(); } @@ -143,7 +143,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) { StringRef Filename = Buff.readString(); if (Buff.getCursor() == (Size - 4)) break; while (uint32_t L = Buff.readInt()) - Block->addLine(Filename, L); + Block->addLine(Filename, L); } Buff.readInt(); // flag } @@ -154,7 +154,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) { void GCOVFunction::dump() { outs() << "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n"; for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(), - E = Blocks.end(); I != E; ++I) + E = Blocks.end(); I != E; ++I) (*I)->dump(); } @@ -162,7 +162,7 @@ void GCOVFunction::dump() { /// reading .gcno and .gcda files. void GCOVFunction::collectLineCounts(FileInfo &FI) { for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(), - E = Blocks.end(); I != E; ++I) + E = Blocks.end(); I != E; ++I) (*I)->collectLineCounts(FI); } @@ -186,7 +186,7 @@ void GCOVBlock::addLine(StringRef Filename, uint32_t LineNo) { /// reading .gcno and .gcda files. void GCOVBlock::collectLineCounts(FileInfo &FI) { for (StringMap<GCOVLines *>::iterator I = Lines.begin(), - E = Lines.end(); I != E; ++I) + E = Lines.end(); I != E; ++I) I->second->collectLineCounts(FI, I->first(), Counter); } @@ -196,14 +196,14 @@ void GCOVBlock::dump() { if (!Edges.empty()) { outs() << "\tEdges : "; for (SmallVector<uint32_t, 16>::iterator I = Edges.begin(), E = Edges.end(); - I != E; ++I) + I != E; ++I) outs() << (*I) << ","; outs() << "\n"; } if (!Lines.empty()) { outs() << "\tLines : "; for (StringMap<GCOVLines *>::iterator LI = Lines.begin(), - LE = Lines.end(); LI != LE; ++LI) { + LE = Lines.end(); LI != LE; ++LI) { outs() << LI->first() << " -> "; LI->second->dump(); outs() << "\n"; @@ -217,16 +217,16 @@ void GCOVBlock::dump() { /// collectLineCounts - Collect line counts. This must be used after /// reading .gcno and .gcda files. void GCOVLines::collectLineCounts(FileInfo &FI, StringRef Filename, - uint32_t Count) { + uint32_t Count) { for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(), - E = Lines.end(); I != E; ++I) + E = Lines.end(); I != E; ++I) FI.addLineCount(Filename, *I, Count); } /// dump - Dump GCOVLines content on standard out for debugging purposes. void GCOVLines::dump() { for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(), - E = Lines.end(); I != E; ++I) + E = Lines.end(); I != E; ++I) outs() << (*I) << ","; } @@ -266,12 +266,12 @@ void FileInfo::print() { StringRef AllLines = Buff.take()->getBuffer(); for (unsigned i = 0, e = L.size(); i != e; ++i) { if (L[i]) - outs() << L[i] << ":\t"; + outs() << L[i] << ":\t"; else - outs() << " :\t"; + outs() << " :\t"; std::pair<StringRef, StringRef> P = AllLines.split('\n'); if (AllLines != P.first) - outs() << P.first; + outs() << P.first; outs() << "\n"; AllLines = P.second; } diff --git a/lib/VMCore/Globals.cpp b/lib/VMCore/Globals.cpp index 4254fb2..c428b88 100644 --- a/lib/VMCore/Globals.cpp +++ b/lib/VMCore/Globals.cpp @@ -82,12 +82,12 @@ bool GlobalValue::isDeclaration() const { GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link, Constant *InitVal, const Twine &Name, - bool ThreadLocal, unsigned AddressSpace) - : GlobalValue(PointerType::get(Ty, AddressSpace), + ThreadLocalMode TLMode, unsigned AddressSpace) + : GlobalValue(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal, OperandTraits<GlobalVariable>::op_begin(this), InitVal != 0, Link, Name), - isConstantGlobal(constant), isThreadLocalSymbol(ThreadLocal) { + isConstantGlobal(constant), threadLocalMode(TLMode) { if (InitVal) { assert(InitVal->getType() == Ty && "Initializer should be the same type as the GlobalVariable!"); @@ -100,13 +100,13 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link, GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant, LinkageTypes Link, Constant *InitVal, const Twine &Name, - GlobalVariable *Before, bool ThreadLocal, + GlobalVariable *Before, ThreadLocalMode TLMode, unsigned AddressSpace) - : GlobalValue(PointerType::get(Ty, AddressSpace), + : GlobalValue(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal, OperandTraits<GlobalVariable>::op_begin(this), InitVal != 0, Link, Name), - isConstantGlobal(constant), isThreadLocalSymbol(ThreadLocal) { + isConstantGlobal(constant), threadLocalMode(TLMode) { if (InitVal) { assert(InitVal->getType() == Ty && "Initializer should be the same type as the GlobalVariable!"); diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp index b459234..5c4e6d9 100644 --- a/lib/VMCore/IRBuilder.cpp +++ b/lib/VMCore/IRBuilder.cpp @@ -12,9 +12,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Support/IRBuilder.h" -#include "llvm/GlobalVariable.h" #include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" using namespace llvm; @@ -28,7 +28,7 @@ Value *IRBuilderBase::CreateGlobalString(StringRef Str, const Twine &Name) { Module &M = *BB->getParent()->getParent(); GlobalVariable *GV = new GlobalVariable(M, StrConstant->getType(), true, GlobalValue::PrivateLinkage, - StrConstant, "", 0, false); + StrConstant); GV->setName(Name); GV->setUnnamedAddr(true); return GV; @@ -120,13 +120,13 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align, CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { assert(isa<PointerType>(Ptr->getType()) && - "lifetime.start only applies to pointers."); + "lifetime.start only applies to pointers."); Ptr = getCastedInt8PtrValue(Ptr); if (!Size) Size = getInt64(-1); else assert(Size->getType() == getInt64Ty() && - "lifetime.start requires the size to be an i64"); + "lifetime.start requires the size to be an i64"); Value *Ops[] = { Size, Ptr }; Module *M = BB->getParent()->getParent(); Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start); @@ -135,13 +135,13 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) { assert(isa<PointerType>(Ptr->getType()) && - "lifetime.end only applies to pointers."); + "lifetime.end only applies to pointers."); Ptr = getCastedInt8PtrValue(Ptr); if (!Size) Size = getInt64(-1); else assert(Size->getType() == getInt64Ty() && - "lifetime.end requires the size to be an i64"); + "lifetime.end requires the size to be an i64"); Value *Ops[] = { Size, Ptr }; Module *M = BB->getParent()->getParent(); Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end); diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp index 5449714..66379a0 100644 --- a/lib/VMCore/Instruction.cpp +++ b/lib/VMCore/Instruction.cpp @@ -226,34 +226,52 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const { RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() && RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() && RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope(); - + if (const PHINode *thisPHI = dyn_cast<PHINode>(this)) { + const PHINode *otherPHI = cast<PHINode>(I); + for (unsigned i = 0, e = thisPHI->getNumOperands(); i != e; ++i) { + if (thisPHI->getIncomingBlock(i) != otherPHI->getIncomingBlock(i)) + return false; + } + return true; + } return true; } // isSameOperationAs // This should be kept in sync with isEquivalentOperation in // lib/Transforms/IPO/MergeFunctions.cpp. -bool Instruction::isSameOperationAs(const Instruction *I) const { +bool Instruction::isSameOperationAs(const Instruction *I, + unsigned flags) const { + bool IgnoreAlignment = flags & CompareIgnoringAlignment; + bool UseScalarTypes = flags & CompareUsingScalarTypes; + if (getOpcode() != I->getOpcode() || getNumOperands() != I->getNumOperands() || - getType() != I->getType()) + (UseScalarTypes ? + getType()->getScalarType() != I->getType()->getScalarType() : + getType() != I->getType())) return false; // We have two instructions of identical opcode and #operands. Check to see // if all operands are the same type for (unsigned i = 0, e = getNumOperands(); i != e; ++i) - if (getOperand(i)->getType() != I->getOperand(i)->getType()) + if (UseScalarTypes ? + getOperand(i)->getType()->getScalarType() != + I->getOperand(i)->getType()->getScalarType() : + getOperand(i)->getType() != I->getOperand(i)->getType()) return false; // Check special state that is a part of some instructions. if (const LoadInst *LI = dyn_cast<LoadInst>(this)) return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() && - LI->getAlignment() == cast<LoadInst>(I)->getAlignment() && + (LI->getAlignment() == cast<LoadInst>(I)->getAlignment() || + IgnoreAlignment) && LI->getOrdering() == cast<LoadInst>(I)->getOrdering() && LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope(); if (const StoreInst *SI = dyn_cast<StoreInst>(this)) return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() && - SI->getAlignment() == cast<StoreInst>(I)->getAlignment() && + (SI->getAlignment() == cast<StoreInst>(I)->getAlignment() || + IgnoreAlignment) && SI->getOrdering() == cast<StoreInst>(I)->getOrdering() && SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope(); if (const CmpInst *CI = dyn_cast<CmpInst>(this)) @@ -388,6 +406,29 @@ bool Instruction::isCommutative(unsigned op) { } } +/// isIdempotent - Return true if the instruction is idempotent: +/// +/// Idempotent operators satisfy: x op x === x +/// +/// In LLVM, the And and Or operators are idempotent. +/// +bool Instruction::isIdempotent(unsigned Opcode) { + return Opcode == And || Opcode == Or; +} + +/// isNilpotent - Return true if the instruction is nilpotent: +/// +/// Nilpotent operators satisfy: x op x === Id, +/// +/// where Id is the identity for the operator, i.e. a constant such that +/// x op Id === x and Id op x === x for all x. +/// +/// In LLVM, the Xor operator is nilpotent. +/// +bool Instruction::isNilpotent(unsigned Opcode) { + return Opcode == Xor; +} + Instruction *Instruction::clone() const { Instruction *New = clone_impl(); New->SubclassOptionalData = SubclassOptionalData; diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp index 6c5db32..9af98e8 100644 --- a/lib/VMCore/Instructions.cpp +++ b/lib/VMCore/Instructions.cpp @@ -161,8 +161,14 @@ Value *PHINode::hasConstantValue() const { // Exploit the fact that phi nodes always have at least one entry. Value *ConstantValue = getIncomingValue(0); for (unsigned i = 1, e = getNumIncomingValues(); i != e; ++i) - if (getIncomingValue(i) != ConstantValue) - return 0; // Incoming values not all the same. + if (getIncomingValue(i) != ConstantValue && getIncomingValue(i) != this) { + if (ConstantValue != this) + return 0; // Incoming values not all the same. + // The case where the first value is this PHI. + ConstantValue = getIncomingValue(i); + } + if (ConstantValue == this) + return UndefValue::get(getType()); return ConstantValue; } @@ -3158,6 +3164,7 @@ SwitchInst::SwitchInst(const SwitchInst &SI) OL[i] = InOL[i]; OL[i+1] = InOL[i+1]; } + TheSubsets = SI.TheSubsets; SubclassOptionalData = SI.SubclassOptionalData; } @@ -3169,6 +3176,16 @@ SwitchInst::~SwitchInst() { /// addCase - Add an entry to the switch instruction... /// void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) { + IntegersSubsetToBB Mapping; + + // FIXME: Currently we work with ConstantInt based cases. + // So inititalize IntItem container directly from ConstantInt. + Mapping.add(IntItem::fromConstantInt(OnVal)); + IntegersSubset CaseRanges = Mapping.getCase(); + addCase(CaseRanges, Dest); +} + +void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) { unsigned NewCaseIdx = getNumCases(); unsigned OpNo = NumOperands; if (OpNo+2 > ReservedSpace) @@ -3176,14 +3193,17 @@ void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) { // Initialize some new operands. assert(OpNo+1 < ReservedSpace && "Growing didn't work!"); NumOperands = OpNo+2; - CaseIt Case(this, NewCaseIdx); - Case.setValue(OnVal); + + SubsetsIt TheSubsetsIt = TheSubsets.insert(TheSubsets.end(), OnVal); + + CaseIt Case(this, NewCaseIdx, TheSubsetsIt); + Case.updateCaseValueOperand(OnVal); Case.setSuccessor(Dest); } /// removeCase - This method removes the specified case and its successor /// from the switch instruction. -void SwitchInst::removeCase(CaseIt i) { +void SwitchInst::removeCase(CaseIt& i) { unsigned idx = i.getCaseIndex(); assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!"); @@ -3200,6 +3220,16 @@ void SwitchInst::removeCase(CaseIt i) { // Nuke the last value. OL[NumOps-2].set(0); OL[NumOps-2+1].set(0); + + // Do the same with TheCases collection: + if (i.SubsetIt != --TheSubsets.end()) { + *i.SubsetIt = TheSubsets.back(); + TheSubsets.pop_back(); + } else { + TheSubsets.pop_back(); + i.SubsetIt = TheSubsets.end(); + } + NumOperands = NumOps-2; } diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp index 090b09a..ede4626 100644 --- a/lib/VMCore/Metadata.cpp +++ b/lib/VMCore/Metadata.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/STLExtras.h" #include "SymbolTableListTraitsImpl.h" +#include "llvm/Support/ConstantRange.h" #include "llvm/Support/LeakDetector.h" #include "llvm/Support/ValueHandle.h" using namespace llvm; @@ -66,7 +67,11 @@ public: MDNodeOperand(Value *V) : CallbackVH(V) {} ~MDNodeOperand() {} - void set(Value *V) { this->setValPtr(V); } + void set(Value *V) { + unsigned IsFirst = this->getValPtrInt(); + this->setValPtr(V); + this->setAsFirstOperand(IsFirst); + } /// setAsFirstOperand - Accessor method to mark the operand as the first in /// the list. @@ -95,7 +100,7 @@ void MDNodeOperand::allUsesReplacedWith(Value *NV) { static MDNodeOperand *getOperandPtr(MDNode *N, unsigned Op) { // Use <= instead of < to permit a one-past-the-end address. assert(Op <= N->getNumOperands() && "Invalid operand number"); - return reinterpret_cast<MDNodeOperand*>(N+1)+Op; + return reinterpret_cast<MDNodeOperand*>(N + 1) + Op; } void MDNode::replaceOperandWith(unsigned i, Value *Val) { @@ -122,7 +127,6 @@ MDNode::MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal) } } - /// ~MDNode - Destroy MDNode. MDNode::~MDNode() { assert((getSubclassDataFromValue() & DestroyFlag) != 0 && @@ -247,7 +251,7 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals, } // Coallocate space for the node and Operands together, then placement new. - void *Ptr = malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand)); + void *Ptr = malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand)); N = new (Ptr) MDNode(Context, Vals, isFunctionLocal); // Cache the operand hash. @@ -275,7 +279,7 @@ MDNode *MDNode::getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals) { MDNode *MDNode::getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals) { MDNode *N = - (MDNode *)malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand)); + (MDNode *)malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand)); N = new (N) MDNode(Context, Vals, FL_No); N->setValueSubclassData(N->getSubclassDataFromValue() | NotUniquedBit); @@ -398,6 +402,155 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) { } } +MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { + if (!A || !B) + return NULL; + + if (A == B) + return A; + + SmallVector<MDNode *, 4> PathA; + MDNode *T = A; + while (T) { + PathA.push_back(T); + T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0; + } + + SmallVector<MDNode *, 4> PathB; + T = B; + while (T) { + PathB.push_back(T); + T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0; + } + + int IA = PathA.size() - 1; + int IB = PathB.size() - 1; + + MDNode *Ret = 0; + while (IA >= 0 && IB >=0) { + if (PathA[IA] == PathB[IB]) + Ret = PathA[IA]; + else + break; + --IA; + --IB; + } + return Ret; +} + +MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) { + if (!A || !B) + return NULL; + + APFloat AVal = cast<ConstantFP>(A->getOperand(0))->getValueAPF(); + APFloat BVal = cast<ConstantFP>(B->getOperand(0))->getValueAPF(); + if (AVal.compare(BVal) == APFloat::cmpLessThan) + return A; + return B; +} + +static bool isContiguous(const ConstantRange &A, const ConstantRange &B) { + return A.getUpper() == B.getLower() || A.getLower() == B.getUpper(); +} + +static bool canBeMerged(const ConstantRange &A, const ConstantRange &B) { + return !A.intersectWith(B).isEmptySet() || isContiguous(A, B); +} + +static bool tryMergeRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low, + ConstantInt *High) { + ConstantRange NewRange(Low->getValue(), High->getValue()); + unsigned Size = EndPoints.size(); + APInt LB = cast<ConstantInt>(EndPoints[Size - 2])->getValue(); + APInt LE = cast<ConstantInt>(EndPoints[Size - 1])->getValue(); + ConstantRange LastRange(LB, LE); + if (canBeMerged(NewRange, LastRange)) { + ConstantRange Union = LastRange.unionWith(NewRange); + Type *Ty = High->getType(); + EndPoints[Size - 2] = ConstantInt::get(Ty, Union.getLower()); + EndPoints[Size - 1] = ConstantInt::get(Ty, Union.getUpper()); + return true; + } + return false; +} + +static void addRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low, + ConstantInt *High) { + if (!EndPoints.empty()) + if (tryMergeRange(EndPoints, Low, High)) + return; + + EndPoints.push_back(Low); + EndPoints.push_back(High); +} + +MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) { + // Given two ranges, we want to compute the union of the ranges. This + // is slightly complitade by having to combine the intervals and merge + // the ones that overlap. + + if (!A || !B) + return NULL; + + if (A == B) + return A; + + // First, walk both lists in older of the lower boundary of each interval. + // At each step, try to merge the new interval to the last one we adedd. + SmallVector<Value*, 4> EndPoints; + int AI = 0; + int BI = 0; + int AN = A->getNumOperands() / 2; + int BN = B->getNumOperands() / 2; + while (AI < AN && BI < BN) { + ConstantInt *ALow = cast<ConstantInt>(A->getOperand(2 * AI)); + ConstantInt *BLow = cast<ConstantInt>(B->getOperand(2 * BI)); + + if (ALow->getValue().slt(BLow->getValue())) { + addRange(EndPoints, ALow, cast<ConstantInt>(A->getOperand(2 * AI + 1))); + ++AI; + } else { + addRange(EndPoints, BLow, cast<ConstantInt>(B->getOperand(2 * BI + 1))); + ++BI; + } + } + while (AI < AN) { + addRange(EndPoints, cast<ConstantInt>(A->getOperand(2 * AI)), + cast<ConstantInt>(A->getOperand(2 * AI + 1))); + ++AI; + } + while (BI < BN) { + addRange(EndPoints, cast<ConstantInt>(B->getOperand(2 * BI)), + cast<ConstantInt>(B->getOperand(2 * BI + 1))); + ++BI; + } + + // If we have more than 2 ranges (4 endpoints) we have to try to merge + // the last and first ones. + unsigned Size = EndPoints.size(); + if (Size > 4) { + ConstantInt *FB = cast<ConstantInt>(EndPoints[0]); + ConstantInt *FE = cast<ConstantInt>(EndPoints[1]); + if (tryMergeRange(EndPoints, FB, FE)) { + for (unsigned i = 0; i < Size - 2; ++i) { + EndPoints[i] = EndPoints[i + 2]; + } + EndPoints.resize(Size - 2); + } + } + + // If in the end we have a single range, it is possible that it is now the + // full range. Just drop the metadata in that case. + if (EndPoints.size() == 2) { + ConstantRange Range(cast<ConstantInt>(EndPoints[0])->getValue(), + cast<ConstantInt>(EndPoints[1])->getValue()); + if (Range.isFullSet()) + return NULL; + } + + return MDNode::get(A->getContext(), EndPoints); +} + //===----------------------------------------------------------------------===// // NamedMDNode implementation. // diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp index 3c67191..8ea3665 100644 --- a/lib/VMCore/Module.cpp +++ b/lib/VMCore/Module.cpp @@ -65,20 +65,20 @@ Module::~Module() { Module::Endianness Module::getEndianness() const { StringRef temp = DataLayout; Module::Endianness ret = AnyEndianness; - + while (!temp.empty()) { std::pair<StringRef, StringRef> P = getToken(temp, "-"); - + StringRef token = P.first; temp = P.second; - + if (token[0] == 'e') { ret = LittleEndian; } else if (token[0] == 'E') { ret = BigEndian; } } - + return ret; } @@ -86,13 +86,13 @@ Module::Endianness Module::getEndianness() const { Module::PointerSize Module::getPointerSize() const { StringRef temp = DataLayout; Module::PointerSize ret = AnyPointerSize; - + while (!temp.empty()) { std::pair<StringRef, StringRef> TmpP = getToken(temp, "-"); temp = TmpP.second; TmpP = getToken(TmpP.first, ":"); StringRef token = TmpP.second, signalToken = TmpP.first; - + if (signalToken[0] == 'p') { int size = 0; getToken(token, ":").first.getAsInteger(10, size); @@ -102,7 +102,7 @@ Module::PointerSize Module::getPointerSize() const { ret = Pointer64; } } - + return ret; } @@ -164,9 +164,9 @@ Constant *Module::getOrInsertFunction(StringRef Name, // right type. if (F->getType() != PointerType::getUnqual(Ty)) return ConstantExpr::getBitCast(F, PointerType::getUnqual(Ty)); - + // Otherwise, we just found the existing function or a prototype. - return F; + return F; } Constant *Module::getOrInsertTargetIntrinsic(StringRef Name, @@ -183,13 +183,12 @@ Constant *Module::getOrInsertTargetIntrinsic(StringRef Name, } // Otherwise, we just found the existing function or a prototype. - return F; + return F; } Constant *Module::getOrInsertFunction(StringRef Name, FunctionType *Ty) { - AttrListPtr AttributeList = AttrListPtr::get((AttributeWithIndex *)0, 0); - return getOrInsertFunction(Name, Ty, AttributeList); + return getOrInsertFunction(Name, Ty, AttrListPtr()); } // getOrInsertFunction - Look up the specified function in the module symbol @@ -229,9 +228,9 @@ Constant *Module::getOrInsertFunction(StringRef Name, va_end(Args); // Build the function type and chain to the other getOrInsertFunction... - return getOrInsertFunction(Name, + return getOrInsertFunction(Name, FunctionType::get(RetTy, ArgTys, false), - AttrListPtr::get((AttributeWithIndex *)0, 0)); + AttrListPtr()); } // getFunction - Look up the specified function in the module symbol table. @@ -254,7 +253,7 @@ Function *Module::getFunction(StringRef Name) const { /// GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) const { - if (GlobalVariable *Result = + if (GlobalVariable *Result = dyn_cast_or_null<GlobalVariable>(getNamedValue(Name))) if (AllowLocal || !Result->hasLocalLinkage()) return Result; @@ -282,7 +281,7 @@ Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) { // right type. if (GV->getType() != PointerType::getUnqual(Ty)) return ConstantExpr::getBitCast(GV, PointerType::getUnqual(Ty)); - + // Otherwise, we just found the existing function or a prototype. return GV; } @@ -299,7 +298,7 @@ GlobalAlias *Module::getNamedAlias(StringRef Name) const { } /// getNamedMetadata - Return the first NamedMDNode in the module with the -/// specified name. This method returns null if a NamedMDNode with the +/// specified name. This method returns null if a NamedMDNode with the /// specified name is not found. NamedMDNode *Module::getNamedMetadata(const Twine &Name) const { SmallString<256> NameData; @@ -307,8 +306,8 @@ NamedMDNode *Module::getNamedMetadata(const Twine &Name) const { return static_cast<StringMap<NamedMDNode*> *>(NamedMDSymTab)->lookup(NameRef); } -/// getOrInsertNamedMetadata - Return the first named MDNode in the module -/// with the specified name. This method returns a new NamedMDNode if a +/// getOrInsertNamedMetadata - Return the first named MDNode in the module +/// with the specified name. This method returns a new NamedMDNode if a /// NamedMDNode with the specified name is not found. NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) { NamedMDNode *&NMD = @@ -481,12 +480,13 @@ namespace { // objects, we keep several helper maps. DenseSet<const Value*> VisitedConstants; DenseSet<Type*> VisitedTypes; - + std::vector<StructType*> &StructTypes; + bool OnlyNamed; public: - TypeFinder(std::vector<StructType*> &structTypes) - : StructTypes(structTypes) {} - + TypeFinder(std::vector<StructType*> &structTypes, bool onlyNamed) + : StructTypes(structTypes), OnlyNamed(onlyNamed) {} + void run(const Module &M) { // Get types from global variables. for (Module::const_global_iterator I = M.global_begin(), @@ -495,7 +495,7 @@ namespace { if (I->hasInitializer()) incorporateValue(I->getInitializer()); } - + // Get types from aliases. for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; ++I) { @@ -503,24 +503,32 @@ namespace { if (const Value *Aliasee = I->getAliasee()) incorporateValue(Aliasee); } - - SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst; // Get types from functions. + SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst; for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) { incorporateType(FI->getType()); - + + // First incorporate the arguments. + for (Function::const_arg_iterator AI = FI->arg_begin(), + AE = FI->arg_end(); AI != AE; ++AI) + incorporateValue(AI); + for (Function::const_iterator BB = FI->begin(), E = FI->end(); BB != E;++BB) for (BasicBlock::const_iterator II = BB->begin(), E = BB->end(); II != E; ++II) { const Instruction &I = *II; - // Incorporate the type of the instruction and all its operands. + // Incorporate the type of the instruction. incorporateType(I.getType()); + + // Incorporate non-instruction operand types. (We are incorporating + // all instructions with this loop.) for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) - incorporateValue(*OI); - + if (!isa<Instruction>(OI)) + incorporateValue(*OI); + // Incorporate types hiding in metadata. I.getAllMetadataOtherThanDebugLoc(MDForInst); for (unsigned i = 0, e = MDForInst.size(); i != e; ++i) @@ -528,7 +536,7 @@ namespace { MDForInst.clear(); } } - + for (Module::const_named_metadata_iterator I = M.named_metadata_begin(), E = M.named_metadata_end(); I != E; ++I) { const NamedMDNode *NMD = I; @@ -536,23 +544,24 @@ namespace { incorporateMDNode(NMD->getOperand(i)); } } - + private: void incorporateType(Type *Ty) { // Check to see if we're already visited this type. if (!VisitedTypes.insert(Ty).second) return; - + // If this is a structure or opaque type, add a name for the type. if (StructType *STy = dyn_cast<StructType>(Ty)) - StructTypes.push_back(STy); - + if (!OnlyNamed || STy->hasName()) + StructTypes.push_back(STy); + // Recursively walk all contained types. for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end(); I != E; ++I) incorporateType(*I); } - + /// incorporateValue - This method is used to walk operand lists finding /// types hiding in constant expressions and other operands that won't be /// walked in other ways. GlobalValues, basic blocks, instructions, and @@ -561,27 +570,31 @@ namespace { if (const MDNode *M = dyn_cast<MDNode>(V)) return incorporateMDNode(M); if (!isa<Constant>(V) || isa<GlobalValue>(V)) return; - + // Already visited? if (!VisitedConstants.insert(V).second) return; - + // Check this type. incorporateType(V->getType()); - + + // If this is an instruction, we incorporate it separately. + if (isa<Instruction>(V)) + return; + // Look in operands for types. const User *U = cast<User>(V); for (Constant::const_op_iterator I = U->op_begin(), E = U->op_end(); I != E;++I) incorporateValue(*I); } - + void incorporateMDNode(const MDNode *V) { - + // Already visited? if (!VisitedConstants.insert(V).second) return; - + // Look in operands for types. for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i) if (Value *Op = V->getOperand(i)) @@ -590,6 +603,7 @@ namespace { }; } // end anonymous namespace -void Module::findUsedStructTypes(std::vector<StructType*> &StructTypes) const { - TypeFinder(StructTypes).run(*this); +void Module::findUsedStructTypes(std::vector<StructType*> &StructTypes, + bool OnlyNamed) const { + TypeFinder(StructTypes, OnlyNamed).run(*this); } diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp index 28fbaa6..4530c04 100644 --- a/lib/VMCore/PassManager.cpp +++ b/lib/VMCore/PassManager.cpp @@ -478,8 +478,7 @@ PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) { /// Set pass P as the last user of the given analysis passes. void -PMTopLevelManager::setLastUser(const SmallVectorImpl<Pass *> &AnalysisPasses, - Pass *P) { +PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) { unsigned PDepth = 0; if (P->getResolver()) PDepth = P->getResolver()->getPMDataManager().getDepth(); @@ -594,6 +593,26 @@ void PMTopLevelManager::schedulePass(Pass *P) { Pass *AnalysisPass = findAnalysisPass(*I); if (!AnalysisPass) { const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I); + + if (PI == NULL) { + // Pass P is not in the global PassRegistry + dbgs() << "Pass '" << P->getPassName() << "' is not initialized." << "\n"; + dbgs() << "Verify if there is a pass dependency cycle." << "\n"; + dbgs() << "Required Passes:" << "\n"; + for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(), + E = RequiredSet.end(); I2 != E && I2 != I; ++I2) { + Pass *AnalysisPass2 = findAnalysisPass(*I2); + if (AnalysisPass2) { + dbgs() << "\t" << AnalysisPass2->getPassName() << "\n"; + } + else { + dbgs() << "\t" << "Error: Required pass not found! Possible causes:" << "\n"; + dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)" << "\n"; + dbgs() << "\t\t" << "- Corruption of the global PassRegistry" << "\n"; + } + } + } + assert(PI && "Expected required passes to be initialized"); AnalysisPass = PI->createPass(); if (P->getPotentialPassManagerType () == diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp index 4006b2c..d871108 100644 --- a/lib/VMCore/Value.cpp +++ b/lib/VMCore/Value.cpp @@ -686,6 +686,9 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) { #endif } -/// ~CallbackVH. Empty, but defined here to avoid emitting the vtable -/// more than once. -CallbackVH::~CallbackVH() {} +// Default implementation for CallbackVH. +void CallbackVH::allUsesReplacedWith(Value *) {} + +void CallbackVH::deleted() { + setValPtr(NULL); +} diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp index 47baef3..5d51f41 100644 --- a/lib/VMCore/Verifier.cpp +++ b/lib/VMCore/Verifier.cpp @@ -68,6 +68,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Support/ConstantRange.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -293,8 +294,9 @@ namespace { void VerifyCallSite(CallSite CS); bool PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, int VT, unsigned ArgNo, std::string &Suffix); - void VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F, - unsigned RetNum, unsigned ParamNum, ...); + bool VerifyIntrinsicType(Type *Ty, + ArrayRef<Intrinsic::IITDescriptor> &Infos, + SmallVectorImpl<Type*> &ArgTys); void VerifyParameterAttrs(Attributes Attrs, Type *Ty, bool isReturnValue, const Value *V); void VerifyFunctionAttrs(FunctionType *FT, const AttrListPtr &Attrs, @@ -804,14 +806,29 @@ void Verifier::visitSwitchInst(SwitchInst &SI) { // Check to make sure that all of the constants in the switch instruction // have the same type as the switched-on value. Type *SwitchTy = SI.getCondition()->getType(); - SmallPtrSet<ConstantInt*, 32> Constants; + IntegerType *IntTy = cast<IntegerType>(SwitchTy); + IntegersSubsetToBB Mapping; + std::map<IntegersSubset::Range, unsigned> RangeSetMap; for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) { - Assert1(i.getCaseValue()->getType() == SwitchTy, - "Switch constants must all be same type as switch value!", &SI); - Assert2(Constants.insert(i.getCaseValue()), - "Duplicate integer as switch case", &SI, i.getCaseValue()); + IntegersSubset CaseRanges = i.getCaseValueEx(); + for (unsigned ri = 0, rie = CaseRanges.getNumItems(); ri < rie; ++ri) { + IntegersSubset::Range r = CaseRanges.getItem(ri); + Assert1(((const APInt&)r.getLow()).getBitWidth() == IntTy->getBitWidth(), + "Switch constants must all be same type as switch value!", &SI); + Assert1(((const APInt&)r.getHigh()).getBitWidth() == IntTy->getBitWidth(), + "Switch constants must all be same type as switch value!", &SI); + Mapping.add(r); + RangeSetMap[r] = i.getCaseIndex(); + } } - + + IntegersSubsetToBB::RangeIterator errItem; + if (!Mapping.verify(errItem)) { + unsigned CaseIndex = RangeSetMap[errItem->first]; + SwitchInst::CaseIt i(&SI, CaseIndex); + Assert2(false, "Duplicate integer as switch case", &SI, i.getCaseValueEx()); + } + visitTerminatorInst(SI); } @@ -1346,6 +1363,10 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) { visitInstruction(GEP); } +static bool isContiguous(const ConstantRange &A, const ConstantRange &B) { + return A.getUpper() == B.getLower() || A.getLower() == B.getUpper(); +} + void Verifier::visitLoadInst(LoadInst &LI) { PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType()); Assert1(PTy, "Load operand must be a pointer.", &LI); @@ -1367,6 +1388,8 @@ void Verifier::visitLoadInst(LoadInst &LI) { Assert1(NumOperands % 2 == 0, "Unfinished range!", Range); unsigned NumRanges = NumOperands / 2; Assert1(NumRanges >= 1, "It should have at least one range!", Range); + + ConstantRange LastRange(1); // Dummy initial value for (unsigned i = 0; i < NumRanges; ++i) { ConstantInt *Low = dyn_cast<ConstantInt>(Range->getOperand(2*i)); Assert1(Low, "The lower limit must be an integer!", Low); @@ -1375,9 +1398,35 @@ void Verifier::visitLoadInst(LoadInst &LI) { Assert1(High->getType() == Low->getType() && High->getType() == ElTy, "Range types must match load type!", &LI); - Assert1(High->getValue() != Low->getValue(), "Range must not be empty!", + + APInt HighV = High->getValue(); + APInt LowV = Low->getValue(); + ConstantRange CurRange(LowV, HighV); + Assert1(!CurRange.isEmptySet() && !CurRange.isFullSet(), + "Range must not be empty!", Range); + if (i != 0) { + Assert1(CurRange.intersectWith(LastRange).isEmptySet(), + "Intervals are overlapping", Range); + Assert1(LowV.sgt(LastRange.getLower()), "Intervals are not in order", + Range); + Assert1(!isContiguous(CurRange, LastRange), "Intervals are contiguous", + Range); + } + LastRange = ConstantRange(LowV, HighV); + } + if (NumRanges > 2) { + APInt FirstLow = + dyn_cast<ConstantInt>(Range->getOperand(0))->getValue(); + APInt FirstHigh = + dyn_cast<ConstantInt>(Range->getOperand(1))->getValue(); + ConstantRange FirstRange(FirstLow, FirstHigh); + Assert1(FirstRange.intersectWith(LastRange).isEmptySet(), + "Intervals are overlapping", Range); + Assert1(!isContiguous(FirstRange, LastRange), "Intervals are contiguous", Range); } + + } visitInstruction(LI); @@ -1526,53 +1575,9 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) { void Verifier::verifyDominatesUse(Instruction &I, unsigned i) { Instruction *Op = cast<Instruction>(I.getOperand(i)); - BasicBlock *BB = I.getParent(); - BasicBlock *OpBlock = Op->getParent(); - PHINode *PN = dyn_cast<PHINode>(&I); - - // DT can handle non phi instructions for us. - if (!PN) { - // Definition must dominate use unless use is unreachable! - Assert2(InstsInThisBlock.count(Op) || !DT->isReachableFromEntry(BB) || - DT->dominates(Op, &I), - "Instruction does not dominate all uses!", Op, &I); - return; - } - // Check that a definition dominates all of its uses. - if (InvokeInst *II = dyn_cast<InvokeInst>(Op)) { - // Invoke results are only usable in the normal destination, not in the - // exceptional destination. - BasicBlock *NormalDest = II->getNormalDest(); - - - // PHI nodes differ from other nodes because they actually "use" the - // value in the predecessor basic blocks they correspond to. - BasicBlock *UseBlock = BB; - unsigned j = PHINode::getIncomingValueNumForOperand(i); - UseBlock = PN->getIncomingBlock(j); - Assert2(UseBlock, "Invoke operand is PHI node with bad incoming-BB", - Op, &I); - - if (UseBlock == OpBlock) { - // Special case of a phi node in the normal destination or the unwind - // destination. - Assert2(BB == NormalDest || !DT->isReachableFromEntry(UseBlock), - "Invoke result not available in the unwind destination!", - Op, &I); - } else { - Assert2(DT->dominates(II, UseBlock) || - !DT->isReachableFromEntry(UseBlock), - "Invoke result does not dominate all uses!", Op, &I); - } - } - - // PHI nodes are more difficult than other nodes because they actually - // "use" the value in the predecessor basic blocks they correspond to. - unsigned j = PHINode::getIncomingValueNumForOperand(i); - BasicBlock *PredBB = PN->getIncomingBlock(j); - Assert2(PredBB && (DT->dominates(OpBlock, PredBB) || - !DT->isReachableFromEntry(PredBB)), + const Use &U = I.getOperandUse(i); + Assert2(InstsInThisBlock.count(Op) || DT->dominates(Op, U), "Instruction does not dominate all uses!", Op, &I); } @@ -1631,8 +1636,11 @@ void Verifier::visitInstruction(Instruction &I) { if (Function *F = dyn_cast<Function>(I.getOperand(i))) { // Check to make sure that the "address of" an intrinsic function is never // taken. - Assert1(!F->isIntrinsic() || (i + 1 == e && isa<CallInst>(I)), + Assert1(!F->isIntrinsic() || i == (isa<CallInst>(I) ? e-1 : 0), "Cannot take the address of an intrinsic!", &I); + Assert1(!F->isIntrinsic() || isa<CallInst>(I) || + F->getIntrinsicID() == Intrinsic::donothing, + "Cannot invoke an intrinsinc other than donothing", &I); Assert1(F->getParent() == Mod, "Referencing function in another module!", &I); } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) { @@ -1673,10 +1681,85 @@ void Verifier::visitInstruction(Instruction &I) { InstsInThisBlock.insert(&I); } -// Flags used by TableGen to mark intrinsic parameters with the -// LLVMExtendedElementVectorType and LLVMTruncatedElementVectorType classes. -static const unsigned ExtendedElementVectorType = 0x40000000; -static const unsigned TruncatedElementVectorType = 0x20000000; +/// VerifyIntrinsicType - Verify that the specified type (which comes from an +/// intrinsic argument or return value) matches the type constraints specified +/// by the .td file (e.g. an "any integer" argument really is an integer). +/// +/// This return true on error but does not print a message. +bool Verifier::VerifyIntrinsicType(Type *Ty, + ArrayRef<Intrinsic::IITDescriptor> &Infos, + SmallVectorImpl<Type*> &ArgTys) { + using namespace Intrinsic; + + // If we ran out of descriptors, there are too many arguments. + if (Infos.empty()) return true; + IITDescriptor D = Infos.front(); + Infos = Infos.slice(1); + + switch (D.Kind) { + case IITDescriptor::Void: return !Ty->isVoidTy(); + case IITDescriptor::MMX: return !Ty->isX86_MMXTy(); + case IITDescriptor::Metadata: return !Ty->isMetadataTy(); + case IITDescriptor::Float: return !Ty->isFloatTy(); + case IITDescriptor::Double: return !Ty->isDoubleTy(); + case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width); + case IITDescriptor::Vector: { + VectorType *VT = dyn_cast<VectorType>(Ty); + return VT == 0 || VT->getNumElements() != D.Vector_Width || + VerifyIntrinsicType(VT->getElementType(), Infos, ArgTys); + } + case IITDescriptor::Pointer: { + PointerType *PT = dyn_cast<PointerType>(Ty); + return PT == 0 || PT->getAddressSpace() != D.Pointer_AddressSpace || + VerifyIntrinsicType(PT->getElementType(), Infos, ArgTys); + } + + case IITDescriptor::Struct: { + StructType *ST = dyn_cast<StructType>(Ty); + if (ST == 0 || ST->getNumElements() != D.Struct_NumElements) + return true; + + for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i) + if (VerifyIntrinsicType(ST->getElementType(i), Infos, ArgTys)) + return true; + return false; + } + + case IITDescriptor::Argument: + // Two cases here - If this is the second occurrence of an argument, verify + // that the later instance matches the previous instance. + if (D.getArgumentNumber() < ArgTys.size()) + return Ty != ArgTys[D.getArgumentNumber()]; + + // Otherwise, if this is the first instance of an argument, record it and + // verify the "Any" kind. + assert(D.getArgumentNumber() == ArgTys.size() && "Table consistency error"); + ArgTys.push_back(Ty); + + switch (D.getArgumentKind()) { + case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy(); + case IITDescriptor::AK_AnyFloat: return !Ty->isFPOrFPVectorTy(); + case IITDescriptor::AK_AnyVector: return !isa<VectorType>(Ty); + case IITDescriptor::AK_AnyPointer: return !isa<PointerType>(Ty); + } + llvm_unreachable("all argument kinds not covered"); + + case IITDescriptor::ExtendVecArgument: + // This may only be used when referring to a previous vector argument. + return D.getArgumentNumber() >= ArgTys.size() || + !isa<VectorType>(ArgTys[D.getArgumentNumber()]) || + VectorType::getExtendedElementVectorType( + cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty; + + case IITDescriptor::TruncVecArgument: + // This may only be used when referring to a previous vector argument. + return D.getArgumentNumber() >= ArgTys.size() || + !isa<VectorType>(ArgTys[D.getArgumentNumber()]) || + VectorType::getTruncatedElementVectorType( + cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty; + } + llvm_unreachable("unhandled"); +} /// visitIntrinsicFunction - Allow intrinsics to be verified in different ways. /// @@ -1685,10 +1768,30 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) { Assert1(IF->isDeclaration(), "Intrinsic functions should never be defined!", IF); -#define GET_INTRINSIC_VERIFIER -#include "llvm/Intrinsics.gen" -#undef GET_INTRINSIC_VERIFIER - + // Verify that the intrinsic prototype lines up with what the .td files + // describe. + FunctionType *IFTy = IF->getFunctionType(); + Assert1(!IFTy->isVarArg(), "Intrinsic prototypes are not varargs", IF); + + SmallVector<Intrinsic::IITDescriptor, 8> Table; + getIntrinsicInfoTableEntries(ID, Table); + ArrayRef<Intrinsic::IITDescriptor> TableRef = Table; + + SmallVector<Type *, 4> ArgTys; + Assert1(!VerifyIntrinsicType(IFTy->getReturnType(), TableRef, ArgTys), + "Intrinsic has incorrect return type!", IF); + for (unsigned i = 0, e = IFTy->getNumParams(); i != e; ++i) + Assert1(!VerifyIntrinsicType(IFTy->getParamType(i), TableRef, ArgTys), + "Intrinsic has incorrect argument type!", IF); + Assert1(TableRef.empty(), "Intrinsic has too few arguments!", IF); + + // Now that we have the intrinsic ID and the actual argument types (and we + // know they are legal for the intrinsic!) get the intrinsic name through the + // usual means. This allows us to verify the mangling of argument types into + // the name. + Assert1(Intrinsic::getName(ID, ArgTys) == IF->getName(), + "Intrinsic name not mangled correctly for type arguments!", IF); + // If the intrinsic takes MDNode arguments, verify that they are either global // or are local to *this* function. for (unsigned i = 0, e = CI.getNumArgOperands(); i != e; ++i) @@ -1772,261 +1875,6 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) { } } -/// Produce a string to identify an intrinsic parameter or return value. -/// The ArgNo value numbers the return values from 0 to NumRets-1 and the -/// parameters beginning with NumRets. -/// -static std::string IntrinsicParam(unsigned ArgNo, unsigned NumRets) { - if (ArgNo >= NumRets) - return "Intrinsic parameter #" + utostr(ArgNo - NumRets); - if (NumRets == 1) - return "Intrinsic result type"; - return "Intrinsic result type #" + utostr(ArgNo); -} - -bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, - int VT, unsigned ArgNo, std::string &Suffix) { - FunctionType *FTy = F->getFunctionType(); - - unsigned NumElts = 0; - Type *EltTy = Ty; - VectorType *VTy = dyn_cast<VectorType>(Ty); - if (VTy) { - EltTy = VTy->getElementType(); - NumElts = VTy->getNumElements(); - } - - Type *RetTy = FTy->getReturnType(); - StructType *ST = dyn_cast<StructType>(RetTy); - unsigned NumRetVals; - if (RetTy->isVoidTy()) - NumRetVals = 0; - else if (ST) - NumRetVals = ST->getNumElements(); - else - NumRetVals = 1; - - if (VT < 0) { - int Match = ~VT; - - // Check flags that indicate a type that is an integral vector type with - // elements that are larger or smaller than the elements of the matched - // type. - if ((Match & (ExtendedElementVectorType | - TruncatedElementVectorType)) != 0) { - IntegerType *IEltTy = dyn_cast<IntegerType>(EltTy); - if (!VTy || !IEltTy) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not " - "an integral vector type.", F); - return false; - } - // Adjust the current Ty (in the opposite direction) rather than - // the type being matched against. - if ((Match & ExtendedElementVectorType) != 0) { - if ((IEltTy->getBitWidth() & 1) != 0) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " vector " - "element bit-width is odd.", F); - return false; - } - Ty = VectorType::getTruncatedElementVectorType(VTy); - } else - Ty = VectorType::getExtendedElementVectorType(VTy); - Match &= ~(ExtendedElementVectorType | TruncatedElementVectorType); - } - - if (Match <= static_cast<int>(NumRetVals - 1)) { - if (ST) - RetTy = ST->getElementType(Match); - - if (Ty != RetTy) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " does not " - "match return type.", F); - return false; - } - } else { - if (Ty != FTy->getParamType(Match - NumRetVals)) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " does not " - "match parameter %" + utostr(Match - NumRetVals) + ".", F); - return false; - } - } - } else if (VT == MVT::iAny) { - if (!EltTy->isIntegerTy()) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not " - "an integer type.", F); - return false; - } - - unsigned GotBits = cast<IntegerType>(EltTy)->getBitWidth(); - Suffix += "."; - - if (EltTy != Ty) - Suffix += "v" + utostr(NumElts); - - Suffix += "i" + utostr(GotBits); - - // Check some constraints on various intrinsics. - switch (ID) { - default: break; // Not everything needs to be checked. - case Intrinsic::bswap: - if (GotBits < 16 || GotBits % 16 != 0) { - CheckFailed("Intrinsic requires even byte width argument", F); - return false; - } - break; - } - } else if (VT == MVT::fAny) { - if (!EltTy->isFloatingPointTy()) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not " - "a floating-point type.", F); - return false; - } - - Suffix += "."; - - if (EltTy != Ty) - Suffix += "v" + utostr(NumElts); - - Suffix += EVT::getEVT(EltTy).getEVTString(); - } else if (VT == MVT::vAny) { - if (!VTy) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a vector type.", - F); - return false; - } - Suffix += ".v" + utostr(NumElts) + EVT::getEVT(EltTy).getEVTString(); - } else if (VT == MVT::iPTR) { - if (!Ty->isPointerTy()) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a " - "pointer and a pointer is required.", F); - return false; - } - } else if (VT == MVT::iPTRAny) { - // Outside of TableGen, we don't distinguish iPTRAny (to any address space) - // and iPTR. In the verifier, we can not distinguish which case we have so - // allow either case to be legal. - if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) { - EVT PointeeVT = EVT::getEVT(PTyp->getElementType(), true); - if (PointeeVT == MVT::Other) { - CheckFailed("Intrinsic has pointer to complex type."); - return false; - } - Suffix += ".p" + utostr(PTyp->getAddressSpace()) + - PointeeVT.getEVTString(); - } else { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a " - "pointer and a pointer is required.", F); - return false; - } - } else if (EVT((MVT::SimpleValueType)VT).isVector()) { - EVT VVT = EVT((MVT::SimpleValueType)VT); - - // If this is a vector argument, verify the number and type of elements. - if (VVT.getVectorElementType() != EVT::getEVT(EltTy)) { - CheckFailed("Intrinsic prototype has incorrect vector element type!", F); - return false; - } - - if (VVT.getVectorNumElements() != NumElts) { - CheckFailed("Intrinsic prototype has incorrect number of " - "vector elements!", F); - return false; - } - } else if (EVT((MVT::SimpleValueType)VT).getTypeForEVT(Ty->getContext()) != - EltTy) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is wrong!", F); - return false; - } else if (EltTy != Ty) { - CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is a vector " - "and a scalar is required.", F); - return false; - } - - return true; -} - -/// VerifyIntrinsicPrototype - TableGen emits calls to this function into -/// Intrinsics.gen. This implements a little state machine that verifies the -/// prototype of intrinsics. -void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F, - unsigned NumRetVals, - unsigned NumParams, ...) { - va_list VA; - va_start(VA, NumParams); - FunctionType *FTy = F->getFunctionType(); - - // For overloaded intrinsics, the Suffix of the function name must match the - // types of the arguments. This variable keeps track of the expected - // suffix, to be checked at the end. - std::string Suffix; - - if (FTy->getNumParams() + FTy->isVarArg() != NumParams) { - CheckFailed("Intrinsic prototype has incorrect number of arguments!", F); - return; - } - - Type *Ty = FTy->getReturnType(); - StructType *ST = dyn_cast<StructType>(Ty); - - if (NumRetVals == 0 && !Ty->isVoidTy()) { - CheckFailed("Intrinsic should return void", F); - return; - } - - // Verify the return types. - if (ST && ST->getNumElements() != NumRetVals) { - CheckFailed("Intrinsic prototype has incorrect number of return types!", F); - return; - } - - for (unsigned ArgNo = 0; ArgNo != NumRetVals; ++ArgNo) { - int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative. - - if (ST) Ty = ST->getElementType(ArgNo); - if (!PerformTypeCheck(ID, F, Ty, VT, ArgNo, Suffix)) - break; - } - - // Verify the parameter types. - for (unsigned ArgNo = 0; ArgNo != NumParams; ++ArgNo) { - int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative. - - if (VT == MVT::isVoid && ArgNo > 0) { - if (!FTy->isVarArg()) - CheckFailed("Intrinsic prototype has no '...'!", F); - break; - } - - if (!PerformTypeCheck(ID, F, FTy->getParamType(ArgNo), VT, - ArgNo + NumRetVals, Suffix)) - break; - } - - va_end(VA); - - // For intrinsics without pointer arguments, if we computed a Suffix then the - // intrinsic is overloaded and we need to make sure that the name of the - // function is correct. We add the suffix to the name of the intrinsic and - // compare against the given function name. If they are not the same, the - // function name is invalid. This ensures that overloading of intrinsics - // uses a sane and consistent naming convention. Note that intrinsics with - // pointer argument may or may not be overloaded so we will check assuming it - // has a suffix and not. - if (!Suffix.empty()) { - std::string Name(Intrinsic::getName(ID)); - if (Name + Suffix != F->getName()) { - CheckFailed("Overloaded intrinsic has incorrect suffix: '" + - F->getName().substr(Name.length()) + "'. It should be '" + - Suffix + "'", F); - } - } - - // Check parameter attributes. - Assert1(F->getAttributes() == Intrinsic::getAttributes(ID), - "Intrinsic has wrong parameter attributes!", F); -} - - //===----------------------------------------------------------------------===// // Implement the public interfaces to this file... //===----------------------------------------------------------------------===// |